cpu.c 11.3 KB
Newer Older
Laurent Aimar's avatar
Laurent Aimar committed
1
/*****************************************************************************
Fiona Glaser's avatar
Fiona Glaser committed
2
 * cpu.c: cpu detection
Laurent Aimar's avatar
Laurent Aimar committed
3
 *****************************************************************************
Fiona Glaser's avatar
Fiona Glaser committed
4
 * Copyright (C) 2003-2010 x264 project
Laurent Aimar's avatar
Laurent Aimar committed
5
 *
6 7
 * Authors: Loren Merritt <lorenm@u.washington.edu>
 *          Laurent Aimar <fenrir@via.ecp.fr>
8
 *          Fiona Glaser <fiona@x264.com>
Laurent Aimar's avatar
Laurent Aimar committed
9 10 11 12 13 14 15 16 17 18 19 20 21
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
Fiona Glaser's avatar
Fiona Glaser committed
23 24 25
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
Laurent Aimar's avatar
Laurent Aimar committed
26 27
 *****************************************************************************/

28 29 30 31
#define _GNU_SOURCE // for sched_getaffinity
#include "common.h"
#include "cpu.h"

Steven Walters's avatar
Steven Walters committed
32
#if HAVE_PTHREAD && SYS_LINUX
33 34
#include <sched.h>
#endif
Steven Walters's avatar
Steven Walters committed
35
#if SYS_BEOS
36 37
#include <kernel/OS.h>
#endif
Steven Walters's avatar
Steven Walters committed
38
#if SYS_MACOSX || SYS_FREEBSD
39 40 41
#include <sys/types.h>
#include <sys/sysctl.h>
#endif
Steven Walters's avatar
Steven Walters committed
42
#if SYS_OPENBSD
43 44 45 46
#include <sys/param.h>
#include <sys/sysctl.h>
#include <machine/cpu.h>
#endif
47

48 49 50
const x264_cpu_name_t x264_cpu_names[] = {
    {"Altivec", X264_CPU_ALTIVEC},
//  {"MMX",     X264_CPU_MMX}, // we don't support asm on mmx1 cpus anymore
51 52
    {"MMX2",    X264_CPU_MMX|X264_CPU_MMXEXT},
    {"MMXEXT",  X264_CPU_MMX|X264_CPU_MMXEXT},
53 54
//  {"SSE",     X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE}, // there are no sse1 functions in x264
    {"SSE2Slow",X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE2_IS_SLOW},
55
    {"SSE2",    X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2},
56
    {"SSE2Fast",X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE2_IS_FAST},
57 58
    {"SSE3",    X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3},
    {"SSSE3",   X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3},
Fiona Glaser's avatar
Fiona Glaser committed
59
    {"FastShuffle",   X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SHUFFLE_IS_FAST},
60 61
    {"SSE4.1",  X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
    {"SSE4.2",  X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42},
62 63
    {"Cache32", X264_CPU_CACHELINE_32},
    {"Cache64", X264_CPU_CACHELINE_64},
Fiona Glaser's avatar
Fiona Glaser committed
64
    {"SSEMisalign", X264_CPU_SSE_MISALIGN},
65
    {"LZCNT", X264_CPU_LZCNT},
66
    {"Slow_mod4_stack", X264_CPU_STACK_MOD4},
67 68 69
    {"ARMv6", X264_CPU_ARMV6},
    {"NEON",  X264_CPU_NEON},
    {"Fast_NEON_MRC",  X264_CPU_FAST_NEON_MRC},
70 71
    {"SlowCTZ", X264_CPU_SLOW_CTZ},
    {"SlowAtom", X264_CPU_SLOW_ATOM},
72 73 74
    {"", 0},
};

Steven Walters's avatar
Steven Walters committed
75
#if (ARCH_PPC && SYS_LINUX) || (ARCH_ARM && !HAVE_NEON)
76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
#include <signal.h>
#include <setjmp.h>
static sigjmp_buf jmpbuf;
static volatile sig_atomic_t canjump = 0;

static void sigill_handler( int sig )
{
    if( !canjump )
    {
        signal( sig, SIG_DFL );
        raise( sig );
    }

    canjump = 0;
    siglongjmp( jmpbuf, 1 );
}
#endif
Fiona Glaser's avatar
Fiona Glaser committed
93

Steven Walters's avatar
Steven Walters committed
94
#if HAVE_MMX
Anton Mitrofanov's avatar
Anton Mitrofanov committed
95 96
int x264_cpu_cpuid_test( void );
uint32_t x264_cpu_cpuid( uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx );
Laurent Aimar's avatar
Laurent Aimar committed
97 98 99 100 101

uint32_t x264_cpu_detect( void )
{
    uint32_t cpu = 0;
    uint32_t eax, ebx, ecx, edx;
102 103 104
    uint32_t vendor[4] = {0};
    int max_extended_cap;
    int cache;
Laurent Aimar's avatar
Laurent Aimar committed
105

Steven Walters's avatar
Steven Walters committed
106
#if !ARCH_X86_64
Laurent Aimar's avatar
Laurent Aimar committed
107 108
    if( !x264_cpu_cpuid_test() )
        return 0;
109
#endif
Laurent Aimar's avatar
Laurent Aimar committed
110

111
    x264_cpu_cpuid( 0, &eax, vendor+0, vendor+2, vendor+1 );
Laurent Aimar's avatar
Laurent Aimar committed
112 113 114 115
    if( eax == 0 )
        return 0;

    x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
116 117 118
    if( edx&0x00800000 )
        cpu |= X264_CPU_MMX;
    else
Laurent Aimar's avatar
Laurent Aimar committed
119
        return 0;
120
    if( edx&0x02000000 )
Laurent Aimar's avatar
Laurent Aimar committed
121
        cpu |= X264_CPU_MMXEXT|X264_CPU_SSE;
122
    if( edx&0x04000000 )
Laurent Aimar's avatar
Laurent Aimar committed
123
        cpu |= X264_CPU_SSE2;
124
    if( ecx&0x00000001 )
125
        cpu |= X264_CPU_SSE3;
126
    if( ecx&0x00000200 )
127
        cpu |= X264_CPU_SSSE3;
128 129
    if( ecx&0x00080000 )
        cpu |= X264_CPU_SSE4;
130 131
    if( ecx&0x00100000 )
        cpu |= X264_CPU_SSE42;
Laurent Aimar's avatar
Laurent Aimar committed
132

133 134 135
    if( cpu & X264_CPU_SSSE3 )
        cpu |= X264_CPU_SSE2_IS_FAST;
    if( cpu & X264_CPU_SSE4 )
Fiona Glaser's avatar
Fiona Glaser committed
136
        cpu |= X264_CPU_SHUFFLE_IS_FAST;
137

Laurent Aimar's avatar
Laurent Aimar committed
138
    x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx );
139 140 141
    max_extended_cap = eax;

    if( !strcmp((char*)vendor, "AuthenticAMD") && max_extended_cap >= 0x80000001 )
Laurent Aimar's avatar
Laurent Aimar committed
142
    {
143
        cpu |= X264_CPU_SLOW_CTZ;
144 145 146
        x264_cpu_cpuid( 0x80000001, &eax, &ebx, &ecx, &edx );
        if( edx&0x00400000 )
            cpu |= X264_CPU_MMXEXT;
147 148 149
        if( cpu & X264_CPU_SSE2 )
        {
            if( ecx&0x00000040 ) /* SSE4a */
Fiona Glaser's avatar
Fiona Glaser committed
150
            {
151
                cpu |= X264_CPU_SSE2_IS_FAST;
152
                cpu |= X264_CPU_LZCNT;
Fiona Glaser's avatar
Fiona Glaser committed
153
                cpu |= X264_CPU_SHUFFLE_IS_FAST;
154
                cpu &= ~X264_CPU_SLOW_CTZ;
Fiona Glaser's avatar
Fiona Glaser committed
155
            }
156 157
            else
                cpu |= X264_CPU_SSE2_IS_SLOW;
158 159 160 161 162 163

            if( ecx&0x00000080 ) /* Misalign SSE */
            {
                cpu |= X264_CPU_SSE_MISALIGN;
                x264_cpu_mask_misalign_sse();
            }
164
        }
Laurent Aimar's avatar
Laurent Aimar committed
165 166
    }

167
    if( !strcmp((char*)vendor, "GenuineIntel") )
Laurent Aimar's avatar
Laurent Aimar committed
168
    {
169
        x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
170 171
        int family = ((eax>>8)&0xf) + ((eax>>20)&0xff);
        int model  = ((eax>>4)&0xf) + ((eax>>12)&0xf0);
172 173 174
        /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and 6/14 (core1 "yonah")
         * theoretically support sse2, but it's significantly slower than mmx for
         * almost all of x264's functions, so let's just pretend they don't. */
175
        if( family == 6 && (model == 9 || model == 13 || model == 14) )
176 177 178 179
        {
            cpu &= ~(X264_CPU_SSE2|X264_CPU_SSE3);
            assert(!(cpu&(X264_CPU_SSSE3|X264_CPU_SSE4)));
        }
180 181 182 183 184 185
        /* Detect Atom CPU */
        if( family == 6 && model == 28 )
        {
            cpu |= X264_CPU_SLOW_ATOM;
            cpu |= X264_CPU_SLOW_CTZ;
        }
Laurent Aimar's avatar
Laurent Aimar committed
186
    }
187

188
    if( (!strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead")) && !(cpu&X264_CPU_SSE42))
Laurent Aimar's avatar
Laurent Aimar committed
189
    {
190 191 192 193 194 195 196 197 198 199 200 201
        /* cacheline size is specified in 3 places, any of which may be missing */
        x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
        cache = (ebx&0xff00)>>5; // cflush size
        if( !cache && max_extended_cap >= 0x80000006 )
        {
            x264_cpu_cpuid( 0x80000006, &eax, &ebx, &ecx, &edx );
            cache = ecx&0xff; // cacheline size
        }
        if( !cache )
        {
            // Cache and TLB Information
            static const char cache32_ids[] = { 0x0a, 0x0c, 0x41, 0x42, 0x43, 0x44, 0x45, 0x82, 0x83, 0x84, 0x85, 0 };
202 203
            static const char cache64_ids[] = { 0x22, 0x23, 0x25, 0x29, 0x2c, 0x46, 0x47, 0x49, 0x60, 0x66, 0x67,
                                                0x68, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7c, 0x7f, 0x86, 0x87, 0 };
204
            uint32_t buf[4];
205
            int max, i = 0;
206 207 208 209
            do {
                x264_cpu_cpuid( 2, buf+0, buf+1, buf+2, buf+3 );
                max = buf[0]&0xff;
                buf[0] &= ~0xff;
210
                for( int j = 0; j < 4; j++ )
211 212 213 214 215 216 217 218 219 220 221
                    if( !(buf[j]>>31) )
                        while( buf[j] )
                        {
                            if( strchr( cache32_ids, buf[j]&0xff ) )
                                cache = 32;
                            if( strchr( cache64_ids, buf[j]&0xff ) )
                                cache = 64;
                            buf[j] >>= 8;
                        }
            } while( ++i < max );
        }
Laurent Aimar's avatar
Laurent Aimar committed
222

223 224 225 226 227
        if( cache == 32 )
            cpu |= X264_CPU_CACHELINE_32;
        else if( cache == 64 )
            cpu |= X264_CPU_CACHELINE_64;
        else
228
            x264_log( NULL, X264_LOG_WARNING, "unable to determine cacheline size\n" );
229
    }
230

Steven Walters's avatar
Steven Walters committed
231
#if BROKEN_STACK_ALIGNMENT
232 233 234
    cpu |= X264_CPU_STACK_MOD4;
#endif

Laurent Aimar's avatar
Laurent Aimar committed
235 236 237
    return cpu;
}

Steven Walters's avatar
Steven Walters committed
238
#elif ARCH_PPC
Laurent Aimar's avatar
Laurent Aimar committed
239

Steven Walters's avatar
Steven Walters committed
240
#if SYS_MACOSX || SYS_OPENBSD
241
#include <sys/sysctl.h>
Laurent Aimar's avatar
Laurent Aimar committed
242 243
uint32_t x264_cpu_detect( void )
{
244
    /* Thank you VLC */
Laurent Aimar's avatar
Laurent Aimar committed
245
    uint32_t cpu = 0;
Steven Walters's avatar
Steven Walters committed
246
#if SYS_OPENBSD
247 248
    int      selectors[2] = { CTL_MACHDEP, CPU_ALTIVEC };
#else
Laurent Aimar's avatar
Laurent Aimar committed
249
    int      selectors[2] = { CTL_HW, HW_VECTORUNIT };
250
#endif
Laurent Aimar's avatar
Laurent Aimar committed
251 252 253 254 255 256 257 258 259 260
    int      has_altivec = 0;
    size_t   length = sizeof( has_altivec );
    int      error = sysctl( selectors, 2, &has_altivec, &length, NULL, 0 );

    if( error == 0 && has_altivec != 0 )
        cpu |= X264_CPU_ALTIVEC;

    return cpu;
}

Steven Walters's avatar
Steven Walters committed
261
#elif SYS_LINUX
262

263 264
uint32_t x264_cpu_detect( void )
{
265
    static void (*oldsig)( int );
266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282

    oldsig = signal( SIGILL, sigill_handler );
    if( sigsetjmp( jmpbuf, 1 ) )
    {
        signal( SIGILL, oldsig );
        return 0;
    }

    canjump = 1;
    asm volatile( "mtspr 256, %0\n\t"
                  "vand 0, 0, 0\n\t"
                  :
                  : "r"(-1) );
    canjump = 0;

    signal( SIGILL, oldsig );

283 284 285 286
    return X264_CPU_ALTIVEC;
}
#endif

Steven Walters's avatar
Steven Walters committed
287
#elif ARCH_ARM
288

289 290
void x264_cpu_neon_test( void );
int x264_cpu_fast_neon_mrc_test( void );
291 292 293 294

uint32_t x264_cpu_detect( void )
{
    int flags = 0;
Steven Walters's avatar
Steven Walters committed
295
#if HAVE_ARMV6
296 297 298
    flags |= X264_CPU_ARMV6;

    // don't do this hack if compiled with -mfpu=neon
Steven Walters's avatar
Steven Walters committed
299
#if !HAVE_NEON
300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328
    static void (* oldsig)( int );
    oldsig = signal( SIGILL, sigill_handler );
    if( sigsetjmp( jmpbuf, 1 ) )
    {
        signal( SIGILL, oldsig );
        return flags;
    }

    canjump = 1;
    x264_cpu_neon_test();
    canjump = 0;
    signal( SIGILL, oldsig );
#endif

    flags |= X264_CPU_NEON;

    // fast neon -> arm (Cortex-A9) detection relies on user access to the
    // cycle counter; this assumes ARMv7 performance counters.
    // NEON requires at least ARMv7, ARMv8 may require changes here, but
    // hopefully this hacky detection method will have been replaced by then.
    // Note that there is potential for a race condition if another program or
    // x264 instance disables or reinits the counters while x264 is using them,
    // which may result in incorrect detection and the counters stuck enabled.
    flags |= x264_cpu_fast_neon_mrc_test() ? X264_CPU_FAST_NEON_MRC : 0;
    // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
#endif
    return flags;
}

Laurent Aimar's avatar
Laurent Aimar committed
329 330 331 332 333 334 335
#else

uint32_t x264_cpu_detect( void )
{
    return 0;
}

Loren Merritt's avatar
Loren Merritt committed
336 337
#endif

338 339
int x264_cpu_num_processors( void )
{
340
#if !HAVE_THREAD
341 342
    return 1;

343
#elif defined(_WIN32)
344
    return x264_pthread_num_processors_np();
Loren Merritt's avatar
Loren Merritt committed
345

Steven Walters's avatar
Steven Walters committed
346
#elif SYS_LINUX
347
    unsigned int bit;
348
    int np;
349 350 351 352 353
    cpu_set_t p_aff;
    memset( &p_aff, 0, sizeof(p_aff) );
    sched_getaffinity( 0, sizeof(p_aff), &p_aff );
    for( np = 0, bit = 0; bit < sizeof(p_aff); bit++ )
        np += (((uint8_t *)&p_aff)[bit / 8] >> (bit % 8)) & 1;
354 355
    return np;

Steven Walters's avatar
Steven Walters committed
356
#elif SYS_BEOS
357 358 359 360
    system_info info;
    get_system_info( &info );
    return info.cpu_count;

Steven Walters's avatar
Steven Walters committed
361
#elif SYS_MACOSX || SYS_FREEBSD || SYS_OPENBSD
Anton Mitrofanov's avatar
Anton Mitrofanov committed
362 363
    int ncpu;
    size_t length = sizeof( ncpu );
Steven Walters's avatar
Steven Walters committed
364
#if SYS_OPENBSD
365
    int mib[2] = { CTL_HW, HW_NCPU };
Anton Mitrofanov's avatar
Anton Mitrofanov committed
366
    if( sysctl(mib, 2, &ncpu, &length, NULL, 0) )
367
#else
Anton Mitrofanov's avatar
Anton Mitrofanov committed
368
    if( sysctlbyname("hw.ncpu", &ncpu, &length, NULL, 0) )
369
#endif
370
    {
Anton Mitrofanov's avatar
Anton Mitrofanov committed
371
        ncpu = 1;
372
    }
Anton Mitrofanov's avatar
Anton Mitrofanov committed
373
    return ncpu;
374

375
#else
376 377
    return 1;
#endif
378
}