cpu.c 11.4 KB
Newer Older
Laurent Aimar's avatar
Laurent Aimar committed
1
/*****************************************************************************
Fiona Glaser's avatar
Fiona Glaser committed
2
 * cpu.c: cpu detection
Laurent Aimar's avatar
Laurent Aimar committed
3
 *****************************************************************************
Sean McGovern's avatar
Sean McGovern committed
4
 * Copyright (C) 2003-2011 x264 project
Laurent Aimar's avatar
Laurent Aimar committed
5
 *
6 7
 * Authors: Loren Merritt <lorenm@u.washington.edu>
 *          Laurent Aimar <fenrir@via.ecp.fr>
8
 *          Fiona Glaser <fiona@x264.com>
Laurent Aimar's avatar
Laurent Aimar committed
9 10 11 12 13 14 15 16 17 18 19 20 21
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
Fiona Glaser's avatar
Fiona Glaser committed
23 24 25
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
Laurent Aimar's avatar
Laurent Aimar committed
26 27
 *****************************************************************************/

28 29 30 31
#define _GNU_SOURCE // for sched_getaffinity
#include "common.h"
#include "cpu.h"

Steven Walters's avatar
Steven Walters committed
32
#if HAVE_PTHREAD && SYS_LINUX
33 34
#include <sched.h>
#endif
Steven Walters's avatar
Steven Walters committed
35
#if SYS_BEOS
36 37
#include <kernel/OS.h>
#endif
Steven Walters's avatar
Steven Walters committed
38
#if SYS_MACOSX || SYS_FREEBSD
39 40 41
#include <sys/types.h>
#include <sys/sysctl.h>
#endif
Steven Walters's avatar
Steven Walters committed
42
#if SYS_OPENBSD
43 44 45 46
#include <sys/param.h>
#include <sys/sysctl.h>
#include <machine/cpu.h>
#endif
47

48 49 50
const x264_cpu_name_t x264_cpu_names[] = {
    {"Altivec", X264_CPU_ALTIVEC},
//  {"MMX",     X264_CPU_MMX}, // we don't support asm on mmx1 cpus anymore
51 52
    {"MMX2",    X264_CPU_MMX|X264_CPU_MMXEXT},
    {"MMXEXT",  X264_CPU_MMX|X264_CPU_MMXEXT},
53 54
//  {"SSE",     X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE}, // there are no sse1 functions in x264
    {"SSE2Slow",X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE2_IS_SLOW},
55
    {"SSE2",    X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2},
56
    {"SSE2Fast",X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE2_IS_FAST},
57 58
    {"SSE3",    X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3},
    {"SSSE3",   X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3},
Fiona Glaser's avatar
Fiona Glaser committed
59
    {"FastShuffle",   X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SHUFFLE_IS_FAST},
60 61
    {"SSE4.1",  X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
    {"SSE4.2",  X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42},
Fiona Glaser's avatar
Fiona Glaser committed
62
    {"AVX", X264_CPU_AVX},
63 64
    {"Cache32", X264_CPU_CACHELINE_32},
    {"Cache64", X264_CPU_CACHELINE_64},
Fiona Glaser's avatar
Fiona Glaser committed
65
    {"SSEMisalign", X264_CPU_SSE_MISALIGN},
66
    {"LZCNT", X264_CPU_LZCNT},
67
    {"Slow_mod4_stack", X264_CPU_STACK_MOD4},
68 69 70
    {"ARMv6", X264_CPU_ARMV6},
    {"NEON",  X264_CPU_NEON},
    {"Fast_NEON_MRC",  X264_CPU_FAST_NEON_MRC},
71 72
    {"SlowCTZ", X264_CPU_SLOW_CTZ},
    {"SlowAtom", X264_CPU_SLOW_ATOM},
73 74 75
    {"", 0},
};

Steven Walters's avatar
Steven Walters committed
76
#if (ARCH_PPC && SYS_LINUX) || (ARCH_ARM && !HAVE_NEON)
77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
#include <signal.h>
#include <setjmp.h>
static sigjmp_buf jmpbuf;
static volatile sig_atomic_t canjump = 0;

static void sigill_handler( int sig )
{
    if( !canjump )
    {
        signal( sig, SIG_DFL );
        raise( sig );
    }

    canjump = 0;
    siglongjmp( jmpbuf, 1 );
}
#endif
Fiona Glaser's avatar
Fiona Glaser committed
94

Steven Walters's avatar
Steven Walters committed
95
#if HAVE_MMX
Anton Mitrofanov's avatar
Anton Mitrofanov committed
96 97
int x264_cpu_cpuid_test( void );
uint32_t x264_cpu_cpuid( uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx );
Laurent Aimar's avatar
Laurent Aimar committed
98 99 100 101 102

uint32_t x264_cpu_detect( void )
{
    uint32_t cpu = 0;
    uint32_t eax, ebx, ecx, edx;
103 104 105
    uint32_t vendor[4] = {0};
    int max_extended_cap;
    int cache;
Laurent Aimar's avatar
Laurent Aimar committed
106

Steven Walters's avatar
Steven Walters committed
107
#if !ARCH_X86_64
Laurent Aimar's avatar
Laurent Aimar committed
108 109
    if( !x264_cpu_cpuid_test() )
        return 0;
110
#endif
Laurent Aimar's avatar
Laurent Aimar committed
111

112
    x264_cpu_cpuid( 0, &eax, vendor+0, vendor+2, vendor+1 );
Laurent Aimar's avatar
Laurent Aimar committed
113 114 115 116
    if( eax == 0 )
        return 0;

    x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
117 118 119
    if( edx&0x00800000 )
        cpu |= X264_CPU_MMX;
    else
Laurent Aimar's avatar
Laurent Aimar committed
120
        return 0;
121
    if( edx&0x02000000 )
Laurent Aimar's avatar
Laurent Aimar committed
122
        cpu |= X264_CPU_MMXEXT|X264_CPU_SSE;
123
    if( edx&0x04000000 )
Laurent Aimar's avatar
Laurent Aimar committed
124
        cpu |= X264_CPU_SSE2;
125
    if( ecx&0x00000001 )
126
        cpu |= X264_CPU_SSE3;
127
    if( ecx&0x00000200 )
128
        cpu |= X264_CPU_SSSE3;
129 130
    if( ecx&0x00080000 )
        cpu |= X264_CPU_SSE4;
131 132
    if( ecx&0x00100000 )
        cpu |= X264_CPU_SSE42;
Fiona Glaser's avatar
Fiona Glaser committed
133 134
    if( ecx&0x10000000 )
        cpu |= X264_CPU_AVX;
Laurent Aimar's avatar
Laurent Aimar committed
135

136 137 138
    if( cpu & X264_CPU_SSSE3 )
        cpu |= X264_CPU_SSE2_IS_FAST;
    if( cpu & X264_CPU_SSE4 )
Fiona Glaser's avatar
Fiona Glaser committed
139
        cpu |= X264_CPU_SHUFFLE_IS_FAST;
140

Laurent Aimar's avatar
Laurent Aimar committed
141
    x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx );
142 143 144
    max_extended_cap = eax;

    if( !strcmp((char*)vendor, "AuthenticAMD") && max_extended_cap >= 0x80000001 )
Laurent Aimar's avatar
Laurent Aimar committed
145
    {
146
        cpu |= X264_CPU_SLOW_CTZ;
147 148 149
        x264_cpu_cpuid( 0x80000001, &eax, &ebx, &ecx, &edx );
        if( edx&0x00400000 )
            cpu |= X264_CPU_MMXEXT;
150 151 152
        if( cpu & X264_CPU_SSE2 )
        {
            if( ecx&0x00000040 ) /* SSE4a */
Fiona Glaser's avatar
Fiona Glaser committed
153
            {
154
                cpu |= X264_CPU_SSE2_IS_FAST;
155
                cpu |= X264_CPU_LZCNT;
Fiona Glaser's avatar
Fiona Glaser committed
156
                cpu |= X264_CPU_SHUFFLE_IS_FAST;
157
                cpu &= ~X264_CPU_SLOW_CTZ;
Fiona Glaser's avatar
Fiona Glaser committed
158
            }
159 160
            else
                cpu |= X264_CPU_SSE2_IS_SLOW;
161 162 163 164 165 166

            if( ecx&0x00000080 ) /* Misalign SSE */
            {
                cpu |= X264_CPU_SSE_MISALIGN;
                x264_cpu_mask_misalign_sse();
            }
167
        }
Laurent Aimar's avatar
Laurent Aimar committed
168 169
    }

170
    if( !strcmp((char*)vendor, "GenuineIntel") )
Laurent Aimar's avatar
Laurent Aimar committed
171
    {
172
        x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
173 174
        int family = ((eax>>8)&0xf) + ((eax>>20)&0xff);
        int model  = ((eax>>4)&0xf) + ((eax>>12)&0xf0);
175 176 177
        /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and 6/14 (core1 "yonah")
         * theoretically support sse2, but it's significantly slower than mmx for
         * almost all of x264's functions, so let's just pretend they don't. */
178
        if( family == 6 && (model == 9 || model == 13 || model == 14) )
179 180 181 182
        {
            cpu &= ~(X264_CPU_SSE2|X264_CPU_SSE3);
            assert(!(cpu&(X264_CPU_SSSE3|X264_CPU_SSE4)));
        }
183 184 185 186 187 188
        /* Detect Atom CPU */
        if( family == 6 && model == 28 )
        {
            cpu |= X264_CPU_SLOW_ATOM;
            cpu |= X264_CPU_SLOW_CTZ;
        }
Laurent Aimar's avatar
Laurent Aimar committed
189
    }
190

191
    if( (!strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead")) && !(cpu&X264_CPU_SSE42))
Laurent Aimar's avatar
Laurent Aimar committed
192
    {
193 194 195 196 197 198 199 200 201 202 203 204
        /* cacheline size is specified in 3 places, any of which may be missing */
        x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
        cache = (ebx&0xff00)>>5; // cflush size
        if( !cache && max_extended_cap >= 0x80000006 )
        {
            x264_cpu_cpuid( 0x80000006, &eax, &ebx, &ecx, &edx );
            cache = ecx&0xff; // cacheline size
        }
        if( !cache )
        {
            // Cache and TLB Information
            static const char cache32_ids[] = { 0x0a, 0x0c, 0x41, 0x42, 0x43, 0x44, 0x45, 0x82, 0x83, 0x84, 0x85, 0 };
205 206
            static const char cache64_ids[] = { 0x22, 0x23, 0x25, 0x29, 0x2c, 0x46, 0x47, 0x49, 0x60, 0x66, 0x67,
                                                0x68, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7c, 0x7f, 0x86, 0x87, 0 };
207
            uint32_t buf[4];
208
            int max, i = 0;
209 210 211 212
            do {
                x264_cpu_cpuid( 2, buf+0, buf+1, buf+2, buf+3 );
                max = buf[0]&0xff;
                buf[0] &= ~0xff;
213
                for( int j = 0; j < 4; j++ )
214 215 216 217 218 219 220 221 222 223 224
                    if( !(buf[j]>>31) )
                        while( buf[j] )
                        {
                            if( strchr( cache32_ids, buf[j]&0xff ) )
                                cache = 32;
                            if( strchr( cache64_ids, buf[j]&0xff ) )
                                cache = 64;
                            buf[j] >>= 8;
                        }
            } while( ++i < max );
        }
Laurent Aimar's avatar
Laurent Aimar committed
225

226 227 228 229 230
        if( cache == 32 )
            cpu |= X264_CPU_CACHELINE_32;
        else if( cache == 64 )
            cpu |= X264_CPU_CACHELINE_64;
        else
231
            x264_log( NULL, X264_LOG_WARNING, "unable to determine cacheline size\n" );
232
    }
233

Steven Walters's avatar
Steven Walters committed
234
#if BROKEN_STACK_ALIGNMENT
235 236 237
    cpu |= X264_CPU_STACK_MOD4;
#endif

Laurent Aimar's avatar
Laurent Aimar committed
238 239 240
    return cpu;
}

Steven Walters's avatar
Steven Walters committed
241
#elif ARCH_PPC
Laurent Aimar's avatar
Laurent Aimar committed
242

Steven Walters's avatar
Steven Walters committed
243
#if SYS_MACOSX || SYS_OPENBSD
244
#include <sys/sysctl.h>
Laurent Aimar's avatar
Laurent Aimar committed
245 246
uint32_t x264_cpu_detect( void )
{
247
    /* Thank you VLC */
Laurent Aimar's avatar
Laurent Aimar committed
248
    uint32_t cpu = 0;
Steven Walters's avatar
Steven Walters committed
249
#if SYS_OPENBSD
250 251
    int      selectors[2] = { CTL_MACHDEP, CPU_ALTIVEC };
#else
Laurent Aimar's avatar
Laurent Aimar committed
252
    int      selectors[2] = { CTL_HW, HW_VECTORUNIT };
253
#endif
Laurent Aimar's avatar
Laurent Aimar committed
254 255 256 257 258 259 260 261 262 263
    int      has_altivec = 0;
    size_t   length = sizeof( has_altivec );
    int      error = sysctl( selectors, 2, &has_altivec, &length, NULL, 0 );

    if( error == 0 && has_altivec != 0 )
        cpu |= X264_CPU_ALTIVEC;

    return cpu;
}

Steven Walters's avatar
Steven Walters committed
264
#elif SYS_LINUX
265

266 267
uint32_t x264_cpu_detect( void )
{
268
    static void (*oldsig)( int );
269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285

    oldsig = signal( SIGILL, sigill_handler );
    if( sigsetjmp( jmpbuf, 1 ) )
    {
        signal( SIGILL, oldsig );
        return 0;
    }

    canjump = 1;
    asm volatile( "mtspr 256, %0\n\t"
                  "vand 0, 0, 0\n\t"
                  :
                  : "r"(-1) );
    canjump = 0;

    signal( SIGILL, oldsig );

286 287 288 289
    return X264_CPU_ALTIVEC;
}
#endif

Steven Walters's avatar
Steven Walters committed
290
#elif ARCH_ARM
291

292 293
void x264_cpu_neon_test( void );
int x264_cpu_fast_neon_mrc_test( void );
294 295 296 297

uint32_t x264_cpu_detect( void )
{
    int flags = 0;
Steven Walters's avatar
Steven Walters committed
298
#if HAVE_ARMV6
299 300 301
    flags |= X264_CPU_ARMV6;

    // don't do this hack if compiled with -mfpu=neon
Steven Walters's avatar
Steven Walters committed
302
#if !HAVE_NEON
303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331
    static void (* oldsig)( int );
    oldsig = signal( SIGILL, sigill_handler );
    if( sigsetjmp( jmpbuf, 1 ) )
    {
        signal( SIGILL, oldsig );
        return flags;
    }

    canjump = 1;
    x264_cpu_neon_test();
    canjump = 0;
    signal( SIGILL, oldsig );
#endif

    flags |= X264_CPU_NEON;

    // fast neon -> arm (Cortex-A9) detection relies on user access to the
    // cycle counter; this assumes ARMv7 performance counters.
    // NEON requires at least ARMv7, ARMv8 may require changes here, but
    // hopefully this hacky detection method will have been replaced by then.
    // Note that there is potential for a race condition if another program or
    // x264 instance disables or reinits the counters while x264 is using them,
    // which may result in incorrect detection and the counters stuck enabled.
    flags |= x264_cpu_fast_neon_mrc_test() ? X264_CPU_FAST_NEON_MRC : 0;
    // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
#endif
    return flags;
}

Laurent Aimar's avatar
Laurent Aimar committed
332 333 334 335 336 337 338
#else

uint32_t x264_cpu_detect( void )
{
    return 0;
}

Loren Merritt's avatar
Loren Merritt committed
339 340
#endif

341 342
int x264_cpu_num_processors( void )
{
343
#if !HAVE_THREAD
344 345
    return 1;

346
#elif defined(_WIN32)
347
    return x264_pthread_num_processors_np();
Loren Merritt's avatar
Loren Merritt committed
348

Steven Walters's avatar
Steven Walters committed
349
#elif SYS_LINUX
350
    unsigned int bit;
351
    int np;
352 353 354 355 356
    cpu_set_t p_aff;
    memset( &p_aff, 0, sizeof(p_aff) );
    sched_getaffinity( 0, sizeof(p_aff), &p_aff );
    for( np = 0, bit = 0; bit < sizeof(p_aff); bit++ )
        np += (((uint8_t *)&p_aff)[bit / 8] >> (bit % 8)) & 1;
357 358
    return np;

Steven Walters's avatar
Steven Walters committed
359
#elif SYS_BEOS
360 361 362 363
    system_info info;
    get_system_info( &info );
    return info.cpu_count;

Steven Walters's avatar
Steven Walters committed
364
#elif SYS_MACOSX || SYS_FREEBSD || SYS_OPENBSD
Anton Mitrofanov's avatar
Anton Mitrofanov committed
365 366
    int ncpu;
    size_t length = sizeof( ncpu );
Steven Walters's avatar
Steven Walters committed
367
#if SYS_OPENBSD
368
    int mib[2] = { CTL_HW, HW_NCPU };
Anton Mitrofanov's avatar
Anton Mitrofanov committed
369
    if( sysctl(mib, 2, &ncpu, &length, NULL, 0) )
370
#else
Anton Mitrofanov's avatar
Anton Mitrofanov committed
371
    if( sysctlbyname("hw.ncpu", &ncpu, &length, NULL, 0) )
372
#endif
373
    {
Anton Mitrofanov's avatar
Anton Mitrofanov committed
374
        ncpu = 1;
375
    }
Anton Mitrofanov's avatar
Anton Mitrofanov committed
376
    return ncpu;
377

378
#else
379 380
    return 1;
#endif
381
}