Commit c0c0e1f4 authored by Loren Merritt's avatar Loren Merritt

many changes to which asm functions are enabled on which cpus.

with Phenom, 3dnow is no longer equivalent to "sse2 is slow", so make a new flag for that.
some sse2 functions are useful only on Core2 and Phenom, so make a "sse2 is fast" flag for that.
some ssse3 instructions didn't become useful until Penryn, so yet another flag.
disable sse2 completely on Pentium M and Core1, because it's uniformly slower than mmx.
enable some sse2 functions on Athlon64 that always were faster and we just didn't notice.
remove mc_luma_sse3, because the only cpu that has lddqu (namely Pentium 4D) doesn't have "sse2 is fast".
don't print mmx1, sse1, nor 3dnow in the detected cpuflags, since we don't really have any such functions. likewise don't print sse3 unless it's used (Pentium 4D).
parent f9ad5ee2
......@@ -34,24 +34,23 @@
#endif
#include "common.h"
#include "cpu.h"
const struct {
const char name[8];
int flags;
} x264_cpu_names[] = {
{"MMX", X264_CPU_MMX},
const x264_cpu_name_t x264_cpu_names[] = {
{"Altivec", X264_CPU_ALTIVEC},
// {"MMX", X264_CPU_MMX}, // we don't support asm on mmx1 cpus anymore
{"MMX2", X264_CPU_MMX|X264_CPU_MMXEXT},
{"MMXEXT", X264_CPU_MMX|X264_CPU_MMXEXT},
{"SSE", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE},
{"SSE1", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE},
// {"SSE", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE}, // there are no sse1 functions in x264
{"SSE2Slow",X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE2_IS_SLOW},
{"SSE2", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2},
{"SSE2Fast",X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE2_IS_FAST},
{"SSE3", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3},
{"SSSE3", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3},
{"PHADD", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_PHADD_IS_FAST},
{"SSE4", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
{"3DNow", X264_CPU_3DNOW},
{"Altivec", X264_CPU_ALTIVEC},
{"Cache32", X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_32},
{"Cache64", X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64},
{"Cache32", X264_CPU_CACHELINE_32},
{"Cache64", X264_CPU_CACHELINE_64},
{"", 0},
};
......@@ -92,57 +91,87 @@ uint32_t x264_cpu_detect( void )
if( ecx&0x00080000 )
cpu |= X264_CPU_SSE4;
if( cpu & X264_CPU_SSSE3 )
cpu |= X264_CPU_SSE2_IS_FAST;
if( cpu & X264_CPU_SSE4 )
cpu |= X264_CPU_PHADD_IS_FAST;
x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx );
max_extended_cap = eax;
if( !strcmp((char*)vendor, "AuthenticAMD") && max_extended_cap >= 0x80000001 )
{
x264_cpu_cpuid( 0x80000001, &eax, &ebx, &ecx, &edx );
if( edx&0x80000000 )
cpu |= X264_CPU_3DNOW;
if( edx&0x00400000 )
cpu |= X264_CPU_MMXEXT;
if( cpu & X264_CPU_SSE2 )
{
if( ecx&0x00000040 ) /* SSE4a */
cpu |= X264_CPU_SSE2_IS_FAST;
else
cpu |= X264_CPU_SSE2_IS_SLOW;
}
}
if( !strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead") )
cpu |= X264_CPU_CACHELINE_SPLIT;
/* cacheline size is specified in 3 places, any of which may be missing */
x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
cache = (ebx&0xff00)>>5; // cflush size
if( !cache && max_extended_cap >= 0x80000006 )
if( !strcmp((char*)vendor, "GenuineIntel") )
{
x264_cpu_cpuid( 0x80000006, &eax, &ebx, &ecx, &edx );
cache = ecx&0xff; // cacheline size
int family, model, stepping;
x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
family = ((eax>>8)&0xf) + ((eax>>20)&0xff);
model = ((eax>>4)&0xf) + ((eax>>12)&0xf0);
stepping = eax&0xf;
/* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and 6/14 (core1 "yonah")
* theoretically support sse2, but it's significantly slower than mmx for
* almost all of x264's functions, so let's just pretend they don't. */
if( family==6 && (model==9 || model==13 || model==14) )
{
cpu &= ~(X264_CPU_SSE2|X264_CPU_SSE3);
assert(!(cpu&(X264_CPU_SSSE3|X264_CPU_SSE4)));
}
}
if( !cache )
if( !strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead") )
{
// Cache and TLB Information
static const char cache32_ids[] = { 0x0a, 0x0c, 0x41, 0x42, 0x43, 0x44, 0x45, 0x82, 0x83, 0x84, 0x85, 0 };
static const char cache64_ids[] = { 0x22, 0x23, 0x25, 0x29, 0x2c, 0x46, 0x47, 0x49, 0x60, 0x66, 0x67, 0x68, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7c, 0x7f, 0x86, 0x87, 0 };
uint32_t buf[4];
int max, i=0, j;
do {
x264_cpu_cpuid( 2, buf+0, buf+1, buf+2, buf+3 );
max = buf[0]&0xff;
buf[0] &= ~0xff;
for(j=0; j<4; j++)
if( !(buf[j]>>31) )
while( buf[j] )
{
if( strchr( cache32_ids, buf[j]&0xff ) )
cache = 32;
if( strchr( cache64_ids, buf[j]&0xff ) )
cache = 64;
buf[j] >>= 8;
}
} while( ++i < max );
/* cacheline size is specified in 3 places, any of which may be missing */
x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
cache = (ebx&0xff00)>>5; // cflush size
if( !cache && max_extended_cap >= 0x80000006 )
{
x264_cpu_cpuid( 0x80000006, &eax, &ebx, &ecx, &edx );
cache = ecx&0xff; // cacheline size
}
if( !cache )
{
// Cache and TLB Information
static const char cache32_ids[] = { 0x0a, 0x0c, 0x41, 0x42, 0x43, 0x44, 0x45, 0x82, 0x83, 0x84, 0x85, 0 };
static const char cache64_ids[] = { 0x22, 0x23, 0x25, 0x29, 0x2c, 0x46, 0x47, 0x49, 0x60, 0x66, 0x67, 0x68, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7c, 0x7f, 0x86, 0x87, 0 };
uint32_t buf[4];
int max, i=0, j;
do {
x264_cpu_cpuid( 2, buf+0, buf+1, buf+2, buf+3 );
max = buf[0]&0xff;
buf[0] &= ~0xff;
for(j=0; j<4; j++)
if( !(buf[j]>>31) )
while( buf[j] )
{
if( strchr( cache32_ids, buf[j]&0xff ) )
cache = 32;
if( strchr( cache64_ids, buf[j]&0xff ) )
cache = 64;
buf[j] >>= 8;
}
} while( ++i < max );
}
if( cache == 32 )
cpu |= X264_CPU_CACHELINE_32;
else if( cache == 64 )
cpu |= X264_CPU_CACHELINE_64;
else
fprintf( stderr, "x264 [warning]: unable to determine cacheline size\n" );
}
if( cache == 32 )
cpu |= X264_CPU_CACHELINE_32;
if( cache == 64 )
cpu |= X264_CPU_CACHELINE_64;
return cpu;
}
......
......@@ -42,9 +42,10 @@ void x264_stack_align( void (*func)(x264_t*), x264_t *arg );
#define x264_stack_align(func,arg) func(arg)
#endif
extern const struct {
const char name[8];
typedef struct {
const char name[12];
int flags;
} x264_cpu_names[];
} x264_cpu_name_t;
extern const x264_cpu_name_t x264_cpu_names[];
#endif
......@@ -394,20 +394,18 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
if( cpu&X264_CPU_MMX )
{
dctf->sub4x4_dct = x264_sub4x4_dct_mmx;
dctf->sub8x8_dct = x264_sub8x8_dct_mmx;
dctf->sub16x16_dct = x264_sub16x16_dct_mmx;
dctf->add4x4_idct = x264_add4x4_idct_mmx;
dctf->add8x8_idct = x264_add8x8_idct_mmx;
dctf->add16x16_idct = x264_add16x16_idct_mmx;
dctf->dct4x4dc = x264_dct4x4dc_mmx;
dctf->idct4x4dc = x264_idct4x4dc_mmx;
#ifndef ARCH_X86_64
dctf->sub8x8_dct = x264_sub8x8_dct_mmx;
dctf->sub16x16_dct = x264_sub16x16_dct_mmx;
dctf->add8x8_idct = x264_add8x8_idct_mmx;
dctf->add16x16_idct = x264_add16x16_idct_mmx;
dctf->sub8x8_dct8 = x264_sub8x8_dct8_mmx;
dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmx;
dctf->add8x8_idct8 = x264_add8x8_idct8_mmx;
dctf->add16x16_idct8= x264_add16x16_idct8_mmx;
#endif
......@@ -419,9 +417,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
dctf->add8x8_idct8 = x264_add8x8_idct8_sse2;
dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
}
if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_3DNOW) )
{
dctf->sub8x8_dct = x264_sub8x8_dct_sse2;
dctf->sub16x16_dct = x264_sub16x16_dct_sse2;
dctf->add8x8_idct = x264_add8x8_idct_sse2;
......
......@@ -44,11 +44,10 @@ x264_frame_t *x264_frame_new( x264_t *h )
if( h->param.b_interlaced )
i_lines = ( i_lines + 31 ) & -32;
if( h->param.cpu&X264_CPU_CACHELINE_SPLIT )
{
int align = h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 64;
i_stride = (i_stride + align-1) & -align;
}
if( h->param.cpu&X264_CPU_CACHELINE_64 )
i_stride = (i_stride + 63) & ~63;
else if( h->param.cpu&X264_CPU_CACHELINE_32 )
i_stride = (i_stride + 31) & ~31;
frame->i_plane = 3;
for( i = 0; i < 3; i++ )
......
......@@ -557,23 +557,20 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmxext;
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_mmxext;
if( cpu&X264_CPU_CACHELINE_SPLIT )
if( cpu&X264_CPU_CACHELINE_32 )
{
if( cpu&X264_CPU_CACHELINE_32 )
{
INIT5( sad, _cache32_mmxext );
INIT4( sad_x3, _cache32_mmxext );
INIT4( sad_x4, _cache32_mmxext );
}
else
{
INIT5( sad, _cache64_mmxext );
INIT4( sad_x3, _cache64_mmxext );
INIT4( sad_x4, _cache64_mmxext );
}
INIT5( sad, _cache32_mmxext );
INIT4( sad_x3, _cache32_mmxext );
INIT4( sad_x4, _cache32_mmxext );
}
else if( cpu&X264_CPU_CACHELINE_64 )
{
INIT5( sad, _cache64_mmxext );
INIT4( sad_x3, _cache64_mmxext );
INIT4( sad_x4, _cache64_mmxext );
}
#else
if( cpu&X264_CPU_CACHELINE_SPLIT )
if( cpu&X264_CPU_CACHELINE_64 )
{
pixf->sad[PIXEL_8x16] = x264_pixel_sad_8x16_cache64_mmxext;
pixf->sad[PIXEL_8x8] = x264_pixel_sad_8x8_cache64_mmxext;
......@@ -589,19 +586,15 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmxext;
}
// disable on AMD processors since it is slower
if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_3DNOW) )
if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
{
INIT2( sad, _sse2 );
INIT2( sad_x3, _sse2 );
INIT2( sad_x4, _sse2 );
INIT5( satd, _sse2 );
INIT5( satd_x3, _sse2 );
INIT5( satd_x4, _sse2 );
INIT_ADS( _sse2 );
#ifdef ARCH_X86
if( cpu&X264_CPU_CACHELINE_SPLIT )
if( cpu&X264_CPU_CACHELINE_64 )
{
INIT2( sad, _cache64_sse2 );
INIT2( sad_x3, _cache64_sse2 );
......@@ -609,10 +602,12 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
}
#endif
}
// these are faster on both Intel and AMD
if( cpu&X264_CPU_SSE2 )
{
INIT5( ssd, _sse2 );
INIT5( satd, _sse2 );
INIT5( satd_x3, _sse2 );
INIT5( satd_x4, _sse2 );
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_sse2;
pixf->ssim_end4 = x264_pixel_ssim_end4_sse2;
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2;
......@@ -622,7 +617,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
#endif
}
if( (cpu&X264_CPU_SSE3) && (cpu&X264_CPU_CACHELINE_SPLIT) )
if( (cpu&X264_CPU_SSE3) && (cpu&X264_CPU_CACHELINE_64) )
{
INIT2( sad, _sse3 );
INIT2( sad_x3, _sse3 );
......@@ -643,20 +638,18 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
#ifdef ARCH_X86_64
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_ssse3;
#endif
if( cpu&X264_CPU_CACHELINE_SPLIT )
if( cpu&X264_CPU_CACHELINE_64 )
{
INIT2( sad, _cache64_ssse3 );
INIT2( sad_x3, _cache64_ssse3 );
INIT2( sad_x4, _cache64_ssse3 );
}
}
if( cpu&X264_CPU_SSE4 )
{
// enabled on Penryn, but slower on Conroe
INIT5( satd, _ssse3_phadd );
INIT5( satd_x3, _ssse3_phadd );
INIT5( satd_x4, _ssse3_phadd );
if( cpu&X264_CPU_PHADD_IS_FAST )
{
INIT5( satd, _ssse3_phadd );
INIT5( satd_x3, _ssse3_phadd );
INIT5( satd_x4, _ssse3_phadd );
}
}
#endif //HAVE_MMX
......
......@@ -283,16 +283,12 @@ cglobal %1, 2,2,1
jmp %2
%endmacro
%ifndef ARCH_X86_64
SUB_NxN_DCT x264_sub8x8_dct_mmx, x264_sub4x4_dct_mmx %+ .skip_prologue, 32, 4, 0, 0
ADD_NxN_IDCT x264_add8x8_idct_mmx, x264_add4x4_idct_mmx %+ .skip_prologue, 32, 4, 0, 0
SUB_NxN_DCT x264_sub16x16_dct_mmx, x264_sub8x8_dct_mmx %+ .skip_prologue, 32, 8, 4, 4
ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx %+ .skip_prologue, 32, 8, 4, 4
SUB_NxN_DCT x264_sub16x16_dct_sse2, x264_sub8x8_dct_sse2 %+ .skip_prologue, 64, 8, 0, 4
ADD_NxN_IDCT x264_add16x16_idct_sse2, x264_add8x8_idct_sse2 %+ .skip_prologue, 64, 8, 0, 4
%ifndef ARCH_X86_64
cextern x264_sub8x8_dct8_mmx.skip_prologue
cextern x264_add8x8_idct8_mmx.skip_prologue
SUB_NxN_DCT x264_sub16x16_dct8_mmx, x264_sub8x8_dct8_mmx %+ .skip_prologue, 128, 8, 0, 0
......@@ -301,6 +297,9 @@ ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx %+ .skip_prologue, 1
%define x264_add8x8_idct8_sse2 x264_add8x8_idct8_sse2.skip_prologue
%endif
SUB_NxN_DCT x264_sub16x16_dct_sse2, x264_sub8x8_dct_sse2 %+ .skip_prologue, 64, 8, 0, 4
ADD_NxN_IDCT x264_add16x16_idct_sse2, x264_add8x8_idct_sse2 %+ .skip_prologue, 64, 8, 0, 4
cextern x264_sub8x8_dct8_sse2
cextern x264_add8x8_idct8_sse2
SUB_NxN_DCT x264_sub16x16_dct8_sse2, x264_sub8x8_dct8_sse2, 128, 8, 0, 0
......
......@@ -230,8 +230,7 @@ cglobal x264_pixel_avg2_w20_mmxext, 6,7
jg .height_loop
REP_RET
%macro PIXEL_AVG_SSE 1
cglobal x264_pixel_avg2_w16_%1, 6,7
cglobal x264_pixel_avg2_w16_sse2, 6,7
sub r4, r2
lea r6, [r4+r3]
.height_loop:
......@@ -249,7 +248,7 @@ cglobal x264_pixel_avg2_w16_%1, 6,7
jg .height_loop
REP_RET
cglobal x264_pixel_avg2_w20_%1, 6,7
cglobal x264_pixel_avg2_w20_sse2, 6,7
sub r4, r2
lea r6, [r4+r3]
.height_loop:
......@@ -272,12 +271,6 @@ cglobal x264_pixel_avg2_w20_%1, 6,7
sub r5d, 2
jg .height_loop
REP_RET
%endmacro
PIXEL_AVG_SSE sse2
%define movdqu lddqu
PIXEL_AVG_SSE sse3
%undef movdqu
; Cacheline split code for processors with high latencies for loads
; split over cache lines. See sad-a.asm for a more detailed explanation.
......
......@@ -69,7 +69,6 @@ PIXEL_AVG_WALL(cache32_mmxext)
PIXEL_AVG_WALL(cache64_mmxext)
PIXEL_AVG_WALL(cache64_sse2)
PIXEL_AVG_WALL(sse2)
PIXEL_AVG_WALL(sse3)
#define AVG_WEIGHT(W,H) \
void x264_pixel_avg_weight_ ## W ## x ## H ## _mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int i_weight_dst ) \
......@@ -104,7 +103,6 @@ PIXEL_AVG_WTAB(cache32_mmxext, mmxext, cache32_mmxext, cache32_mmxext, cache32_m
PIXEL_AVG_WTAB(cache64_mmxext, mmxext, cache64_mmxext, cache64_mmxext, cache64_mmxext, cache64_mmxext)
PIXEL_AVG_WTAB(sse2, mmxext, mmxext, mmxext, sse2, sse2)
PIXEL_AVG_WTAB(cache64_sse2, mmxext, cache64_mmxext, cache64_sse2, cache64_sse2, cache64_sse2)
PIXEL_AVG_WTAB(cache64_sse3, mmxext, cache64_mmxext, sse3, sse3, sse3)
#define MC_COPY_WTAB(instr, name1, name2, name3)\
static void (* const x264_mc_copy_wtab_##instr[5])( uint8_t *, int, uint8_t *, int, int ) =\
......@@ -118,7 +116,6 @@ static void (* const x264_mc_copy_wtab_##instr[5])( uint8_t *, int, uint8_t *, i
MC_COPY_WTAB(mmx,mmx,mmx,mmx)
MC_COPY_WTAB(sse2,mmx,mmx,sse2)
MC_COPY_WTAB(sse3,mmx,mmx,sse3)
static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
......@@ -153,7 +150,6 @@ MC_LUMA(cache64_mmxext,cache64_mmxext,mmx)
#endif
MC_LUMA(sse2,sse2,sse2)
MC_LUMA(cache64_sse2,cache64_sse2,sse2)
MC_LUMA(cache64_sse3,cache64_sse3,sse3)
#define GET_REF(name)\
uint8_t *get_ref_##name( uint8_t *dst, int *i_dst_stride,\
......@@ -186,7 +182,6 @@ GET_REF(cache64_mmxext)
#endif
GET_REF(sse2)
GET_REF(cache64_sse2)
GET_REF(cache64_sse3)
#define HPEL(align, cpu, cpuv, cpuc, cpuh)\
void x264_hpel_filter_v_##cpuv( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width);\
......@@ -270,7 +265,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->mc_luma = mc_luma_cache32_mmxext;
pf->get_ref = get_ref_cache32_mmxext;
}
else if( cpu&X264_CPU_CACHELINE_SPLIT )
else if( cpu&X264_CPU_CACHELINE_64 )
{
pf->mc_luma = mc_luma_cache64_mmxext;
pf->get_ref = get_ref_cache64_mmxext;
......@@ -284,26 +279,22 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->memzero_aligned = x264_memzero_aligned_sse2;
pf->hpel_filter = x264_hpel_filter_sse2_amd;
// disable on AMD processors since it is slower
if( cpu&X264_CPU_3DNOW )
if( cpu&X264_CPU_SSE2_IS_SLOW )
return;
pf->mc_luma = mc_luma_sse2;
pf->get_ref = get_ref_sse2;
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse2;
pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2;
pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_sse2;
pf->hpel_filter = x264_hpel_filter_sse2;
if( cpu&X264_CPU_CACHELINE_SPLIT )
if( cpu&X264_CPU_SSE2_IS_FAST )
{
pf->mc_luma = mc_luma_cache64_sse2;
pf->get_ref = get_ref_cache64_sse2;
/* lddqu doesn't work on Core2 */
if( (cpu&X264_CPU_SSE3) && !(cpu&X264_CPU_SSSE3) )
pf->mc_luma = mc_luma_sse2;
pf->get_ref = get_ref_sse2;
if( cpu&X264_CPU_CACHELINE_64 )
{
pf->mc_luma = mc_luma_cache64_sse3;
pf->get_ref = get_ref_cache64_sse3;
pf->mc_luma = mc_luma_cache64_sse2;
pf->get_ref = get_ref_cache64_sse2;
}
}
......
......@@ -505,11 +505,13 @@ void x264_predict_16x16_init_mmx( int cpu, x264_predict_t pf[7] )
pf[I_PRED_16x16_DC] = predict_16x16_dc_mmxext;
pf[I_PRED_16x16_DC_TOP] = predict_16x16_dc_top_mmxext;
pf[I_PRED_16x16_P] = predict_16x16_p_mmxext;
if( !(cpu&X264_CPU_SSE2) || (cpu&X264_CPU_3DNOW) )
if( !(cpu&X264_CPU_SSE2) )
return;
pf[I_PRED_16x16_DC] = predict_16x16_dc_sse2;
pf[I_PRED_16x16_DC_TOP] = predict_16x16_dc_top_sse2;
pf[I_PRED_16x16_V] = predict_16x16_v_sse2;
if( cpu&X264_CPU_SSE2_IS_SLOW )
return;
pf[I_PRED_16x16_DC_TOP] = predict_16x16_dc_top_sse2;
pf[I_PRED_16x16_P] = predict_16x16_p_sse2;
}
......
......@@ -660,9 +660,17 @@ x264_t *x264_encoder_open ( x264_param_t *param )
p = buf + sprintf( buf, "using cpu capabilities:" );
for( i=0; x264_cpu_names[i].flags; i++ )
{
if( !strcmp(x264_cpu_names[i].name, "SSE2")
&& param->cpu & (X264_CPU_SSE2_IS_FAST|X264_CPU_SSE2_IS_SLOW) )
continue;
if( !strcmp(x264_cpu_names[i].name, "SSE3")
&& (param->cpu & X264_CPU_SSSE3 || !(param->cpu & X264_CPU_CACHELINE_64)) )
continue;
if( (param->cpu & x264_cpu_names[i].flags) == x264_cpu_names[i].flags
&& (!i || x264_cpu_names[i].flags != x264_cpu_names[i-1].flags) )
p += sprintf( p, " %s", x264_cpu_names[i].name );
}
if( !param->cpu )
p += sprintf( p, " none!" );
x264_log( h, X264_LOG_INFO, "%s\n", buf );
......
......@@ -120,9 +120,11 @@ static void print_bench(void)
for( k=0; k<j && benchs[i].vers[k].pointer != b->pointer; k++ );
if( k<j ) continue;
printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,
b->cpu&X264_CPU_SSE4 ? "sse4" :
b->cpu&X264_CPU_PHADD_IS_FAST ? "phadd" :
b->cpu&X264_CPU_SSSE3 ? "ssse3" :
b->cpu&X264_CPU_SSE3 ? "sse3" :
/* print sse2slow only if there's also a sse2fast version of the same func */
b->cpu&X264_CPU_SSE2_IS_SLOW && j<MAX_CPUS && b[1].cpu&X264_CPU_SSE2_IS_FAST && !(b[1].cpu&X264_CPU_SSE3) ? "sse2slow" :
b->cpu&X264_CPU_SSE2 ? "sse2" :
b->cpu&X264_CPU_MMX ? "mmx" : "c",
b->cpu&X264_CPU_CACHELINE_32 ? "_c32" :
......@@ -1112,6 +1114,8 @@ int add_flags( int *cpu_ref, int *cpu_new, int flags, const char *name )
{
*cpu_ref = *cpu_new;
*cpu_new |= flags;
if( *cpu_new & X264_CPU_SSE2_IS_FAST )
*cpu_new &= ~X264_CPU_SSE2_IS_SLOW;
if( !quiet )
fprintf( stderr, "x264: %s\n", name );
return check_all_funcs( *cpu_ref, *cpu_new );
......@@ -1124,29 +1128,28 @@ int check_all_flags( void )
#ifdef HAVE_MMX
if( x264_cpu_detect() & X264_CPU_MMXEXT )
{
ret |= add_flags( &cpu0, &cpu1, X264_CPU_MMX | X264_CPU_MMXEXT, "MMXEXT" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64, "MMXEXT Cache64" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_MMX | X264_CPU_MMXEXT, "MMX" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "MMX Cache64" );
cpu1 &= ~X264_CPU_CACHELINE_64;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_32, "MMXEXT Cache32" );
#ifdef ARCH_X86
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_32, "MMX Cache32" );
cpu1 &= ~X264_CPU_CACHELINE_32;
#endif
}
if( x264_cpu_detect() & X264_CPU_SSE2 )
{
cpu1 &= ~(X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_32);
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE | X264_CPU_SSE2, "SSE2" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64, "SSE2 Cache64" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE | X264_CPU_SSE2 | X264_CPU_SSE2_IS_SLOW, "SSE2Slow" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE2_IS_FAST, "SSE2Fast" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSE2Fast Cache64" );
}
if( x264_cpu_detect() & X264_CPU_SSE3 )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE3, "SSE3" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE3 | X264_CPU_CACHELINE_64, "SSE3" );
if( x264_cpu_detect() & X264_CPU_SSSE3 )
{
cpu1 &= ~(X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64);
cpu1 &= ~X264_CPU_CACHELINE_64;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSSE3, "SSSE3" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64, "SSSE3 Cache64" );
}
if( x264_cpu_detect() & X264_CPU_SSSE3 )
{
cpu1 &= ~(X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64);
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4, "SSE4" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_PHADD_IS_FAST, "PHADD" );
}
#elif ARCH_PPC
if( x264_cpu_detect() & X264_CPU_ALTIVEC )
......
......@@ -35,7 +35,7 @@
#include <stdarg.h>
#define X264_BUILD 59
#define X264_BUILD 60
/* x264_t:
* opaque handler for encoder */
......@@ -46,19 +46,19 @@ typedef struct x264_t x264_t;
****************************************************************************/
/* CPU flags
*/
#define X264_CPU_MMX 0x000001 /* mmx */
#define X264_CPU_MMXEXT 0x000002 /* mmx-ext*/
#define X264_CPU_SSE 0x000004 /* sse */
#define X264_CPU_SSE2 0x000008 /* sse 2 */
#define X264_CPU_3DNOW 0x000010 /* 3dnow! */
#define X264_CPU_3DNOWEXT 0x000020 /* 3dnow! ext */
#define X264_CPU_ALTIVEC 0x000040 /* altivec */
#define X264_CPU_SSE3 0x000080 /* sse 3 */
#define X264_CPU_SSSE3 0x000100 /* ssse 3 */
#define X264_CPU_CACHELINE_SPLIT 0x200 /* avoid memory loads that span the border between two cachelines */
#define X264_CPU_CACHELINE_32 0x0400 /* size of a cacheline in bytes */
#define X264_CPU_CACHELINE_64 0x0800
#define X264_CPU_SSE4 0x001000 /* sse 4.1 */
#define X264_CPU_CACHELINE_32 0x000001 /* avoid memory loads that span the border between two cachelines */
#define X264_CPU_CACHELINE_64 0x000002 /* 32/64 is the size of a cacheline in bytes */
#define X264_CPU_ALTIVEC 0x000004
#define X264_CPU_MMX 0x000008
#define X264_CPU_MMXEXT 0x000010 /* MMX2 aka MMXEXT aka ISSE */
#define X264_CPU_SSE 0x000020
#define X264_CPU_SSE2 0x000040
#define X264_CPU_SSE2_IS_SLOW 0x000080 /* avoid most SSE2 functions on Athlon64 */
#define X264_CPU_SSE2_IS_FAST 0x000100 /* a few functions are only faster on Core2 and Phenom */
#define X264_CPU_SSE3 0x000200
#define X264_CPU_SSSE3 0x000400
#define X264_CPU_PHADD_IS_FAST 0x000800 /* pre-Penryn Core2 have a uselessly slow PHADD instruction */
#define X264_CPU_SSE4 0x001000 /* SSE4.1 */
/* Analyse flags
*/
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment