Commit 57729402 authored by Fiona Glaser's avatar Fiona Glaser

Detect Atom CPU, enable appropriate asm functions

I'm not going to actually optimize for this pile of garbage unless someone pays me.
But it can't hurt to at least enable the correct functions based on benchmarks.

Also save some cache on Intel CPUs that don't need the decimate LUT due to having fast bsr/bsf.
parent 0f249f12
......@@ -64,6 +64,8 @@ const x264_cpu_name_t x264_cpu_names[] = {
{"ARMv6", X264_CPU_ARMV6},
{"NEON", X264_CPU_NEON},
{"Fast_NEON_MRC", X264_CPU_FAST_NEON_MRC},
{"SlowCTZ", X264_CPU_SLOW_CTZ},
{"SlowAtom", X264_CPU_SLOW_ATOM},
{"", 0},
};
......@@ -135,6 +137,7 @@ uint32_t x264_cpu_detect( void )
if( !strcmp((char*)vendor, "AuthenticAMD") && max_extended_cap >= 0x80000001 )
{
cpu |= X264_CPU_SLOW_CTZ;
x264_cpu_cpuid( 0x80000001, &eax, &ebx, &ecx, &edx );
if( edx&0x00400000 )
cpu |= X264_CPU_MMXEXT;
......@@ -145,6 +148,7 @@ uint32_t x264_cpu_detect( void )
cpu |= X264_CPU_SSE2_IS_FAST;
cpu |= X264_CPU_LZCNT;
cpu |= X264_CPU_SHUFFLE_IS_FAST;
cpu &= ~X264_CPU_SLOW_CTZ;
}
else
cpu |= X264_CPU_SSE2_IS_SLOW;
......@@ -159,11 +163,9 @@ uint32_t x264_cpu_detect( void )
if( !strcmp((char*)vendor, "GenuineIntel") )
{
int family, model, stepping;
x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
family = ((eax>>8)&0xf) + ((eax>>20)&0xff);
model = ((eax>>4)&0xf) + ((eax>>12)&0xf0);
stepping = eax&0xf;
int family = ((eax>>8)&0xf) + ((eax>>20)&0xff);
int model = ((eax>>4)&0xf) + ((eax>>12)&0xf0);
/* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and 6/14 (core1 "yonah")
* theoretically support sse2, but it's significantly slower than mmx for
* almost all of x264's functions, so let's just pretend they don't. */
......@@ -172,6 +174,12 @@ uint32_t x264_cpu_detect( void )
cpu &= ~(X264_CPU_SSE2|X264_CPU_SSE3);
assert(!(cpu&(X264_CPU_SSSE3|X264_CPU_SSE4)));
}
/* Detect Atom CPU */
if( family == 6 && model == 28 )
{
cpu |= X264_CPU_SLOW_ATOM;
cpu |= X264_CPU_SLOW_CTZ;
}
}
if( (!strcmp((char*)vendor, "GenuineIntel") || !strcmp((char*)vendor, "CyrixInstead")) && !(cpu&X264_CPU_SSE42))
......
......@@ -457,7 +457,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
}
if( cpu&X264_CPU_SSSE3 )
if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SLOW_ATOM) )
{
dctf->sub4x4_dct = x264_sub4x4_dct_ssse3;
dctf->sub8x8_dct = x264_sub8x8_dct_ssse3;
......
......@@ -768,17 +768,20 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
if( cpu&X264_CPU_SSSE3 )
{
INIT7( ssd, _ssse3 );
INIT7( satd, _ssse3 );
INIT7( satd_x3, _ssse3 );
INIT7( satd_x4, _ssse3 );
if( !(cpu&X264_CPU_STACK_MOD4) )
{
INIT4( hadamard_ac, _ssse3 );
}
INIT_ADS( _ssse3 );
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3;
if( !(cpu&X264_CPU_SLOW_ATOM) )
{
INIT7( ssd, _ssse3 );
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3;
INIT7( satd, _ssse3 );
INIT7( satd_x3, _ssse3 );
INIT7( satd_x4, _ssse3 );
}
pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_ssse3;
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_ssse3;
pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_ssse3;
......@@ -794,7 +797,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT2( sad_x3, _cache64_ssse3 );
INIT2( sad_x4, _cache64_ssse3 );
}
if( !(cpu&X264_CPU_SHUFFLE_IS_FAST) )
if( cpu&X264_CPU_SLOW_ATOM || !(cpu&X264_CPU_SHUFFLE_IS_FAST) )
{
INIT5( ssd, _sse2 ); /* on conroe, sse2 is faster for width8/16 */
}
......
......@@ -312,6 +312,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->quant_4x4_dc = x264_quant_4x4_dc_mmxext;
pf->decimate_score15 = x264_decimate_score15_mmxext;
pf->decimate_score16 = x264_decimate_score16_mmxext;
if( cpu&X264_CPU_SLOW_CTZ )
{
pf->decimate_score15 = x264_decimate_score15_mmxext_slowctz;
pf->decimate_score16 = x264_decimate_score16_mmxext_slowctz;
}
pf->decimate_score64 = x264_decimate_score64_mmxext;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_mmxext;
pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmxext;
......@@ -345,6 +350,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->decimate_score15 = x264_decimate_score15_sse2;
pf->decimate_score16 = x264_decimate_score16_sse2;
pf->decimate_score64 = x264_decimate_score64_sse2;
if( cpu&X264_CPU_SLOW_CTZ )
{
pf->decimate_score15 = x264_decimate_score15_sse2_slowctz;
pf->decimate_score16 = x264_decimate_score16_sse2_slowctz;
}
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2;
pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2;
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2;
......@@ -369,6 +379,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->denoise_dct = x264_denoise_dct_ssse3;
pf->decimate_score15 = x264_decimate_score15_ssse3;
pf->decimate_score16 = x264_decimate_score16_ssse3;
if( cpu&X264_CPU_SLOW_CTZ )
{
pf->decimate_score15 = x264_decimate_score15_ssse3_slowctz;
pf->decimate_score16 = x264_decimate_score16_ssse3_slowctz;
}
pf->decimate_score64 = x264_decimate_score64_ssse3;
}
......
......@@ -427,8 +427,11 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
return;
pf->weight = x264_mc_weight_wtab_sse2;
pf->offsetadd = x264_mc_offsetadd_wtab_sse2;
pf->offsetsub = x264_mc_offsetsub_wtab_sse2;
if( !(cpu&X264_CPU_SLOW_ATOM) )
{
pf->offsetadd = x264_mc_offsetadd_wtab_sse2;
pf->offsetsub = x264_mc_offsetsub_wtab_sse2;
}
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse2;
pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2;
......@@ -481,7 +484,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->weight = x264_mc_weight_wtab_ssse3;
}
if( cpu&X264_CPU_SHUFFLE_IS_FAST )
if( (cpu&X264_CPU_SHUFFLE_IS_FAST) && !(cpu&X264_CPU_SLOW_ATOM) )
pf->integral_init4v = x264_integral_init4v_ssse3;
if( !(cpu&X264_CPU_SSE4) )
......
......@@ -583,9 +583,9 @@ DENOISE_DCT ssse3, 7
cextern decimate_table4
cextern decimate_table8
%macro DECIMATE4x4 2
%macro DECIMATE4x4 3
;A LUT is faster than bsf on AMD processors, and no slower on Intel
;A LUT is faster than bsf on AMD processors.
;This is not true for score64.
cglobal decimate_score%1_%2, 1,3
%ifdef PIC
......@@ -605,6 +605,7 @@ cglobal decimate_score%1_%2, 1,3
%if %1==15
shr edx, 1
%endif
%if %3==1
movzx ecx, dl
movzx eax, byte [mask_table + rcx]
cmp edx, ecx
......@@ -617,8 +618,16 @@ cglobal decimate_score%1_%2, 1,3
shr edx, cl
add al, byte [table + rcx]
add al, byte [mask_table + rdx]
%else
.loop:
bsf ecx, edx
shr edx, cl
add al, byte [table + rcx]
shr edx, 1
jne .loop
%endif
.ret:
REP_RET
RET
.ret9:
mov eax, 9
RET
......@@ -627,14 +636,20 @@ cglobal decimate_score%1_%2, 1,3
%ifndef ARCH_X86_64
%define DECIMATE_MASK DECIMATE_MASK_MMX
DECIMATE4x4 15, mmxext
DECIMATE4x4 16, mmxext
DECIMATE4x4 15, mmxext, 0
DECIMATE4x4 16, mmxext, 0
DECIMATE4x4 15, mmxext_slowctz, 1
DECIMATE4x4 16, mmxext_slowctz, 1
%endif
%define DECIMATE_MASK DECIMATE_MASK_SSE2
DECIMATE4x4 15, sse2
DECIMATE4x4 15, ssse3
DECIMATE4x4 16, sse2
DECIMATE4x4 16, ssse3
DECIMATE4x4 15, sse2, 0
DECIMATE4x4 16, sse2, 0
DECIMATE4x4 15, sse2_slowctz, 1
DECIMATE4x4 16, sse2_slowctz, 1
DECIMATE4x4 15, ssse3, 0
DECIMATE4x4 16, ssse3, 0
DECIMATE4x4 15, ssse3_slowctz, 1
DECIMATE4x4 16, ssse3_slowctz, 1
%macro DECIMATE8x8 1
......
......@@ -57,6 +57,12 @@ int x264_decimate_score15_ssse3 ( int16_t *dct );
int x264_decimate_score16_mmxext( int16_t *dct );
int x264_decimate_score16_sse2 ( int16_t *dct );
int x264_decimate_score16_ssse3 ( int16_t *dct );
int x264_decimate_score15_mmxext_slowctz( int16_t *dct );
int x264_decimate_score15_sse2_slowctz ( int16_t *dct );
int x264_decimate_score15_ssse3_slowctz ( int16_t *dct );
int x264_decimate_score16_mmxext_slowctz( int16_t *dct );
int x264_decimate_score16_sse2_slowctz ( int16_t *dct );
int x264_decimate_score16_ssse3_slowctz ( int16_t *dct );
int x264_decimate_score64_mmxext( int16_t *dct );
int x264_decimate_score64_sse2 ( int16_t *dct );
int x264_decimate_score64_ssse3 ( int16_t *dct );
......
......@@ -993,10 +993,7 @@ int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
/* calculate dct coeffs */
for( int i4x4 = 0, i_decimate_mb = 0; i4x4 < 4; i4x4++ )
{
/* We don't need to zero the DC coefficient before quantization because we already
* checked that all the DCs were zero above at twice the precision that quant4x4
* uses. This applies even though the DC here is being quantized before the 2x2
* transform. */
dct4x4[i4x4][0] = 0;
if( !h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] ) )
continue;
h->zigzagf.scan_4x4( dctscan, dct4x4[i4x4] );
......
......@@ -173,7 +173,9 @@ static void print_bench(void)
b->cpu&X264_CPU_CACHELINE_64 ? "_c64" :
b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" :
b->cpu&X264_CPU_LZCNT ? "_lzcnt" :
b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" : "",
b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" :
b->cpu&X264_CPU_SLOW_CTZ ? "_slow_ctz" :
b->cpu&X264_CPU_SLOW_ATOM ? "_slow_atom" : "",
((int64_t)10*b->cycles/b->den - nop_time)/4 );
}
}
......@@ -1700,6 +1702,8 @@ static int check_all_flags( void )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "MMX_LZCNT" );
cpu1 &= ~X264_CPU_LZCNT;
}
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "MMX SlowCTZ" );
cpu1 &= ~X264_CPU_SLOW_CTZ;
}
if( x264_cpu_detect() & X264_CPU_SSE2 )
{
......@@ -1708,6 +1712,10 @@ static int check_all_flags( void )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSE2Fast Cache64" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SHUFFLE_IS_FAST, "SSE2 FastShuffle" );
cpu1 &= ~X264_CPU_SHUFFLE_IS_FAST;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSE2 SlowCTZ" );
cpu1 &= ~X264_CPU_SLOW_CTZ;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_ATOM, "SSE2 SlowAtom" );
cpu1 &= ~X264_CPU_SLOW_ATOM;
}
if( x264_cpu_detect() & X264_CPU_SSE_MISALIGN )
{
......@@ -1730,6 +1738,10 @@ static int check_all_flags( void )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SHUFFLE_IS_FAST, "SSSE3 FastShuffle" );
cpu1 &= ~X264_CPU_SHUFFLE_IS_FAST;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSSE3 SlowCTZ" );
cpu1 &= ~X264_CPU_SLOW_CTZ;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_ATOM, "SSSE3 SlowAtom" );
cpu1 &= ~X264_CPU_SLOW_ATOM;
}
if( x264_cpu_detect() & X264_CPU_SSE4 )
{
......
......@@ -66,6 +66,8 @@ typedef struct x264_t x264_t;
#define X264_CPU_ARMV6 0x020000
#define X264_CPU_NEON 0x040000 /* ARM NEON */
#define X264_CPU_FAST_NEON_MRC 0x080000 /* Transfer from NEON to ARM register is fast (Cortex-A9) */
#define X264_CPU_SLOW_CTZ 0x100000 /* BSR/BSF x86 instructions are really slow on some CPUs */
#define X264_CPU_SLOW_ATOM 0x200000 /* The Atom just sucks */
/* Analyse flags
*/
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment