Commit 5d60b9c9 authored by Fiona Glaser's avatar Fiona Glaser
Browse files

x86: detect Bobcat, improve Atom optimizations, reorganize flags

The Bobcat has a 64-bit SIMD unit reminiscent of the Athlon 64; detect this
and apply the appropriate flags.

It also has an extremely slow palignr instruction; create a flag for this to
avoid massive penalties on palignr-heavy functions.

Improve Atom function selection and document exactly what the SLOW_ATOM flag
covers.

Add Atom-optimized SATD/SA8D/hadamard_ac functions: simply combine the ssse3
optimizations with the sse2 algorithm to avoid pmaddubsw, which is slow on
Atom along with other SIMD multiplies.

Drop TBM detection; it'll probably never be useful for x264.

Invert FastShuffle to SlowShuffle; it only ever applied to one CPU (Conroe).

Detect CMOV, to fail more gracefully when run on a chip with MMX2 but no CMOV.
parent 75d92705
......@@ -622,10 +622,8 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
b_error = 1;
}
free( buf );
if( p->cpu & X264_CPU_SSSE3 )
if( (p->cpu&X264_CPU_SSSE3) && !(p->cpu&X264_CPU_SSE2_IS_SLOW) )
p->cpu |= X264_CPU_SSE2_IS_FAST;
if( p->cpu & X264_CPU_SSE4 )
p->cpu |= X264_CPU_SHUFFLE_IS_FAST;
}
}
OPT("threads")
......
......@@ -47,18 +47,19 @@
const x264_cpu_name_t x264_cpu_names[] =
{
{"Altivec", X264_CPU_ALTIVEC},
// {"MMX", X264_CPU_MMX}, // we don't support asm on mmx1 cpus anymore
{"MMX2", X264_CPU_MMX|X264_CPU_MMX2},
{"MMXEXT", X264_CPU_MMX|X264_CPU_MMX2},
{"SSE", X264_CPU_MMX|X264_CPU_MMX2|X264_CPU_SSE},
#define SSE2 X264_CPU_MMX|X264_CPU_MMX2|X264_CPU_SSE|X264_CPU_SSE2
#if HAVE_MMX
// {"MMX", X264_CPU_MMX}, // we don't support asm on mmx1 cpus anymore
// {"CMOV", X264_CPU_CMOV}, // we require this unconditionally, so don't print it
#define MMX2 X264_CPU_MMX|X264_CPU_MMX2|X264_CPU_CMOV
{"MMX2", MMX2},
{"MMXEXT", MMX2},
{"SSE", MMX2|X264_CPU_SSE},
#define SSE2 MMX2|X264_CPU_SSE|X264_CPU_SSE2
{"SSE2Slow", SSE2|X264_CPU_SSE2_IS_SLOW},
{"SSE2", SSE2},
{"SSE2Fast", SSE2|X264_CPU_SSE2_IS_FAST},
{"SSE3", SSE2|X264_CPU_SSE3},
{"SSSE3", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3},
{"FastShuffle", SSE2|X264_CPU_SHUFFLE_IS_FAST},
{"SSE4.1", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
{"SSE4", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
{"SSE4.2", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42},
......@@ -70,19 +71,26 @@ const x264_cpu_name_t x264_cpu_names[] =
{"FMA3", AVX|X264_CPU_FMA3},
#undef AVX
#undef SSE2
#undef MMX2
{"Cache32", X264_CPU_CACHELINE_32},
{"Cache64", X264_CPU_CACHELINE_64},
{"SSEMisalign", X264_CPU_SSE_MISALIGN},
{"LZCNT", X264_CPU_LZCNT},
{"BMI1", X264_CPU_BMI1},
{"BMI2", X264_CPU_BMI1|X264_CPU_BMI2},
{"TBM", X264_CPU_TBM},
{"Slow_mod4_stack", X264_CPU_STACK_MOD4},
{"ARMv6", X264_CPU_ARMV6},
{"NEON", X264_CPU_NEON},
{"Fast_NEON_MRC", X264_CPU_FAST_NEON_MRC},
{"SlowCTZ", X264_CPU_SLOW_CTZ},
{"SlowAtom", X264_CPU_SLOW_ATOM},
{"SlowPshufb", X264_CPU_SLOW_PSHUFB},
{"SlowPalignr", X264_CPU_SLOW_PALIGNR},
{"SlowShuffle", X264_CPU_SLOW_SHUFFLE},
{"UnalignedStack", X264_CPU_STACK_MOD4},
#elif ARCH_PPC
{"Altivec", X264_CPU_ALTIVEC},
#elif ARCH_ARM
{"ARMv6", X264_CPU_ARMV6},
{"NEON", X264_CPU_NEON},
{"FastNeonMRC", X264_CPU_FAST_NEON_MRC},
#endif
{"", 0},
};
......@@ -131,9 +139,13 @@ uint32_t x264_cpu_detect( void )
if( edx&0x00800000 )
cpu |= X264_CPU_MMX;
else
return 0;
return cpu;
if( edx&0x02000000 )
cpu |= X264_CPU_MMX2|X264_CPU_SSE;
if( edx&0x00008000 )
cpu |= X264_CPU_CMOV;
else
return cpu;
if( edx&0x04000000 )
cpu |= X264_CPU_SSE2;
if( ecx&0x00000001 )
......@@ -170,46 +182,50 @@ uint32_t x264_cpu_detect( void )
if( cpu & X264_CPU_SSSE3 )
cpu |= X264_CPU_SSE2_IS_FAST;
if( cpu & X264_CPU_SSE4 )
cpu |= X264_CPU_SHUFFLE_IS_FAST;
x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx );
max_extended_cap = eax;
if( !strcmp((char*)vendor, "AuthenticAMD") && max_extended_cap >= 0x80000001 )
if( max_extended_cap >= 0x80000001 )
{
cpu |= X264_CPU_SLOW_CTZ;
x264_cpu_cpuid( 0x80000001, &eax, &ebx, &ecx, &edx );
if( edx&0x00400000 )
cpu |= X264_CPU_MMX2;
if( cpu & X264_CPU_SSE2 )
if( ecx&0x00000020 )
cpu |= X264_CPU_LZCNT; /* Supported by Intel chips starting with Haswell */
if( ecx&0x00000040 ) /* SSE4a, AMD only */
{
if( ecx&0x00000040 ) /* SSE4a */
int family = ((eax>>8)&0xf) + ((eax>>20)&0xff);
cpu |= X264_CPU_SSE2_IS_FAST; /* Phenom and later CPUs have fast SSE units */
if( family == 0x14 )
{
cpu |= X264_CPU_SSE2_IS_FAST;
cpu |= X264_CPU_LZCNT;
cpu |= X264_CPU_SHUFFLE_IS_FAST;
cpu &= ~X264_CPU_SLOW_CTZ;
cpu &= ~X264_CPU_SSE2_IS_FAST; /* SSSE3 doesn't imply fast SSE anymore... */
cpu |= X264_CPU_SSE2_IS_SLOW; /* Bobcat has 64-bit SIMD units */
cpu |= X264_CPU_SLOW_PALIGNR; /* palignr is insanely slow on Bobcat */
}
else
cpu |= X264_CPU_SSE2_IS_SLOW;
}
if( ecx&0x00000080 ) /* Misalign SSE */
{
cpu |= X264_CPU_SSE_MISALIGN;
x264_cpu_mask_misalign_sse();
}
if( ecx&0x00000080 ) /* Misalign SSE */
{
cpu |= X264_CPU_SSE_MISALIGN;
x264_cpu_mask_misalign_sse();
}
if( cpu & X264_CPU_AVX )
{
if( ecx&0x00000800 ) /* XOP */
cpu |= X264_CPU_XOP;
if( ecx&0x00010000 ) /* FMA4 */
cpu |= X264_CPU_FMA4;
}
if( cpu & X264_CPU_AVX )
{
if( ecx&0x00000800 ) /* XOP */
cpu |= X264_CPU_XOP;
if( ecx&0x00010000 ) /* FMA4 */
cpu |= X264_CPU_FMA4;
}
if( ecx&0x00200000 )
cpu |= X264_CPU_TBM;
if( !strcmp((char*)vendor, "AuthenticAMD") )
{
if( edx&0x00400000 )
cpu |= X264_CPU_MMX2;
if( !(cpu&X264_CPU_LZCNT) )
cpu |= X264_CPU_SLOW_CTZ;
if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_FAST) )
cpu |= X264_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */
}
}
......@@ -233,11 +249,12 @@ uint32_t x264_cpu_detect( void )
{
cpu |= X264_CPU_SLOW_ATOM;
cpu |= X264_CPU_SLOW_CTZ;
cpu |= X264_CPU_SLOW_PSHUFB;
}
/* Some Penryns and Nehalems are pointlessly crippled (SSE4 disabled), so
* detect them here. */
else if( model >= 23 )
cpu |= X264_CPU_SHUFFLE_IS_FAST;
/* Conroe has a slow shuffle unit. Check the model number to make sure not
* to include crippled low-end Penryns and Nehalems that don't have SSE4. */
else if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SSE4) && model < 23 )
cpu |= X264_CPU_SLOW_SHUFFLE;
}
}
......
......@@ -640,23 +640,32 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->add8x8_idct8 = x264_add8x8_idct8_sse2;
dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
dctf->sub8x8_dct = x264_sub8x8_dct_sse2;
dctf->sub16x16_dct = x264_sub16x16_dct_sse2;
dctf->add8x8_idct = x264_add8x8_idct_sse2;
dctf->add16x16_idct = x264_add16x16_idct_sse2;
dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
if( !(cpu&X264_CPU_SSE2_IS_SLOW) )
{
dctf->sub8x8_dct = x264_sub8x8_dct_sse2;
dctf->sub16x16_dct = x264_sub16x16_dct_sse2;
dctf->add8x8_idct = x264_add8x8_idct_sse2;
dctf->add16x16_idct = x264_add16x16_idct_sse2;
dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
}
}
if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SLOW_ATOM) )
if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
{
dctf->sub4x4_dct = x264_sub4x4_dct_ssse3;
dctf->sub8x8_dct = x264_sub8x8_dct_ssse3;
dctf->sub16x16_dct = x264_sub16x16_dct_ssse3;
dctf->sub8x8_dct8 = x264_sub8x8_dct8_ssse3;
dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3;
dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_ssse3;
dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
if( !(cpu&X264_CPU_SLOW_ATOM) )
{
dctf->sub4x4_dct = x264_sub4x4_dct_ssse3;
dctf->sub8x8_dct = x264_sub8x8_dct_ssse3;
dctf->sub16x16_dct = x264_sub16x16_dct_ssse3;
dctf->sub8x8_dct8 = x264_sub8x8_dct8_ssse3;
dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3;
if( !(cpu&X264_CPU_SLOW_PSHUFB) )
{
dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
}
}
}
if( cpu&X264_CPU_SSE4 )
......@@ -951,7 +960,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig
pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_ssse3;
pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_ssse3;
pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3;
if( cpu&X264_CPU_SHUFFLE_IS_FAST )
if( !(cpu&X264_CPU_SLOW_SHUFFLE) )
pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
}
if( cpu&X264_CPU_AVX )
......@@ -962,8 +971,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig
pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_avx;
pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_avx;
#endif
if( cpu&X264_CPU_SHUFFLE_IS_FAST )
pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
}
if( cpu&X264_CPU_XOP )
{
......@@ -1005,7 +1013,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig
pf_interlaced->interleave_8x8_cavlc =
pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
}
if( cpu&X264_CPU_SHUFFLE_IS_FAST )
if( (cpu&X264_CPU_SSE2) && !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SSE2_IS_SLOW)) )
{
pf_interlaced->interleave_8x8_cavlc =
pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
......
......@@ -73,7 +73,11 @@ static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
int i_stride, i_width, i_lines, luma_plane_count;
int i_padv = PADV << PARAM_INTERLACED;
int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16;
int disalign = h->param.cpu&X264_CPU_ALTIVEC ? 1<<9 : 1<<10;
#if ARCH_PPC
int disalign = 1<<9;
#else
int disalign = 1<<10;
#endif
CHECKED_MALLOCZERO( frame, sizeof(x264_frame_t) );
......
......@@ -500,6 +500,7 @@ SATD_X_DECL7( _mmx2 )
#if !HIGH_BIT_DEPTH
SATD_X_DECL6( _sse2 )
SATD_X_DECL7( _ssse3 )
SATD_X_DECL6( _ssse3_atom )
SATD_X_DECL7( _sse4 )
SATD_X_DECL7( _avx )
SATD_X_DECL7( _xop )
......@@ -1024,14 +1025,14 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT4( sad_x3, _cache32_mmx2 );
INIT4( sad_x4, _cache32_mmx2 );
}
else if( cpu&X264_CPU_CACHELINE_64 )
else if( cpu&X264_CPU_CACHELINE_64 && !(cpu&X264_CPU_SLOW_ATOM) )
{
INIT5( sad, _cache64_mmx2 );
INIT4( sad_x3, _cache64_mmx2 );
INIT4( sad_x4, _cache64_mmx2 );
}
#else
if( cpu&X264_CPU_CACHELINE_64 )
if( cpu&X264_CPU_CACHELINE_64 && !(cpu&X264_CPU_SLOW_ATOM) )
{
pixf->sad[PIXEL_8x16] = x264_pixel_sad_8x16_cache64_mmx2;
pixf->sad[PIXEL_8x8] = x264_pixel_sad_8x8_cache64_mmx2;
......@@ -1146,7 +1147,20 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
#endif
}
INIT_ADS( _ssse3 );
if( !(cpu&X264_CPU_SLOW_ATOM) )
if( cpu&X264_CPU_SLOW_ATOM )
{
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3_atom;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3_atom;
INIT6( satd, _ssse3_atom );
pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_ssse3_atom;
INIT6( satd_x3, _ssse3_atom );
INIT6( satd_x4, _ssse3_atom );
INIT4( hadamard_ac, _ssse3_atom );
#if ARCH_X86_64
pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_ssse3_atom;
#endif
}
else
{
INIT8( ssd, _ssse3 );
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
......@@ -1154,25 +1168,26 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT8( satd, _ssse3 );
INIT7( satd_x3, _ssse3 );
INIT7( satd_x4, _ssse3 );
#if ARCH_X86_64
pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_ssse3;
#endif
}
pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_ssse3;
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_ssse3;
if( !(cpu&X264_CPU_SLOW_PSHUFB) )
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_ssse3;
pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_ssse3;
pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_ssse3;
pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_ssse3;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_ssse3;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_ssse3;
pixf->asd8 = x264_pixel_asd8_ssse3;
#if ARCH_X86_64
pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_ssse3;
#endif
if( cpu&X264_CPU_CACHELINE_64 )
{
INIT2( sad, _cache64_ssse3 );
INIT2( sad_x3, _cache64_ssse3 );
INIT2( sad_x4, _cache64_ssse3 );
}
if( cpu&X264_CPU_SLOW_ATOM || !(cpu&X264_CPU_SHUFFLE_IS_FAST) )
if( (cpu&X264_CPU_SLOW_ATOM) || (cpu&X264_CPU_SLOW_SHUFFLE) )
{
INIT5( ssd, _sse2 ); /* on conroe, sse2 is faster for width8/16 */
}
......
......@@ -214,6 +214,7 @@ PIXEL_AVG_WTAB(sse2, mmx2, mmx2, sse2, sse2, sse2)
PIXEL_AVG_WTAB(sse2_misalign, mmx2, mmx2, sse2, sse2, sse2_misalign)
PIXEL_AVG_WTAB(cache64_sse2, mmx2, cache64_mmx2, cache64_sse2, cache64_sse2, cache64_sse2)
PIXEL_AVG_WTAB(cache64_ssse3, mmx2, cache64_mmx2, cache64_ssse3, cache64_ssse3, cache64_sse2)
PIXEL_AVG_WTAB(cache64_ssse3_atom, mmx2, mmx2, cache64_ssse3, cache64_ssse3, sse2)
#endif // HIGH_BIT_DEPTH
#define MC_COPY_WTAB(instr, name1, name2, name3)\
......@@ -365,6 +366,7 @@ MC_LUMA(cache64_mmx2,cache64_mmx2,mmx)
#endif
MC_LUMA(cache64_sse2,cache64_sse2,sse)
MC_LUMA(cache64_ssse3,cache64_ssse3,sse)
MC_LUMA(cache64_ssse3_atom,cache64_ssse3_atom,sse)
#endif // !HIGH_BIT_DEPTH
#define GET_REF(name)\
......@@ -408,6 +410,7 @@ GET_REF(cache64_mmx2)
GET_REF(sse2_misalign)
GET_REF(cache64_sse2)
GET_REF(cache64_ssse3)
GET_REF(cache64_ssse3_atom)
#endif // !HIGH_BIT_DEPTH
#define HPEL(align, cpu, cpuv, cpuc, cpuh)\
......@@ -606,7 +609,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
if( (cpu&X264_CPU_SHUFFLE_IS_FAST) && !(cpu&X264_CPU_SLOW_ATOM) )
if( !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SLOW_ATOM|X264_CPU_SLOW_PALIGNR)) )
pf->integral_init4v = x264_integral_init4v_ssse3;
if( !(cpu&X264_CPU_AVX) )
......@@ -649,48 +652,48 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->hpel_filter = x264_hpel_filter_sse2_amd;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_sse2;
if( cpu&X264_CPU_SSE2_IS_SLOW )
return;
pf->weight = x264_mc_weight_wtab_sse2;
if( !(cpu&X264_CPU_SLOW_ATOM) )
{
pf->offsetadd = x264_mc_offsetadd_wtab_sse2;
pf->offsetsub = x264_mc_offsetsub_wtab_sse2;
}
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse;
pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2;
pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_sse2;
pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_sse2;
pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_sse2;
pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_sse2;
pf->hpel_filter = x264_hpel_filter_sse2;
if( cpu&X264_CPU_SSE_MISALIGN )
pf->hpel_filter = x264_hpel_filter_sse2_misalign;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2;
if( !(cpu&X264_CPU_STACK_MOD4) )
pf->mc_chroma = x264_mc_chroma_sse2;
if( cpu&X264_CPU_SSE2_IS_FAST )
if( !(cpu&X264_CPU_SSE2_IS_SLOW) )
{
pf->store_interleave_chroma = x264_store_interleave_chroma_sse2; // FIXME sse2fast? sse2medium?
pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_sse2;
pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_sse2;
pf->plane_copy_interleave = x264_plane_copy_interleave_sse2;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2;
pf->mc_luma = mc_luma_sse2;
pf->get_ref = get_ref_sse2;
if( cpu&X264_CPU_CACHELINE_64 )
pf->weight = x264_mc_weight_wtab_sse2;
if( !(cpu&X264_CPU_SLOW_ATOM) )
{
pf->mc_luma = mc_luma_cache64_sse2;
pf->get_ref = get_ref_cache64_sse2;
pf->offsetadd = x264_mc_offsetadd_wtab_sse2;
pf->offsetsub = x264_mc_offsetsub_wtab_sse2;
}
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse;
pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2;
pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_sse2;
pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_sse2;
pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_sse2;
pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_sse2;
pf->hpel_filter = x264_hpel_filter_sse2;
if( cpu&X264_CPU_SSE_MISALIGN )
pf->hpel_filter = x264_hpel_filter_sse2_misalign;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2;
if( !(cpu&X264_CPU_STACK_MOD4) )
pf->mc_chroma = x264_mc_chroma_sse2;
if( cpu&X264_CPU_SSE2_IS_FAST )
{
pf->get_ref = get_ref_sse2_misalign;
if( !(cpu&X264_CPU_STACK_MOD4) )
pf->mc_chroma = x264_mc_chroma_sse2_misalign;
pf->store_interleave_chroma = x264_store_interleave_chroma_sse2; // FIXME sse2fast? sse2medium?
pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_sse2;
pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_sse2;
pf->plane_copy_interleave = x264_plane_copy_interleave_sse2;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2;
pf->mc_luma = mc_luma_sse2;
pf->get_ref = get_ref_sse2;
if( cpu&X264_CPU_CACHELINE_64 )
{
pf->mc_luma = mc_luma_cache64_sse2;
pf->get_ref = get_ref_cache64_sse2;
}
if( cpu&X264_CPU_SSE_MISALIGN )
{
pf->get_ref = get_ref_sse2_misalign;
if( !(cpu&X264_CPU_STACK_MOD4) )
pf->mc_chroma = x264_mc_chroma_sse2_misalign;
}
}
}
......@@ -707,12 +710,21 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_ssse3;
pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_ssse3;
pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_ssse3;
pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_ssse3;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_ssse3;
if( !(cpu&X264_CPU_SLOW_PSHUFB) )
{
pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_ssse3;
pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_ssse3;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_ssse3;
}
pf->hpel_filter = x264_hpel_filter_ssse3;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
if( !(cpu&X264_CPU_SLOW_PALIGNR) )
{
#if ARCH_X86_64
if( !(cpu&X264_CPU_SLOW_ATOM) ) /* The 64-bit version is slower, but the 32-bit version is faster? */
#endif
pf->hpel_filter = x264_hpel_filter_ssse3;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
}
if( !(cpu&X264_CPU_STACK_MOD4) )
pf->mc_chroma = x264_mc_chroma_ssse3;
......@@ -722,13 +734,17 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->mc_chroma = x264_mc_chroma_ssse3_cache64;
pf->mc_luma = mc_luma_cache64_ssse3;
pf->get_ref = get_ref_cache64_ssse3;
/* ssse3 weight is slower on Nehalem, so only assign here. */
pf->weight_cache = x264_weight_cache_ssse3;
pf->weight = x264_mc_weight_wtab_ssse3;
if( cpu&X264_CPU_SLOW_ATOM )
{
pf->mc_luma = mc_luma_cache64_ssse3_atom;
pf->get_ref = get_ref_cache64_ssse3_atom;
}
}
if( (cpu&X264_CPU_SHUFFLE_IS_FAST) && !(cpu&X264_CPU_SLOW_ATOM) )
pf->weight_cache = x264_weight_cache_ssse3;
pf->weight = x264_mc_weight_wtab_ssse3;
if( !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SLOW_ATOM|X264_CPU_SLOW_PALIGNR)) )
pf->integral_init4v = x264_integral_init4v_ssse3;
if( !(cpu&X264_CPU_SSE4) )
......@@ -744,9 +760,6 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->integral_init8h = x264_integral_init8h_avx;
pf->hpel_filter = x264_hpel_filter_avx;
/* ssse3 weight seems to be faster again on Sandy Bridge and Bulldozer. */
pf->weight_cache = x264_weight_cache_ssse3;
pf->weight = x264_mc_weight_wtab_ssse3;
if( !(cpu&X264_CPU_STACK_MOD4) )
pf->mc_chroma = x264_mc_chroma_avx;
......
......@@ -961,7 +961,7 @@ VAR2_8x8_SSSE3 16, 7
%if cpuflag(sse4)
; just use shufps on anything post conroe
shufps %1, %2, 0
%elif cpuflag(ssse3)
%elif cpuflag(ssse3) && notcpuflag(atom)
; join 2x 32 bit and duplicate them
; emulating shufps is faster on conroe
punpcklqdq %1, %2
......@@ -1079,6 +1079,7 @@ VAR2_8x8_SSSE3 16, 7
SWAP %%n, 4
%endmacro
; in: %1 = horizontal if 0, vertical if 1
%macro SATD_8x4_SSE 8-9
%if %1
HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax
......@@ -1253,7 +1254,7 @@ cglobal pixel_satd_4x4, 4,6
FIX_STRIDES r1, r3
%if HIGH_BIT_DEPTH && %3
pxor %2, %2
%elif cpuflag(ssse3)
%elif cpuflag(ssse3) && notcpuflag(atom)
mova %2, [hmul_8p]
%endif
lea r4, [3*r1]
......@@ -1307,7 +1308,7 @@ cglobal pixel_satd_4x4, 4,6
%endif
%endmacro
%macro SATD_4x8_SSE 2
%macro SATD_4x8_SSE 3
%if HIGH_BIT_DEPTH
movh m0, [r0+0*r1]
movh m4, [r2+0*r3]
......@@ -1348,7 +1349,7 @@ cglobal pixel_satd_4x4, 4,6
JDUP m5, m3
movd m3, [r0+2*r1]
JDUP m1, m3
%if cpuflag(ssse3) && %1==1
%if %1==0 && %2==1
mova m3, [hmul_4p]
DIFFOP 0, 4, 1, 5, 3
%else
......@@ -1366,21 +1367,23 @@ cglobal pixel_satd_4x4, 4,6
JDUP m5, m4
movd m4, [r0+r1]
JDUP m3, m4
%if cpuflag(ssse3) && %1==1
%if %1==0 && %2==1
mova m4, [hmul_4p]
DIFFOP 2, 6, 3, 5, 4
%else
DIFFOP 2, 6, 3, 5, 7
%endif
%endif ; HIGH_BIT_DEPTH
SATD_8x4_SSE (HIGH_BIT_DEPTH || cpuflags == cpuflags_sse2), 0, 1, 2, 3, 4, 5, 7, %2
SATD_8x4_SSE %1, 0, 1, 2, 3, 4, 5, 7, %3
%endmacro
;-----------------------------------------------------------------------------
; int pixel_satd_8x4( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
%macro SATDS_SSE2 0
%if cpuflag(ssse3)
%define vertical ((notcpuflag(ssse3) || cpuflag(atom)) || HIGH_BIT_DEPTH)
%if vertical==0 || HIGH_BIT_DEPTH
cglobal pixel_satd_4x4, 4, 6, 6
SATD_START_MMX
mova m4, [hmul_4p]
......@@ -1399,33 +1402,33 @@ cglobal pixel_satd_4x4, 4, 6, 6
cglobal pixel_satd_4x8, 4, 6, 8
SATD_START_MMX
%if HIGH_BIT_DEPTH == 0 && cpuflag(ssse3)
mova m7, [hmul_4p]
%if vertical==0
mova m7, [hmul_4p]
%endif
SATD_4x8_SSE 0, swap
HADDW m7, m1
movd eax, m7
SATD_4x8_SSE vertical, 0, swap
HADDW m7, m1
movd eax, m7
RET
cglobal pixel_satd_4x16, 4, 6, 8
SATD_START_MMX
%if HIGH_BIT_DEPTH == 0 && cpuflag(ssse3)
%if vertical==0
mova m7, [hmul_4p]
%endif
SATD_4x8_SSE 0, swap
<