Commit d25d50c9 authored by Fiona Glaser's avatar Fiona Glaser

SSE2 zigzag_interleave

Replace PHADD with FastShuffle (more accurate naming).
This flag represents asm functions that rely on fast SSE2 shuffle units, and thus are only faster on Phenom, Nehalem, and Penryn CPUs.
parent acd4b264
......@@ -53,7 +53,7 @@ const x264_cpu_name_t x264_cpu_names[] = {
{"SSE2Fast",X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE2_IS_FAST},
{"SSE3", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3},
{"SSSE3", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3},
{"PHADD", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_PHADD_IS_FAST},
{"FastShuffle", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SHUFFLE_IS_FAST},
{"SSE4.1", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
{"SSE4.2", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42},
{"Cache32", X264_CPU_CACHELINE_32},
......@@ -107,7 +107,7 @@ uint32_t x264_cpu_detect( void )
if( cpu & X264_CPU_SSSE3 )
cpu |= X264_CPU_SSE2_IS_FAST;
if( cpu & X264_CPU_SSE4 )
cpu |= X264_CPU_PHADD_IS_FAST;
cpu |= X264_CPU_SHUFFLE_IS_FAST;
x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx );
max_extended_cap = eax;
......@@ -124,6 +124,7 @@ uint32_t x264_cpu_detect( void )
cpu |= X264_CPU_SSE2_IS_FAST;
cpu |= X264_CPU_SSE_MISALIGN;
cpu |= X264_CPU_LZCNT;
cpu |= X264_CPU_SHUFFLE_IS_FAST;
x264_cpu_mask_misalign_sse();
}
else
......
......@@ -663,9 +663,9 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
{
pf->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3;
pf->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3;
if( cpu&X264_CPU_SHUFFLE_IS_FAST )
pf->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
}
if( cpu&X264_CPU_PHADD_IS_FAST )
pf->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
#endif
#ifdef ARCH_PPC
......@@ -678,5 +678,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
#ifdef HAVE_MMX
if( cpu&X264_CPU_MMX )
pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
if( cpu&X264_CPU_SHUFFLE_IS_FAST )
pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
#endif
}
......@@ -763,7 +763,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT2( sad_x3, _cache64_ssse3 );
INIT2( sad_x4, _cache64_ssse3 );
}
if( !(cpu&X264_CPU_PHADD_IS_FAST) )
if( !(cpu&X264_CPU_SHUFFLE_IS_FAST) )
{
INIT5( ssd, _sse2 ); /* on conroe, sse2 is faster for width8/16 */
}
......
......@@ -35,7 +35,7 @@ pb_scan4framea: db 12,13,6,7,14,15,0,1,8,9,2,3,4,5,10,11
pb_scan4frameb: db 0,1,8,9,2,3,4,5,10,11,12,13,6,7,14,15
pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
pb_idctdc_unpack2: db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
pb_1: times 8 db 1
pb_1: times 16 db 1
SECTION .text
......@@ -785,3 +785,50 @@ cglobal x264_zigzag_interleave_8x8_cavlc_mmx, 3,3
shr r0d, 16
mov [r2+8], r0w
RET
%macro INTERLEAVE_XMM 1
mova m0, [r1+%1*4+ 0]
mova m1, [r1+%1*4+16]
mova m4, [r1+%1*4+32]
mova m5, [r1+%1*4+48]
SBUTTERFLY wd, 0, 1, 6
SBUTTERFLY wd, 4, 5, 7
SBUTTERFLY wd, 0, 1, 6
SBUTTERFLY wd, 4, 5, 7
movq [r0+%1+ 0], m0
movhps [r0+%1+ 32], m0
movq [r0+%1+ 64], m1
movhps [r0+%1+ 96], m1
movq [r0+%1+ 8], m4
movhps [r0+%1+ 40], m4
movq [r0+%1+ 72], m5
movhps [r0+%1+104], m5
%if %1
por m2, m0
por m3, m1
por m2, m4
por m3, m5
%else
SWAP 0,2
SWAP 3,1
por m2, m4
por m3, m5
%endif
%endmacro
INIT_XMM
cglobal x264_zigzag_interleave_8x8_cavlc_sse2, 3,3,8
INTERLEAVE_XMM 0
INTERLEAVE_XMM 16
packsswb m2, m3
pxor m5, m5
packsswb m2, m2
packsswb m2, m2
pcmpeqb m5, m2
paddb m5, [pb_1 GLOBAL]
movd r0d, m5
mov [r2+0], r0w
shr r0d, 16
mov [r2+8], r0w
RET
......@@ -69,5 +69,6 @@ void x264_zigzag_scan_4x4_frame_mmx ( int16_t level[16], int16_t dct[4][4] );
void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] );
void x264_zigzag_sub_4x4_frame_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst );
void x264_zigzag_interleave_8x8_cavlc_mmx( int16_t *dst, int16_t *src, uint8_t *nnz );
void x264_zigzag_interleave_8x8_cavlc_sse2( int16_t *dst, int16_t *src, uint8_t *nnz );
#endif
......@@ -151,7 +151,7 @@ static void print_bench(void)
if( k<j ) continue;
printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,
b->cpu&X264_CPU_SSE4 ? "sse4" :
b->cpu&X264_CPU_PHADD_IS_FAST ? "phadd" :
b->cpu&X264_CPU_SHUFFLE_IS_FAST ? "fastshuffle" :
b->cpu&X264_CPU_SSSE3 ? "ssse3" :
b->cpu&X264_CPU_SSE3 ? "sse3" :
/* print sse2slow only if there's also a sse2fast version of the same func */
......@@ -1364,10 +1364,10 @@ static int check_intra( int cpu_ref, int cpu_new )
for( i = 0; i < 12; i++ )
INTRA_TEST( predict_8x8, i, 8, edge );
used_asm = 1;
set_func_name("intra_predict_8x8_filter");
if( ip_a.predict_8x8_filter != ip_ref.predict_8x8_filter )
{
used_asm = 1;
for( i = 0; i < 32; i++ )
{
memcpy( edge2, edge, 33 );
......@@ -1463,6 +1463,8 @@ static int check_all_flags( void )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE | X264_CPU_SSE2 | X264_CPU_SSE2_IS_SLOW, "SSE2Slow" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE2_IS_FAST, "SSE2Fast" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSE2Fast Cache64" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SHUFFLE_IS_FAST, "SSE2 FastShuffle" );
cpu1 &= ~X264_CPU_SHUFFLE_IS_FAST;
}
if( x264_cpu_detect() & X264_CPU_SSE_MISALIGN )
{
......@@ -1483,7 +1485,8 @@ static int check_all_flags( void )
cpu1 &= ~X264_CPU_CACHELINE_64;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSSE3, "SSSE3" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_PHADD_IS_FAST, "PHADD" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SHUFFLE_IS_FAST, "SSSE3 FastShuffle" );
cpu1 &= ~X264_CPU_SHUFFLE_IS_FAST;
}
if( x264_cpu_detect() & X264_CPU_SSE4 )
{
......
......@@ -57,7 +57,7 @@ typedef struct x264_t x264_t;
#define X264_CPU_SSE2_IS_FAST 0x000100 /* a few functions are only faster on Core2 and Phenom */
#define X264_CPU_SSE3 0x000200
#define X264_CPU_SSSE3 0x000400
#define X264_CPU_PHADD_IS_FAST 0x000800 /* pre-Penryn Core2 have a uselessly slow PHADD instruction */
#define X264_CPU_SHUFFLE_IS_FAST 0x000800 /* Penryn, Nehalem, and Phenom have fast shuffle units */
#define X264_CPU_STACK_MOD4 0x001000 /* if stack is only mod4 and not mod16 */
#define X264_CPU_SSE4 0x002000 /* SSE4.1 */
#define X264_CPU_SSE42 0x004000 /* SSE4.2 */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment