Commit 69e69197 authored by Fiona Glaser's avatar Fiona Glaser

Faster width4 SSD+SATD, SSE4 optimizations

Do satd 4x8 by transposing the two blocks' positions and running satd 8x4.
Use pinsrd (SSE4) for faster width4 SSD
Globally replace movlhps with punpcklqdq (it seems to be faster on Conroe)
Move mask_misalign declaration to cpu.h to avoid warning in encoder.c.
These optimizations help on Nehalem, Phenom, and Penryn CPUs.
parent e76caf36
......@@ -60,7 +60,6 @@ const x264_cpu_name_t x264_cpu_names[] = {
#ifdef HAVE_MMX
extern int x264_cpu_cpuid_test( void );
extern void x264_cpu_mask_misalign_sse( void );
extern uint32_t x264_cpu_cpuid( uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx );
uint32_t x264_cpu_detect( void )
......
......@@ -24,6 +24,7 @@
uint32_t x264_cpu_detect( void );
int x264_cpu_num_processors( void );
void x264_emms( void );
void x264_cpu_mask_misalign_sse( void );
/* kluge:
* gcc can't give variables any greater alignment than the stack frame has.
......
......@@ -414,23 +414,23 @@ static void x264_pixel_satd_x4_##size##cpu( uint8_t *fenc, uint8_t *pix0, uint8_
scores[2] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix2, i_stride );\
scores[3] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix3, i_stride );\
}
#define SATD_X_DECL5( cpu )\
#define SATD_X_DECL6( cpu )\
SATD_X( 16x16, cpu )\
SATD_X( 16x8, cpu )\
SATD_X( 8x16, cpu )\
SATD_X( 8x8, cpu )\
SATD_X( 8x4, cpu )
SATD_X( 8x4, cpu )\
SATD_X( 4x8, cpu )
#define SATD_X_DECL7( cpu )\
SATD_X_DECL5( cpu )\
SATD_X( 4x8, cpu )\
SATD_X_DECL6( cpu )\
SATD_X( 4x4, cpu )
SATD_X_DECL7()
#ifdef HAVE_MMX
SATD_X_DECL7( _mmxext )
SATD_X_DECL5( _sse2 )
SATD_X_DECL6( _sse2 )
SATD_X_DECL7( _ssse3 )
SATD_X_DECL5( _ssse3_phadd )
SATD_X_DECL6( _ssse3_phadd )
#endif
/****************************************************************************
......@@ -582,13 +582,16 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
#define INIT5_NAME( name1, name2, cpu ) \
INIT4_NAME( name1, name2, cpu ) \
pixf->name1[PIXEL_8x4] = x264_pixel_##name2##_8x4##cpu;
#define INIT7_NAME( name1, name2, cpu ) \
#define INIT6_NAME( name1, name2, cpu ) \
INIT5_NAME( name1, name2, cpu ) \
pixf->name1[PIXEL_4x8] = x264_pixel_##name2##_4x8##cpu;\
pixf->name1[PIXEL_4x8] = x264_pixel_##name2##_4x8##cpu;
#define INIT7_NAME( name1, name2, cpu ) \
INIT6_NAME( name1, name2, cpu ) \
pixf->name1[PIXEL_4x4] = x264_pixel_##name2##_4x4##cpu;
#define INIT2( name, cpu ) INIT2_NAME( name, name, cpu )
#define INIT4( name, cpu ) INIT4_NAME( name, name, cpu )
#define INIT5( name, cpu ) INIT5_NAME( name, name, cpu )
#define INIT6( name, cpu ) INIT6_NAME( name, name, cpu )
#define INIT7( name, cpu ) INIT7_NAME( name, name, cpu )
#define INIT_ADS( cpu ) \
......@@ -698,9 +701,18 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
if( cpu&X264_CPU_SSE2 )
{
INIT5( ssd, _sse2 );
INIT5( satd, _sse2 );
INIT5( satd_x3, _sse2 );
INIT5( satd_x4, _sse2 );
if( cpu&X264_CPU_SSE2_IS_FAST )
{
INIT6( satd, _sse2 );
INIT6( satd_x3, _sse2 );
INIT6( satd_x4, _sse2 );
}
else
{
INIT5( satd, _sse2 );
INIT5( satd_x3, _sse2 );
INIT5( satd_x4, _sse2 );
}
INIT2_NAME( sad_aligned, sad, _sse2_aligned );
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_sse2;
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_sse2;
......@@ -758,11 +770,17 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
}
if( cpu&X264_CPU_PHADD_IS_FAST )
{
INIT5( satd, _ssse3_phadd );
INIT5( satd_x3, _ssse3_phadd );
INIT5( satd_x4, _ssse3_phadd );
INIT6( satd, _ssse3_phadd );
INIT6( satd_x3, _ssse3_phadd );
INIT6( satd_x4, _ssse3_phadd );
}
}
if( cpu&X264_CPU_SSE4 )
{
pixf->ssd[PIXEL_4x8] = x264_pixel_ssd_4x8_sse4;
pixf->ssd[PIXEL_4x4] = x264_pixel_ssd_4x4_sse4;
}
#endif //HAVE_MMX
#ifdef ARCH_PPC
......
......@@ -295,7 +295,7 @@ cglobal x264_zigzag_scan_8x8_frame_%1, 2,2
movdq2q mm7, xmm7
%else
movhlps xmm3, xmm7
movlhps xmm7, xmm7
punpcklqdq xmm7, xmm7
movdq2q mm7, xmm3
%endif
......@@ -531,8 +531,8 @@ cglobal x264_zigzag_sub_4x4_frame_ssse3, 3,3
punpckldq xmm2, xmm3
punpckldq xmm4, xmm5
punpckldq xmm6, xmm7
movlhps xmm0, xmm2
movlhps xmm4, xmm6
punpcklqdq xmm0, xmm2
punpcklqdq xmm4, xmm6
movdqa xmm7, [pb_sub4frame GLOBAL]
pshufb xmm0, xmm7
pshufb xmm4, xmm7
......
......@@ -200,7 +200,7 @@ SECTION .text
%macro SPLATW 1
%ifidn m0, xmm0
pshuflw %1, %1, 0
movlhps %1, %1
punpcklqdq %1, %1
%else
pshufw %1, %1, 0
%endif
......
......@@ -77,7 +77,7 @@ SECTION .text
%macro SPLATW 2
%if mmsize==16
pshuflw %1, %2, 0
movlhps %1, %1
punpcklqdq %1, %1
%else
pshufw %1, %2, 0
%endif
......
......@@ -134,12 +134,46 @@ SECTION .text
paddd m0, m3
%endmacro
%macro SSD_QUARTER 6
movd m1, [r0+%1]
movd m2, [r2+%2]
movd m3, [r0+%3]
movd m4, [r2+%4]
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
pinsrd m1, [r0+%1], 1
pinsrd m2, [r2+%2], 1
pinsrd m3, [r0+%3], 1
pinsrd m4, [r2+%4], 1
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
punpcklbw m4, m7
psubw m1, m2
psubw m3, m4
pmaddwd m1, m1
pmaddwd m3, m3
%if %6
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
%endif
%if %5
paddd m0, m1
%else
SWAP m0, m1
%endif
paddd m0, m3
%endmacro
;-----------------------------------------------------------------------------
; int x264_pixel_ssd_16x16_mmx( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
%macro SSD 3
cglobal x264_pixel_ssd_%1x%2_%3, 4,4
%if %1 >= mmsize
pxor m7, m7
%endif
%assign i 0
%rep %2/2
%if %1 > mmsize
......@@ -172,6 +206,19 @@ SSD 8, 16, sse2
SSD 8, 8, sse2
SSD 8, 4, sse2
cglobal x264_pixel_ssd_4x8_sse4, 4,4
SSD_QUARTER 0, 0, r1, r3, 0, 1
SSD_QUARTER 0, 0, r1, r3, 1, 0
HADDD m0, m1
movd eax, m0
RET
cglobal x264_pixel_ssd_4x4_sse4, 4,4
SSD_QUARTER 0, 0, r1, r3, 0, 0
HADDD m0, m1
movd eax, m0
RET
;=============================================================================
; variance
......@@ -339,14 +386,9 @@ cglobal x264_pixel_var_8x8_sse2, 2,3
HADAMARD4x4_SUM %1
%endmacro
%macro SATD_8x4_SSE2 2
LOAD_DIFF_8x4P m0, m1, m2, m3, m4, m5
%if %1
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
%endif
%macro SATD_8x4_SSE2 1
HADAMARD4_1D m0, m1, m2, m3
%ifidn %2, ssse3_phadd
%ifidn %1, ssse3_phadd
HADAMARD4_ROW_PHADD 0, 1, 2, 3, 4
%else
TRANSPOSE2x4x4W 0, 1, 2, 3, 4
......@@ -447,15 +489,15 @@ cglobal x264_pixel_satd_8x4_mmxext, 4,6
call x264_pixel_satd_8x4_internal_mmxext
SATD_END_MMX
%macro SATD_W4 1
INIT_MMX
cglobal x264_pixel_satd_4x8_%1, 4,6
cglobal x264_pixel_satd_4x8_mmxext, 4,6
SATD_START_MMX
SATD_4x4_MMX m0, 0, 1
SATD_4x4_MMX m1, 0, 0
paddw m0, m1
SATD_END_MMX
%macro SATD_W4 1
INIT_MMX
cglobal x264_pixel_satd_4x4_%1, 4,6
SATD_START_MMX
SATD_4x4_MMX m0, 0, 0
......@@ -502,9 +544,15 @@ SATD_W4 mmxext
%macro SATDS_SSE2 1
INIT_XMM
cglobal x264_pixel_satd_8x8_internal_%1
SATD_8x4_SSE2 1, %1
LOAD_DIFF_8x4P m0, m1, m2, m3, m4, m5
SATD_8x4_SSE2 %1
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
x264_pixel_satd_8x4_internal_%1:
SATD_8x4_SSE2 0, %1
LOAD_DIFF_8x4P m0, m1, m2, m3, m4, m5
x264_pixel_satd_4x8_internal_%1:
SAVE_MM_PERMUTATION satd_4x8_internal
SATD_8x4_SSE2 %1
ret
cglobal x264_pixel_satd_16x16_%1, 4,6
......@@ -547,6 +595,29 @@ cglobal x264_pixel_satd_8x4_%1, 4,6
call x264_pixel_satd_8x4_internal_%1
SATD_END_SSE2
cglobal x264_pixel_satd_4x8_%1, 4,6
INIT_XMM
LOAD_MM_PERMUTATION satd_4x8_internal
%define movh movd
SATD_START_SSE2
LOAD_DIFF m0, m7, m6, [r0], [r2]
LOAD_DIFF m1, m7, m6, [r0+r1], [r2+r3]
LOAD_DIFF m2, m7, m6, [r0+2*r1], [r2+2*r3]
LOAD_DIFF m3, m7, m6, [r0+r4], [r2+r5]
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
LOAD_DIFF m4, m7, m6, [r0], [r2]
LOAD_DIFF m5, m7, m6, [r0+r1], [r2+r3]
punpcklqdq m0, m4
punpcklqdq m1, m5
LOAD_DIFF m4, m7, m6, [r0+2*r1], [r2+2*r3]
LOAD_DIFF m5, m7, m6, [r0+r4], [r2+r5]
punpcklqdq m2, m4
punpcklqdq m3, m5
%define movh movq
call x264_pixel_satd_4x8_internal_%1
SATD_END_SSE2
%ifdef ARCH_X86_64
;-----------------------------------------------------------------------------
; int x264_pixel_sa8d_8x8_sse2( uint8_t *, int, uint8_t *, int )
......
......@@ -50,6 +50,7 @@ DECL_X4( sad, sse2 )
DECL_X4( sad, sse3 )
DECL_X1( ssd, mmx )
DECL_X1( ssd, sse2 )
DECL_X1( ssd, sse4 )
DECL_X1( satd, mmxext )
DECL_X1( satd, sse2 )
DECL_X1( satd, ssse3 )
......
......@@ -136,7 +136,7 @@
punpcklbw %1, %1
%if mmsize==16
pshuflw %1, %1, 0xff
movlhps %1, %1
punpcklqdq %1, %1
%else
pshufw %1, %1, 0xff
%endif
......
......@@ -146,6 +146,7 @@ static void print_bench(void)
for( k=0; k<j && benchs[i].vers[k].pointer != b->pointer; k++ );
if( k<j ) continue;
printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,
b->cpu&X264_CPU_SSE4 ? "sse4" :
b->cpu&X264_CPU_PHADD_IS_FAST ? "phadd" :
b->cpu&X264_CPU_SSSE3 ? "ssse3" :
b->cpu&X264_CPU_SSE3 ? "sse3" :
......@@ -1305,6 +1306,11 @@ static int check_all_flags( void )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_PHADD_IS_FAST, "PHADD" );
}
if( x264_cpu_detect() & X264_CPU_SSE4 )
{
cpu1 &= ~X264_CPU_CACHELINE_64;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4, "SSE4" );
}
#elif ARCH_PPC
if( x264_cpu_detect() & X264_CPU_ALTIVEC )
{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment