Commit f9ad5ee2 authored by Loren Merritt's avatar Loren Merritt

enable ssse3 phadd satd on Penryn.

parent b8670681
...@@ -47,6 +47,7 @@ const struct { ...@@ -47,6 +47,7 @@ const struct {
{"SSE2", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2}, {"SSE2", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2},
{"SSE3", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3}, {"SSE3", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3},
{"SSSE3", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3}, {"SSSE3", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3},
{"SSE4", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
{"3DNow", X264_CPU_3DNOW}, {"3DNow", X264_CPU_3DNOW},
{"Altivec", X264_CPU_ALTIVEC}, {"Altivec", X264_CPU_ALTIVEC},
{"Cache32", X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_32}, {"Cache32", X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_32},
...@@ -88,6 +89,8 @@ uint32_t x264_cpu_detect( void ) ...@@ -88,6 +89,8 @@ uint32_t x264_cpu_detect( void )
cpu |= X264_CPU_SSE3; cpu |= X264_CPU_SSE3;
if( ecx&0x00000200 ) if( ecx&0x00000200 )
cpu |= X264_CPU_SSSE3; cpu |= X264_CPU_SSSE3;
if( ecx&0x00080000 )
cpu |= X264_CPU_SSE4;
x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx ); x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx );
max_extended_cap = eax; max_extended_cap = eax;
......
...@@ -360,6 +360,7 @@ SATD_X_DECL7() ...@@ -360,6 +360,7 @@ SATD_X_DECL7()
SATD_X_DECL7( _mmxext ) SATD_X_DECL7( _mmxext )
SATD_X_DECL5( _sse2 ) SATD_X_DECL5( _sse2 )
SATD_X_DECL7( _ssse3 ) SATD_X_DECL7( _ssse3 )
SATD_X_DECL5( _ssse3_phadd )
#endif #endif
/**************************************************************************** /****************************************************************************
...@@ -649,6 +650,14 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) ...@@ -649,6 +650,14 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT2( sad_x4, _cache64_ssse3 ); INIT2( sad_x4, _cache64_ssse3 );
} }
} }
if( cpu&X264_CPU_SSE4 )
{
// enabled on Penryn, but slower on Conroe
INIT5( satd, _ssse3_phadd );
INIT5( satd_x3, _ssse3_phadd );
INIT5( satd_x4, _ssse3_phadd );
}
#endif //HAVE_MMX #endif //HAVE_MMX
#ifdef ARCH_PPC #ifdef ARCH_PPC
......
...@@ -274,19 +274,23 @@ SSD_SSE2 8, 4 ...@@ -274,19 +274,23 @@ SSD_SSE2 8, 4
LOAD_DIFF_8P %4, %6, [r0+r4], [r2+r5] LOAD_DIFF_8P %4, %6, [r0+r4], [r2+r5]
%endmacro %endmacro
;;; row transform not used, because phaddw is much slower than paddw on a Conroe ; phaddw is used only in 4x4 hadamard, because in 8x8 it's slower:
;%macro PHSUMSUB 3 ; even on Penryn, phaddw has latency 3 while paddw and punpck* have 1.
; movdqa %3, %1 ; 4x4 is special in that 4x4 transpose in xmmregs takes extra munging,
; phaddw %1, %2 ; whereas phaddw-based transform doesn't care what order the coefs end up in.
; phsubw %3, %2
;%endmacro %macro PHSUMSUB 3
movdqa %3, %1
;%macro HADAMARD4_ROW_SSSE3 5 ; abcd-t -> adtc phaddw %1, %2
; PHSUMSUB %1, %2, %5 phsubw %3, %2
; PHSUMSUB %3, %4, %2 %endmacro
; PHSUMSUB %1, %3, %4
; PHSUMSUB %5, %2, %3 %macro HADAMARD4_ROW_PHADD 5 ; abcd-t -> adtc
;%endmacro PHSUMSUB %1, %2, %5
PHSUMSUB %3, %4, %2
PHSUMSUB %1, %3, %4
PHSUMSUB %5, %2, %3
%endmacro
%macro SUMSUB_BADC 4 %macro SUMSUB_BADC 4
paddw %1, %2 paddw %1, %2
...@@ -494,6 +498,21 @@ SSD_SSE2 8, 4 ...@@ -494,6 +498,21 @@ SSD_SSE2 8, 4
paddusw xmm6, xmm2 paddusw xmm6, xmm2
%endmacro %endmacro
%macro SATD_8x4_PHADD 1
LOAD_DIFF_8x4P xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
%if %1
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
%endif
HADAMARD4_1D xmm0, xmm1, xmm2, xmm3
HADAMARD4_ROW_PHADD xmm0, xmm1, xmm2, xmm3, xmm4
ABS4 xmm0, xmm3, xmm4, xmm2, xmm1, xmm5
paddusw xmm0, xmm3
paddusw xmm2, xmm4
paddusw xmm6, xmm0
paddusw xmm6, xmm2
%endmacro
%macro SATD_START_MMX 0 %macro SATD_START_MMX 0
lea r4, [3*r1] ; 3*stride1 lea r4, [3*r1] ; 3*stride1
lea r5, [3*r3] ; 3*stride2 lea r5, [3*r3] ; 3*stride2
...@@ -1279,6 +1298,8 @@ SA8D_16x16_32 ssse3 ...@@ -1279,6 +1298,8 @@ SA8D_16x16_32 ssse3
INTRA_SA8D_SSE2 ssse3 INTRA_SA8D_SSE2 ssse3
INTRA_SATDS_MMX ssse3 INTRA_SATDS_MMX ssse3
SATD_W4 ssse3 ; mmx, but uses pabsw from ssse3. SATD_W4 ssse3 ; mmx, but uses pabsw from ssse3.
%define SATD_8x4_SSE2 SATD_8x4_PHADD
SATDS_SSE2 ssse3_phadd
......
...@@ -51,6 +51,7 @@ DECL_X1( ssd, sse2 ) ...@@ -51,6 +51,7 @@ DECL_X1( ssd, sse2 )
DECL_X1( satd, mmxext ) DECL_X1( satd, mmxext )
DECL_X1( satd, sse2 ) DECL_X1( satd, sse2 )
DECL_X1( satd, ssse3 ) DECL_X1( satd, ssse3 )
DECL_X1( satd, ssse3_phadd )
DECL_X1( sa8d, mmxext ) DECL_X1( sa8d, mmxext )
DECL_X1( sa8d, sse2 ) DECL_X1( sa8d, sse2 )
DECL_X1( sa8d, ssse3 ) DECL_X1( sa8d, ssse3 )
......
...@@ -120,6 +120,7 @@ static void print_bench(void) ...@@ -120,6 +120,7 @@ static void print_bench(void)
for( k=0; k<j && benchs[i].vers[k].pointer != b->pointer; k++ ); for( k=0; k<j && benchs[i].vers[k].pointer != b->pointer; k++ );
if( k<j ) continue; if( k<j ) continue;
printf( "%s_%s%s: %"PRId64"\n", benchs[i].name, printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,
b->cpu&X264_CPU_SSE4 ? "sse4" :
b->cpu&X264_CPU_SSSE3 ? "ssse3" : b->cpu&X264_CPU_SSSE3 ? "ssse3" :
b->cpu&X264_CPU_SSE3 ? "sse3" : b->cpu&X264_CPU_SSE3 ? "sse3" :
b->cpu&X264_CPU_SSE2 ? "sse2" : b->cpu&X264_CPU_SSE2 ? "sse2" :
...@@ -1142,6 +1143,11 @@ int check_all_flags( void ) ...@@ -1142,6 +1143,11 @@ int check_all_flags( void )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSSE3, "SSSE3" ); ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSSE3, "SSSE3" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64, "SSSE3 Cache64" ); ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64, "SSSE3 Cache64" );
} }
if( x264_cpu_detect() & X264_CPU_SSSE3 )
{
cpu1 &= ~(X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64);
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4, "SSE4" );
}
#elif ARCH_PPC #elif ARCH_PPC
if( x264_cpu_detect() & X264_CPU_ALTIVEC ) if( x264_cpu_detect() & X264_CPU_ALTIVEC )
{ {
......
...@@ -58,6 +58,7 @@ typedef struct x264_t x264_t; ...@@ -58,6 +58,7 @@ typedef struct x264_t x264_t;
#define X264_CPU_CACHELINE_SPLIT 0x200 /* avoid memory loads that span the border between two cachelines */ #define X264_CPU_CACHELINE_SPLIT 0x200 /* avoid memory loads that span the border between two cachelines */
#define X264_CPU_CACHELINE_32 0x0400 /* size of a cacheline in bytes */ #define X264_CPU_CACHELINE_32 0x0400 /* size of a cacheline in bytes */
#define X264_CPU_CACHELINE_64 0x0800 #define X264_CPU_CACHELINE_64 0x0800
#define X264_CPU_SSE4 0x001000 /* sse 4.1 */
/* Analyse flags /* Analyse flags
*/ */
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment