Commit f9ad5ee2 authored by Loren Merritt's avatar Loren Merritt

enable ssse3 phadd satd on Penryn.

parent b8670681
......@@ -47,6 +47,7 @@ const struct {
{"SSE2", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2},
{"SSE3", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3},
{"SSSE3", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3},
{"SSE4", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
{"3DNow", X264_CPU_3DNOW},
{"Altivec", X264_CPU_ALTIVEC},
{"Cache32", X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_32},
......@@ -88,6 +89,8 @@ uint32_t x264_cpu_detect( void )
cpu |= X264_CPU_SSE3;
if( ecx&0x00000200 )
cpu |= X264_CPU_SSSE3;
if( ecx&0x00080000 )
cpu |= X264_CPU_SSE4;
x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx );
max_extended_cap = eax;
......
......@@ -360,6 +360,7 @@ SATD_X_DECL7()
SATD_X_DECL7( _mmxext )
SATD_X_DECL5( _sse2 )
SATD_X_DECL7( _ssse3 )
SATD_X_DECL5( _ssse3_phadd )
#endif
/****************************************************************************
......@@ -649,6 +650,14 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT2( sad_x4, _cache64_ssse3 );
}
}
if( cpu&X264_CPU_SSE4 )
{
// enabled on Penryn, but slower on Conroe
INIT5( satd, _ssse3_phadd );
INIT5( satd_x3, _ssse3_phadd );
INIT5( satd_x4, _ssse3_phadd );
}
#endif //HAVE_MMX
#ifdef ARCH_PPC
......
......@@ -274,19 +274,23 @@ SSD_SSE2 8, 4
LOAD_DIFF_8P %4, %6, [r0+r4], [r2+r5]
%endmacro
;;; row transform not used, because phaddw is much slower than paddw on a Conroe
;%macro PHSUMSUB 3
; movdqa %3, %1
; phaddw %1, %2
; phsubw %3, %2
;%endmacro
;%macro HADAMARD4_ROW_SSSE3 5 ; abcd-t -> adtc
; PHSUMSUB %1, %2, %5
; PHSUMSUB %3, %4, %2
; PHSUMSUB %1, %3, %4
; PHSUMSUB %5, %2, %3
;%endmacro
; phaddw is used only in 4x4 hadamard, because in 8x8 it's slower:
; even on Penryn, phaddw has latency 3 while paddw and punpck* have 1.
; 4x4 is special in that 4x4 transpose in xmmregs takes extra munging,
; whereas phaddw-based transform doesn't care what order the coefs end up in.
%macro PHSUMSUB 3
movdqa %3, %1
phaddw %1, %2
phsubw %3, %2
%endmacro
%macro HADAMARD4_ROW_PHADD 5 ; abcd-t -> adtc
PHSUMSUB %1, %2, %5
PHSUMSUB %3, %4, %2
PHSUMSUB %1, %3, %4
PHSUMSUB %5, %2, %3
%endmacro
%macro SUMSUB_BADC 4
paddw %1, %2
......@@ -494,6 +498,21 @@ SSD_SSE2 8, 4
paddusw xmm6, xmm2
%endmacro
%macro SATD_8x4_PHADD 1
LOAD_DIFF_8x4P xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
%if %1
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
%endif
HADAMARD4_1D xmm0, xmm1, xmm2, xmm3
HADAMARD4_ROW_PHADD xmm0, xmm1, xmm2, xmm3, xmm4
ABS4 xmm0, xmm3, xmm4, xmm2, xmm1, xmm5
paddusw xmm0, xmm3
paddusw xmm2, xmm4
paddusw xmm6, xmm0
paddusw xmm6, xmm2
%endmacro
%macro SATD_START_MMX 0
lea r4, [3*r1] ; 3*stride1
lea r5, [3*r3] ; 3*stride2
......@@ -1279,6 +1298,8 @@ SA8D_16x16_32 ssse3
INTRA_SA8D_SSE2 ssse3
INTRA_SATDS_MMX ssse3
SATD_W4 ssse3 ; mmx, but uses pabsw from ssse3.
%define SATD_8x4_SSE2 SATD_8x4_PHADD
SATDS_SSE2 ssse3_phadd
......
......@@ -51,6 +51,7 @@ DECL_X1( ssd, sse2 )
DECL_X1( satd, mmxext )
DECL_X1( satd, sse2 )
DECL_X1( satd, ssse3 )
DECL_X1( satd, ssse3_phadd )
DECL_X1( sa8d, mmxext )
DECL_X1( sa8d, sse2 )
DECL_X1( sa8d, ssse3 )
......
......@@ -120,6 +120,7 @@ static void print_bench(void)
for( k=0; k<j && benchs[i].vers[k].pointer != b->pointer; k++ );
if( k<j ) continue;
printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,
b->cpu&X264_CPU_SSE4 ? "sse4" :
b->cpu&X264_CPU_SSSE3 ? "ssse3" :
b->cpu&X264_CPU_SSE3 ? "sse3" :
b->cpu&X264_CPU_SSE2 ? "sse2" :
......@@ -1142,6 +1143,11 @@ int check_all_flags( void )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSSE3, "SSSE3" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64, "SSSE3 Cache64" );
}
if( x264_cpu_detect() & X264_CPU_SSSE3 )
{
cpu1 &= ~(X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64);
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4, "SSE4" );
}
#elif ARCH_PPC
if( x264_cpu_detect() & X264_CPU_ALTIVEC )
{
......
......@@ -58,6 +58,7 @@ typedef struct x264_t x264_t;
#define X264_CPU_CACHELINE_SPLIT 0x200 /* avoid memory loads that span the border between two cachelines */
#define X264_CPU_CACHELINE_32 0x0400 /* size of a cacheline in bytes */
#define X264_CPU_CACHELINE_64 0x0800
#define X264_CPU_SSE4 0x001000 /* sse 4.1 */
/* Analyse flags
*/
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment