Commit cb688111 authored by Fiona Glaser's avatar Fiona Glaser

Add support for SSE4a (Phenom) LZCNT instruction

Significantly speeds up coeff_last and coeff_level_run on Phenom CPUs for faster CAVLC and CABAC.
Also a small tweak to coeff_level_run asm.
parent 9e1f3000
...@@ -54,6 +54,7 @@ const x264_cpu_name_t x264_cpu_names[] = { ...@@ -54,6 +54,7 @@ const x264_cpu_name_t x264_cpu_names[] = {
{"Cache32", X264_CPU_CACHELINE_32}, {"Cache32", X264_CPU_CACHELINE_32},
{"Cache64", X264_CPU_CACHELINE_64}, {"Cache64", X264_CPU_CACHELINE_64},
{"SSEMisalign", X264_CPU_SSE_MISALIGN}, {"SSEMisalign", X264_CPU_SSE_MISALIGN},
{"LZCNT", X264_CPU_LZCNT},
{"Slow_mod4_stack", X264_CPU_STACK_MOD4}, {"Slow_mod4_stack", X264_CPU_STACK_MOD4},
{"", 0}, {"", 0},
}; };
...@@ -117,6 +118,7 @@ uint32_t x264_cpu_detect( void ) ...@@ -117,6 +118,7 @@ uint32_t x264_cpu_detect( void )
{ {
cpu |= X264_CPU_SSE2_IS_FAST; cpu |= X264_CPU_SSE2_IS_FAST;
cpu |= X264_CPU_SSE_MISALIGN; cpu |= X264_CPU_SSE_MISALIGN;
cpu |= X264_CPU_LZCNT;
x264_cpu_mask_misalign_sse(); x264_cpu_mask_misalign_sse();
} }
else else
......
...@@ -352,6 +352,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) ...@@ -352,6 +352,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
#endif #endif
pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmxext; pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmxext;
pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmxext; pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmxext;
if( cpu&X264_CPU_LZCNT )
{
pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmxext_lzcnt;
pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmxext_lzcnt;
}
} }
if( cpu&X264_CPU_SSE2 ) if( cpu&X264_CPU_SSE2 )
...@@ -376,6 +381,14 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) ...@@ -376,6 +381,14 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2; pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2; pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2;
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2; pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2;
if( cpu&X264_CPU_LZCNT )
{
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2_lzcnt;
pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2_lzcnt;
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2_lzcnt;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2_lzcnt;
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2_lzcnt;
}
} }
if( cpu&X264_CPU_SSSE3 ) if( cpu&X264_CPU_SSSE3 )
......
...@@ -688,37 +688,53 @@ DECIMATE8x8 ssse3 ...@@ -688,37 +688,53 @@ DECIMATE8x8 ssse3
or %1, %3 or %1, %3
%endmacro %endmacro
%macro LAST_X86 3
bsr %1, %2
%endmacro
%macro LAST_SSE4A 3
lzcnt %1, %2
xor %1, %3
%endmacro
%macro COEFF_LAST4 1
%ifdef ARCH_X86_64 %ifdef ARCH_X86_64
cglobal x264_coeff_last4_mmxext, 1,1 cglobal x264_coeff_last4_%1, 1,1
bsr rax, [r0] LAST rax, [r0], 0x3f
shr eax, 4 shr eax, 4
RET RET
%else %else
cglobal x264_coeff_last4_mmxext, 0,3 cglobal x264_coeff_last4_%1, 0,3
mov edx, r0m mov edx, r0m
mov eax, [edx+4] mov eax, [edx+4]
xor ecx, ecx xor ecx, ecx
test eax, eax test eax, eax
cmovz eax, [edx] cmovz eax, [edx]
setnz cl setnz cl
bsr eax, eax LAST eax, eax, 0x1f
shr eax, 4 shr eax, 4
lea eax, [eax+ecx*2] lea eax, [eax+ecx*2]
RET RET
%endif %endif
%endmacro
%define LAST LAST_X86
COEFF_LAST4 mmxext
%define LAST LAST_SSE4A
COEFF_LAST4 mmxext_lzcnt
%macro COEFF_LAST 1 %macro COEFF_LAST 1
cglobal x264_coeff_last15_%1, 1,3 cglobal x264_coeff_last15_%1, 1,3
LAST_MASK r1d, r0-2, r2d LAST_MASK r1d, r0-2, r2d
xor r1d, 0xffff xor r1d, 0xffff
bsr eax, r1d LAST eax, r1d, 0x1f
dec eax dec eax
RET RET
cglobal x264_coeff_last16_%1, 1,3 cglobal x264_coeff_last16_%1, 1,3
LAST_MASK r1d, r0, r2d LAST_MASK r1d, r0, r2d
xor r1d, 0xffff xor r1d, 0xffff
bsr eax, r1d LAST eax, r1d, 0x1f
RET RET
%ifndef ARCH_X86_64 %ifndef ARCH_X86_64
...@@ -738,17 +754,18 @@ cglobal x264_coeff_last16_%1, 1,3 ...@@ -738,17 +754,18 @@ cglobal x264_coeff_last16_%1, 1,3
not r1d not r1d
xor r2d, -1 xor r2d, -1
jne .secondhalf jne .secondhalf
bsr eax, r1d LAST eax, r1d, 0x1f
RET RET
.secondhalf: .secondhalf:
bsr eax, r2d LAST eax, r2d, 0x1f
add eax, 32 add eax, 32
RET RET
%endif %endif
%endmacro %endmacro
%ifdef ARCH_X86_64 %ifdef ARCH_X86_64
cglobal x264_coeff_last64_sse2, 1,4 %macro COEFF_LAST64 1
cglobal x264_coeff_last64_%1, 1,4
LAST_MASK_SSE2 r1d, r0 LAST_MASK_SSE2 r1d, r0
LAST_MASK_SSE2 r2d, r0+32 LAST_MASK_SSE2 r2d, r0+32
LAST_MASK_SSE2 r3d, r0+64 LAST_MASK_SSE2 r3d, r0+64
...@@ -760,16 +777,25 @@ cglobal x264_coeff_last16_%1, 1,3 ...@@ -760,16 +777,25 @@ cglobal x264_coeff_last16_%1, 1,3
shl r3, 32 shl r3, 32
or r1, r3 or r1, r3
not r1 not r1
bsr rax, r1 LAST rax, r1, 0x3f
RET RET
%endmacro
%define LAST LAST_X86
COEFF_LAST64 sse2
%define LAST LAST_SSE4A
COEFF_LAST64 sse2_lzcnt
%endif %endif
%define LAST LAST_X86
%ifndef ARCH_X86_64 %ifndef ARCH_X86_64
%define LAST_MASK LAST_MASK_MMX %define LAST_MASK LAST_MASK_MMX
COEFF_LAST mmxext COEFF_LAST mmxext
%endif %endif
%define LAST_MASK LAST_MASK_SSE2 %define LAST_MASK LAST_MASK_SSE2
COEFF_LAST sse2 COEFF_LAST sse2
%define LAST LAST_SSE4A
COEFF_LAST sse2_lzcnt
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; int x264_coeff_level_run( int16_t *dct, x264_run_level_t *runlevel ) ; int x264_coeff_level_run( int16_t *dct, x264_run_level_t *runlevel )
...@@ -783,6 +809,15 @@ COEFF_LAST sse2 ...@@ -783,6 +809,15 @@ COEFF_LAST sse2
pmovmskb %1, mm0 pmovmskb %1, mm0
%endmacro %endmacro
%macro LZCOUNT_X86 3
bsr %1, %2
xor %1, %3
%endmacro
%macro LZCOUNT_SSE4A 3
lzcnt %1, %2
%endmacro
; t6 = eax for return, t3 = ecx for shift, t[01] = r[01] for x86_64 args ; t6 = eax for return, t3 = ecx for shift, t[01] = r[01] for x86_64 args
%ifdef ARCH_X86_64 %ifdef ARCH_X86_64
DECLARE_REG_TMP 0,1,2,3,4,5,6 DECLARE_REG_TMP 0,1,2,3,4,5,6
...@@ -794,21 +829,18 @@ COEFF_LAST sse2 ...@@ -794,21 +829,18 @@ COEFF_LAST sse2
cglobal x264_coeff_level_run%2_%1,0,7 cglobal x264_coeff_level_run%2_%1,0,7
movifnidn t0d, r0m movifnidn t0d, r0m
movifnidn t1d, r1m movifnidn t1d, r1m
LAST_MASK t2d, t0-(%2&1)*2, t4d LAST_MASK t5d, t0-(%2&1)*2, t4d
not t2d not t5d
shl t2d, 32-((%2+1)&~1) shl t5d, 32-((%2+1)&~1)
mov t4d, %2-1 mov t4d, %2-1
mov t5d, t2d LZCOUNT t3d, t5d, 0x1f
bsr t3d, t2d
xor t6d, t6d xor t6d, t6d
shl t5d, 1 shl t5d, 1
xor t3d, 0x1f
sub t4d, t3d sub t4d, t3d
shl t5d, t3b shl t5d, t3b
mov [t1], t4d mov [t1], t4d
.loop: .loop:
bsr t3d, t5d LZCOUNT t3d, t5d, 0x1f
xor t3d, 0x1f
mov t2w, [t0+t4*2] mov t2w, [t0+t4*2]
mov [t1+t6 +36], t3b mov [t1+t6 +36], t3b
mov [t1+t6*2+ 4], t2w mov [t1+t6*2+ 4], t2w
...@@ -820,6 +852,7 @@ cglobal x264_coeff_level_run%2_%1,0,7 ...@@ -820,6 +852,7 @@ cglobal x264_coeff_level_run%2_%1,0,7
RET RET
%endmacro %endmacro
%define LZCOUNT LZCOUNT_X86
%ifndef ARCH_X86_64 %ifndef ARCH_X86_64
%define LAST_MASK LAST_MASK_MMX %define LAST_MASK LAST_MASK_MMX
COEFF_LEVELRUN mmxext, 15 COEFF_LEVELRUN mmxext, 15
...@@ -830,3 +863,8 @@ COEFF_LEVELRUN mmxext, 4 ...@@ -830,3 +863,8 @@ COEFF_LEVELRUN mmxext, 4
%define LAST_MASK LAST_MASK_SSE2 %define LAST_MASK LAST_MASK_SSE2
COEFF_LEVELRUN sse2, 15 COEFF_LEVELRUN sse2, 15
COEFF_LEVELRUN sse2, 16 COEFF_LEVELRUN sse2, 16
%define LZCOUNT LZCOUNT_SSE4A
COEFF_LEVELRUN sse2_lzcnt, 15
COEFF_LEVELRUN sse2_lzcnt, 16
%define LAST_MASK LAST_MASK4_MMX
COEFF_LEVELRUN mmxext_lzcnt, 4
...@@ -64,10 +64,17 @@ int x264_coeff_last64_mmxext( int16_t *dct ); ...@@ -64,10 +64,17 @@ int x264_coeff_last64_mmxext( int16_t *dct );
int x264_coeff_last15_sse2( int16_t *dct ); int x264_coeff_last15_sse2( int16_t *dct );
int x264_coeff_last16_sse2( int16_t *dct ); int x264_coeff_last16_sse2( int16_t *dct );
int x264_coeff_last64_sse2( int16_t *dct ); int x264_coeff_last64_sse2( int16_t *dct );
int x264_coeff_last4_mmxext_lzcnt( int16_t *dct );
int x264_coeff_last15_sse2_lzcnt( int16_t *dct );
int x264_coeff_last16_sse2_lzcnt( int16_t *dct );
int x264_coeff_last64_sse2_lzcnt( int16_t *dct );
int x264_coeff_level_run16_mmxext( int16_t *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run16_mmxext( int16_t *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run16_sse2( int16_t *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run16_sse2( int16_t *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run16_sse2_lzcnt( int16_t *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run15_mmxext( int16_t *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run15_mmxext( int16_t *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run15_sse2( int16_t *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run15_sse2( int16_t *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run15_sse2_lzcnt( int16_t *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run4_mmxext( int16_t *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run4_mmxext( int16_t *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run4_mmxext_lzcnt( int16_t *dct, x264_run_level_t *runlevel );
#endif #endif
...@@ -156,7 +156,8 @@ static void print_bench(void) ...@@ -156,7 +156,8 @@ static void print_bench(void)
b->cpu&X264_CPU_MMX ? "mmx" : "c", b->cpu&X264_CPU_MMX ? "mmx" : "c",
b->cpu&X264_CPU_CACHELINE_32 ? "_c32" : b->cpu&X264_CPU_CACHELINE_32 ? "_c32" :
b->cpu&X264_CPU_CACHELINE_64 ? "_c64" : b->cpu&X264_CPU_CACHELINE_64 ? "_c64" :
b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" : "", b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" :
b->cpu&X264_CPU_LZCNT ? "_lzcnt" : "",
((int64_t)10*b->cycles/b->den - nop_time)/4 ); ((int64_t)10*b->cycles/b->den - nop_time)/4 );
} }
} }
...@@ -1392,6 +1393,11 @@ static int check_all_flags( void ) ...@@ -1392,6 +1393,11 @@ static int check_all_flags( void )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_32, "MMX Cache32" ); ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_32, "MMX Cache32" );
cpu1 &= ~X264_CPU_CACHELINE_32; cpu1 &= ~X264_CPU_CACHELINE_32;
#endif #endif
if( x264_cpu_detect() & X264_CPU_LZCNT )
{
ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "MMX_LZCNT" );
cpu1 &= ~X264_CPU_LZCNT;
}
} }
if( x264_cpu_detect() & X264_CPU_SSE2 ) if( x264_cpu_detect() & X264_CPU_SSE2 )
{ {
...@@ -1405,6 +1411,12 @@ static int check_all_flags( void ) ...@@ -1405,6 +1411,12 @@ static int check_all_flags( void )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE_MISALIGN, "SSE_Misalign" ); ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE_MISALIGN, "SSE_Misalign" );
cpu1 &= ~X264_CPU_SSE_MISALIGN; cpu1 &= ~X264_CPU_SSE_MISALIGN;
} }
if( x264_cpu_detect() & X264_CPU_LZCNT )
{
cpu1 &= ~X264_CPU_CACHELINE_64;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSE_LZCNT" );
cpu1 &= ~X264_CPU_LZCNT;
}
if( x264_cpu_detect() & X264_CPU_SSE3 ) if( x264_cpu_detect() & X264_CPU_SSE3 )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE3 | X264_CPU_CACHELINE_64, "SSE3" ); ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE3 | X264_CPU_CACHELINE_64, "SSE3" );
if( x264_cpu_detect() & X264_CPU_SSSE3 ) if( x264_cpu_detect() & X264_CPU_SSSE3 )
......
...@@ -62,6 +62,7 @@ typedef struct x264_t x264_t; ...@@ -62,6 +62,7 @@ typedef struct x264_t x264_t;
#define X264_CPU_SSE4 0x002000 /* SSE4.1 */ #define X264_CPU_SSE4 0x002000 /* SSE4.1 */
#define X264_CPU_SSE42 0x004000 /* SSE4.2 */ #define X264_CPU_SSE42 0x004000 /* SSE4.2 */
#define X264_CPU_SSE_MISALIGN 0x008000 /* Phenom support for misaligned SSE instruction arguments */ #define X264_CPU_SSE_MISALIGN 0x008000 /* Phenom support for misaligned SSE instruction arguments */
#define X264_CPU_LZCNT 0x010000 /* Phenom support for "leading zero count" instruction. */
/* Analyse flags /* Analyse flags
*/ */
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment