Commit cb688111 authored by Fiona Glaser's avatar Fiona Glaser

Add support for SSE4a (Phenom) LZCNT instruction

Significantly speeds up coeff_last and coeff_level_run on Phenom CPUs for faster CAVLC and CABAC.
Also a small tweak to coeff_level_run asm.
parent 9e1f3000
......@@ -54,6 +54,7 @@ const x264_cpu_name_t x264_cpu_names[] = {
{"Cache32", X264_CPU_CACHELINE_32},
{"Cache64", X264_CPU_CACHELINE_64},
{"SSEMisalign", X264_CPU_SSE_MISALIGN},
{"LZCNT", X264_CPU_LZCNT},
{"Slow_mod4_stack", X264_CPU_STACK_MOD4},
{"", 0},
};
......@@ -117,6 +118,7 @@ uint32_t x264_cpu_detect( void )
{
cpu |= X264_CPU_SSE2_IS_FAST;
cpu |= X264_CPU_SSE_MISALIGN;
cpu |= X264_CPU_LZCNT;
x264_cpu_mask_misalign_sse();
}
else
......
......@@ -352,6 +352,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
#endif
pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmxext;
pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmxext;
if( cpu&X264_CPU_LZCNT )
{
pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmxext_lzcnt;
pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmxext_lzcnt;
}
}
if( cpu&X264_CPU_SSE2 )
......@@ -376,6 +381,14 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2;
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2;
if( cpu&X264_CPU_LZCNT )
{
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2_lzcnt;
pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2_lzcnt;
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2_lzcnt;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2_lzcnt;
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2_lzcnt;
}
}
if( cpu&X264_CPU_SSSE3 )
......
......@@ -688,37 +688,53 @@ DECIMATE8x8 ssse3
or %1, %3
%endmacro
%macro LAST_X86 3
bsr %1, %2
%endmacro
%macro LAST_SSE4A 3
lzcnt %1, %2
xor %1, %3
%endmacro
%macro COEFF_LAST4 1
%ifdef ARCH_X86_64
cglobal x264_coeff_last4_mmxext, 1,1
bsr rax, [r0]
cglobal x264_coeff_last4_%1, 1,1
LAST rax, [r0], 0x3f
shr eax, 4
RET
%else
cglobal x264_coeff_last4_mmxext, 0,3
cglobal x264_coeff_last4_%1, 0,3
mov edx, r0m
mov eax, [edx+4]
xor ecx, ecx
test eax, eax
cmovz eax, [edx]
setnz cl
bsr eax, eax
LAST eax, eax, 0x1f
shr eax, 4
lea eax, [eax+ecx*2]
RET
%endif
%endmacro
%define LAST LAST_X86
COEFF_LAST4 mmxext
%define LAST LAST_SSE4A
COEFF_LAST4 mmxext_lzcnt
%macro COEFF_LAST 1
cglobal x264_coeff_last15_%1, 1,3
LAST_MASK r1d, r0-2, r2d
xor r1d, 0xffff
bsr eax, r1d
LAST eax, r1d, 0x1f
dec eax
RET
cglobal x264_coeff_last16_%1, 1,3
LAST_MASK r1d, r0, r2d
xor r1d, 0xffff
bsr eax, r1d
LAST eax, r1d, 0x1f
RET
%ifndef ARCH_X86_64
......@@ -738,17 +754,18 @@ cglobal x264_coeff_last16_%1, 1,3
not r1d
xor r2d, -1
jne .secondhalf
bsr eax, r1d
LAST eax, r1d, 0x1f
RET
.secondhalf:
bsr eax, r2d
LAST eax, r2d, 0x1f
add eax, 32
RET
%endif
%endmacro
%ifdef ARCH_X86_64
cglobal x264_coeff_last64_sse2, 1,4
%macro COEFF_LAST64 1
cglobal x264_coeff_last64_%1, 1,4
LAST_MASK_SSE2 r1d, r0
LAST_MASK_SSE2 r2d, r0+32
LAST_MASK_SSE2 r3d, r0+64
......@@ -760,16 +777,25 @@ cglobal x264_coeff_last16_%1, 1,3
shl r3, 32
or r1, r3
not r1
bsr rax, r1
LAST rax, r1, 0x3f
RET
%endmacro
%define LAST LAST_X86
COEFF_LAST64 sse2
%define LAST LAST_SSE4A
COEFF_LAST64 sse2_lzcnt
%endif
%define LAST LAST_X86
%ifndef ARCH_X86_64
%define LAST_MASK LAST_MASK_MMX
COEFF_LAST mmxext
%endif
%define LAST_MASK LAST_MASK_SSE2
COEFF_LAST sse2
%define LAST LAST_SSE4A
COEFF_LAST sse2_lzcnt
;-----------------------------------------------------------------------------
; int x264_coeff_level_run( int16_t *dct, x264_run_level_t *runlevel )
......@@ -783,6 +809,15 @@ COEFF_LAST sse2
pmovmskb %1, mm0
%endmacro
%macro LZCOUNT_X86 3
bsr %1, %2
xor %1, %3
%endmacro
%macro LZCOUNT_SSE4A 3
lzcnt %1, %2
%endmacro
; t6 = eax for return, t3 = ecx for shift, t[01] = r[01] for x86_64 args
%ifdef ARCH_X86_64
DECLARE_REG_TMP 0,1,2,3,4,5,6
......@@ -794,21 +829,18 @@ COEFF_LAST sse2
cglobal x264_coeff_level_run%2_%1,0,7
movifnidn t0d, r0m
movifnidn t1d, r1m
LAST_MASK t2d, t0-(%2&1)*2, t4d
not t2d
shl t2d, 32-((%2+1)&~1)
LAST_MASK t5d, t0-(%2&1)*2, t4d
not t5d
shl t5d, 32-((%2+1)&~1)
mov t4d, %2-1
mov t5d, t2d
bsr t3d, t2d
LZCOUNT t3d, t5d, 0x1f
xor t6d, t6d
shl t5d, 1
xor t3d, 0x1f
sub t4d, t3d
shl t5d, t3b
mov [t1], t4d
.loop:
bsr t3d, t5d
xor t3d, 0x1f
LZCOUNT t3d, t5d, 0x1f
mov t2w, [t0+t4*2]
mov [t1+t6 +36], t3b
mov [t1+t6*2+ 4], t2w
......@@ -820,6 +852,7 @@ cglobal x264_coeff_level_run%2_%1,0,7
RET
%endmacro
%define LZCOUNT LZCOUNT_X86
%ifndef ARCH_X86_64
%define LAST_MASK LAST_MASK_MMX
COEFF_LEVELRUN mmxext, 15
......@@ -830,3 +863,8 @@ COEFF_LEVELRUN mmxext, 4
%define LAST_MASK LAST_MASK_SSE2
COEFF_LEVELRUN sse2, 15
COEFF_LEVELRUN sse2, 16
%define LZCOUNT LZCOUNT_SSE4A
COEFF_LEVELRUN sse2_lzcnt, 15
COEFF_LEVELRUN sse2_lzcnt, 16
%define LAST_MASK LAST_MASK4_MMX
COEFF_LEVELRUN mmxext_lzcnt, 4
......@@ -64,10 +64,17 @@ int x264_coeff_last64_mmxext( int16_t *dct );
int x264_coeff_last15_sse2( int16_t *dct );
int x264_coeff_last16_sse2( int16_t *dct );
int x264_coeff_last64_sse2( int16_t *dct );
int x264_coeff_last4_mmxext_lzcnt( int16_t *dct );
int x264_coeff_last15_sse2_lzcnt( int16_t *dct );
int x264_coeff_last16_sse2_lzcnt( int16_t *dct );
int x264_coeff_last64_sse2_lzcnt( int16_t *dct );
int x264_coeff_level_run16_mmxext( int16_t *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run16_sse2( int16_t *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run16_sse2_lzcnt( int16_t *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run15_mmxext( int16_t *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run15_sse2( int16_t *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run15_sse2_lzcnt( int16_t *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run4_mmxext( int16_t *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run4_mmxext_lzcnt( int16_t *dct, x264_run_level_t *runlevel );
#endif
......@@ -156,7 +156,8 @@ static void print_bench(void)
b->cpu&X264_CPU_MMX ? "mmx" : "c",
b->cpu&X264_CPU_CACHELINE_32 ? "_c32" :
b->cpu&X264_CPU_CACHELINE_64 ? "_c64" :
b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" : "",
b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" :
b->cpu&X264_CPU_LZCNT ? "_lzcnt" : "",
((int64_t)10*b->cycles/b->den - nop_time)/4 );
}
}
......@@ -1392,6 +1393,11 @@ static int check_all_flags( void )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_32, "MMX Cache32" );
cpu1 &= ~X264_CPU_CACHELINE_32;
#endif
if( x264_cpu_detect() & X264_CPU_LZCNT )
{
ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "MMX_LZCNT" );
cpu1 &= ~X264_CPU_LZCNT;
}
}
if( x264_cpu_detect() & X264_CPU_SSE2 )
{
......@@ -1405,6 +1411,12 @@ static int check_all_flags( void )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE_MISALIGN, "SSE_Misalign" );
cpu1 &= ~X264_CPU_SSE_MISALIGN;
}
if( x264_cpu_detect() & X264_CPU_LZCNT )
{
cpu1 &= ~X264_CPU_CACHELINE_64;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSE_LZCNT" );
cpu1 &= ~X264_CPU_LZCNT;
}
if( x264_cpu_detect() & X264_CPU_SSE3 )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE3 | X264_CPU_CACHELINE_64, "SSE3" );
if( x264_cpu_detect() & X264_CPU_SSSE3 )
......
......@@ -62,6 +62,7 @@ typedef struct x264_t x264_t;
#define X264_CPU_SSE4 0x002000 /* SSE4.1 */
#define X264_CPU_SSE42 0x004000 /* SSE4.2 */
#define X264_CPU_SSE_MISALIGN 0x008000 /* Phenom support for misaligned SSE instruction arguments */
#define X264_CPU_LZCNT 0x010000 /* Phenom support for "leading zero count" instruction. */
/* Analyse flags
*/
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment