Commit a79dc7b5 authored by Fiona Glaser's avatar Fiona Glaser

Cacheline-split SSSE3 chroma MC

~70% faster chroma MC on 32-bit Conroe
Also slightly faster SSSE3 intra_sad_8x8c
parent 1921079d
......@@ -50,7 +50,7 @@ x264_frame_t *x264_frame_new( x264_t *h )
frame->i_plane = 3;
for( i = 0; i < 3; i++ )
{
frame->i_stride[i] = ALIGN( i_stride >> !!i, 16 );
frame->i_stride[i] = ALIGN( i_stride >> !!i, align );
frame->i_width[i] = i_width >> !!i;
frame->i_lines[i] = i_lines >> !!i;
}
......
......@@ -25,8 +25,9 @@
%include "x86inc.asm"
SECTION_RODATA
SECTION_RODATA 32
ch_shuffle: db 0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,0,0
pw_4: times 8 dw 4
pw_8: times 8 dw 8
pw_32: times 8 dw 32
......@@ -869,8 +870,9 @@ MC_CHROMA mmxext
INIT_XMM
MC_CHROMA sse2, 8
%macro MC_CHROMA_SSSE3 2
INIT_MMX
cglobal x264_mc_chroma_ssse3, 0,6,8
cglobal x264_mc_chroma_ssse3%1, 0,6,%2
MC_CHROMA_START
and r4d, 7
and r5d, 7
......@@ -887,7 +889,7 @@ cglobal x264_mc_chroma_ssse3, 0,6,8
mova m5, [pw_32 GLOBAL]
movd m6, r5d
movd m7, r4d
movifnidn r0, r0mp
movifnidn r0, r0mp
movifnidn r1d, r1m
movifnidn r4d, r7m
SPLATW m6, m6
......@@ -925,23 +927,28 @@ cglobal x264_mc_chroma_ssse3, 0,6,8
INIT_XMM
.width8:
mova m5, [pw_32 GLOBAL]
movd m6, r5d
movd m7, r4d
movifnidn r0, r0mp
movifnidn r0, r0mp
movifnidn r1d, r1m
movifnidn r4d, r7m
SPLATW m6, m6
SPLATW m7, m7
%ifidn %1, _cache64
mov r5, r2
and r5, 0x3f
cmp r5, 0x38
jge .split
%endif
mova m5, [pw_32 GLOBAL]
movh m0, [r2]
movh m1, [r2+1]
punpcklbw m0, m1
add r2, r3
.loop8:
movh m1, [r2]
movh m2, [r2+1]
movh m3, [r2+r3]
movh m4, [r2+r3+1]
movh m1, [r2+1*r3]
movh m2, [r2+1*r3+1]
movh m3, [r2+2*r3]
movh m4, [r2+2*r3+1]
punpcklbw m1, m2
punpcklbw m3, m4
lea r2, [r2+2*r3]
......@@ -965,6 +972,53 @@ INIT_XMM
lea r0, [r0+2*r1]
jg .loop8
REP_RET
%ifidn %1, _cache64
.split:
and r2, ~7
and r5, 7
%ifdef PIC
lea r11, [ch_shuffle GLOBAL]
movu m5, [r11 + r5*2]
%else
movu m5, [ch_shuffle + r5*2 GLOBAL]
%endif
movu m0, [r2]
pshufb m0, m5
%ifdef ARCH_X86_64
mova m8, [pw_32 GLOBAL]
%define round m8
%else
%define round [pw_32 GLOBAL]
%endif
.splitloop8:
movu m1, [r2+r3]
pshufb m1, m5
movu m3, [r2+2*r3]
pshufb m3, m5
lea r2, [r2+2*r3]
mova m2, m1
mova m4, m3
pmaddubsw m0, m7
pmaddubsw m1, m6
pmaddubsw m2, m7
pmaddubsw m3, m6
paddw m0, round
paddw m2, round
paddw m1, m0
paddw m3, m2
mova m0, m4
psrlw m1, 6
psrlw m3, 6
packuswb m1, m3
movh [r0], m1
movhps [r0+r1], m1
sub r4d, 2
lea r0, [r0+2*r1]
jg .splitloop8
REP_RET
%endif
; mc_chroma 1d ssse3 is negligibly faster, and definitely not worth the extra code size
%endmacro
MC_CHROMA_SSSE3 , 8
MC_CHROMA_SSSE3 _cache64, 9
......@@ -59,6 +59,9 @@ extern void x264_mc_chroma_sse2( uint8_t *src, int i_src_stride,
extern void x264_mc_chroma_ssse3( uint8_t *src, int i_src_stride,
uint8_t *dst, int i_dst_stride,
int dx, int dy, int i_width, int i_height );
extern void x264_mc_chroma_ssse3_cache64( uint8_t *src, int i_src_stride,
uint8_t *dst, int i_dst_stride,
int dx, int dy, int i_width, int i_height );
extern void x264_plane_copy_mmxext( uint8_t *, int, uint8_t *, int, int w, int h);
extern void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n );
extern void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n );
......@@ -340,6 +343,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->hpel_filter = x264_hpel_filter_ssse3;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
pf->mc_chroma = x264_mc_chroma_ssse3;
if( cpu&X264_CPU_CACHELINE_64 )
pf->mc_chroma = x264_mc_chroma_ssse3_cache64;
if( cpu&X264_CPU_SHUFFLE_IS_FAST )
pf->integral_init4v = x264_integral_init4v_ssse3;
......
......@@ -28,9 +28,8 @@
SECTION_RODATA
pb_3: times 16 db 3
pb_shuf8x8c: db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6
pw_8: times 4 dw 8
pb_shuf8x8c0: db 0,0,0,0,2,2,2,2
pb_shuf8x8c1: db 4,4,4,4,6,6,6,6
sw_64: dd 64
SECTION .text
......@@ -450,16 +449,32 @@ cglobal x264_intra_sad_x3_8x8c_%1, 3,3
psrlw m0, 2
pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
%ifidn %1, ssse3
movq m1, m0
pshufb m0, [pb_shuf8x8c0 GLOBAL]
pshufb m1, [pb_shuf8x8c1 GLOBAL]
movq2dq xmm0, m0
pshufb xmm0, [pb_shuf8x8c GLOBAL]
movq xmm1, [r0+FENC_STRIDE*0]
movq xmm2, [r0+FENC_STRIDE*1]
movq xmm3, [r0+FENC_STRIDE*2]
movq xmm4, [r0+FENC_STRIDE*3]
movhps xmm1, [r0+FENC_STRIDE*4]
movhps xmm2, [r0+FENC_STRIDE*5]
movhps xmm3, [r0+FENC_STRIDE*6]
movhps xmm4, [r0+FENC_STRIDE*7]
psadbw xmm1, xmm0
psadbw xmm2, xmm0
psadbw xmm3, xmm0
psadbw xmm4, xmm0
paddw xmm1, xmm2
paddw xmm1, xmm3
paddw xmm1, xmm4
movhlps xmm0, xmm1
paddw xmm1, xmm0
movd [r2], xmm1
%else
packuswb m0, m0
punpcklbw m0, m0
movq m1, m0
punpcklbw m0, m0 ; 4x dc0 4x dc1
punpckhbw m1, m1 ; 4x dc2 4x dc3
%endif
movq m2, [r0+FENC_STRIDE*0]
movq m3, [r0+FENC_STRIDE*1]
movq m4, [r0+FENC_STRIDE*2]
......@@ -483,6 +498,7 @@ cglobal x264_intra_sad_x3_8x8c_%1, 3,3
paddw m6, m0
paddw m2, m6
movd [r2], m2
%endif
RET
%endmacro
......
......@@ -37,14 +37,14 @@
; Name of the .rodata section.
; Kludge: Something on OS X fails to align .rodata even given an align attribute,
; so use a different read-only section.
%macro SECTION_RODATA 0
%macro SECTION_RODATA 0-1 16
%ifidn __OUTPUT_FORMAT__,macho64
SECTION .text align=16
SECTION .text align=%1
%elifidn __OUTPUT_FORMAT__,macho
SECTION .text align=16
SECTION .text align=%1
fakegot:
%else
SECTION .rodata align=16
SECTION .rodata align=%1
%endif
%endmacro
......
......@@ -803,8 +803,8 @@ static int check_mc( int cpu_ref, int cpu_new )
used_asm = 1; \
memset(buf3, 0xCD, 1024); \
memset(buf4, 0xCD, 1024); \
call_c( mc_c.mc_chroma, dst1, 16, src, 32, dx, dy, w, h ); \
call_a( mc_a.mc_chroma, dst2, 16, src, 32, dx, dy, w, h ); \
call_c( mc_c.mc_chroma, dst1, 16, src, 64, dx, dy, w, h ); \
call_a( mc_a.mc_chroma, dst2, 16, src, 64, dx, dy, w, h ); \
/* mc_chroma width=2 may write garbage to the right of dst. ignore that. */\
for( j=0; j<h; j++ ) \
for( i=w; i<4; i++ ) \
......@@ -834,8 +834,9 @@ static int check_mc( int cpu_ref, int cpu_new )
ok = 1; used_asm = 0;
for( dy = -1; dy < 9; dy++ )
for( dx = -1; dx < 9; dx++ )
for( dx = -128; dx < 128; dx++ )
{
if( rand()&15 ) continue;
MC_TEST_CHROMA( 8, 8 );
MC_TEST_CHROMA( 8, 4 );
MC_TEST_CHROMA( 4, 8 );
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment