Commit 4becc3e9 authored by Henrik Gramner's avatar Henrik Gramner Committed by Fiona Glaser

x86: Faster AVX2 pixel_sad_x3 and pixel_sad_x4

parent 401edc3a
......@@ -32,7 +32,6 @@
SECTION_RODATA 32
pb_shuf8x8c2: times 2 db 0,0,0,0,8,8,8,8,-1,-1,-1,-1,-1,-1,-1,-1
deinterleave_sadx4: dd 0,4,2,6
hpred_shuf: db 0,0,2,2,8,8,10,10,1,1,3,3,9,9,11,11
SECTION .text
......@@ -1387,12 +1386,12 @@ SAD_X 4, 4, 4
vbroadcasti128 m4, [r0]
vbroadcasti128 m5, [r0+FENC_STRIDE]
movu xm0, [r1]
movu xm1, [r3]
movu xm1, [r2]
movu xm2, [r1+r5]
movu xm3, [r3+r5]
vinserti128 m0, m0, [r2], 1
movu xm3, [r2+r5]
vinserti128 m0, m0, [r3], 1
vinserti128 m1, m1, [r4], 1
vinserti128 m2, m2, [r2+r5], 1
vinserti128 m2, m2, [r3+r5], 1
vinserti128 m3, m3, [r4+r5], 1
psadbw m0, m4
psadbw m1, m4
......@@ -1406,12 +1405,12 @@ SAD_X 4, 4, 4
vbroadcasti128 m6, [r0+%1]
vbroadcasti128 m7, [r0+%3]
movu xm2, [r1+%2]
movu xm3, [r3+%2]
movu xm3, [r2+%2]
movu xm4, [r1+%4]
movu xm5, [r3+%4]
vinserti128 m2, m2, [r2+%2], 1
movu xm5, [r2+%4]
vinserti128 m2, m2, [r3+%2], 1
vinserti128 m3, m3, [r4+%2], 1
vinserti128 m4, m4, [r2+%4], 1
vinserti128 m4, m4, [r3+%4], 1
vinserti128 m5, m5, [r4+%4], 1
psadbw m2, m6
psadbw m3, m6
......@@ -1443,35 +1442,22 @@ SAD_X 4, 4, 4
%endmacro
%macro SAD_X3_END_AVX2 0
vextracti128 xm4, m0, 1
vextracti128 xm5, m1, 1
vextracti128 xm6, m2, 1
paddw xm0, xm4
paddw xm1, xm5
paddw xm2, xm6
movhlps xm4, xm0
movhlps xm5, xm1
movhlps xm6, xm2
paddw xm0, xm4
paddw xm1, xm5
paddw xm2, xm6
movifnidn r5, r5mp
movd [r5+0], xm0
movd [r5+4], xm1
movd [r5+8], xm2
packssdw m0, m1 ; 0 0 1 1 0 0 1 1
packssdw m2, m2 ; 2 2 _ _ 2 2 _ _
phaddd m0, m2 ; 0 1 2 _ 0 1 2 _
vextracti128 xm1, m0, 1
paddd xm0, xm1 ; 0 1 2 _
mova [r5], xm0
RET
%endmacro
%macro SAD_X4_END_AVX2 0
mov r0, r6mp
punpckhqdq m2, m0, m0
punpckhqdq m3, m1, m1
paddw m0, m2
paddw m1, m3
packssdw m0, m1
mova xm2, [deinterleave_sadx4]
vpermd m0, m2, m0
mova [r0], xm0
mov r0, r6mp
packssdw m0, m1 ; 0 0 1 1 2 2 3 3
vextracti128 xm1, m0, 1
phaddd xm0, xm1 ; 0 1 2 3
mova [r0], xm0
RET
%endmacro
......
......@@ -365,14 +365,14 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc,
/* hexagon */
COST_MV_X3_DIR( -2,0, -1, 2, 1, 2, costs );
COST_MV_X3_DIR( 2,0, 1,-2, -1,-2, costs+3 );
COST_MV_X3_DIR( 2,0, 1,-2, -1,-2, costs+4 ); /* +4 for 16-byte alignment */
bcost <<= 3;
COPY1_IF_LT( bcost, (costs[0]<<3)+2 );
COPY1_IF_LT( bcost, (costs[1]<<3)+3 );
COPY1_IF_LT( bcost, (costs[2]<<3)+4 );
COPY1_IF_LT( bcost, (costs[3]<<3)+5 );
COPY1_IF_LT( bcost, (costs[4]<<3)+6 );
COPY1_IF_LT( bcost, (costs[5]<<3)+7 );
COPY1_IF_LT( bcost, (costs[4]<<3)+5 );
COPY1_IF_LT( bcost, (costs[5]<<3)+6 );
COPY1_IF_LT( bcost, (costs[6]<<3)+7 );
if( bcost&7 )
{
......@@ -671,7 +671,7 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc,
for( i = 0; i < xn-2; i += 3 )
{
pixel *ref = p_fref_w+min_x+my*stride;
int sads[3];
ALIGNED_ARRAY_16( int, sads,[4] ); /* padded to [4] for asm */
h->pixf.sad_x3[i_pixel]( p_fenc, ref+xs[i], ref+xs[i+1], ref+xs[i+2], stride, sads );
for( int j = 0; j < 3; j++ )
{
......
......@@ -406,7 +406,7 @@ static int check_pixel( int cpu_ref, int cpu_new )
} \
else \
call_a( pixel_asm.sad_x3[i], pbuf1, pix2, pix2+6, pix2+1, (intptr_t)64, res_asm ); \
if( memcmp(res_c, res_asm, sizeof(res_c)) ) \
if( memcmp(res_c, res_asm, N*sizeof(int)) ) \
{ \
ok = 0; \
fprintf( stderr, "sad_x"#N"[%d]: %d,%d,%d,%d != %d,%d,%d,%d [FAILED]\n", \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment