Commit f753caea authored by Henrik Gramner's avatar Henrik Gramner

Add minor x86 bilin mc optimizations

parent f813285c
Pipeline #3959 passed with stages
in 5 minutes and 11 seconds
......@@ -265,7 +265,6 @@ INIT_YMM avx2
; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
imul mxyd, 0xff01
vbroadcasti128 m4, [bilin_h_shuf8]
WIN64_SPILL_XMM 7
add mxyd, 16 << 8
movd xm5, mxyd
mov mxyd, r7m ; my
......@@ -273,7 +272,7 @@ INIT_YMM avx2
test mxyd, mxyd
jnz .hv
movzx wd, word [t2+wq*2+table_offset(put, _bilin_h)]
vpbroadcastd m6, [pw_2048]
vpbroadcastd m3, [pw_2048]
add wq, t2
jmp wq
.h_w2:
......@@ -282,7 +281,7 @@ INIT_YMM avx2
lea srcq, [srcq+ssq*2]
pshufb xm0, xm4
pmaddubsw xm0, xm5
pmulhrsw xm0, xm6
pmulhrsw xm0, xm3
packuswb xm0, xm0
pextrw [dstq+dsq*0], xm0, 0
pextrw [dstq+dsq*1], xm0, 2
......@@ -298,7 +297,7 @@ INIT_YMM avx2
lea srcq, [srcq+ssq*2]
pshufb xm0, xm4
pmaddubsw xm0, xm5
pmulhrsw xm0, xm6
pmulhrsw xm0, xm3
packuswb xm0, xm0
movd [dstq+dsq*0], xm0
pextrd [dstq+dsq*1], xm0, 1
......@@ -314,8 +313,8 @@ INIT_YMM avx2
pshufb xm1, xm4
pmaddubsw xm0, xm5
pmaddubsw xm1, xm5
pmulhrsw xm0, xm6
pmulhrsw xm1, xm6
pmulhrsw xm0, xm3
pmulhrsw xm1, xm3
packuswb xm0, xm1
movq [dstq+dsq*0], xm0
movhps [dstq+dsq*1], xm0
......@@ -333,8 +332,8 @@ INIT_YMM avx2
pshufb m1, m4
pmaddubsw m0, m5
pmaddubsw m1, m5
pmulhrsw m0, m6
pmulhrsw m1, m6
pmulhrsw m0, m3
pmulhrsw m1, m3
packuswb m0, m1
mova [dstq+dsq*0], xm0
vextracti128 [dstq+dsq*1], m0, 1
......@@ -350,8 +349,8 @@ INIT_YMM avx2
pshufb m1, m4
pmaddubsw m0, m5
pmaddubsw m1, m5
pmulhrsw m0, m6
pmulhrsw m1, m6
pmulhrsw m0, m3
pmulhrsw m1, m3
packuswb m0, m1
mova [dstq], m0
add dstq, dsq
......@@ -361,25 +360,25 @@ INIT_YMM avx2
.h_w64:
movu m0, [srcq+8*0]
movu m1, [srcq+8*1]
movu m2, [srcq+8*4]
movu m3, [srcq+8*5]
add srcq, ssq
pshufb m0, m4
pshufb m1, m4
pshufb m2, m4
pshufb m3, m4
pmaddubsw m0, m5
pmaddubsw m1, m5
pmaddubsw m2, m5
pmaddubsw m3, m5
pmulhrsw m0, m6
pmulhrsw m1, m6
pmulhrsw m2, m6
pmulhrsw m3, m6
pmulhrsw m0, m3
pmulhrsw m1, m3
packuswb m0, m1
packuswb m2, m3
movu m1, [srcq+8*4]
movu m2, [srcq+8*5]
add srcq, ssq
pshufb m1, m4
pshufb m2, m4
pmaddubsw m1, m5
pmaddubsw m2, m5
pmulhrsw m1, m3
pmulhrsw m2, m3
packuswb m1, m2
mova [dstq+32*0], m0
mova [dstq+32*1], m2
mova [dstq+32*1], m1
add dstq, dsq
dec hd
jg .h_w64
......@@ -393,8 +392,8 @@ INIT_YMM avx2
pshufb m1, m4
pmaddubsw m0, m5
pmaddubsw m1, m5
pmulhrsw m0, m6
pmulhrsw m1, m6
pmulhrsw m0, m3
pmulhrsw m1, m3
packuswb m0, m1
mova [dstq+t1+32*3], m0
add t1, 32
......@@ -406,14 +405,12 @@ INIT_YMM avx2
RET
.v:
movzx wd, word [t2+wq*2+table_offset(put, _bilin_v)]
%assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 8
imul mxyd, 0xff01
vpbroadcastd m7, [pw_2048]
vpbroadcastd m5, [pw_2048]
add mxyd, 16 << 8
add wq, t2
movd xm6, mxyd
vpbroadcastw m6, xm6
movd xm4, mxyd
vpbroadcastw m4, xm4
jmp wq
.v_w2:
movd xm0, [srcq+ssq*0]
......@@ -423,8 +420,8 @@ INIT_YMM avx2
pinsrw xm0, xm1, [srcq+ssq*0], 0 ; 2 1
pshuflw xm1, xm1, q2301 ; 1 0
punpcklbw xm1, xm0, xm1
pmaddubsw xm1, xm6
pmulhrsw xm1, xm7
pmaddubsw xm1, xm4
pmulhrsw xm1, xm5
packuswb xm1, xm1
pextrw [dstq+dsq*0], xm1, 1
pextrw [dstq+dsq*1], xm1, 0
......@@ -441,8 +438,8 @@ INIT_YMM avx2
vpbroadcastd xm0, [srcq+ssq*0]
vpblendd xm1, xm1, xm0, 0x02 ; 1 2
punpcklbw xm1, xm2
pmaddubsw xm1, xm6
pmulhrsw xm1, xm7
pmaddubsw xm1, xm4
pmulhrsw xm1, xm5
packuswb xm1, xm1
movd [dstq+dsq*0], xm1
pextrd [dstq+dsq*1], xm1, 1
......@@ -453,20 +450,18 @@ INIT_YMM avx2
.v_w8:
movq xm0, [srcq+ssq*0]
.v_w8_loop:
vpbroadcastq xm1, [srcq+ssq*1]
movq xm3, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
vpblendd xm2, xm1, xm0, 0x03 ; 0 1
vpbroadcastq xm0, [srcq+ssq*0]
vpblendd xm1, xm1, xm0, 0x0c ; 1 2
punpcklbw xm3, xm1, xm2
punpckhbw xm1, xm2
pmaddubsw xm3, xm6
pmaddubsw xm1, xm6
pmulhrsw xm3, xm7
pmulhrsw xm1, xm7
packuswb xm3, xm1
movq [dstq+dsq*0], xm3
movhps [dstq+dsq*1], xm3
punpcklbw xm1, xm3, xm0
movq xm0, [srcq+ssq*0]
punpcklbw xm2, xm0, xm3
pmaddubsw xm1, xm4
pmaddubsw xm2, xm4
pmulhrsw xm1, xm5
pmulhrsw xm2, xm5
packuswb xm1, xm2
movq [dstq+dsq*0], xm1
movhps [dstq+dsq*1], xm1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w8_loop
......@@ -481,10 +476,10 @@ INIT_YMM avx2
vpblendd m2, m2, m0, 0xf0 ; 1 2
punpcklbw m1, m2, m3
punpckhbw m2, m3
pmaddubsw m1, m6
pmaddubsw m2, m6
pmulhrsw m1, m7
pmulhrsw m2, m7
pmaddubsw m1, m4
pmaddubsw m2, m4
pmulhrsw m1, m5
pmulhrsw m2, m5
packuswb m1, m2
mova [dstq+dsq*0], xm1
vextracti128 [dstq+dsq*1], m1, 1
......@@ -496,25 +491,25 @@ INIT_YMM avx2
%macro PUT_BILIN_V_W32 0
movu m0, [srcq+ssq*0]
%%loop:
movu m4, [srcq+ssq*1]
movu m3, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
punpcklbw m1, m4, m0
punpckhbw m3, m4, m0
punpcklbw m1, m3, m0
punpckhbw m2, m3, m0
movu m0, [srcq+ssq*0]
punpcklbw m2, m0, m4
punpckhbw m4, m0, m4
pmaddubsw m1, m6
pmaddubsw m3, m6
pmaddubsw m2, m6
pmaddubsw m4, m6
pmulhrsw m1, m7
pmulhrsw m3, m7
pmulhrsw m2, m7
pmulhrsw m4, m7
packuswb m1, m3
packuswb m2, m4
pmaddubsw m1, m4
pmaddubsw m2, m4
pmulhrsw m1, m5
pmulhrsw m2, m5
packuswb m1, m2
mova [dstq+dsq*0], m1
mova [dstq+dsq*1], m2
punpcklbw m1, m0, m3
punpckhbw m2, m0, m3
pmaddubsw m1, m4
pmaddubsw m2, m4
pmulhrsw m1, m5
pmulhrsw m2, m5
packuswb m1, m2
mova [dstq+dsq*1], m1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg %%loop
......@@ -527,25 +522,25 @@ INIT_YMM avx2
.v_w64_loop:
add srcq, ssq
movu m3, [srcq+32*0]
movu m4, [srcq+32*1]
punpcklbw m2, m3, m0
punpckhbw m5, m3, m0
pmaddubsw m2, m6
pmaddubsw m5, m6
punpckhbw m0, m3, m0
pmaddubsw m2, m4
pmaddubsw m0, m4
pmulhrsw m2, m5
pmulhrsw m0, m5
packuswb m2, m0
mova m0, m3
pmulhrsw m2, m7
pmulhrsw m5, m7
packuswb m2, m5
punpcklbw m3, m4, m1
punpckhbw m5, m4, m1
pmaddubsw m3, m6
pmaddubsw m5, m6
mova m1, m4
pmulhrsw m3, m7
pmulhrsw m5, m7
packuswb m3, m5
movu m3, [srcq+32*1]
mova [dstq+32*0], m2
mova [dstq+32*1], m3
punpcklbw m2, m3, m1
punpckhbw m1, m3, m1
pmaddubsw m2, m4
pmaddubsw m1, m4
pmulhrsw m2, m5
pmulhrsw m1, m5
packuswb m2, m1
mova m1, m3
mova [dstq+32*1], m2
add dstq, dsq
dec hd
jg .v_w64_loop
......@@ -568,7 +563,6 @@ INIT_YMM avx2
; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
movzx wd, word [t2+wq*2+table_offset(put, _bilin_hv)]
%assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 8
shl mxyd, 11 ; can't shift by 12 due to signed overflow
vpbroadcastd m7, [pw_2048]
......@@ -684,7 +678,14 @@ INIT_YMM avx2
jg .hv_w16_loop
RET
.hv_w32:
%macro PUT_BILIN_HV_W32 0
xor t2d, t2d
.hv_w32gt:
mov t0, dstq
mov t1, srcq
%if WIN64
movaps r4m, xmm8
%endif
.hv_w32_loop0:
movu m0, [srcq+8*0]
vinserti128 m0, m0, [srcq+8*2], 1
movu m1, [srcq+8*1]
......@@ -693,10 +694,7 @@ INIT_YMM avx2
pshufb m1, m4
pmaddubsw m0, m5
pmaddubsw m1, m5
%if WIN64
movaps r4m, xmm8
%endif
%%loop:
.hv_w32_loop:
add srcq, ssq
movu xm2, [srcq+8*1]
vinserti128 m2, m2, [srcq+8*3], 1
......@@ -722,41 +720,24 @@ INIT_YMM avx2
mova [dstq], m3
add dstq, dsq
dec hd
jg %%loop
%if WIN64
movaps xmm8, r4m
%endif
%endmacro
PUT_BILIN_HV_W32
RET
.hv_w64:
mov t0, dstq
mov t1, srcq
lea t2d, [hq+(1<<8)]
.hv_w64_loop:
PUT_BILIN_HV_W32
mov hb, t2b
jg .hv_w32_loop
movzx hd, t2b
add t0, 32
add t1, 32
mov dstq, t0
mov srcq, t1
sub t2d, 1<<8
jg .hv_w64_loop
jg .hv_w32_loop0
%if WIN64
movaps xmm8, r4m
%endif
RET
.hv_w64:
lea t2d, [hq+(1<<8)]
jmp .hv_w32gt
.hv_w128:
mov t0, dstq
mov t1, srcq
lea t2d, [hq+(3<<8)]
.hv_w128_loop:
PUT_BILIN_HV_W32
mov hb, t2b
add t0, 32
add t1, 32
mov dstq, t0
mov srcq, t1
sub t2d, 1<<8
jg .hv_w128_loop
RET
jmp .hv_w32gt
DECLARE_REG_TMP 3, 5, 6
cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
......
......@@ -266,7 +266,6 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy
imul mxyd, 0xff01
mova m4, [base+bilin_h_shuf8]
mova m0, [base+bilin_h_shuf4]
WIN64_SPILL_XMM 7
add mxyd, 16 << 8
movd m5, mxyd
mov mxyd, r7m ; my
......@@ -275,7 +274,7 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy
test mxyd, mxyd
jnz .hv
movzx wd, word [t0+wq*2+table_offset(put, _bilin_h)]
mova m6, [base+pw_2048]
mova m3, [base+pw_2048]
add wq, t0
RESTORE_DSQ_32 t0
jmp wq
......@@ -288,7 +287,7 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy
punpckldq m0, m1
pshufb m0, m4
pmaddubsw m0, m5
pmulhrsw m0, m6
pmulhrsw m0, m3
packuswb m0, m0
movd r6d, m0
mov [dstq+dsq*0], r6w
......@@ -304,10 +303,10 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy
lea srcq, [srcq+ssq*2]
pshufb m4, m0
pmaddubsw m4, m5
pmulhrsw m4, m6
pmulhrsw m4, m3
packuswb m4, m4
movd [dstq+dsq*0], m4
pshufd m4, m4, q0101
psrlq m4, 32
movd [dstq+dsq*1], m4
lea dstq, [dstq+dsq*2]
sub hd, 2
......@@ -321,8 +320,8 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy
pshufb m1, m4
pmaddubsw m0, m5
pmaddubsw m1, m5
pmulhrsw m0, m6
pmulhrsw m1, m6
pmulhrsw m0, m3
pmulhrsw m1, m3
packuswb m0, m1
movq [dstq+dsq*0], m0
movhps [dstq+dsq*1], m0
......@@ -338,8 +337,8 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy
pshufb m1, m4
pmaddubsw m0, m5
pmaddubsw m1, m5
pmulhrsw m0, m6
pmulhrsw m1, m6
pmulhrsw m0, m3
pmulhrsw m1, m3
packuswb m0, m1
mova [dstq], m0
add dstq, dsq
......@@ -349,25 +348,25 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy
.h_w32:
movu m0, [srcq+mmsize*0+8*0]
movu m1, [srcq+mmsize*0+8*1]
movu m2, [srcq+mmsize*1+8*0]
movu m3, [srcq+mmsize*1+8*1]
add srcq, ssq
pshufb m0, m4
pshufb m1, m4
pshufb m2, m4
pshufb m3, m4
pmaddubsw m0, m5
pmaddubsw m1, m5
pmaddubsw m2, m5
pmaddubsw m3, m5
pmulhrsw m0, m6
pmulhrsw m1, m6
pmulhrsw m2, m6
pmulhrsw m3, m6
pmulhrsw m0, m3
pmulhrsw m1, m3
packuswb m0, m1
packuswb m2, m3
movu m1, [srcq+mmsize*1+8*0]
movu m2, [srcq+mmsize*1+8*1]
add srcq, ssq
pshufb m1, m4
pshufb m2, m4
pmaddubsw m1, m5
pmaddubsw m2, m5
pmulhrsw m1, m3
pmulhrsw m2, m3
packuswb m1, m2
mova [dstq+16*0], m0
mova [dstq+16*1], m2
mova [dstq+16*1], m1
add dstq, dsq
dec hd
jg .h_w32
......@@ -381,8 +380,8 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy
pshufb m1, m4
pmaddubsw m0, m5
pmaddubsw m1, m5
pmulhrsw m0, m6
pmulhrsw m1, m6
pmulhrsw m0, m3
pmulhrsw m1, m3
packuswb m0, m1
mova [dstq+r6+16*3], m0
add r6, 16
......@@ -401,8 +400,8 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy
pshufb m1, m4
pmaddubsw m0, m5
pmaddubsw m1, m5
pmulhrsw m0, m6
pmulhrsw m1, m6
pmulhrsw m0, m3
pmulhrsw m1, m3
packuswb m0, m1
mova [dstq+r6+16*7], m0
add r6, 16
......@@ -414,15 +413,13 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy
RET
.v:
movzx wd, word [t0+wq*2+table_offset(put, _bilin_v)]
%assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 8
imul mxyd, 0xff01
mova m7, [base+pw_2048]
mova m5, [base+pw_2048]
add mxyd, 16 << 8
add wq, t0
movd m6, mxyd
pshuflw m6, m6, q0000
punpcklqdq m6, m6
movd m4, mxyd
pshuflw m4, m4, q0000
punpcklqdq m4, m4
RESTORE_DSQ_32 t0
jmp wq
.v_w2:
......@@ -433,8 +430,8 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy
pshuflw m2, m0, q2301
pinsrw m0, [srcq+ssq*0], 0 ; 2 1
punpcklbw m1, m0, m2
pmaddubsw m1, m6
pmulhrsw m1, m7
pmaddubsw m1, m4
pmulhrsw m1, m5
packuswb m1, m1
movd r6d, m1
mov [dstq+dsq*1], r6w
......@@ -453,8 +450,8 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy
movd m0, [srcq+ssq*0]
punpckldq m1, m0 ; 1 2
punpcklbw m1, m2
pmaddubsw m1, m6
pmulhrsw m1, m7
pmaddubsw m1, m4
pmulhrsw m1, m5
packuswb m1, m1
movd [dstq+dsq*0], m1
psrlq m1, 32
......@@ -467,20 +464,18 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy
.v_w8:
movq m0, [srcq+ssq*0]
.v_w8_loop:
movddup m2, [srcq+ssq*1]
movq m3, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
punpcklqdq m3, m0, m2 ; 0 1 m2qh:m0ql
movddup m0, [srcq+ssq*0]
punpcklqdq m4, m2, m0 ; 1 2 m0qh:m2ql
punpcklbw m1, m4, m3
punpckhbw m4, m3
pmaddubsw m1, m6
pmaddubsw m4, m6
pmulhrsw m1, m7
pmulhrsw m4, m7
packuswb m1, m4
movq [dstq+dsq*0], m1
movhps [dstq+dsq*1], m1
punpcklbw m1, m3, m0
movq m0, [srcq+ssq*0]
punpcklbw m2, m0, m3
pmaddubsw m1, m4
pmaddubsw m2, m4
pmulhrsw m1, m5
pmulhrsw m2, m5
packuswb m1, m2
movq [dstq+dsq*0], m1
movhps [dstq+dsq*1], m1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w8_loop
......@@ -489,25 +484,25 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy
%macro PUT_BILIN_V_W16 0
movu m0, [srcq+ssq*0]
%%loop:
movu m4, [srcq+ssq*1]
movu m3, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
punpcklbw m1, m4, m0
punpckhbw m3, m4, m0
punpcklbw m1, m3, m0
punpckhbw m2, m3, m0
movu m0, [srcq+ssq*0]
punpcklbw m2, m0, m4
pmaddubsw m1, m6
pmaddubsw m3, m6
pmulhrsw m1, m7
pmulhrsw m3, m7
packuswb m1, m3
pmaddubsw m1, m4
pmaddubsw m2, m4
pmulhrsw m1, m5
pmulhrsw m2, m5
packuswb m1, m2
mova [dstq+dsq*0], m1
punpckhbw m3, m0, m4
pmaddubsw m2, m6
pmaddubsw m3, m6
pmulhrsw m2, m7
pmulhrsw m3, m7
packuswb m2, m3
mova [dstq+dsq*1], m2
punpcklbw m1, m0, m3
punpckhbw m2, m0, m3
pmaddubsw m1, m4
pmaddubsw m2, m4
pmulhrsw m1, m5
pmulhrsw m2, m5
packuswb m1, m2
mova [dstq+dsq*1], m1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg %%loop
......@@ -549,7 +544,6 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy
; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
movzx wd, word [t0+wq*2+table_offset(put, _bilin_hv)]
%assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 8
shl mxyd, 11 ; can't shift by 12 due to signed overflow
mova m7, [base+pw_2048]
......@@ -579,10 +573,14 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy
paddw m1, m2 ; src[x] + (my * (src[x + src_stride] - src[x])
pmulhrsw m1, m7
packuswb m1, m1
%if ARCH_X86_64
movq r6, m1
%else
pshuflw m1, m1, q2020
movd r6d, m1
%endif
mov [dstq+dsq*0], r6w
shr r6d, 16
shr r6, gprsize*4
mov [dstq+dsq*1], r6w
lea dstq, [dstq+dsq*2]
sub hd, 2
......@@ -595,9 +593,9 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy
pshufb m0, m4
pmaddubsw m0, m5
.hv_w4_loop:
movq m1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
movhps m1, [srcq+ssq*0]
movq m1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
movhps m1, [srcq+ssq*0]
pshufb m1, m4
pmaddubsw m1, m5 ; 1 2
shufps m2, m0, m1, q1032 ; 0 1
......@@ -617,21 +615,21 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy
RET
.hv_w8:
RESTORE_DSQ_32 t0
movu m0, [srcq+ssq*0+8*0]
movu m0, [srcq+ssq*0+8*0]
pshufb m0, m4
pmaddubsw m0, m5
.hv_w8_loop:
movu m2, [srcq+ssq*1+8*0]
lea srcq, [srcq+ssq*2]
movu m3, [srcq+ssq*0+8*0]
movu m2, [srcq+ssq*1+8*0]
lea srcq, [srcq+ssq*2]
pshufb m2, m4
pshufb m3, m4
pmaddubsw m2, m5
psubw m1, m2, m0
paddw m1, m1
pmulhw m1, m6
paddw m1, m0
pmaddubsw m0, m3, m5
movu m0, [srcq+ssq*0+8*0]
pshufb m0, m4
pmaddubsw m0, m5
psubw m3, m0, m2
paddw m3, m3
pmulhw m3, m6
......@@ -639,79 +637,69 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy
pmulhrsw m1, m7
pmulhrsw m3, m7
packuswb m1, m3
movq [dstq+dsq*0], m1
movhps [dstq+dsq*1], m1
movq [dstq+dsq*0], m1
movhps [dstq+dsq*1], m1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w8_loop
RET
;
; 32bit has ssq, dsq free
%macro PUT_BILIN_HV_W16 0
.hv_w16:
xor t0d, t0d
.hv_w16gt:
mov r4, dstq
mov r6, srcq
%if WIN64
movaps r4m, xmm8
%endif
.hv_w16_loop0:
movu m0, [srcq+8*0]
movu m1, [srcq+8*1]
pshufb m0, m4
pshufb m1, m4
pmaddubsw m0, m5
pmaddubsw m1, m5
%if WIN64
movaps r4m, xmm8
%endif
%%loop:
.hv_w16_loop:
%if ARCH_X86_32
%define m3back [dstq]
%define dsqval dsm
%define m0tmp [dstq]
%else
%define m3back m8
%define dsqval dsq
%define m0tmp m8
%endif
add srcq, ssq
movu m2, [srcq+8*1]
movu m2, [srcq+8*0]
movu m3, [srcq+8*1]
pshufb m2, m4
pshufb m3, m4
pmaddubsw m2, m5
psubw m3, m2, m1
pmaddubsw m3, m5
mova m0tmp, m2
psubw m2, m0
paddw m2, m2
pmulhw m2, m6
paddw m2, m0
mova m0, m3
psubw m3, m1
paddw m3, m3
pmulhw m3, m6
paddw m3, m1
mova m1, m2
pmulhrsw m3, m7
mova m3back, m3
movu m2, [srcq+8*0]
pshufb m2, m4
pmaddubsw m2, m5
psubw m3, m2, m0
paddw m3, m3
pmulhw m3, m6
paddw m3, m0
<