Commit 0afec6b1 authored by François Cartegnie's avatar François Cartegnie 🤞 Committed by Victorien Le Couviour--Tuffet

x86: add SSSE3 mc prep_8tap implementation

```------------------
x86_64:
```

---------------------------------------
mct_8tap_regular_w4_0_8bpc_c: 115.6
mct_8tap_regular_w4_0_8bpc_ssse3: 13.1
mct_8tap_regular_w4_0_8bpc_avx2: 13.3
------------------------------------------
mct_8tap_regular_w4_h_8bpc_c: 363.0
mct_8tap_regular_w4_h_8bpc_ssse3: 19.1
mct_8tap_regular_w4_h_8bpc_avx2: 16.5
------------------------------------------
mct_8tap_regular_w4_hv_8bpc_c: 832.2
mct_8tap_regular_w4_hv_8bpc_ssse3: 113.4
mct_8tap_regular_w4_hv_8bpc_avx2: 53.1
------------------------------------------
mct_8tap_regular_w4_v_8bpc_c: 488.5
mct_8tap_regular_w4_v_8bpc_ssse3: 38.9
mct_8tap_regular_w4_v_8bpc_avx2: 26.0
------------------------------------------
mct_8tap_regular_w8_0_8bpc_c: 259.3
mct_8tap_regular_w8_0_8bpc_ssse3: 20.4
mct_8tap_regular_w8_0_8bpc_avx2: 18.0
------------------------------------------
mct_8tap_regular_w8_h_8bpc_c: 1124.3
mct_8tap_regular_w8_h_8bpc_ssse3: 67.7
mct_8tap_regular_w8_h_8bpc_avx2: 43.3
------------------------------------------
mct_8tap_regular_w8_hv_8bpc_c: 2155.0
mct_8tap_regular_w8_hv_8bpc_ssse3: 340.8
mct_8tap_regular_w8_hv_8bpc_avx2: 151.3
------------------------------------------
mct_8tap_regular_w8_v_8bpc_c: 1195.4
mct_8tap_regular_w8_v_8bpc_ssse3: 72.4
mct_8tap_regular_w8_v_8bpc_avx2: 39.8
------------------------------------------
mct_8tap_regular_w16_0_8bpc_c: 158.3
mct_8tap_regular_w16_0_8bpc_ssse3: 52.9
mct_8tap_regular_w16_0_8bpc_avx2: 30.2
------------------------------------------
mct_8tap_regular_w16_h_8bpc_c: 4267.4
mct_8tap_regular_w16_h_8bpc_ssse3: 211.9
mct_8tap_regular_w16_h_8bpc_avx2: 121.4
------------------------------------------
mct_8tap_regular_w16_hv_8bpc_c: 5430.9
mct_8tap_regular_w16_hv_8bpc_ssse3: 986.8
mct_8tap_regular_w16_hv_8bpc_avx2: 428.4
------------------------------------------
mct_8tap_regular_w16_v_8bpc_c: 4604.2
mct_8tap_regular_w16_v_8bpc_ssse3: 199.1
mct_8tap_regular_w16_v_8bpc_avx2: 100.7
------------------------------------------
mct_8tap_regular_w32_0_8bpc_c: 372.9
mct_8tap_regular_w32_0_8bpc_ssse3: 231.9
mct_8tap_regular_w32_0_8bpc_avx2: 99.7
------------------------------------------
mct_8tap_regular_w32_h_8bpc_c: 15975.0
mct_8tap_regular_w32_h_8bpc_ssse3: 802.9
mct_8tap_regular_w32_h_8bpc_avx2: 468.5
------------------------------------------
mct_8tap_regular_w32_hv_8bpc_c: 18555.5
mct_8tap_regular_w32_hv_8bpc_ssse3: 3673.5
mct_8tap_regular_w32_hv_8bpc_avx2: 1587.6
------------------------------------------
mct_8tap_regular_w32_v_8bpc_c: 16632.4
mct_8tap_regular_w32_v_8bpc_ssse3: 743.5
mct_8tap_regular_w32_v_8bpc_avx2: 337.8
------------------------------------------
mct_8tap_regular_w64_0_8bpc_c: 675.9
mct_8tap_regular_w64_0_8bpc_ssse3: 513.6
mct_8tap_regular_w64_0_8bpc_avx2: 285.4
------------------------------------------
mct_8tap_regular_w64_h_8bpc_c: 37161.3
mct_8tap_regular_w64_h_8bpc_ssse3: 1929.7
mct_8tap_regular_w64_h_8bpc_avx2: 1138.1
------------------------------------------
mct_8tap_regular_w64_hv_8bpc_c: 42434.0
mct_8tap_regular_w64_hv_8bpc_ssse3: 8822.1
mct_8tap_regular_w64_hv_8bpc_avx2: 3853.5
------------------------------------------
mct_8tap_regular_w64_v_8bpc_c: 37969.1
mct_8tap_regular_w64_v_8bpc_ssse3: 1805.6
mct_8tap_regular_w64_v_8bpc_avx2: 826.1
------------------------------------------
mct_8tap_regular_w128_0_8bpc_c: 1532.7
mct_8tap_regular_w128_0_8bpc_ssse3: 1397.7
mct_8tap_regular_w128_0_8bpc_avx2: 813.8
------------------------------------------
mct_8tap_regular_w128_h_8bpc_c: 91204.3
mct_8tap_regular_w128_h_8bpc_ssse3: 4783.0
mct_8tap_regular_w128_h_8bpc_avx2: 2767.2
------------------------------------------
mct_8tap_regular_w128_hv_8bpc_c: 102396.0
mct_8tap_regular_w128_hv_8bpc_ssse3: 22202.3
mct_8tap_regular_w128_hv_8bpc_avx2: 9637.2
------------------------------------------
mct_8tap_regular_w128_v_8bpc_c: 92294.3
mct_8tap_regular_w128_v_8bpc_ssse3: 4952.8
mct_8tap_regular_w128_v_8bpc_avx2: 2370.1
------------------------------------------

---------------------
x86_32:
------------------------------------------
mct_8tap_regular_w4_0_8bpc_c: 131.3
mct_8tap_regular_w4_0_8bpc_ssse3: 18.7
------------------------------------------
mct_8tap_regular_w4_h_8bpc_c: 422.0
mct_8tap_regular_w4_h_8bpc_ssse3: 27.3
------------------------------------------
mct_8tap_regular_w4_hv_8bpc_c: 1012.6
mct_8tap_regular_w4_hv_8bpc_ssse3: 123.6
------------------------------------------
mct_8tap_regular_w4_v_8bpc_c: 589.6
mct_8tap_regular_w4_v_8bpc_ssse3: 48.9
------------------------------------------
mct_8tap_regular_w8_0_8bpc_c: 278.5
mct_8tap_regular_w8_0_8bpc_ssse3: 26.3
------------------------------------------
mct_8tap_regular_w8_h_8bpc_c: 1129.3
mct_8tap_regular_w8_h_8bpc_ssse3: 80.6
------------------------------------------
mct_8tap_regular_w8_hv_8bpc_c: 2556.4
mct_8tap_regular_w8_hv_8bpc_ssse3: 354.6
------------------------------------------
mct_8tap_regular_w8_v_8bpc_c: 1460.2
mct_8tap_regular_w8_v_8bpc_ssse3: 103.8
------------------------------------------
mct_8tap_regular_w16_0_8bpc_c: 218.9
mct_8tap_regular_w16_0_8bpc_ssse3: 58.4
------------------------------------------
mct_8tap_regular_w16_h_8bpc_c: 4471.8
mct_8tap_regular_w16_h_8bpc_ssse3: 237.2
------------------------------------------
mct_8tap_regular_w16_hv_8bpc_c: 5570.5
mct_8tap_regular_w16_hv_8bpc_ssse3: 1044.1
------------------------------------------
mct_8tap_regular_w16_v_8bpc_c: 4885.5
mct_8tap_regular_w16_v_8bpc_ssse3: 268.3
------------------------------------------
mct_8tap_regular_w32_0_8bpc_c: 495.6
mct_8tap_regular_w32_0_8bpc_ssse3: 236.6
------------------------------------------
mct_8tap_regular_w32_h_8bpc_c: 15903.5
mct_8tap_regular_w32_h_8bpc_ssse3: 872.5
------------------------------------------
mct_8tap_regular_w32_hv_8bpc_c: 19402.2
mct_8tap_regular_w32_hv_8bpc_ssse3: 3832.8
------------------------------------------
mct_8tap_regular_w32_v_8bpc_c: 17119.5
mct_8tap_regular_w32_v_8bpc_ssse3: 935.2
------------------------------------------
mct_8tap_regular_w64_0_8bpc_c: 877.0
mct_8tap_regular_w64_0_8bpc_ssse3: 515.7
------------------------------------------
mct_8tap_regular_w64_h_8bpc_c: 36832.1
mct_8tap_regular_w64_h_8bpc_ssse3: 2094.1
------------------------------------------
mct_8tap_regular_w64_hv_8bpc_c: 43965.3
mct_8tap_regular_w64_hv_8bpc_ssse3: 9423.0
------------------------------------------
mct_8tap_regular_w64_v_8bpc_c: 37041.2
mct_8tap_regular_w64_v_8bpc_ssse3: 2348.9
------------------------------------------
mct_8tap_regular_w128_0_8bpc_c: 1929.9
mct_8tap_regular_w128_0_8bpc_ssse3: 1392.3
------------------------------------------
mct_8tap_regular_w128_h_8bpc_c: 86022.5
mct_8tap_regular_w128_h_8bpc_ssse3: 5110.8
------------------------------------------
mct_8tap_regular_w128_hv_8bpc_c: 105793.5
mct_8tap_regular_w128_hv_8bpc_ssse3: 23278.8
------------------------------------------
mct_8tap_regular_w128_v_8bpc_c: 88223.5
mct_8tap_regular_w128_v_8bpc_ssse3: 7442.7
------------------------------------------
parent 65ee1233
......@@ -50,14 +50,23 @@ decl_mc_fn(dav1d_put_bilin_avx2);
decl_mc_fn(dav1d_put_bilin_ssse3);
decl_mct_fn(dav1d_prep_8tap_regular_avx2);
decl_mct_fn(dav1d_prep_8tap_regular_ssse3);
decl_mct_fn(dav1d_prep_8tap_regular_smooth_avx2);
decl_mct_fn(dav1d_prep_8tap_regular_smooth_ssse3);
decl_mct_fn(dav1d_prep_8tap_regular_sharp_avx2);
decl_mct_fn(dav1d_prep_8tap_regular_sharp_ssse3);
decl_mct_fn(dav1d_prep_8tap_smooth_avx2);
decl_mct_fn(dav1d_prep_8tap_smooth_ssse3);
decl_mct_fn(dav1d_prep_8tap_smooth_regular_avx2);
decl_mct_fn(dav1d_prep_8tap_smooth_regular_ssse3);
decl_mct_fn(dav1d_prep_8tap_smooth_sharp_avx2);
decl_mct_fn(dav1d_prep_8tap_smooth_sharp_ssse3);
decl_mct_fn(dav1d_prep_8tap_sharp_avx2);
decl_mct_fn(dav1d_prep_8tap_sharp_ssse3);
decl_mct_fn(dav1d_prep_8tap_sharp_regular_avx2);
decl_mct_fn(dav1d_prep_8tap_sharp_regular_ssse3);
decl_mct_fn(dav1d_prep_8tap_sharp_smooth_avx2);
decl_mct_fn(dav1d_prep_8tap_sharp_smooth_ssse3);
decl_mct_fn(dav1d_prep_bilin_avx2);
decl_mct_fn(dav1d_prep_bilin_ssse3);
......@@ -108,6 +117,15 @@ void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
init_mc_fn (FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3);
init_mct_fn(FILTER_2D_BILINEAR, bilin, ssse3);
init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3);
init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3);
init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, ssse3);
init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, ssse3);
init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, ssse3);
init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, ssse3);
init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, ssse3);
init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, ssse3);
init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3);
c->avg = dav1d_avg_ssse3;
c->w_avg = dav1d_w_avg_ssse3;
......
......@@ -64,6 +64,7 @@ pw_1024: times 8 dw 1024
pw_2048: times 8 dw 2048
pw_6903: times 8 dw 6903
pw_8192: times 8 dw 8192
pd_32: times 4 dd 32
pd_512: times 4 dd 512
pw_258: times 2 dw 258
......@@ -141,6 +142,7 @@ BASE_JMP_TABLE prep, ssse3, 4, 8, 16, 32, 64, 128
%endmacro
HV_JMP_TABLE put, 8tap, ssse3, 3, 2, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE prep, 8tap, ssse3, 1, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE put, bilin, ssse3, 7, 2, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE prep, bilin, ssse3, 7, 4, 8, 16, 32, 64, 128
......@@ -2424,6 +2426,891 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
jg .hv_w8_loop0
RET
%if ARCH_X86_32
DECLARE_REG_TMP 1, 2
%elif WIN64
DECLARE_REG_TMP 6, 4
%else
DECLARE_REG_TMP 6, 7
%endif
%macro PREP_8TAP_FN 3 ; type, type_h, type_v
cglobal prep_8tap_%1
mov t0d, FILTER_%2
mov t1d, FILTER_%3
%ifnidn %1, sharp_smooth ; skip the jump in the last filter
jmp mangle(private_prefix %+ _prep_8tap %+ SUFFIX)
%endif
%endmacro
PREP_8TAP_FN regular, REGULAR, REGULAR
PREP_8TAP_FN regular_sharp, REGULAR, SHARP
PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR
PREP_8TAP_FN smooth, SMOOTH, SMOOTH
PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP
PREP_8TAP_FN sharp_regular, SHARP, REGULAR
PREP_8TAP_FN sharp, SHARP, SHARP
PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH
%if ARCH_X86_32
%define base_reg r2
%define base base_reg-prep_ssse3
%define W32_RESTORE_SSQ mov strideq, stridem
%else
%define base_reg r7
%define base 0
%define W32_RESTORE_SSQ
%endif
cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
%assign org_stack_offset stack_offset
imul mxd, mxm, 0x010101
add mxd, t0d ; 8tap_h, mx, 4tap_h
imul myd, mym, 0x010101
add myd, t1d ; 8tap_v, my, 4tap_v
movsxd wq, wm
movifnidn srcd, srcm
movifnidn hd, hm
LEA base_reg, prep_ssse3
test mxd, 0xf00
jnz .h
test myd, 0xf00
jnz .v
tzcnt wd, wd
movzx wd, word [base_reg+wq*2+table_offset(prep,)]
add wq, base_reg
movifnidn strided, stridem
lea r6, [strideq*3]
%assign stack_offset org_stack_offset
%if WIN64
pop r8
pop r7
%endif
jmp wq
.h:
test myd, 0xf00
jnz .hv
WIN64_SPILL_XMM 12
cmp wd, 4
je .h_w4
tzcnt wd, wd
%if ARCH_X86_64
mova m10, [base+subpel_h_shufA]
mova m11, [base+subpel_h_shufB]
mova m9, [base+subpel_h_shufC]
%endif
shr mxd, 16
sub srcq, 3
movzx wd, word [base_reg+wq*2+table_offset(prep, _8tap_h)]
movd m5, [base_reg+mxq*8+subpel_filters-prep_ssse3+0]
pshufd m5, m5, q0000
movd m6, [base_reg+mxq*8+subpel_filters-prep_ssse3+4]
pshufd m6, m6, q0000
mova m7, [base+pw_8192]
add wq, base_reg
jmp wq
.h_w4:
%if ARCH_X86_32
and mxd, 0xff
%else
movzx mxd, mxb
%endif
dec srcq
movd m4, [base_reg+mxq*8+subpel_filters-prep_ssse3+2]
pshufd m4, m4, q0000
mova m6, [base+pw_8192]
mova m5, [base+subpel_h_shufA]
W32_RESTORE_SSQ
%if ARCH_X86_64
lea stride3q, [strideq*3]
%endif
.h_w4_loop:
movq m0, [srcq+strideq*0] ; 0
movq m1, [srcq+strideq*1] ; 1
%if ARCH_X86_32
lea srcq, [srcq+strideq*2]
movq m2, [srcq+strideq*0] ; 2
movq m3, [srcq+strideq*1] ; 3
lea srcq, [srcq+strideq*2]
%else
movq m2, [srcq+strideq*2] ; 2
movq m3, [srcq+stride3q ] ; 3
lea srcq, [srcq+strideq*4]
%endif
pshufb m0, m5 ; subpel_h_shufA
pshufb m1, m5
pshufb m2, m5
pshufb m3, m5
pmaddubsw m0, m4 ; subpel_filters + 2
pmaddubsw m1, m4
pmaddubsw m2, m4
pmaddubsw m3, m4
phaddw m0, m1
phaddw m2, m3
pmulhrsw m0, m6 ; pw_8192
pmulhrsw m2, m6 ; pw_8192
mova [tmpq+16*0], m0
mova [tmpq+16*1], m2
add tmpq, 32
sub hd, 4
jg .h_w4_loop
RET
;
%macro PREP_8TAP_H 4 ; dst/src, tmp[1-3]
%if ARCH_X86_32
pshufb %2, %1, [base+subpel_h_shufB]
pshufb %3, %1, [base+subpel_h_shufC]
pshufb %1, [base+subpel_h_shufA]
%else
pshufb %2, %1, m11; subpel_h_shufB
pshufb %3, %1, m9 ; subpel_h_shufC
pshufb %1, m10 ; subpel_h_shufA
%endif
pmaddubsw %4, %2, m5 ; subpel +0 B0
pmaddubsw %2, m6 ; subpel +4 B4
pmaddubsw %3, m6 ; subpel +4 C4
pmaddubsw %1, m5 ; subpel +0 A0
paddw %3, %4
paddw %1, %2
phaddw %1, %3
pmulhrsw %1, m7 ; 8192
%endmacro
;
.h_w8:
%if ARCH_X86_32
mov r3, r2
%define base_reg r3
W32_RESTORE_SSQ
%endif
.h_w8_loop:
movu m0, [srcq+strideq*0]
movu m1, [srcq+strideq*1]
lea srcq, [srcq+strideq*2]
PREP_8TAP_H m0, m2, m3, m4
PREP_8TAP_H m1, m2, m3, m4
mova [tmpq+16*0], m0
mova [tmpq+16*1], m1
add tmpq, 32
sub hd, 2
jg .h_w8_loop
RET
.h_w16:
xor r6d, r6d
jmp .h_start
.h_w32:
mov r6, -16*1
jmp .h_start
.h_w64:
mov r6, -16*3
jmp .h_start
.h_w128:
mov r6, -16*7
.h_start:
%if ARCH_X86_32
mov r3, r2
%define base_reg r3
%endif
sub srcq, r6
mov r5, r6
W32_RESTORE_SSQ
.h_loop:
movu m0, [srcq+r6+8*0]
movu m1, [srcq+r6+8*1]
PREP_8TAP_H m0, m2, m3, m4
PREP_8TAP_H m1, m2, m3, m4
mova [tmpq+16*0], m0
mova [tmpq+16*1], m1
add tmpq, 32
add r6, 16
jle .h_loop
add srcq, strideq
mov r6, r5
dec hd
jg .h_loop
RET
%if ARCH_X86_32
%define base_reg r2
%endif
.v:
%if ARCH_X86_32
mov mxd, myd
and mxd, 0xff
%else
%assign stack_offset org_stack_offset
WIN64_SPILL_XMM 16
movzx mxd, myb
%endif
shr myd, 16
cmp hd, 4
cmovle myd, mxd
lea myq, [base_reg+myq*8+subpel_filters-prep_ssse3]
mova m2, [base+pw_512]
psrlw m2, m2, 1 ; 0x0100
mova m7, [base+pw_8192]
%if ARCH_X86_32
%define subpel0 [rsp+mmsize*0]
%define subpel1 [rsp+mmsize*1]
%define subpel2 [rsp+mmsize*2]
%define subpel3 [rsp+mmsize*3]
%assign regs_used 2 ; use r1 (src) as tmp for stack alignment if needed
ALLOC_STACK -mmsize*4
%assign regs_used 7
movd m0, [myq+0]
pshufb m0, m2
mova subpel0, m0
movd m0, [myq+2]
pshufb m0, m2
mova subpel1, m0
movd m0, [myq+4]
pshufb m0, m2
mova subpel2, m0
movd m0, [myq+6]
pshufb m0, m2
mova subpel3, m0
mov strideq, [rstk+stack_offset+gprsize*3]
lea strideq, [strideq*3]
sub [rstk+stack_offset+gprsize*2], strideq
mov strideq, [rstk+stack_offset+gprsize*3]
mov srcq, [rstk+stack_offset+gprsize*2]
%else
%define subpel0 m8
%define subpel1 m9
%define subpel2 m10
%define subpel3 m11
movd subpel0, [myq+0]
pshufb subpel0, m2
movd subpel1, [myq+2]
pshufb subpel1, m2
movd subpel2, [myq+4]
pshufb subpel2, m2
movd subpel3, [myq+6]
pshufb subpel3, m2
lea stride3q, [strideq*3]
sub srcq, stride3q
cmp wd, 8
jg .v_w16
je .v_w8
%endif
.v_w4:
%if ARCH_X86_32
%if STACK_ALIGNMENT < mmsize
%define srcm [rsp+mmsize*4+gprsize*1]
%define tmpm [rsp+mmsize*4+gprsize*2]
%endif
mov tmpm, tmpq
mov srcm, srcq
lea r5d, [wq - 4] ; horizontal loop
shl r5d, (16 - 2) ; (wq / 4) << 16
mov r5w, hw
.v_w4_loop0:
%endif
movd m2, [srcq+strideq*0] ; 0
movhps m2, [srcq+strideq*2] ; 0 _ 2
movd m3, [srcq+strideq*1] ; 1
%if ARCH_X86_32
lea srcq, [srcq+strideq*2]
movhps m3, [srcq+strideq*1] ; 1 _ 3
lea srcq, [srcq+strideq*2]
%else
movhps m3, [srcq+stride3q ] ; 1 _ 3
lea srcq, [srcq+strideq*4]
%endif
pshufd m2, m2, q2020 ; 0 2 0 2
pshufd m3, m3, q2020 ; 1 3 1 3
punpckldq m2, m3 ; 0 1 2 3
movd m3, [srcq+strideq*0] ; 4
movd m1, [srcq+strideq*1] ; 5
movd m0, [srcq+strideq*2] ; 6
%if ARCH_X86_32
lea srcq, [srcq+strideq*2]
add srcq, strideq
%else
add srcq, stride3q
%endif
punpckldq m3, m1 ; 4 5 _ _
punpckldq m1, m0 ; 5 6 _ _
palignr m4, m3, m2, 4 ; 1 2 3 4
punpcklbw m3, m1 ; 45 56
punpcklbw m1, m2, m4 ; 01 12
punpckhbw m2, m4 ; 23 34
.v_w4_loop:
pmaddubsw m5, m1, subpel0 ; a0 b0
mova m1, m2
pmaddubsw m2, subpel1 ; a1 b1
paddw m5, m2
mova m2, m3
pmaddubsw m3, subpel2 ; a2 b2
paddw m5, m3
movd m4, [srcq+strideq*0]
punpckldq m3, m0, m4 ; 6 7 _ _
movd m0, [srcq+strideq*1]
lea srcq, [srcq+strideq*2]
punpckldq m4, m0 ; 7 8 _ _
punpcklbw m3, m4 ; 67 78
pmaddubsw m4, m3, subpel3 ; a3 b3
paddw m5, m4
pmulhrsw m5, m7
movq [tmpq+wq*0], m5
movhps [tmpq+wq*2], m5
lea tmpq, [tmpq+wq*4]
sub hd, 2
jg .v_w4_loop
%if ARCH_X86_32
mov hw, r5w ; reset vertical loop
mov tmpq, tmpm
mov srcq, srcm
add tmpq, 8
add srcq, 4
mov tmpm, tmpq
mov srcm, srcq
sub r5d, 1<<16 ; horizontal--
jg .v_w4_loop0
%endif
RET
%if ARCH_X86_64
.v_w8:
.v_w16:
lea r5d, [wq - 8] ; horizontal loop
mov r8, tmpq
mov r6, srcq
shl r5d, 8 - 3; (wq / 8) << 8
mov r5b, hb
.v_w8_loop0:
movq m4, [srcq+strideq*0] ; 0
movq m5, [srcq+strideq*1] ; 1
lea srcq, [srcq+strideq*2]
movq m6, [srcq+strideq*0] ; 2
movq m0, [srcq+strideq*1] ; 3
lea srcq, [srcq+strideq*2]
movq m1, [srcq+strideq*0] ; 4
movq m2, [srcq+strideq*1] ; 5
lea srcq, [srcq+strideq*2] ;
movq m3, [srcq+strideq*0] ; 6
shufpd m4, m0, 0x0c
shufpd m5, m1, 0x0c
punpcklbw m1, m4, m5 ; 01
punpckhbw m4, m5 ; 34
shufpd m6, m2, 0x0c
punpcklbw m2, m5, m6 ; 12
punpckhbw m5, m6 ; 45
shufpd m0, m3, 0x0c
punpcklbw m3, m6, m0 ; 23
punpckhbw m6, m0 ; 56
.v_w8_loop:
movq m12, [srcq+strideq*1] ; 8
lea srcq, [srcq+strideq*2]
movq m13, [srcq+strideq*0] ; 9
pmaddubsw m14, m1, subpel0 ; a0
pmaddubsw m15, m2, subpel0 ; b0
mova m1, m3
mova m2, m4
pmaddubsw m3, subpel1 ; a1
pmaddubsw m4, subpel1 ; b1
paddw m14, m3
paddw m15, m4
mova m3, m5
mova m4, m6
pmaddubsw m5, subpel2 ; a2
pmaddubsw m6, subpel2 ; b2
paddw m14, m5
paddw m15, m6
shufpd m6, m0, m12, 0x0d
shufpd m0, m12, m13, 0x0c
punpcklbw m5, m6, m0 ; 67
punpckhbw m6, m0 ; 78
pmaddubsw m12, m5, subpel3 ; a3
pmaddubsw m13, m6, subpel3 ; b3
paddw m14, m12
paddw m15, m13
pmulhrsw m14, m7
pmulhrsw m15, m7
movu [tmpq+wq*0], xm14
movu [tmpq+wq*2], xm15
lea tmpq, [tmpq+wq*4]
sub hd, 2
jg .v_w8_loop
movzx hd, r5b ; reset vertical loop
add r8, 16
add r6, 8
mov tmpq, r8
mov srcq, r6
sub r5d, 1<<8 ; horizontal--
jg .v_w8_loop0
RET
%endif ;ARCH_X86_64
%undef subpel0
%undef subpel1
%undef subpel2
%undef subpel3
.hv:
%assign stack_offset org_stack_offset
cmp wd, 4
jg .hv_w8
and mxd, 0xff
movd m1, [base_reg+mxq*8+subpel_filters-prep_ssse3+2]
%if ARCH_X86_32
mov mxd, myd
and mxd, 0xff
shr myd, 16
cmp hd, 4
cmovle myd, mxd
movq m0, [base_reg+myq*8+subpel_filters-prep_ssse3]
mov r5, r2; use as new base
%define base_reg r5
%assign regs_used 2
ALLOC_STACK -mmsize*14
%assign regs_used 7
mov strideq, [rstk+stack_offset+gprsize*3]
lea strideq, [strideq*3 + 1]
sub [rstk+stack_offset+gprsize*2], strideq
mov strideq, [rstk+stack_offset+gprsize*3]
mov srcq, [rstk+stack_offset+gprsize*2]
%define subpelv0 [rsp+mmsize*0]
%define subpelv1 [rsp+mmsize*1]
%define subpelv2 [rsp+mmsize*2]
%define subpelv3 [rsp+mmsize*3]
punpcklbw m0, m0
psraw m0, 8 ; sign-extend
pshufd m6, m0, q0000
mova subpelv0, m6
pshufd m6, m0, q1111
mova subpelv1, m6
pshufd m6, m0, q2222
mova subpelv2, m6
pshufd m6, m0, q3333
mova subpelv3, m6
%else
movzx mxd, myb
shr myd, 16
cmp hd, 4
cmovle myd, mxd
movq m0, [base_reg+myq*8+subpel_filters-prep_ssse3]
ALLOC_STACK mmsize*14, 14
lea stride3q, [strideq*3]
sub srcq, stride3q
dec srcq
%define subpelv0 m10
%define subpelv1 m11
%define subpelv2 m12
%define subpelv3 m13
punpcklbw m0, m0
psraw m0, 8 ; sign-extend
mova m8, [base+pw_8192]
mova m9, [base+pd_32]
pshufd m10, m0, q0000
pshufd m11, m0, q1111
pshufd m12, m0, q2222
pshufd m13, m0, q3333
%endif
pshufd m7, m1, q0000
.hv_w4:
%define hv4_line_0_0 4
%define hv4_line_0_1 5
%define hv4_line_0_2 6
%define hv4_line_0_3 7
%define hv4_line_0_4 8
%define hv4_line_0_5 9
%define hv4_line_1_0 10
%define hv4_line_1_1 11
%define hv4_line_1_2 12
%define hv4_line_1_3 13
;
;
%if ARCH_X86_32
%define w8192reg [base+pw_8192]
%define d32reg [base+pd_32]
%else
%define w8192reg m8
%define d32reg m9
%endif
; lower shuffle 0 1 2 3 4
mova m6, [base+subpel_h_shuf4]
movq m5, [srcq+strideq*0] ; 0 _ _ _
movhps m5, [srcq+strideq*1] ; 0 _ 1 _
movq m4, [srcq+strideq*2] ; 2 _ _ _
%if ARCH_X86_32
lea srcq, [srcq+strideq*2]
add srcq, strideq
movhps m4, [srcq+strideq*0] ; 2 _ 3 _
add srcq, strideq
%else
movhps m4, [srcq+stride3q ] ; 2 _ 3 _
lea srcq, [srcq+strideq*4]
%endif
pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~
pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~
pmaddubsw m2, m7 ;H subpel_filters
pmaddubsw m0, m7 ;H subpel_filters
phaddw m2, m0 ;H 0 1 2 3
pmulhrsw m2, w8192reg ;H pw_8192
SAVELINE_W4 m2, 2, 0
; upper shuffle 2 3 4 5 6
mova m6, [base+subpel_h_shuf4+16]
pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~
pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~
pmaddubsw m2, m7 ;H subpel_filters
pmaddubsw m0, m7 ;H subpel_filters
phaddw m2, m0 ;H 0 1 2 3
pmulhrsw m2, w8192reg ;H pw_8192
;
; lower shuffle
mova m6, [base+subpel_h_shuf4]
movq m5, [srcq+strideq*0] ; 4 _ _ _
movhps m5, [srcq+strideq*1] ; 4 _ 5 _
movq m4, [srcq+strideq*2] ; 6 _ _ _
pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~
pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~
pmaddubsw m3, m7 ;H subpel_filters
pmaddubsw m0, m7 ;H subpel_filters
phaddw m3, m0 ;H 4 5 6 7
pmulhrsw m3, w8192reg ;H pw_8192
SAVELINE_W4 m3, 3, 0
; upper shuffle
mova m6, [base+subpel_h_shuf4+16]
pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~
pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~
pmaddubsw m3, m7 ;H subpel_filters
pmaddubsw m0, m7 ;H subpel_filters
phaddw m3, m0 ;H 4 5 6 7
pmulhrsw m3, w8192reg ;H pw_8192
;
%if ARCH_X86_32
lea srcq, [srcq+strideq*2]
add srcq, strideq
%else
add srcq, stride3q
%endif
;process high
palignr m4, m3, m2, 4;V 1 2 3 4
punpcklwd m1, m2, m4 ; V 01 12
punpckhwd m2, m4 ; V 23 34
pshufd m0, m3, q2121;V 5 6 5 6
punpcklwd m3, m0 ; V 45 56
SAVELINE_W4 m0, 0, 1
SAVELINE_W4 m1, 1, 1
SAVELINE_W4 m2, 2, 1
SAVELINE_W4 m3, 3, 1
;process low
RESTORELINE_W4 m2, 2, 0
RESTORELINE_W4 m3, 3, 0
palignr m4, m3, m2, 4;V 1 2 3 4
punpcklwd m1, m2, m4 ; V 01 12
punpckhwd m2, m4 ; V 23 34
pshufd m0, m3, q2121;V 5 6 5 6
punpcklwd m3, m0 ; V 45 56
.hv_w4_loop:
;process low
pmaddwd m5, m1, subpelv0 ; V a0 b0
mova m1, m2
pmaddwd m2, subpelv1; V a1 b1
paddd m5, m2
mova m2, m3
pmaddwd m3, subpelv2; V a2 b2
paddd m5, m3
;
mova m6, [base+subpel_h_shuf4]
movq m4, [srcq+strideq*0] ; 7
movhps m4, [srcq+strideq*1] ; 7 _ 8 _
pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~
pmaddubsw m4, m7 ;H subpel_filters
phaddw m4, m4 ;H 7 8 7 8
pmulhrsw m4, w8192reg ;H pw_8192
palignr m3, m4, m0, 12 ; 6 7 8 7
mova m0, m4
punpcklwd m3, m4 ; 67 78
pmaddwd m4, m3, subpelv3; a3 b3
paddd m5, d32reg ; pd_32
paddd m5, m4
psrad m5, 6
SAVELINE_W4 m0, 0, 0
SAVELINE_W4 m1, 1, 0
SAVELINE_W4 m2, 2, 0
SAVELINE_W4 m3, 3, 0
SAVELINE_W4 m5, 5, 0
;process high
RESTORELINE_W4 m0, 0, 1
RESTORELINE_W4 m1, 1, 1
RESTORELINE_W4 m2, 2, 1
RESTORELINE_W4 m3, 3, 1
pmaddwd m5, m1, subpelv0; V a0 b0
mova m1, m2
pmaddwd m2, subpelv1; V a1 b1
paddd m5, m2
mova m2, m3
pmaddwd m3, subpelv2; V a2 b2
paddd m5, m3
;
mova m6, [base+subpel_h_shuf4+16]
movq m4, [srcq+strideq*0] ; 7
movhps m4, [srcq+strideq*1] ; 7 _ 8 _
pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~
pmaddubsw m4, m7 ;H subpel_filters
phaddw m4, m4 ;H 7 8 7 8
pmulhrsw m4, w8192reg ;H pw_8192
palignr m3, m4, m0, 12 ; 6 7 8 7
mova m0, m4
punpcklwd m3, m4 ; 67 78
pmaddwd m4, m3, subpelv3; a3 b3
paddd m5, d32reg ; pd_32
paddd m5, m4
psrad m4, m5, 6
;
RESTORELINE_W4 m5, 5, 0
packssdw m5, m4
pshufd m5, m5, q3120
movu [tmpq], m5
lea srcq, [srcq+strideq*2]
add tmpq, 16
sub hd, 2
SAVELINE_W4 m0, 0, 1
SAVELINE_W4 m1, 1, 1
SAVELINE_W4 m2, 2, 1
SAVELINE_W4 m3, 3, 1
RESTORELINE_W4 m0, 0, 0
RESTORELINE_W4 m1, 1, 0
RESTORELINE_W4 m2, 2, 0
RESTORELINE_W4 m3, 3, 0
jg .hv_w4_loop