Commit ee58d65d authored by François Cartegnie's avatar François Cartegnie 🤞 Committed by Henrik Gramner

Add SSSE3 put_bilin

parent aaba9f8e
Pipeline #3874 passed with stages
in 5 minutes and 11 seconds
...@@ -38,6 +38,7 @@ decl_mc_fn(dav1d_put_8tap_sharp_avx2); ...@@ -38,6 +38,7 @@ decl_mc_fn(dav1d_put_8tap_sharp_avx2);
decl_mc_fn(dav1d_put_8tap_sharp_regular_avx2); decl_mc_fn(dav1d_put_8tap_sharp_regular_avx2);
decl_mc_fn(dav1d_put_8tap_sharp_smooth_avx2); decl_mc_fn(dav1d_put_8tap_sharp_smooth_avx2);
decl_mc_fn(dav1d_put_bilin_avx2); decl_mc_fn(dav1d_put_bilin_avx2);
decl_mc_fn(dav1d_put_bilin_ssse3);
decl_mct_fn(dav1d_prep_8tap_regular_avx2); decl_mct_fn(dav1d_prep_8tap_regular_avx2);
decl_mct_fn(dav1d_prep_8tap_regular_smooth_avx2); decl_mct_fn(dav1d_prep_8tap_regular_smooth_avx2);
...@@ -83,6 +84,8 @@ void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) { ...@@ -83,6 +84,8 @@ void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
return; return;
#if BITDEPTH == 8 #if BITDEPTH == 8
init_mc_fn (FILTER_2D_BILINEAR, bilin, ssse3);
c->avg = dav1d_avg_ssse3; c->avg = dav1d_avg_ssse3;
c->w_avg = dav1d_w_avg_ssse3; c->w_avg = dav1d_w_avg_ssse3;
c->mask = dav1d_mask_ssse3; c->mask = dav1d_mask_ssse3;
......
...@@ -45,7 +45,10 @@ obmc_masks: db 0, 0, 0, 0 ...@@ -45,7 +45,10 @@ obmc_masks: db 0, 0, 0, 0
db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9 db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9
db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2 db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2
db 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0 db 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0
blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11
bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
pb_64: times 16 db 64 pb_64: times 16 db 64
pw_8: times 8 dw 8 pw_8: times 8 dw 8
...@@ -76,10 +79,650 @@ BIDIR_JMP_TABLE blend_ssse3, 4, 8, 16, 32 ...@@ -76,10 +79,650 @@ BIDIR_JMP_TABLE blend_ssse3, 4, 8, 16, 32
BIDIR_JMP_TABLE blend_v_ssse3, 2, 4, 8, 16, 32 BIDIR_JMP_TABLE blend_v_ssse3, 2, 4, 8, 16, 32
BIDIR_JMP_TABLE blend_h_ssse3, 2, 4, 8, 16, 16, 16, 16 BIDIR_JMP_TABLE blend_h_ssse3, 2, 4, 8, 16, 16, 16, 16
%macro BASE_JMP_TABLE 3-*
%xdefine %1_%2_table (%%table - %3)
%xdefine %%base %1_%2
%%table:
%rep %0 - 2
dw %%base %+ _w%3 - %%base
%rotate 1
%endrep
%endmacro
%xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_ssse3.put)
BASE_JMP_TABLE put, ssse3, 2, 4, 8, 16, 32, 64, 128
%macro HV_JMP_TABLE 5-*
%xdefine %%prefix mangle(private_prefix %+ _%1_%2_%3)
%xdefine %%base %1_%3
%assign %%types %4
%if %%types & 1
%xdefine %1_%2_h_%3_table (%%h - %5)
%%h:
%rep %0 - 4
dw %%prefix %+ .h_w%5 - %%base
%rotate 1
%endrep
%rotate 4
%endif
%if %%types & 2
%xdefine %1_%2_v_%3_table (%%v - %5)
%%v:
%rep %0 - 4
dw %%prefix %+ .v_w%5 - %%base
%rotate 1
%endrep
%rotate 4
%endif
%if %%types & 4
%xdefine %1_%2_hv_%3_table (%%hv - %5)
%%hv:
%rep %0 - 4
dw %%prefix %+ .hv_w%5 - %%base
%rotate 1
%endrep
%endif
%endmacro
HV_JMP_TABLE put, bilin, ssse3, 7, 2, 4, 8, 16, 32, 64, 128
%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
SECTION .text SECTION .text
INIT_XMM ssse3 INIT_XMM ssse3
%if ARCH_X86_32
DECLARE_REG_TMP 1
cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy, bak
%define base t0-put_ssse3
%else
DECLARE_REG_TMP 7
%define base 0
cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy
%endif
;
%macro RESTORE_DSQ_32 1
%if ARCH_X86_32
mov %1, dsm ; restore dsq
%endif
%endmacro
;
movifnidn mxyd, r6m ; mx
LEA t0, put_ssse3
tzcnt wd, wm
mov hd, hm
test mxyd, mxyd
jnz .h
mov mxyd, r7m ; my
test mxyd, mxyd
jnz .v
.put:
movzx wd, word [t0+wq*2+table_offset(put,)]
add wq, t0
lea r6, [ssq*3]
RESTORE_DSQ_32 t0
jmp wq
.put_w2:
movzx r4d, word [srcq+ssq*0]
movzx r6d, word [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
mov [dstq+dsq*0], r4w
mov [dstq+dsq*1], r6w
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w2
RET
.put_w4:
mov r4d, [srcq+ssq*0]
mov r6d, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
mov [dstq+dsq*0], r4d
mov [dstq+dsq*1], r6d
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w4
RET
.put_w8:
movq m0, [srcq+ssq*0]
movq m1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
movq [dstq+dsq*0], m0
movq [dstq+dsq*1], m1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w8
RET
.put_w16:
lea r4, [dsq*3]
.put_w16_in:
movu m0, [srcq+ssq*0]
movu m1, [srcq+ssq*1]
movu m2, [srcq+ssq*2]
movu m3, [srcq+r6 ]
lea srcq, [srcq+ssq*4]
mova [dstq+dsq*0], m0
mova [dstq+dsq*1], m1
mova [dstq+dsq*2], m2
mova [dstq+r4 ], m3
lea dstq, [dstq+dsq*4]
sub hd, 4
jg .put_w16_in
RET
.put_w32:
movu m0, [srcq+ssq*0+16*0]
movu m1, [srcq+ssq*0+16*1]
movu m2, [srcq+ssq*1+16*0]
movu m3, [srcq+ssq*1+16*1]
lea srcq, [srcq+ssq*2]
mova [dstq+dsq*0+16*0], m0
mova [dstq+dsq*0+16*1], m1
mova [dstq+dsq*1+16*0], m2
mova [dstq+dsq*1+16*1], m3
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .put_w32
RET
.put_w64:
movu m0, [srcq+16*0]
movu m1, [srcq+16*1]
movu m2, [srcq+16*2]
movu m3, [srcq+16*3]
add srcq, ssq
mova [dstq+16*0], m0
mova [dstq+16*1], m1
mova [dstq+16*2], m2
mova [dstq+16*3], m3
add dstq, dsq
dec hd
jg .put_w64
RET
.put_w128:
movu m0, [srcq+16*0]
movu m1, [srcq+16*1]
movu m2, [srcq+16*2]
movu m3, [srcq+16*3]
mova [dstq+16*0], m0
mova [dstq+16*1], m1
mova [dstq+16*2], m2
mova [dstq+16*3], m3
movu m0, [srcq+16*4]
movu m1, [srcq+16*5]
movu m2, [srcq+16*6]
movu m3, [srcq+16*7]
mova [dstq+16*4], m0
mova [dstq+16*5], m1
mova [dstq+16*6], m2
mova [dstq+16*7], m3
add srcq, ssq
add dstq, dsq
dec hd
jg .put_w128
RET
.h:
; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
imul mxyd, 0xff01
mova m4, [base+bilin_h_shuf8]
mova m0, [base+bilin_h_shuf4]
WIN64_SPILL_XMM 7
add mxyd, 16 << 8
movd m5, mxyd
mov mxyd, r7m ; my
pshuflw m5, m5, q0000
punpcklqdq m5, m5
test mxyd, mxyd
jnz .hv
movzx wd, word [t0+wq*2+table_offset(put, _bilin_h)]
mova m6, [base+pw_2048]
add wq, t0
RESTORE_DSQ_32 t0
jmp wq
.h_w2:
pshufd m4, m4, q3120 ; m4 = {1, 0, 2, 1, 5, 4, 6, 5}
.h_w2_loop:
movd m0, [srcq+ssq*0]
movd m1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
punpckldq m0, m1
pshufb m0, m4
pmaddubsw m0, m5
pmulhrsw m0, m6
packuswb m0, m0
movd r6d, m0
mov [dstq+dsq*0], r6w
shr r6d, 16
mov [dstq+dsq*1], r6w
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w2_loop
RET
.h_w4:
movq m4, [srcq+ssq*0]
movhps m4, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
pshufb m4, m0
pmaddubsw m4, m5
pmulhrsw m4, m6
packuswb m4, m4
movd [dstq+dsq*0], m4
pshufd m4, m4, q0101
movd [dstq+dsq*1], m4
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w4
RET
.h_w8:
movu m0, [srcq+ssq*0]
movu m1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
pshufb m0, m4
pshufb m1, m4
pmaddubsw m0, m5
pmaddubsw m1, m5
pmulhrsw m0, m6
pmulhrsw m1, m6
packuswb m0, m1
movq [dstq+dsq*0], m0
movhps [dstq+dsq*1], m0
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .h_w8
RET
.h_w16:
movu m0, [srcq+8*0]
movu m1, [srcq+8*1]
add srcq, ssq
pshufb m0, m4
pshufb m1, m4
pmaddubsw m0, m5
pmaddubsw m1, m5
pmulhrsw m0, m6
pmulhrsw m1, m6
packuswb m0, m1
mova [dstq], m0
add dstq, dsq
dec hd
jg .h_w16
RET
.h_w32:
movu m0, [srcq+mmsize*0+8*0]
movu m1, [srcq+mmsize*0+8*1]
movu m2, [srcq+mmsize*1+8*0]
movu m3, [srcq+mmsize*1+8*1]
add srcq, ssq
pshufb m0, m4
pshufb m1, m4
pshufb m2, m4
pshufb m3, m4
pmaddubsw m0, m5
pmaddubsw m1, m5
pmaddubsw m2, m5
pmaddubsw m3, m5
pmulhrsw m0, m6
pmulhrsw m1, m6
pmulhrsw m2, m6
pmulhrsw m3, m6
packuswb m0, m1
packuswb m2, m3
mova [dstq+16*0], m0
mova [dstq+16*1], m2
add dstq, dsq
dec hd
jg .h_w32
RET
.h_w64:
mov r6, -16*3
.h_w64_loop:
movu m0, [srcq+r6+16*3+8*0]
movu m1, [srcq+r6+16*3+8*1]
pshufb m0, m4
pshufb m1, m4
pmaddubsw m0, m5
pmaddubsw m1, m5
pmulhrsw m0, m6
pmulhrsw m1, m6
packuswb m0, m1
mova [dstq+r6+16*3], m0
add r6, 16
jle .h_w64_loop
add srcq, ssq
add dstq, dsq
dec hd
jg .h_w64
RET
.h_w128:
mov r6, -16*7
.h_w128_loop:
movu m0, [srcq+r6+16*7+8*0]
movu m1, [srcq+r6+16*7+8*1]
pshufb m0, m4
pshufb m1, m4
pmaddubsw m0, m5
pmaddubsw m1, m5
pmulhrsw m0, m6
pmulhrsw m1, m6
packuswb m0, m1
mova [dstq+r6+16*7], m0
add r6, 16
jle .h_w128_loop
add srcq, ssq
add dstq, dsq
dec hd
jg .h_w128
RET
.v:
movzx wd, word [t0+wq*2+table_offset(put, _bilin_v)]
%assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 8
imul mxyd, 0xff01
mova m7, [base+pw_2048]
add mxyd, 16 << 8
add wq, t0
movd m6, mxyd
pshuflw m6, m6, q0000
punpcklqdq m6, m6
RESTORE_DSQ_32 t0
jmp wq
.v_w2:
movd m0, [srcq+ssq*0]
.v_w2_loop:
pinsrw m0, [srcq+ssq*1], 1 ; 0 1
lea srcq, [srcq+ssq*2]
pshuflw m2, m0, q2301
pinsrw m0, [srcq+ssq*0], 0 ; 2 1
punpcklbw m1, m0, m2
pmaddubsw m1, m6
pmulhrsw m1, m7
packuswb m1, m1
movd r6d, m1
mov [dstq+dsq*1], r6w
shr r6d, 16
mov [dstq+dsq*0], r6w
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w2_loop
RET
.v_w4:
movd m0, [srcq+ssq*0]
.v_w4_loop:
movd m1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
punpckldq m2, m0, m1 ; 0 1
movd m0, [srcq+ssq*0]
punpckldq m1, m0 ; 1 2
punpcklbw m1, m2
pmaddubsw m1, m6
pmulhrsw m1, m7
packuswb m1, m1
movd [dstq+dsq*0], m1
psrlq m1, 32
movd [dstq+dsq*1], m1
;
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w4_loop
RET
.v_w8:
movq m0, [srcq+ssq*0]
.v_w8_loop:
movddup m2, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
punpcklqdq m3, m0, m2 ; 0 1 m2qh:m0ql
movddup m0, [srcq+ssq*0]
punpcklqdq m4, m2, m0 ; 1 2 m0qh:m2ql
punpcklbw m1, m4, m3
punpckhbw m4, m3
pmaddubsw m1, m6
pmaddubsw m4, m6
pmulhrsw m1, m7
pmulhrsw m4, m7
packuswb m1, m4
movq [dstq+dsq*0], m1
movhps [dstq+dsq*1], m1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .v_w8_loop
RET
;
%macro PUT_BILIN_V_W16 0
movu m0, [srcq+ssq*0]
%%loop:
movu m4, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
punpcklbw m1, m4, m0
punpckhbw m3, m4, m0
movu m0, [srcq+ssq*0]
punpcklbw m2, m0, m4
pmaddubsw m1, m6
pmaddubsw m3, m6
pmulhrsw m1, m7
pmulhrsw m3, m7
packuswb m1, m3
mova [dstq+dsq*0], m1
punpckhbw m3, m0, m4
pmaddubsw m2, m6
pmaddubsw m3, m6
pmulhrsw m2, m7
pmulhrsw m3, m7
packuswb m2, m3
mova [dstq+dsq*1], m2
lea dstq, [dstq+dsq*2]
sub hd, 2
jg %%loop
%endmacro
;
.v_w16:
PUT_BILIN_V_W16
RET
.v_w16gt:
mov r4, dstq
mov r6, srcq
.v_w16gt_loop:
%if ARCH_X86_32
mov bakm, t0q
RESTORE_DSQ_32 t0
PUT_BILIN_V_W16
mov t0q, bakm
%else
PUT_BILIN_V_W16
%endif
mov hw, t0w
add r4, mmsize
add r6, mmsize
mov dstq, r4
mov srcq, r6
sub t0d, 1<<16
jg .v_w16gt
RET
.v_w32:
lea t0d, [hq+(1<<16)]
jmp .v_w16gt
.v_w64:
lea t0d, [hq+(3<<16)]
jmp .v_w16gt
.v_w128:
lea t0d, [hq+(7<<16)]
jmp .v_w16gt
.hv:
; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
movzx wd, word [t0+wq*2+table_offset(put, _bilin_hv)]
%assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 8
shl mxyd, 11 ; can't shift by 12 due to signed overflow
mova m7, [base+pw_2048]
movd m6, mxyd
add wq, t0
pshuflw m6, m6, q0000
punpcklqdq m6, m6
jmp wq
.hv_w2:
RESTORE_DSQ_32 t0
movd m0, [srcq+ssq*0]
pshufd m0, m0, q0000 ; src[x - src_stride]
pshufb m0, m4
pmaddubsw m0, m5
.hv_w2_loop:
movd m1, [srcq+ssq*1] ; src[x]
lea srcq, [srcq+ssq*2]
movhps m1, [srcq+ssq*0] ; src[x + src_stride]
pshufd m1, m1, q3120
pshufb m1, m4
pmaddubsw m1, m5 ; 1 _ 2 _
shufps m2, m0, m1, q1032 ; 0 _ 1 _
mova m0, m1
psubw m1, m2 ; src[x + src_stride] - src[x]
paddw m1, m1
pmulhw m1, m6 ; (my * (src[x + src_stride] - src[x])
paddw m1, m2 ; src[x] + (my * (src[x + src_stride] - src[x])
pmulhrsw m1, m7
packuswb m1, m1
pshuflw m1, m1, q2020
movd r6d, m1
mov [dstq+dsq*0], r6w
shr r6d, 16
mov [dstq+dsq*1], r6w
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w2_loop
RET
.hv_w4:
mova m4, [base+bilin_h_shuf4]
RESTORE_DSQ_32 t0
movddup xm0, [srcq+ssq*0]
pshufb m0, m4
pmaddubsw m0, m5
.hv_w4_loop:
movq m1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
movhps m1, [srcq+ssq*0]
pshufb m1, m4
pmaddubsw m1, m5 ; 1 2
shufps m2, m0, m1, q1032 ; 0 1
mova m0, m1
psubw m1, m2
paddw m1, m1
pmulhw m1, m6
paddw m1, m2
pmulhrsw m1, m7
packuswb m1, m1
movd [dstq+dsq*0], m1
psrlq m1, 32
movd [dstq+dsq*1], m1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w4_loop
RET
.hv_w8:
RESTORE_DSQ_32 t0
movu m0, [srcq+ssq*0+8*0]
pshufb m0, m4
pmaddubsw m0, m5
.hv_w8_loop:
movu m2, [srcq+ssq*1+8*0]
lea srcq, [srcq+ssq*2]
movu m3, [srcq+ssq*0+8*0]
pshufb m2, m4
pshufb m3, m4
pmaddubsw m2, m5
psubw m1, m2, m0
paddw m1, m1
pmulhw m1, m6
paddw m1, m0
pmaddubsw m0, m3, m5
psubw m3, m0, m2
paddw m3, m3
pmulhw m3, m6
paddw m3, m2
pmulhrsw m1, m7
pmulhrsw m3, m7
packuswb m1, m3
movq [dstq+dsq*0], m1
movhps [dstq+dsq*1], m1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .hv_w8_loop
RET
;
; 32bit has ssq, dsq free
%macro PUT_BILIN_HV_W16 0
movu m0, [srcq+8*0]
movu m1, [srcq+8*1]
pshufb m0, m4
pshufb m1, m4
pmaddubsw m0, m5
pmaddubsw m1, m5
%if WIN64
movaps r4m, xmm8
%endif
%%loop:
%if ARCH_X86_32
%define m3back [dstq]
%define dsqval dsm
%else
%define m3back m8
%define dsqval dsq
%endif
add srcq, ssq
movu m2, [srcq+8*1]
pshufb m2, m4
pmaddubsw m2, m5
psubw m3, m2, m1
paddw m3, m3
pmulhw m3, m6
paddw m3, m1
mova m1, m2
pmulhrsw m3, m7
mova m3back, m3
movu m2, [srcq+8*0]
pshufb m2, m4
pmaddubsw m2, m5
psubw m3, m2, m0
paddw m3, m3
pmulhw m3, m6
paddw m3, m0
mova m0, m2
pmulhrsw m3, m7
packuswb m3, m3back
mova [dstq], m3
add dstq, dsqval
dec hd
jg %%loop
%if WIN64
movaps xmm8, r4m
%endif
%undef m3back
%undef dsqval
%endmacro
;
.hv_w16:
PUT_BILIN_HV_W16
RET
.hv_w16gt:
mov r4, dstq
mov r6, srcq
.hv_w16gt_loop:
PUT_BILIN_HV_W16
mov hw, t0w
add r4, mmsize
add r6, mmsize
mov dstq, r4
mov srcq, r6
sub t0d, 1<<16
jg .hv_w16gt_loop
RET
.hv_w32:
lea t0d, [hq+(1<<16)]
jmp .hv_w16gt
.hv_w64:
lea t0d, [hq+(3<<16)]
jmp .hv_w16gt
.hv_w128:
lea t0d, [hq+(7<<16)]
jmp .hv_w16gt
%if WIN64 %if WIN64
DECLARE_REG_TMP 6, 4 DECLARE_REG_TMP 6, 4
%else %else
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment