Commit b3da18fe authored by François Cartegnie's avatar François Cartegnie 🤞

add SSSE3 w_mask_420

parent 00d1f4d5
...@@ -57,6 +57,7 @@ decl_w_avg_fn(dav1d_w_avg_ssse3); ...@@ -57,6 +57,7 @@ decl_w_avg_fn(dav1d_w_avg_ssse3);
decl_mask_fn(dav1d_mask_avx2); decl_mask_fn(dav1d_mask_avx2);
decl_mask_fn(dav1d_mask_ssse3); decl_mask_fn(dav1d_mask_ssse3);
decl_w_mask_fn(dav1d_w_mask_420_avx2); decl_w_mask_fn(dav1d_w_mask_420_avx2);
decl_w_mask_fn(dav1d_w_mask_420_ssse3);
decl_blend_fn(dav1d_blend_avx2); decl_blend_fn(dav1d_blend_avx2);
decl_blend_dir_fn(dav1d_blend_v_avx2); decl_blend_dir_fn(dav1d_blend_v_avx2);
decl_blend_dir_fn(dav1d_blend_h_avx2); decl_blend_dir_fn(dav1d_blend_h_avx2);
...@@ -81,6 +82,7 @@ void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) { ...@@ -81,6 +82,7 @@ void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
c->avg = dav1d_avg_ssse3; c->avg = dav1d_avg_ssse3;
c->w_avg = dav1d_w_avg_ssse3; c->w_avg = dav1d_w_avg_ssse3;
c->mask = dav1d_mask_ssse3; c->mask = dav1d_mask_ssse3;
c->w_mask[2] = dav1d_w_mask_420_ssse3;
#endif #endif
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) if (!(flags & DAV1D_X86_CPU_FLAG_AVX2))
......
...@@ -29,6 +29,9 @@ ...@@ -29,6 +29,9 @@
SECTION_RODATA 16 SECTION_RODATA 16
pw_8: times 8 dw 8
pw_26: times 8 dw 26
pw_258: times 8 dw 258
pw_1024: times 8 dw 1024 pw_1024: times 8 dw 1024
pw_2048: times 8 dw 2048 pw_2048: times 8 dw 2048
...@@ -48,6 +51,7 @@ pw_2048: times 8 dw 2048 ...@@ -48,6 +51,7 @@ pw_2048: times 8 dw 2048
BIDIR_JMP_TABLE avg_ssse3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE avg_ssse3, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_avg_ssse3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_avg_ssse3, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE mask_ssse3, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE mask_ssse3, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_mask_420_ssse3, 4, 8, 16, 16, 16, 16
SECTION .text SECTION .text
...@@ -251,3 +255,175 @@ cglobal mask, 4, 7, 7, dst, stride, tmp1, tmp2, w, mask, stride3 ...@@ -251,3 +255,175 @@ cglobal mask, 4, 7, 7, dst, stride, tmp1, tmp2, w, mask, stride3
mov maskq, r6m mov maskq, r6m
BIDIR_FN MASK BIDIR_FN MASK
%undef hd %undef hd
%if ARCH_X86_64
%define reg_pw_8 m8
%define reg_pw_27 m9
%define reg_pw_2048 m10
%else
%define reg_pw_8 [pw_8]
%define reg_pw_27 [pw_26] ; 64 - 38
%define reg_pw_2048 [pw_2048]
%endif
%macro W_MASK_420_B 2 ; src_offset in bytes, mask_out
;**** do m0 = u16.dst[7..0], m%2 = u16.m[7..0] ****
mova m0, [tmp1q+(%1)]
mova m1, [tmp2q+(%1)]
psubw m1, m0 ; tmp1 - tmp2
pabsw m3, m1 ; abs(tmp1 - tmp2)
paddw m3, reg_pw_8 ; abs(tmp1 - tmp2) + 8
psrlw m3, 8 ; (abs(tmp1 - tmp2) + 8) >> 8
psubusw m%2, reg_pw_27, m3 ; 64 - min(m, 64)
psllw m2, m%2, 10
pmulhw m1, m2 ; tmp2 * ()
paddw m0, m1 ; tmp1 + ()
;**** do m1 = u16.dst[7..0], m%2 = u16.m[7..0] ****
mova m1, [tmp1q+(%1)+mmsize]
mova m2, [tmp2q+(%1)+mmsize]
psubw m2, m1 ; tmp1 - tmp2
pabsw m7, m2 ; abs(tmp1 - tmp2)
paddw m7, reg_pw_8 ; abs(tmp1 - tmp2) + 8
psrlw m7, 8 ; (abs(tmp1 - tmp2) + 8) >> 8
psubusw m3, reg_pw_27, m7 ; 64 - min(m, 64)
phaddw m%2, m3 ; pack both u16.m[8..0]runs as u8.m [15..0]
psllw m3, 10
pmulhw m2, m3
paddw m1, m2
;********
pmulhrsw m0, reg_pw_2048 ; round/scale 2048
pmulhrsw m1, reg_pw_2048 ; round/scale 2048
packuswb m0, m1 ; concat m0 = u8.dst[15..0]
%endmacro
%macro W_MASK_420 2
W_MASK_420_B (%1*16), %2
%endmacro
%if ARCH_X86_64
; args: dst, stride, tmp1, tmp2, w, h, mask, sign
cglobal w_mask_420, 4, 9, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
lea r7, [w_mask_420_ssse3_table]
mov wd, wm
tzcnt r8d, wd
movifnidn hd, hm
mov maskq, maskmp
movd m0, r7m
pshuflw m0, m0, q0000 ; sign
punpcklqdq m0, m0
movsxd r8, dword [r7+r8*4]
mova reg_pw_8, [pw_8]
mova reg_pw_27, [pw_26] ; 64 - 38
mova reg_pw_2048, [pw_2048]
mova m6, [pw_258] ; 64 * 4 + 2
psubw m6, m0
add r8, r7
W_MASK_420 0, 4
lea stride3q, [strideq*3]
jmp r8
%define dst_bak r8
%define loop_w r7
%define orig_w wq
%else
cglobal w_mask_420, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask, stride3
tzcnt r6d, r4m
mov wd, w_mask_420_ssse3_table
add wd, [wq+r6*4]
mov maskq, r6mp
movd m0, r7m
pshuflw m0, m0, q0000 ; sign
punpcklqdq m0, m0
mova m6, [pw_258] ; 64 * 4 + 2
psubw m6, m0
W_MASK_420 0, 4
lea stride3q, [strideq*3]
jmp wd
%define dst_bak r0m
%define loop_w r6q
%define orig_w r4m
%define hd dword r5m
%endif
.w4_loop:
add tmp1q, 2*16
add tmp2q, 2*16
W_MASK_420 0, 4
lea dstq, [dstq+strideq*4]
add maskq, 4
.w4:
movd [dstq ], m0 ; copy m0[0]
pshuflw m1, m0, q1032
movd [dstq+strideq*1], m1 ; copy m0[1]
punpckhqdq m0, m0
movd [dstq+strideq*2], m0 ; copy m0[2]
psrlq m0, 32
movd [dstq+stride3q ], m0 ; copy m0[3]
pshufd m5, m4, q3131; DBDB even lines repeated
pshufd m4, m4, q2020; CACA odd lines repeated
psubw m1, m6, m4 ; m9 == 64 * 4 + 2
psubw m1, m5 ; C-D A-B C-D A-B
psrlw m1, 2 ; >> 2
packuswb m1, m1
movd [maskq], m1
sub hd, 4
jg .w4_loop
RET
.w8_loop:
add tmp1q, 2*16
add tmp2q, 2*16
W_MASK_420 0, 4
lea dstq, [dstq+strideq*2]
add maskq, 4
.w8:
movq [dstq ], m0
movhps [dstq+strideq*1], m0
pshufd m1, m4, q3232
psubw m0, m6, m4
psubw m0, m1
psrlw m0, 2
packuswb m0, m0
movd [maskq], m0
sub hd, 2
jg .w8_loop
RET
.w16: ; w32/64/128
mov dst_bak, dstq
mov loop_w, orig_w ; use width as counter
%if ARCH_X86_32
mov wq, orig_w ; because we altered it in 32bit setup
%endif
jmp .w16ge_inner_loop_first
.w16ge_loop:
lea tmp1q, [tmp1q+wq*2] ; skip even line pixels
lea tmp2q, [tmp2q+wq*2] ; skip even line pixels
lea dstq, [dstq+strideq*2]
mov dst_bak, dstq
mov loop_w, orig_w
.w16ge_inner_loop:
W_MASK_420_B 0, 4
.w16ge_inner_loop_first:
mova [dstq ], m0
W_MASK_420_B wq*2, 5 ; load matching even line (offset = widthpx * (16+16))
mova [dstq+strideq*1], m0
psubw m1, m6, m4 ; m9 == 64 * 4 + 2
psubw m1, m5 ; - odd line mask
psrlw m1, 2 ; >> 2
packuswb m1, m1
movq [maskq], m1
add tmp1q, 2*16
add tmp2q, 2*16
add maskq, 8
add dstq, 16
sub loop_w, 16
jg .w16ge_inner_loop
mov dstq, dst_bak
sub hd, 2
jg .w16ge_loop
RET
%undef reg_pw_8
%undef reg_pw_27
%undef reg_pw_2048
%undef dst_bak
%undef loop_w
%undef orig_w
%undef hd
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment