Commit 6044a1ae authored by François Cartegnie's avatar François Cartegnie 🤞

add SSSE3 avg/w_avg/mask

Adaption of the avx2 code
parent 08184fd8
Pipeline #3076 passed with stages
in 9 minutes and 41 seconds
......@@ -121,6 +121,7 @@ if is_asm_enabled
'x86/loopfilter.asm',
'x86/looprestoration.asm',
'x86/mc.asm',
'x86/mc_ssse3.asm',
)
# Compile the ASM sources with NASM
......
......@@ -51,8 +51,11 @@ decl_mct_fn(dav1d_prep_8tap_sharp_smooth_avx2);
decl_mct_fn(dav1d_prep_bilin_avx2);
decl_avg_fn(dav1d_avg_avx2);
decl_avg_fn(dav1d_avg_ssse3);
decl_w_avg_fn(dav1d_w_avg_avx2);
decl_w_avg_fn(dav1d_w_avg_ssse3);
decl_mask_fn(dav1d_mask_avx2);
decl_mask_fn(dav1d_mask_ssse3);
decl_w_mask_fn(dav1d_w_mask_420_avx2);
decl_blend_fn(dav1d_blend_avx2);
decl_blend_dir_fn(dav1d_blend_v_avx2);
......@@ -70,7 +73,18 @@ void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
c->mct[type] = dav1d_prep_##name##_##suffix
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
if(!(flags & DAV1D_X86_CPU_FLAG_SSSE3))
return;
#if BITDEPTH == 8 && ARCH_X86_64
c->avg = dav1d_avg_ssse3;
c->w_avg = dav1d_w_avg_ssse3;
c->mask = dav1d_mask_ssse3;
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2))
return;
#if BITDEPTH == 8 && ARCH_X86_64
init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2);
......
; Copyright © 2018, VideoLAN and dav1d authors
; Copyright © 2018, Two Orioles, LLC
; Copyright © 2018, VideoLabs
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
;
; 1. Redistributions of source code must retain the above copyright notice, this
; list of conditions and the following disclaimer.
;
; 2. Redistributions in binary form must reproduce the above copyright notice,
; this list of conditions and the following disclaimer in the documentation
; and/or other materials provided with the distribution.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm"
%if ARCH_X86_64
SECTION_RODATA 16
pw_1024: times 8 dw 1024
pw_2048: times 8 dw 2048
%macro BIDIR_JMP_TABLE 1-*
;evaluated at definition time (in loop below)
%xdefine %1_table (%%table - 2*%2)
%xdefine %%base %1_table
%xdefine %%prefix mangle(private_prefix %+ _%1)
; dynamically generated label
%%table:
%rep %0 - 1 ; repeat for num args
dd %%prefix %+ .w%2 - %%base
%rotate 1
%endrep
%endmacro
BIDIR_JMP_TABLE avg_ssse3, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_avg_ssse3, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE mask_ssse3, 4, 8, 16, 32, 64, 128
SECTION .text
INIT_XMM ssse3
%if WIN64
DECLARE_REG_TMP 6, 4
%else
DECLARE_REG_TMP 6, 7
%endif
%macro BIDIR_FN 1 ; op
%1 0
lea stride3q, [strideq*3]
jmp wq
.w4_loop:
%1_INC_PTR 2
%1 0
lea dstq, [dstq+strideq*4]
.w4: ; tile 4x
movd [dstq ], m0 ; copy dw[0]
pshuflw m1, m0, q1032 ; swap dw[1] and dw[0]
movd [dstq+strideq*1], m1 ; copy dw[1]
punpckhqdq m0, m0 ; swap dw[3,2] with dw[1,0]
movd [dstq+strideq*2], m0 ; dw[2]
psrlq m0, 32 ; shift right in dw[3]
movd [dstq+stride3q ], m0 ; copy
sub hd, 4
jg .w4_loop
RET
.w8_loop:
%1_INC_PTR 2
%1 0
lea dstq, [dstq+strideq*2]
.w8:
movq [dstq ], m0
movhps [dstq+strideq*1], m0
sub hd, 2
jg .w8_loop
RET
.w16_loop:
%1_INC_PTR 2
%1 0
lea dstq, [dstq+strideq]
.w16:
mova [dstq ], m0
dec hd
jg .w16_loop
RET
.w32_loop:
%1_INC_PTR 4
%1 0
lea dstq, [dstq+strideq]
.w32:
mova [dstq ], m0
%1 2
mova [dstq + 16 ], m0
dec hd
jg .w32_loop
RET
.w64_loop:
%1_INC_PTR 8
%1 0
add dstq, strideq
.w64:
%assign i 0
%rep 4
mova [dstq + i*16 ], m0
%assign i i+1
%if i < 4
%1 2*i
%endif
%endrep
dec hd
jg .w64_loop
RET
.w128_loop:
%1_INC_PTR 16
%1 0
add dstq, strideq
.w128:
%assign i 0
%rep 8
mova [dstq + i*16 ], m0
%assign i i+1
%if i < 8
%1 2*i
%endif
%endrep
dec hd
jg .w128_loop
RET
%endmacro
%macro AVG 1 ; src_offset
; writes AVG of tmp1 tmp2 uint16 coeffs into uint8 pixel
mova m0, [tmp1q+(%1+0)*mmsize] ; load 8 coef(2bytes) from tmp1
paddw m0, [tmp2q+(%1+0)*mmsize] ; load/add 8 coef(2bytes) tmp2
mova m1, [tmp1q+(%1+1)*mmsize]
paddw m1, [tmp2q+(%1+1)*mmsize]
pmulhrsw m0, m2
pmulhrsw m1, m2
packuswb m0, m1 ; pack/trunc 16 bits from m0 & m1 to 8 bit
%endmacro
%macro AVG_INC_PTR 1
add tmp1q, %1*mmsize
add tmp2q, %1*mmsize
%endmacro
cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
lea r6, [avg_ssse3_table]
tzcnt wd, wm ; leading zeros
movifnidn hd, hm ; move h(stack) to h(register) if not already that register
movsxd wq, dword [r6+wq*4] ; push table entry matching the tile width (tzcnt) in widen reg
mova m2, [pw_1024+r6-avg_ssse3_table] ; fill m2 with shift/align
add wq, r6
BIDIR_FN AVG
%macro W_AVG 1 ; src_offset
; (a * weight + b * (16 - weight) + 128) >> 8
; = ((a - b) * weight + (b << 4) + 128) >> 8
; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4
mova m0, [tmp2q+(%1+0)*mmsize]
psubw m2, m0, [tmp1q+(%1+0)*mmsize]
mova m1, [tmp2q+(%1+1)*mmsize]
psubw m3, m1, [tmp1q+(%1+1)*mmsize]
paddw m2, m2 ; compensate for the weight only being half
paddw m3, m3 ; of what it should be
pmulhw m2, m4 ; (b-a) * (-weight << 12)
pmulhw m3, m4 ; (b-a) * (-weight << 12)
paddw m0, m2 ; ((b-a) * -weight) + b
paddw m1, m3
pmulhrsw m0, m5
pmulhrsw m1, m5
packuswb m0, m1
%endmacro
%define W_AVG_INC_PTR AVG_INC_PTR
cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
lea r6, [w_avg_ssse3_table]
tzcnt wd, wm
movifnidn hd, hm
movd m0, r6m
pshuflw m0, m0, q0000
punpcklqdq m0, m0
movsxd wq, dword [r6+wq*4]
pxor m4, m4
psllw m0, 11 ; can't shift by 12, sign bit must be preserved
psubw m4, m0
mova m5, [pw_2048+r6-w_avg_ssse3_table]
add wq, r6
BIDIR_FN W_AVG
%macro MASK 1 ; src_offset
; (a * m + b * (64 - m) + 512) >> 10
; = ((a - b) * m + (b << 6) + 512) >> 10
; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4
mova m3, [maskq+(%1+0)*(mmsize/2)]
mova m0, [tmp2q+(%1+0)*mmsize] ; b
psubw m1, m0, [tmp1q+(%1+0)*mmsize] ; b - a
mova m6, m3 ; m
psubb m3, m4, m6 ; -m
paddw m1, m1 ; (b - a) << 1
paddb m3, m3 ; -m << 1
punpcklbw m2, m4, m3 ; -m << 9 (<< 8 when ext as uint16)
pmulhw m1, m2 ; (-m * (b - a)) << 10
paddw m0, m1 ; + b
mova m1, [tmp2q+(%1+1)*mmsize] ; b
psubw m2, m1, [tmp1q+(%1+1)*mmsize] ; b - a
paddw m2, m2 ; (b - a) << 1
mova m6, m3 ; (-m << 1)
punpckhbw m3, m4, m6 ; (-m << 9)
pmulhw m2, m3 ; (-m << 9)
paddw m1, m2 ; (-m * (b - a)) << 10
pmulhrsw m0, m5 ; round
pmulhrsw m1, m5 ; round
packuswb m0, m1 ; interleave 16 -> 8
%endmacro
%macro MASK_INC_PTR 1
add maskq, %1*mmsize/2
add tmp1q, %1*mmsize
add tmp2q, %1*mmsize
%endmacro
cglobal mask, 4, 8, 7, dst, stride, tmp1, tmp2, w, h, mask, stride3
lea r7, [mask_ssse3_table]
tzcnt wd, wm
movifnidn hd, hm
mov maskq, maskmp
movsxd wq, dword [r7+wq*4]
pxor m4, m4
mova m5, [pw_2048+r7-mask_ssse3_table]
add wq, r7
BIDIR_FN MASK
%endif ; ARCH_X86_64
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment