Commit 973d11bf authored by Henrik Gramner's avatar Henrik Gramner

x86: Add warp8x8 and warp8x8t AVX2 asm

parent 22e144fb
......@@ -30,6 +30,10 @@
SECTION_RODATA 32
warp_8x8_shufA: db 0, 2, 4, 6, 1, 3, 5, 7, 1, 3, 5, 7, 2, 4, 6, 8
db 4, 6, 8, 10, 5, 7, 9, 11, 5, 7, 9, 11, 6, 8, 10, 12
warp_8x8_shufB: db 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9, 4, 6, 8, 10
db 6, 8, 10, 12, 7, 9, 11, 13, 7, 9, 11, 13, 8, 10, 12, 14
subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12
db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14
subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
......@@ -47,8 +51,9 @@ pw_512: times 2 dw 512
pw_1024: times 2 dw 1024
pw_2048: times 2 dw 2048
pw_8192: times 2 dw 8192
pd_32: dd 32
pd_512: dd 512
pd_32: dd 32
pd_512: dd 512
pd_32768: dd 32768
cextern mc_subpel_filters
%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
......@@ -123,6 +128,8 @@ HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128
%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
cextern mc_warp_filter
SECTION .text
INIT_XMM avx2
......@@ -2627,6 +2634,211 @@ cglobal prep_8tap, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
jg .hv_w8_loop0
RET
%macro WARP_V 5 ; dst, 01, 23, 45, 67
; Can be done using gathers, but that's terribly slow on many CPU:s
lea tmp1d, [myq+deltaq*1]
lea tmp2d, [myq+deltaq*2]
shr myd, 10
shr tmp1d, 10
movq xm8, [filterq+myq *8]
movq xm10, [filterq+tmp1q*8]
lea tmp1d, [tmp2q+deltaq*1]
lea myd, [tmp2q+deltaq*2]
shr tmp2d, 10
shr tmp1d, 10
movq xm0, [filterq+tmp2q*8]
movq xm9, [filterq+tmp1q*8]
lea tmp1d, [myq+deltaq*1]
lea tmp2d, [myq+deltaq*2]
shr myd, 10
shr tmp1d, 10
vinserti128 m8, [filterq+myq *8], 1 ; a e
vinserti128 m10, [filterq+tmp1q*8], 1 ; b f
lea tmp1d, [tmp2q+deltaq*1]
lea myd, [tmp2q+gammaq] ; my += gamma
shr tmp2d, 10
shr tmp1d, 10
punpcklbw m8, m10
vpbroadcastq m10, [filterq+tmp2q*8] ; c g
vpblendd m0, m10, 0x30
vpbroadcastq m10, [filterq+tmp1q*8] ; d h
vpblendd m9, m10, 0x30
punpcklbw m0, m9
punpcklwd m9, m8, m0
punpckhwd m8, m0
pxor m10, m10
punpcklbw m0, m9, m8
punpckhbw m9, m8
punpcklbw m8, m10, m0 ; a0 a4 b0 b4 c0 c4 d0 d4 << 8
punpckhbw m0, m10, m0 ; a1 a5 b1 b5 c1 c5 d1 d5 << 8
pmaddwd m%2, m8
pmaddwd m0, m%3
punpcklbw m8, m10, m9 ; a2 a6 b2 b6 c2 c6 d2 d6 << 8
punpckhbw m9, m10, m9 ; a3 a7 b3 b7 c3 c7 d3 d7 << 8
pmaddwd m8, m%4
pmaddwd m9, m%5
paddd m0, m%2
mova m%2, m%3
paddd m0, m8
mova m%3, m%4
paddd m%1, m0, m9
mova m%4, m%5
%endmacro
cglobal warp_affine_8x8t, 0, 14, 0, tmp, ts
%if WIN64
sub rsp, 0xa0
%endif
call mangle(private_prefix %+ _warp_affine_8x8_avx2).main
.loop:
psrad m11, 13
psrad m0, 13
packssdw m11, m0
pmulhrsw m11, m14 ; (x + (1 << 6)) >> 7
vpermq m0, m11, q3120
mova [tmpq+tsq*0], xm0
vextracti128 [tmpq+tsq*2], m0, 1
dec r4d
jz mangle(private_prefix %+ _warp_affine_8x8_avx2).end
call mangle(private_prefix %+ _warp_affine_8x8_avx2).main2
lea tmpq, [tmpq+tsq*4]
jmp .loop
cglobal warp_affine_8x8, 0, 14, 0, dst, ds, src, ss, abcd, mx, tmp2, alpha, \
beta, filter, tmp1, delta, my, gamma
%if WIN64
sub rsp, 0xa0
%assign xmm_regs_used 16
%assign stack_size_padded 0xa0
%assign stack_offset stack_offset+stack_size_padded
%endif
call .main
jmp .start
.loop:
call .main2
lea dstq, [dstq+dsq*2]
.start:
psrad m11, 17
psrad m0, 17
packssdw m11, m0
pmulhrsw m11, m14 ; (x + (1 << 10)) >> 11
vextracti128 xm0, m11, 1
packuswb xm11, xm0
pshufd xm0, xm11, q3120
movq [dstq+dsq*0], xm0
movhps [dstq+dsq*1], xm0
dec r4d
jg .loop
.end:
RET
ALIGN function_align
.main:
; Stack args offset by one (r4m -> r5m etc.) due to call
%if WIN64
mov abcdq, r5m
mov mxd, r6m
movaps [rsp+stack_offset+0x10], xmm6
movaps [rsp+stack_offset+0x20], xmm7
movaps [rsp+0x28], xmm8
movaps [rsp+0x38], xmm9
movaps [rsp+0x48], xmm10
movaps [rsp+0x58], xmm11
movaps [rsp+0x68], xmm12
movaps [rsp+0x78], xmm13
movaps [rsp+0x88], xmm14
movaps [rsp+0x98], xmm15
%endif
movsx alphad, word [abcdq+2*0]
movsx betad, word [abcdq+2*1]
mova m12, [warp_8x8_shufA]
mova m13, [warp_8x8_shufB]
vpbroadcastd m14, [pw_8192]
vpbroadcastd m15, [pd_32768]
lea filterq, [mc_warp_filter]
lea tmp1q, [ssq*3+3]
add mxd, 512+(64<<10)
lea tmp2d, [alphaq*3]
add tmp2d, tmp2d
sub srcq, tmp1q ; src -= src_stride*3 + 3
sub betad, tmp2d ; beta -= alpha*6
mov myd, r7m
call .h
psrld m1, m0, 16
call .h
pblendw m1, m0, 0xaa ; 01
psrld m2, m0, 16
call .h
pblendw m2, m0, 0xaa ; 12
psrld m3, m0, 16
call .h
pblendw m3, m0, 0xaa ; 23
psrld m4, m0, 16
call .h
pblendw m4, m0, 0xaa ; 34
psrld m5, m0, 16
call .h
pblendw m5, m0, 0xaa ; 45
psrld m6, m0, 16
call .h
pblendw m6, m0, 0xaa ; 56
movsx deltad, word [abcdq+2*2]
movsx gammad, word [abcdq+2*3]
add myd, 512+(64<<10)
mov r4d, 4
lea tmp1d, [deltaq*3]
add tmp1d, tmp1d
sub gammad, tmp1d ; gamma -= delta*6
.main2:
call .h
psrld m7, m6, 16
pblendw m7, m0, 0xaa ; 67
WARP_V 11, 1, 3, 5, 7
call .h
psrld m7, 16
pblendw m7, m0, 0xaa ; 78
WARP_V 0, 2, 4, 6, 7
ret
ALIGN function_align
.h:
lea tmp1d, [mxq+alphaq*1]
lea tmp2d, [mxq+alphaq*2]
shr mxd, 10
shr tmp1d, 10
vbroadcasti128 m10, [srcq]
movq xm8, [filterq+mxq *8]
movhps xm8, [filterq+tmp1q*8]
lea tmp1d, [tmp2q+alphaq*1]
lea mxd, [tmp2q+alphaq*2]
shr tmp2d, 10
shr tmp1d, 10
movq xm9, [filterq+tmp2q*8]
movhps xm9, [filterq+tmp1q*8]
lea tmp1d, [mxq+alphaq*1]
lea tmp2d, [mxq+alphaq*2]
shr mxd, 10
shr tmp1d, 10
vpbroadcastq m0, [filterq+mxq *8]
vpblendd m8, m0, 0x30
vpbroadcastq m0, [filterq+tmp1q*8]
vpblendd m8, m0, 0xc0 ; 0 1 4 5
pshufb m0, m10, m12
pmaddubsw m0, m8
lea tmp1d, [tmp2q+alphaq*1]
lea mxd, [tmp2q+betaq] ; mx += beta
shr tmp2d, 10
shr tmp1d, 10
vpbroadcastq m8, [filterq+tmp2q*8]
vpblendd m9, m8, 0x30
vpbroadcastq m8, [filterq+tmp1q*8]
vpblendd m9, m8, 0xc0 ; 2 3 6 7
pshufb m10, m13
pmaddubsw m10, m9
add srcq, ssq
phaddw m0, m10
pmaddwd m0, m14 ; 17-bit intermediate, upshifted by 13
paddd m0, m15 ; rounded 14-bit result in upper 16 bits of dword
ret
%macro BIDIR_FN 1 ; op
%1 0
lea stride3q, [strideq*3]
......
......@@ -55,6 +55,9 @@ decl_w_avg_fn(dav1d_w_avg_avx2);
decl_mask_fn(dav1d_mask_avx2);
decl_w_mask_fn(dav1d_w_mask_420_avx2);
decl_warp8x8_fn(dav1d_warp_affine_8x8_avx2);
decl_warp8x8t_fn(dav1d_warp_affine_8x8t_avx2);
void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
#define init_mc_fn(type, name, suffix) \
c->mc[type] = dav1d_put_##name##_##suffix
......@@ -91,5 +94,8 @@ void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
c->w_avg = dav1d_w_avg_avx2;
c->mask = dav1d_mask_avx2;
c->w_mask[2] = dav1d_w_mask_420_avx2;
c->warp8x8 = dav1d_warp_affine_8x8_avx2;
c->warp8x8t = dav1d_warp_affine_8x8t_avx2;
#endif
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment