Commit 59c3370e authored by Henrik Gramner's avatar Henrik Gramner

x86: Add smooth intra prediction AVX2 asm

parent 613ef787
......@@ -30,11 +30,44 @@
SECTION_RODATA 32
paeth_shuf: db 7, 7, 7, 7, 3, 3, 3, 3, 6, 6, 6, 6, 2, 2, 2, 2
db 5, 5, 5, 5, 1, 1, 1, 1, 4, 4, 4, 4, 0, 0, 0, 0
%macro SMOOTH_WEIGHT_TABLE 1-*
%rep %0
db %1-128, 127-%1
%rotate 1
%endrep
%endmacro
; sm_weights[], but modified to precalculate x and 256-x with offsets to
; enable efficient use of pmaddubsw (which requires signed values)
smooth_weights: SMOOTH_WEIGHT_TABLE \
0, 0, 255, 128, 255, 149, 85, 64, \
255, 197, 146, 105, 73, 50, 37, 32, \
255, 225, 196, 170, 145, 123, 102, 84, \
68, 54, 43, 33, 26, 20, 17, 16, \
255, 240, 225, 210, 196, 182, 169, 157, \
145, 133, 122, 111, 101, 92, 83, 74, \
66, 59, 52, 45, 39, 34, 29, 25, \
21, 17, 14, 12, 10, 9, 8, 8, \
255, 248, 240, 233, 225, 218, 210, 203, \
196, 189, 182, 176, 169, 163, 156, 150, \
144, 138, 133, 127, 121, 116, 111, 106, \
101, 96, 91, 86, 82, 77, 73, 69, \
65, 61, 57, 54, 50, 47, 44, 41, \
38, 35, 32, 29, 27, 25, 22, 20, \
18, 16, 15, 13, 12, 10, 9, 8, \
7, 6, 6, 5, 5, 4, 4, 4
ipred_v_shuf: db 0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13
db 2, 3, 2, 3, 6, 7, 6, 7, 10, 11, 10, 11, 14, 15, 14, 15
ipred_h_shuf: db 7, 7, 7, 7, 3, 3, 3, 3, 5, 5, 5, 5, 1, 1, 1, 1
db 6, 6, 6, 6, 2, 2, 2, 2, 4, 4, 4, 4, 0, 0, 0, 0
pb_1: times 4 db 1
pb_128: times 4 db 128
pw_128: times 2 dw 128
pw_255: times 2 dw 255
pb_127_m127: times 2 db 127, -127
%macro JMP_TABLE 3-*
%xdefine %1_%2_table (%%table - 2*4)
......@@ -48,6 +81,9 @@ pb_128: times 4 db 128
%define ipred_dc_splat_avx2_table (ipred_dc_avx2_table + 10*4)
JMP_TABLE ipred_smooth, avx2, w4, w8, w16, w32, w64
JMP_TABLE ipred_smooth_v, avx2, w4, w8, w16, w32, w64
JMP_TABLE ipred_smooth_h, avx2, w4, w8, w16, w32, w64
JMP_TABLE ipred_paeth, avx2, w4, w8, w16, w32, w64
JMP_TABLE ipred_dc, avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
......@@ -423,17 +459,18 @@ INIT_YMM avx2
%endmacro
cglobal ipred_paeth, 3, 6, 9, dst, stride, tl, w, h
%define base r5-ipred_paeth_avx2_table
lea r5, [ipred_paeth_avx2_table]
tzcnt wd, wm
vpbroadcastb m5, [tlq] ; topleft
movifnidn hd, hm
movsxd wq, [r5+wq*4]
vpbroadcastd m4, [r5-ipred_paeth_avx2_table+pb_1]
vpbroadcastd m4, [base+pb_1]
add wq, r5
jmp wq
.w4:
vpbroadcastd m6, [tlq+1] ; top
mova m8, [r5-ipred_paeth_avx2_table+paeth_shuf]
mova m8, [base+ipred_h_shuf]
lea r3, [strideq*3]
psubusb m7, m5, m6
psubusb m0, m6, m5
......@@ -445,15 +482,15 @@ cglobal ipred_paeth, 3, 6, 9, dst, stride, tl, w, h
PAETH 6, 7
vextracti128 xm1, m0, 1
movd [dstq+strideq*0], xm0
pextrd [dstq+strideq*1], xm0, 2
movd [dstq+strideq*2], xm1
movd [dstq+strideq*1], xm1
pextrd [dstq+strideq*2], xm0, 2
pextrd [dstq+r3 ], xm1, 2
cmp hd, 4
je .ret
lea dstq, [dstq+strideq*4]
pextrd [dstq+strideq*0], xm0, 1
pextrd [dstq+strideq*1], xm0, 3
pextrd [dstq+strideq*2], xm1, 1
pextrd [dstq+strideq*1], xm1, 1
pextrd [dstq+strideq*2], xm0, 3
pextrd [dstq+r3 ], xm1, 3
lea dstq, [dstq+strideq*4]
sub hd, 8
......@@ -463,7 +500,7 @@ cglobal ipred_paeth, 3, 6, 9, dst, stride, tl, w, h
ALIGN function_align
.w8:
vpbroadcastq m6, [tlq+1]
mova m8, [r5-ipred_paeth_avx2_table+paeth_shuf]
mova m8, [base+ipred_h_shuf]
lea r3, [strideq*3]
psubusb m7, m5, m6
psubusb m0, m6, m5
......@@ -475,8 +512,8 @@ ALIGN function_align
PAETH 6, 7
vextracti128 xm1, m0, 1
movq [dstq+strideq*0], xm0
movhps [dstq+strideq*1], xm0
movq [dstq+strideq*2], xm1
movq [dstq+strideq*1], xm1
movhps [dstq+strideq*2], xm0
movhps [dstq+r3 ], xm1
lea dstq, [dstq+strideq*4]
sub hd, 4
......@@ -543,4 +580,654 @@ ALIGN function_align
%endif
RET
%macro SMOOTH 6 ; src[1-2], mul[1-2], add[1-2]
; w * a = (w - 128) * a + 128 * a
; (256 - w) * b = (127 - w) * b + 129 * b
pmaddubsw m0, m%3, m%1
pmaddubsw m1, m%4, m%2
paddw m0, m%5
paddw m1, m%6
psrlw m0, 8
psrlw m1, 8
packuswb m0, m1
%endmacro
cglobal ipred_smooth_v, 3, 7, 0, dst, stride, tl, w, h, weights
%define base r6-ipred_smooth_v_avx2_table
lea r6, [ipred_smooth_v_avx2_table]
tzcnt wd, wm
mov hd, hm
movsxd wq, [r6+wq*4]
vpbroadcastd m0, [base+pb_127_m127]
vpbroadcastd m1, [base+pw_128]
lea weightsq, [base+smooth_weights+hq*4]
neg hq
vpbroadcastb m5, [tlq+hq] ; bottom
add wq, r6
jmp wq
.w4:
vpbroadcastd m2, [tlq+1]
punpcklbw m2, m5 ; top, bottom
mova m5, [base+ipred_v_shuf]
lea r3, [strideq*3]
punpckldq m4, m5, m5
punpckhdq m5, m5
pmaddubsw m3, m2, m0
paddw m1, m2 ; 1 * top + 256 * bottom + 128, overflow is ok
paddw m3, m1 ; 128 * top + 129 * bottom + 128
.w4_loop:
vbroadcasti128 m1, [weightsq+hq*2]
pshufb m0, m1, m4
pshufb m1, m5
SMOOTH 0, 1, 2, 2, 3, 3
vextracti128 xm1, m0, 1
movd [dstq+strideq*0], xm0
movd [dstq+strideq*1], xm1
pextrd [dstq+strideq*2], xm0, 1
pextrd [dstq+r3 ], xm1, 1
cmp hd, -4
je .ret
lea dstq, [dstq+strideq*4]
pextrd [dstq+strideq*0], xm0, 2
pextrd [dstq+strideq*1], xm1, 2
pextrd [dstq+strideq*2], xm0, 3
pextrd [dstq+r3 ], xm1, 3
lea dstq, [dstq+strideq*4]
add hq, 8
jl .w4_loop
.ret:
RET
ALIGN function_align
.w8:
vpbroadcastq m2, [tlq+1]
punpcklbw m2, m5
mova m5, [base+ipred_v_shuf]
lea r3, [strideq*3]
pshufd m4, m5, q0000
pshufd m5, m5, q1111
pmaddubsw m3, m2, m0
paddw m1, m2
paddw m3, m1
.w8_loop:
vpbroadcastq m1, [weightsq+hq*2]
pshufb m0, m1, m4
pshufb m1, m5
SMOOTH 0, 1, 2, 2, 3, 3
vextracti128 xm1, m0, 1
movq [dstq+strideq*0], xm0
movq [dstq+strideq*1], xm1
movhps [dstq+strideq*2], xm0
movhps [dstq+r3 ], xm1
lea dstq, [dstq+strideq*4]
add hq, 4
jl .w8_loop
RET
ALIGN function_align
.w16:
WIN64_SPILL_XMM 7
vbroadcasti128 m3, [tlq+1]
mova m6, [base+ipred_v_shuf]
punpcklbw m2, m3, m5
punpckhbw m3, m5
pmaddubsw m4, m2, m0
pmaddubsw m5, m3, m0
paddw m0, m1, m2
paddw m1, m3
paddw m4, m0
paddw m5, m1
.w16_loop:
vpbroadcastd m1, [weightsq+hq*2]
pshufb m1, m6
SMOOTH 1, 1, 2, 3, 4, 5
mova [dstq+strideq*0], xm0
vextracti128 [dstq+strideq*1], m0, 1
lea dstq, [dstq+strideq*2]
add hq, 2
jl .w16_loop
RET
ALIGN function_align
.w32:
%assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 6
movu m3, [tlq+1]
punpcklbw m2, m3, m5
punpckhbw m3, m5
pmaddubsw m4, m2, m0
pmaddubsw m5, m3, m0
paddw m0, m1, m2
paddw m1, m3
paddw m4, m0
paddw m5, m1
.w32_loop:
vpbroadcastw m1, [weightsq+hq*2]
SMOOTH 1, 1, 2, 3, 4, 5
mova [dstq], m0
add dstq, strideq
inc hq
jl .w32_loop
RET
ALIGN function_align
.w64:
WIN64_SPILL_XMM 11
movu m4, [tlq+ 1]
movu m8, [tlq+33]
punpcklbw m3, m4, m5
punpckhbw m4, m5
punpcklbw m7, m8, m5
punpckhbw m8, m5
pmaddubsw m5, m3, m0
pmaddubsw m6, m4, m0
pmaddubsw m9, m7, m0
pmaddubsw m10, m8, m0
paddw m2, m1, m3
paddw m5, m2
paddw m2, m1, m4
paddw m6, m2
paddw m0, m1, m7
paddw m9, m0
paddw m1, m8
paddw m10, m1
.w64_loop:
vpbroadcastw m2, [weightsq+hq*2]
SMOOTH 2, 2, 3, 4, 5, 6
mova [dstq+32*0], m0
SMOOTH 2, 2, 7, 8, 9, 10
mova [dstq+32*1], m0
add dstq, strideq
inc hq
jl .w64_loop
RET
%macro SETUP_STACK_FRAME 3 ; stack_size, regs_used, xmm_regs_used
%assign stack_offset 0
%assign stack_size_padded 0
%assign regs_used %2
%xdefine rstk rsp
SETUP_STACK_POINTER %1
%if regs_used != %2 && WIN64
PUSH r%2
%endif
ALLOC_STACK %1, %3
%endmacro
cglobal ipred_smooth_h, 3, 7, 0, dst, stride, tl, w, h
%define base r6-ipred_smooth_h_avx2_table
lea r6, [ipred_smooth_h_avx2_table]
mov wd, wm
vpbroadcastb m3, [tlq+wq] ; right
tzcnt wd, wd
mov hd, hm
movsxd wq, [r6+wq*4]
vpbroadcastd m4, [base+pb_127_m127]
vpbroadcastd m5, [base+pw_128]
add wq, r6
jmp wq
.w4:
WIN64_SPILL_XMM 8
vpbroadcastq m6, [base+smooth_weights+4*2]
mova m7, [base+ipred_h_shuf]
sub tlq, 8
sub tlq, hq
lea r3, [strideq*3]
.w4_loop:
vpbroadcastq m2, [tlq+hq]
pshufb m2, m7
punpcklbw m1, m2, m3 ; left, right
punpckhbw m2, m3
pmaddubsw m0, m1, m4 ; 127 * left - 127 * right
paddw m0, m1 ; 128 * left + 129 * right
pmaddubsw m1, m6
paddw m1, m5
paddw m0, m1
pmaddubsw m1, m2, m4
paddw m1, m2
pmaddubsw m2, m6
paddw m2, m5
paddw m1, m2
psrlw m0, 8
psrlw m1, 8
packuswb m0, m1
vextracti128 xm1, m0, 1
movd [dstq+strideq*0], xm0
movd [dstq+strideq*1], xm1
pextrd [dstq+strideq*2], xm0, 2
pextrd [dstq+r3 ], xm1, 2
cmp hd, 4
je .ret
lea dstq, [dstq+strideq*4]
pextrd [dstq+strideq*0], xm0, 1
pextrd [dstq+strideq*1], xm1, 1
pextrd [dstq+strideq*2], xm0, 3
pextrd [dstq+r3 ], xm1, 3
lea dstq, [dstq+strideq*4]
sub hd, 8
jg .w4_loop
.ret:
RET
ALIGN function_align
.w8:
%assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 8
vbroadcasti128 m6, [base+smooth_weights+8*2]
mova m7, [base+ipred_h_shuf]
sub tlq, 4
lea r3, [strideq*3]
sub tlq, hq
.w8_loop:
vpbroadcastd m2, [tlq+hq]
pshufb m2, m7
punpcklbw m1, m2, m3
punpckhbw m2, m3
pmaddubsw m0, m1, m4
paddw m0, m1
pmaddubsw m1, m6
paddw m1, m5
paddw m0, m1
pmaddubsw m1, m2, m4
paddw m1, m2
pmaddubsw m2, m6
paddw m2, m5
paddw m1, m2
psrlw m0, 8
psrlw m1, 8
packuswb m0, m1
vextracti128 xm1, m0, 1
movq [dstq+strideq*0], xm0
movq [dstq+strideq*1], xm1
movhps [dstq+strideq*2], xm0
movhps [dstq+r3 ], xm1
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w8_loop
RET
ALIGN function_align
.w16:
SETUP_STACK_FRAME 32*4, 7, 8
lea r3, [rsp+64*2-4]
call .prep ; only worthwhile for for w16 and above
sub tlq, 2
vpbroadcastd xm6, [base+pb_1]
mova xm7, [base+ipred_v_shuf+16]
vinserti128 m7, [base+ipred_v_shuf+ 0], 1
vbroadcasti128 m4, [base+smooth_weights+16*2]
vbroadcasti128 m5, [base+smooth_weights+16*3]
.w16_loop:
vpbroadcastd m1, [tlq+hq]
vpbroadcastd m2, [r3+hq*2]
pshufb m1, m6
punpcklbw m1, m3
pshufb m2, m7
SMOOTH 4, 5, 1, 1, 2, 2
mova [dstq+strideq*0], xm0
vextracti128 [dstq+strideq*1], m0, 1
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w16_loop
RET
ALIGN function_align
.w32:
SETUP_STACK_FRAME 32*4, 7, 6
lea r3, [rsp+64*2-2]
call .prep
dec tlq
mova xm4, [base+smooth_weights+16*4]
vinserti128 m4, [base+smooth_weights+16*6], 1
mova xm5, [base+smooth_weights+16*5]
vinserti128 m5, [base+smooth_weights+16*7], 1
.w32_loop:
vpbroadcastb m1, [tlq+hq]
punpcklbw m1, m3
vpbroadcastw m2, [r3+hq*2]
SMOOTH 4, 5, 1, 1, 2, 2
mova [dstq], m0
add dstq, strideq
dec hd
jg .w32_loop
RET
ALIGN function_align
.w64:
SETUP_STACK_FRAME 32*4, 7, 9
lea r3, [rsp+64*2-2]
call .prep
add r6, smooth_weights+16*15-ipred_smooth_h_avx2_table
dec tlq
mova xm5, [r6-16*7]
vinserti128 m5, [r6-16*5], 1
mova xm6, [r6-16*6]
vinserti128 m6, [r6-16*4], 1
mova xm7, [r6-16*3]
vinserti128 m7, [r6-16*1], 1
mova xm8, [r6-16*2]
vinserti128 m8, [r6-16*0], 1
.w64_loop:
vpbroadcastb m2, [tlq+hq]
punpcklbw m2, m3
vpbroadcastw m4, [r3+hq*2]
SMOOTH 5, 6, 2, 2, 4, 4
mova [dstq+32*0], m0
SMOOTH 7, 8, 2, 2, 4, 4
mova [dstq+32*1], m0
add dstq, strideq
dec hd
jg .w64_loop
RET
ALIGN function_align
.prep:
vpermq m2, [tlq-32*1], q3120
punpckhbw m1, m2, m3
punpcklbw m2, m3
pmaddubsw m0, m1, m4 ; 127 * left - 127 * right
paddw m1, m5 ; 1 * left + 256 * right + 128
paddw m0, m1 ; 128 * left + 129 * right + 128
pmaddubsw m1, m2, m4
paddw m2, m5
paddw m1, m2
vpermq m2, [tlq-32*2], q3120
mova [rsp+gprsize+32*3], m0
mova [rsp+gprsize+32*2], m1
punpckhbw m1, m2, m3
punpcklbw m2, m3
pmaddubsw m0, m1, m4
paddw m1, m5
paddw m0, m1
pmaddubsw m1, m2, m4
paddw m2, m5
paddw m1, m2
mova [rsp+gprsize+32*1], m0
mova [rsp+gprsize+32*0], m1
sub r3, hq
sub tlq, hq
sub r3, hq
ret
%macro SMOOTH_2D_END 6 ; src[1-2], mul[1-2], add[1-2]
pmaddubsw m0, m%3, m%1
pmaddubsw m1, m%4, m%2
%ifnum %5
paddw m0, m%5
%else
paddw m0, %5
%endif
%ifnum %6
paddw m1, m%6
%else
paddw m1, %6
%endif
pavgw m0, m2
pavgw m1, m3
psrlw m0, 8
psrlw m1, 8
packuswb m0, m1
%endmacro
cglobal ipred_smooth, 3, 7, 0, dst, stride, tl, w, h, v_weights
%define base r6-ipred_smooth_avx2_table
lea r6, [ipred_smooth_avx2_table]
mov wd, wm
vpbroadcastb m4, [tlq+wq] ; right
tzcnt wd, wd
mov hd, hm
mov r5, tlq
sub r5, hq
movsxd wq, [r6+wq*4]
vpbroadcastd m5, [base+pb_127_m127]
vpbroadcastb m0, [r5] ; bottom
vpbroadcastd m3, [base+pw_255]
add wq, r6
lea v_weightsq, [base+smooth_weights+hq*2]
jmp wq
.w4:
WIN64_SPILL_XMM 12
mova m10, [base+ipred_h_shuf]
vpbroadcastq m11, [base+smooth_weights+4*2]
mova m7, [base+ipred_v_shuf]
vpbroadcastd m8, [tlq+1]
sub tlq, 8
lea r3, [strideq*3]
sub tlq, hq
punpcklbw m8, m0 ; top, bottom
pshufd m6, m7, q2200
pshufd m7, m7, q3311
pmaddubsw m9, m8, m5
paddw m3, m8 ; 1 * top + 255 * bottom + 255
paddw m9, m3 ; 128 * top + 129 * bottom + 255
.w4_loop:
vpbroadcastq m1, [tlq+hq]
pshufb m1, m10
punpcklbw m0, m1, m4 ; left, right
punpckhbw m1, m4
pmaddubsw m2, m0, m5 ; 127 * left - 127 * right
pmaddubsw m3, m1, m5
paddw m2, m0 ; 128 * left + 129 * right
paddw m3, m1
pmaddubsw m0, m11
pmaddubsw m1, m11
paddw m2, m0
paddw m3, m1
vbroadcasti128 m1, [v_weightsq]
add v_weightsq, 16
pshufb m0, m1, m6
pshufb m1, m7
SMOOTH_2D_END 0, 1, 8, 8, 9, 9
vextracti128 xm1, m0, 1
movd [dstq+strideq*0], xm0
movd [dstq+strideq*1], xm1
pextrd [dstq+strideq*2], xm0, 2
pextrd [dstq+r3 ], xm1, 2
cmp hd, 4
je .ret
lea dstq, [dstq+strideq*4]
pextrd [dstq+strideq*0], xm0, 1
pextrd [dstq+strideq*1], xm1, 1
pextrd [dstq+strideq*2], xm0, 3
pextrd [dstq+r3 ], xm1, 3
lea dstq, [dstq+strideq*4]
sub hd, 8
jg .w4_loop
.ret:
RET
ALIGN function_align
.w8:
%assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 12
mova m10, [base+ipred_h_shuf]
vbroadcasti128 m11, [base+smooth_weights+8*2]
mova m7, [base+ipred_v_shuf]
vpbroadcastq m8, [tlq+1]
sub tlq, 4
lea r3, [strideq*3]
sub tlq, hq
punpcklbw m8, m0
pshufd m6, m7, q0000
pshufd m7, m7, q1111
pmaddubsw m9, m8, m5
paddw m3, m8
paddw m9, m3
.w8_loop:
vpbroadcastd m1, [tlq+hq]
pshufb m1, m10
punpcklbw m0, m1, m4
punpckhbw m1, m4
pmaddubsw m2, m0, m5
pmaddubsw m3, m1, m5
paddw m2, m0
paddw m3, m1
pmaddubsw m0, m11
pmaddubsw m1, m11
paddw m2, m0
paddw m3, m1
vpbroadcastq m1, [v_weightsq]
add v_weightsq, 8
pshufb m0, m1, m6
pshufb m1, m7
SMOOTH_2D_END 0, 1, 8, 8, 9, 9
vextracti128 xm1, m0, 1
movq [dstq+strideq*0], xm0
movq [dstq+strideq*1], xm1
movhps [dstq+strideq*2], xm0
movhps [dstq+r3 ], xm1
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w8_loop
RET
ALIGN function_align
.w16:
SETUP_STACK_FRAME 32*4, 7, 14
vbroadcasti128 m11, [tlq+1]
lea r3, [rsp+64*2-4]
punpcklbw m10, m11, m0 ; top, bottom
punpckhbw m11, m0
call .prep_v
sub tlq, 2
pmaddubsw m12, m10, m5
pmaddubsw m13, m11, m5
vpbroadcastd xm5, [base+pb_1]
mova m9, [base+ipred_v_shuf]
vbroadcasti128 m6, [base+smooth_weights+16*2]
vbroadcasti128 m7, [base+smooth_weights+16*3]
vpermq m8, m9, q1032
paddw m0, m10, m3
paddw m3, m11
paddw m12, m0
paddw m13, m3
.w16_loop:
vpbroadcastd m3, [tlq+hq]
vpbroadcastd m0, [r3+hq*2]
vpbroadcastd m1, [v_weightsq]
add v_weightsq, 4
pshufb m3, m5
punpcklbw m3, m4 ; left, right
pmaddubsw m2, m3, m6
pmaddubsw m3, m7
pshufb m0, m8
pshufb m1, m9
paddw m2, m0
paddw m3, m0
SMOOTH_2D_END 1, 1, 10, 11, 12, 13
mova [dstq+strideq*0], xm0
vextracti128 [dstq+strideq*1], m0, 1
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w16_loop
RET
ALIGN function_align
.w32:
SETUP_STACK_FRAME 32*4, 7, 11
movu m8, [tlq+1]
lea r3, [rsp+64*2-2]
punpcklbw m7, m8, m0
punpckhbw m8, m0
call .prep_v
dec tlq
pmaddubsw m9, m7, m5
pmaddubsw m10, m8, m5
mova xm5, [base+smooth_weights+16*4]
vinserti128 m5, [base+smooth_weights+16*6], 1
mova xm6, [base+smooth_weights+16*5]
vinserti128 m6, [base+smooth_weights+16*7], 1
paddw m0, m7, m3
paddw m3, m8
paddw m9, m0
paddw m10, m3
.w32_loop:
vpbroadcastb m3, [tlq+hq]
punpcklbw m3, m4
vpbroadcastw m0, [r3+hq*2]
vpbroadcastw m1, [v_weightsq]
add v_weightsq, 2
pmaddubsw m2, m3, m5
pmaddubsw m3, m6
paddw m2, m0
paddw m3, m0
SMOOTH_2D_END 1, 1, 7, 8, 9, 10
mova [dstq], m0
add dstq, strideq
dec hd
jg .w32_loop
RET
ALIGN function_align
.w64:
SETUP_STACK_FRAME 32*8, 7, 16
movu m13, [tlq+1 ]
movu m15, [tlq+33]
add r6, smooth_weights+16*15-ipred_smooth_avx2_table
lea r3, [rsp+64*2-2]
punpcklbw m12, m13, m0
punpckhbw m13, m0
punpcklbw m14, m15, m0
punpckhbw m15, m0
call .prep_v
dec tlq
pmaddubsw m0, m12, m5
pmaddubsw m1, m13, m5
pmaddubsw m2, m14, m5
pmaddubsw m5, m15, m5
mova xm8, [r6-16*7]
vinserti128 m8, [r6-16*5], 1
mova xm9, [r6-16*6]
vinserti128 m9, [r6-16*4], 1
mova xm10, [r6-16*3]
vinserti128 m10, [r6-16*1], 1
mova xm11, [r6-16*2]