Commit a440af4a authored by Henrik Gramner's avatar Henrik Gramner

Add ipred_z3 AVX2 asm

Also backport some minor optimizations to z1.
parent 18d2d750
......@@ -28,7 +28,7 @@
%if ARCH_X86_64
SECTION_RODATA 32
SECTION_RODATA 64
%macro SMOOTH_WEIGHT_TABLE 1-*
%rep %0
......@@ -57,7 +57,6 @@ smooth_weights: SMOOTH_WEIGHT_TABLE \
18, 16, 15, 13, 12, 10, 9, 8, \
7, 6, 6, 5, 5, 4, 4, 4
; Note that the order of (some of) the following z constants matter
z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39
db 39, 39, 47, 47, 47, 63, 63, 63, 79, 79, 79, -1
z_filter_k: db 0, 16, 0, 16, 0, 20, 0, 20, 8, 16, 8, 16
......@@ -65,10 +64,18 @@ z_filter_k: db 0, 16, 0, 16, 0, 20, 0, 20, 8, 16, 8, 16
db 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 8, 0
z_filter_s: db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7
db 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15
db 15, 15, 15, 15, 15, 15, 15, 15 ; should be in one cache line
pb_12: times 4 db 12 ; those are just placed here for alignment.
pb_14: times 4 db 14
z3_shuf: db 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0
z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0
z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0
z_upsample: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
z_shuf_w4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12
z_upsample1: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
z_upsample2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 8, 8, 8
z_upsample3: db 0, 0, 0, 0, 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5
z1_shuf_w4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12
z3_shuf_w4: db 4, 3, 3, 2, 2, 1, 1, 0, 12, 11, 11, 10, 10, 9, 9, 8
z_transpose4: db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64
dw 16*64, 17*64, 18*64, 19*64, 20*64, 21*64, 22*64, 23*64
......@@ -76,13 +83,14 @@ z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64
filter_shuf1: db 10, 4, 10, 4, 37, 6, 5, 6,103, 9, 7, 9, 72, -1, 8, -1
db 16, 4, 0, 4, 53, 6, 5, 6,119, 11, 7, 11, 95, -1, 15, -1
filter_shuf2: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 2, 7, 2, 1, -1, 1, -1
filter_shuf3: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 11, 7, 11, 15, -1, 15, -1
filter_shuf3: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 11, 7, 11; 15, -1, 15, -1
pb_127_m127: times 2 db 127, -127
ipred_v_shuf: db 0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13
db 2, 3, 2, 3, 6, 7, 6, 7, 10, 11, 10, 11, 14, 15, 14, 15
ipred_h_shuf: db 7, 7, 7, 7, 3, 3, 3, 3, 5, 5, 5, 5, 1, 1, 1, 1
db 6, 6, 6, 6, 2, 2, 2, 2, 4, 4, 4, 4, 0, 0, 0, 0
db 6, 6, 6, 6, 2, 2, 2, 2, 4, 4, 4, 4; 0, 0, 0, 0
pw_64: times 2 dw 64
pb_0to15:
cfl_ac_w16_pad_shuffle: ; w=16, w_pad=1
db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
; w=8, w_pad=1 as well as second half of previous one
......@@ -94,26 +102,27 @@ cfl_ac_w8_pad1_shuffle: db 0, 1, 2, 3, 4, 5
; w=16,w_pad=3
db 0, 1, 2, 3, 4, 5
times 13 db 6, 7
pb_15to0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
pb_1: times 4 db 1
pb_2: times 4 db 2
pb_4: times 4 db 4
pb_8: times 4 db 8
pb_12: times 4 db 12
pb_14: times 4 db 14
pb_15 times 4 db 15
pb_31: times 4 db 31
pb_128: times 4 db 128
pw_1: times 2 dw 1
pw_8: times 2 dw 8
pw_62: times 2 dw 62
pw_64: times 2 dw 64
pw_128: times 2 dw 128
pw_255: times 2 dw 255
pw_512: times 2 dw 512
%define pb_0to15 cfl_ac_w16_pad_shuffle
%define pb_1 (ipred_h_shuf+12)
%define pb_2 (ipred_h_shuf+20)
%define pb_3 (ipred_h_shuf+ 4)
%define pb_4 (ipred_h_shuf+24)
%define pb_7 (ipred_h_shuf+ 0)
%define pb_8 (z_upsample2 +12)
%define pb_15 (z_filter_s +32)
%define pw_8 (z_filter_k +32)
pb_36_m4: times 2 db 36, -4
pb_127_m127: times 2 db 127, -127
pb_27: times 4 db 27
pb_31: times 4 db 31
pb_128: times 4 db 128
pw_1: times 2 dw 1
pw_62: times 2 dw 62
pw_128: times 2 dw 128
pw_255: times 2 dw 255
pw_512: times 2 dw 512
pb_36_m4: times 2 db 36, -4
%macro JMP_TABLE 3-*
%xdefine %1_%2_table (%%table - 2*4)
......@@ -138,6 +147,7 @@ JMP_TABLE ipred_dc, avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
JMP_TABLE ipred_dc_left, avx2, h4, h8, h16, h32, h64
JMP_TABLE ipred_h, avx2, w4, w8, w16, w32, w64
JMP_TABLE ipred_z1, avx2, w4, w8, w16, w32, w64
JMP_TABLE ipred_z3, avx2, h4, h8, h16, h32, h64
JMP_TABLE ipred_cfl, avx2, h4, h8, h16, h32, w4, w8, w16, w32, \
s4-8*4, s8-8*4, s16-8*4, s32-8*4
JMP_TABLE ipred_cfl_left, avx2, h4, h8, h16, h32
......@@ -1315,10 +1325,8 @@ cglobal ipred_z1, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase
jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm)
ALLOC_STACK -32, 8
mova xm1, [tlq-1]
pshufb xm0, xm1, [z_upsample]
vpbroadcastd xm2, [pb_8]
pminub xm2, [z_filter_s+6]
pshufb xm1, xm2
pshufb xm0, xm1, [z_upsample1]
pshufb xm1, [z_upsample2]
vpbroadcastd xm2, [pb_36_m4] ; upshifted by 2 to be able to reuse
add dxd, dxd ; pw_512 (which is already in m3)
pmaddubsw xm0, xm2 ; for rounding instead of pw_2048
......@@ -1375,13 +1383,14 @@ ALIGN function_align
.filter_strength: ; w4/w8/w16
; The C version uses a lot of branches, but we can do all the comparisons
; in parallel and use popcnt to get the final filter strength value.
%define base r3-z_filter_t0
lea r3, [z_filter_t0]
movd xm0, maxbased
movd xm2, angled
lea r3, [z_filter_t0]
shr angled, 8 ; is_sm << 1
vpbroadcastb m0, xm0
vpbroadcastb m2, xm2
pcmpeqb m1, m0, [r3-z_filter_t0+z_filter_wh]
pcmpeqb m1, m0, [base+z_filter_wh]
pand m1, m2
mova xm2, [r3+angleq*8] ; upper ymm half zero in both cases
pcmpgtb m1, m2
......@@ -1398,14 +1407,13 @@ ALIGN function_align
call .filter_strength
mov maxbased, 7
jz .w4_main ; filter_strength == 0
lea r3, [z_filter_k-4]
vpbroadcastd m7, [pb_8]
vpbroadcastd m7, [base+pb_8]
vbroadcasti128 m2, [tlq-1]
pminub m1, m7, [r3-z_filter_k+z_filter_s+4]
vpbroadcastd m8, [r3+r5*4+12*0]
pminub m7, [r3-z_filter_k+z_filter_s+12]
vpbroadcastd m9, [r3+r5*4+12*1]
vpbroadcastd m10, [r3+r5*4+12*2]
pminub m1, m7, [base+z_filter_s]
vpbroadcastd m8, [base+z_filter_k-4+r5*4+12*0]
pminub m7, [base+z_filter_s+8]
vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1]
vpbroadcastd m10, [base+z_filter_k-4+r5*4+12*2]
pshufb m0, m2, m1
shufps m1, m7, q2121
pmaddubsw m0, m8
......@@ -1432,7 +1440,7 @@ ALIGN function_align
mov r3d, dxd ; xpos
movd xm9, maxbased
vpbroadcastw m9, xm9
vbroadcasti128 m8, [z_shuf_w4]
vbroadcasti128 m8, [z1_shuf_w4]
psrlw m7, 8 ; top[max_base_x]
paddw m10, m6, m6
psubw m9, m0 ; max_base_x
......@@ -1502,7 +1510,7 @@ ALIGN function_align
movd xm6, hd
vinserti128 m0, [tlq+7], 1
vpbroadcastb xm6, xm6
vbroadcasti128 m1, [z_upsample]
vbroadcasti128 m1, [z_upsample1]
pminub xm6, xm2
vpbroadcastd m7, [pb_36_m4]
vinserti128 m2, xm6, 1
......@@ -1561,9 +1569,8 @@ ALIGN function_align
jg .w8_upsample_loop
RET
.w8_no_intra_edge_filter:
mov r3d, 15
cmp hd, 8
cmova maxbased, r3d
and maxbased, 7
or maxbased, 8 ; imin(h+7, 15)
jmp .w8_main
.w8_no_upsample:
%assign stack_offset org_stack_offset
......@@ -1572,27 +1579,22 @@ ALIGN function_align
test angled, 0x400
jnz .w8_no_intra_edge_filter
call .filter_strength
vpbroadcastd xm6, [pb_15]
pminub xm6, xm0 ; imin(h, 8) + 7
movd maxbased, xm6
movzx maxbased, maxbaseb
jz .w8_main ; filter_strength == 0
lea r3, [z_filter_k-4]
movu xm2, [tlq]
pminub xm1, xm6, [r3-z_filter_k+z_filter_s+18]
pminub xm1, xm0, [base+z_filter_s+14]
vinserti128 m2, [tlq-1], 1
vinserti128 m1, [r3-z_filter_k+z_filter_s+ 4], 1
vpbroadcastd m7, [r3+r5*4+12*0]
pminub xm6, [r3-z_filter_k+z_filter_s+26]
vinserti128 m6, [r3-z_filter_k+z_filter_s+12], 1
pshufb m0, m2, m1
pmaddubsw m0, m7
vpbroadcastd m7, [r3+r5*4+12*1]
vinserti128 m1, [base+z_filter_s+ 0], 1
vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*0]
pminub xm0, [base+z_filter_s+22]
vinserti128 m0, [base+z_filter_s+ 8], 1
pshufb m6, m2, m1
pmaddubsw m6, m7
vpbroadcastd m7, [base+z_filter_k-4+r5*4+12*1]
movzx r3d, byte [tlq+15]
shufps m1, m6, q2121
shufps m1, m0, q2121
pshufb m1, m2, m1
pmaddubsw m1, m7
paddw m0, m1
paddw m1, m6
sub r5d, 3
jnz .w8_3tap
; filter_strength == 3 uses a 5-tap filter instead of a 3-tap one,
......@@ -1600,24 +1602,24 @@ ALIGN function_align
; slightly different from out[max_base_x] when h > w.
vpbroadcastd m7, [z_filter_k+4*8]
movzx r2d, byte [tlq+14]
pshufb m2, m6
pshufb m2, m0
pmaddubsw m2, m7
sub r2d, r3d
lea r2d, [r2+r3*8+4]
shr r2d, 3 ; (tlq[w*2-2] + tlq[w*2-1]*7 + 4) >> 3
mov [rsp+16], r2b
paddw m0, m2
paddw m1, m2
.w8_3tap:
pmulhrsw m0, m3
pmulhrsw m1, m3
sar r5d, 1
mov tlq, rsp
add r5d, 17 ; w*2 + (filter_strength == 3)
cmp hd, 8
cmova maxbased, r5d
mov [tlq+r5], r3b
vextracti128 xm1, m0, 1
packuswb xm1, xm0
mova [tlq], xm1
vextracti128 xm0, m1, 1
packuswb xm0, xm1
mova [tlq], xm0
.w8_main:
movd xm2, dxd
vbroadcasti128 m0, [z_base_inc]
......@@ -1668,9 +1670,8 @@ ALIGN function_align
.w8_end:
RET
.w16_no_intra_edge_filter:
mov r3d, 31
cmp hd, 16
cmova maxbased, r3d
and maxbased, 15
or maxbased, 16 ; imin(h+15, 31)
jmp .w16_main
ALIGN function_align
.w16:
......@@ -1680,25 +1681,18 @@ ALIGN function_align
test angled, 0x400
jnz .w16_no_intra_edge_filter
call .filter_strength
vpbroadcastd m1, [pb_31]
pminub m0, m1 ; imin(h, 16) + 15
movd maxbased, xm0
movzx maxbased, maxbaseb
jz .w16_main ; filter_strength == 0
lea r3, [z_filter_k-4]
vpbroadcastd m1, [pb_12]
vpbroadcastd m11, [pb_15]
vbroadcasti128 m6, [r3-z_filter_k+z_filter_s+12]
vinserti128 m2, m6, [r3-z_filter_k+z_filter_s+4], 0
vinserti128 m6, [r3-z_filter_k+z_filter_s+20], 1
vpbroadcastd m1, [base+pb_12]
vbroadcasti128 m6, [base+z_filter_s+8]
vinserti128 m2, m6, [base+z_filter_s], 0
vinserti128 m6, [base+z_filter_s+16], 1
mova xm10, [tlq-1]
vinserti128 m10, [tlq+3], 1
vpbroadcastd m9, [r3+r5*4+12*0]
vbroadcasti128 m7, [r3-z_filter_k+z_filter_s+18]
vinserti128 m8, m7, [r3-z_filter_k+z_filter_s+10], 0
vinserti128 m7, [r3-z_filter_k+z_filter_s+26], 1
vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*0]
vbroadcasti128 m7, [base+z_filter_s+14]
vinserti128 m8, m7, [base+z_filter_s+6], 0
vinserti128 m7, [base+z_filter_s+22], 1
psubw m0, m1
pminub m0, m11 ; imin(h+3, 15)
movu xm11, [tlq+12]
vinserti128 m11, [tlq+16], 1
pminub m8, m0
......@@ -1709,7 +1703,7 @@ ALIGN function_align
pshufb m1, m11, m8
shufps m8, m7, q2121
pmaddubsw m1, m9
vpbroadcastd m9, [r3+r5*4+12*1]
vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1]
movzx r3d, byte [tlq+31]
pshufb m2, m10, m2
pmaddubsw m2, m9
......@@ -2131,6 +2125,1169 @@ ALIGN function_align
.w64_end:
RET
cglobal ipred_z3, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase
%assign org_stack_offset stack_offset
lea r6, [ipred_z3_avx2_table]
tzcnt hd, hm
movifnidn angled, anglem
lea r7, [dr_intra_derivative+90*2]
dec tlq
movsxd hq, [r6+hq*4]
sub angled, 180
add hq, r6
movzx dyd, angleb
xor angled, 0x400
neg dyq
movzx dyd, word [r7+dyq*2]
vpbroadcastd m3, [pw_512]
vpbroadcastd m4, [pw_62]
vpbroadcastd m5, [pw_64]
mov org_wd, wd
jmp hq
.h4:
lea r7, [strideq*3]
cmp angleb, 40
jae .h4_no_upsample
lea r4d, [angleq-1024]
sar r4d, 7
add r4d, wd
jg .h4_no_upsample ; !enable_intra_edge_filter || w > 8 || (w == 8 && is_sm)
ALLOC_STACK -32, 9
movu xm8, [tlq-7]
pshufb xm0, xm8, [z_upsample3]
vpbroadcastb xm2, xm8
pshufb xm1, xm8, [z_filter_s+2]
mova [rsp+16], xm2 ; top[max_base_y]
vpbroadcastd xm2, [pb_36_m4]
add dyd, dyd
pmaddubsw xm0, xm2
pmaddubsw xm1, xm2
movd xm7, dyd
mov r2d, dyd
vpbroadcastw m7, xm7
paddw xm1, xm0
pmulhrsw xm1, xm3
pslldq m6, m7, 8
paddw xm2, xm7, xm7
paddw m6, m7
packuswb xm1, xm1
paddw m6, m2
punpcklbw xm1, xm8
mova xm8, [z_transpose4]
psllw m7, 2
pshufb xm1, [pb_15to0]
mova [rsp], xm1
.h4_upsample_loop:
lea r4d, [r2+dyq]
shr r2d, 6
vpbroadcastq m1, [rsp+r2]
lea r2d, [r4+dyq]
shr r4d, 6
vpbroadcastq m2, [rsp+r4]
lea r4d, [r2+dyq]
shr r2d, 6
movq xm0, [rsp+r2]
lea r2d, [r4+dyq]
shr r4d, 6
movhps xm0, [rsp+r4]
vpblendd m1, m2, 0xc0
pand m2, m4, m6
vpblendd m0, m1, 0xf0
psubw m1, m5, m2
psllw m2, 8
por m1, m2
pmaddubsw m0, m1
paddw m6, m7
pmulhrsw m0, m3
vextracti128 xm1, m0, 1
packuswb xm1, xm0
pshufb xm1, xm8
movd [dstq+strideq*0], xm1
pextrd [dstq+strideq*1], xm1, 1
pextrd [dstq+strideq*2], xm1, 2
pextrd [dstq+r7 ], xm1, 3
add dstq, 4
sub wd, 4
jg .h4_upsample_loop
RET
ALIGN function_align
.filter_strength: ; h4/h8/h16
%define base r4-z_filter_t0
lea r4, [z_filter_t0]
movd xm0, maxbased
movd xm2, angled
shr angled, 8 ; is_sm << 1
vpbroadcastb m0, xm0
vpbroadcastb m2, xm2
pcmpeqb m1, m0, [base+z_filter_wh]
pand m1, m2
mova xm2, [r4+angleq*8]
pcmpgtb m1, m2
pmovmskb r5d, m1
popcnt r5d, r5d
ret
.h4_no_upsample:
%assign stack_offset org_stack_offset
ALLOC_STACK -16, 12
mov maxbased, 7
test angled, 0x400 ; !enable_intra_edge_filter
jnz .h4_main
lea maxbased, [wq+3]
call .filter_strength
mov maxbased, 7
jz .h4_main ; filter_strength == 0
vpbroadcastd m7, [base+pb_7]
vbroadcasti128 m2, [tlq-14]
pmaxub m1, m7, [base+z_filter_s-4]
vpbroadcastd m8, [base+z_filter_k-4+r5*4+12*0]
pmaxub m7, [base+z_filter_s+4]
vpbroadcastd m9, [base+z_filter_k-4+r5*4+12*1]
vpbroadcastd m10, [base+z_filter_k-4+r5*4+12*2]
pshufb m0, m2, m1
shufps m1, m7, q2121
pmaddubsw m0, m8
pshufb m1, m2, m1
pmaddubsw m1, m9
pshufb m2, m7
pmaddubsw m2, m10
paddw m0, m1
paddw m0, m2
pmulhrsw m0, m3
mov r4d, 9
lea tlq, [rsp+15]
cmp wd, 4
cmova maxbased, r4d
vextracti128 xm1, m0, 1
packuswb xm0, xm1
mova [rsp], xm0
.h4_main:
movd xm6, dyd
vpbroadcastq m0, [z_base_inc] ; base_inc << 6
mov r4, tlq
sub tlq, 4
neg dyq
vpbroadcastw m6, xm6
sub r4, maxbaseq
shl maxbased, 6
vpbroadcastb m7, [r4]
lea r4, [dyq+63] ; ypos
movd xm9, maxbased
sub maxbased, 63
vbroadcasti128 m8, [z3_shuf_w4]
neg maxbaseq
vpbroadcastw m9, xm9
psrlw m7, 8 ; top[max_base_y]
paddw m10, m6, m6
psubw m9, m0 ; max_base_y
vpblendd m6, m10, 0xcc
mova xm0, xm10
paddw m6, m0 ; ypos2 ypos3 ypos0 ypos1
paddw m10, m10
mova xm11, [z_transpose4]
.h4_loop:
lea r5, [r4+dyq]
sar r4, 6 ; base0
vpbroadcastq m1, [tlq+r4]
lea r4, [r5+dyq]
sar r5, 6 ; base1
vpbroadcastq m2, [tlq+r5]
lea r5, [r4+dyq]
sar r4, 6 ; base2
movq xm0, [tlq+r4]
lea r4, [r5+dyq]
sar r5, 6 ; base3
movhps xm0, [tlq+r5]
vpblendd m1, m2, 0xc0
pand m2, m4, m6 ; frac << 1
vpblendd m0, m1, 0xf0
psubw m1, m5, m2 ; (32 - frac) << 1
psllw m2, 8
pshufb m0, m8
por m1, m2 ; (32-frac, frac) << 1
pmaddubsw m0, m1
pcmpgtw m1, m9, m6 ; base < max_base_y
pmulhrsw m0, m3
paddsw m6, m10 ; ypos += dy
vpblendvb m0, m7, m0, m1
vextracti128 xm1, m0, 1
packuswb xm1, xm0
pshufb xm1, xm11 ; transpose
movd [dstq+strideq*0], xm1
pextrd [dstq+strideq*1], xm1, 1
pextrd [dstq+strideq*2], xm1, 2
pextrd [dstq+r7 ], xm1, 3
add dstq, 4
sub wd, 4
jz .h4_end
cmp r4d, maxbased
jg .h4_loop
packuswb xm7, xm7
.h4_end_loop:
movd [dstq+strideq*0], xm7
movd [dstq+strideq*1], xm7
movd [dstq+strideq*2], xm7
movd [dstq+r7 ], xm7
add dstq, 4
sub wd, 4
jg .h4_end_loop
.h4_end:
RET
ALIGN function_align
.h8:
lea r4d, [angleq+216]
mov r4b, wb
cmp r4d, 8
ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8
%assign stack_offset org_stack_offset
ALLOC_STACK -32, 8
and r4d, 4
mova xm0, [tlq-15]
vinserti128 m0, [tlq- 9], 1
movd xm1, r4d
movu xm2, [z_filter_s+2]
vinserti128 m2, [z_filter_s+6], 1
vpbroadcastb xm1, xm1 ; w & 4
vpbroadcastd m7, [pb_36_m4]
pmaxub xm1, [z_upsample3] ; clip 4x8
vinserti128 m1, [z_upsample1], 1
add dyd, dyd
pshufb m1, m0, m1
pshufb m2, m0, m2
vinserti128 m0, [tlq-7], 1
movd xm6, dyd
pmaddubsw m1, m7
pmaddubsw m2, m7
vpbroadcastw m6, xm6
mov r2d, dyd
lea r5, [strideq*3]
paddw m7, m6, m6
paddw m1, m2
vpblendd m6, m7, 0xf0
pmulhrsw m1, m3
pslldq m2, m7, 8
paddw m7, m7
paddw m6, m2
vbroadcasti128 m2, [pb_15to0]
packuswb m1, m1
punpcklbw m1, m0
pshufb m1, m2
vextracti128 [rsp+ 0], m1, 1
mova [rsp+16], xm1
.h8_upsample_loop:
lea r4d, [r2+dyq]
shr r2d, 6 ; base0
movu xm0, [rsp+r2]
lea r2d, [r4+dyq]
shr r4d, 6 ; base1
vinserti128 m0, [rsp+r4], 1
lea r4d, [r2+dyq]
shr r2d, 6 ; base2
pand m1, m4, m6
psubw m2, m5, m1
psllw m1, 8
por m2, m1
punpcklqdq m1, m2, m2 ; frac0 frac1
pmaddubsw m0, m1
movu xm1, [rsp+r2]
lea r2d, [r4+dyq]
shr r4d, 6 ; base3
vinserti128 m1, [rsp+r4], 1
punpckhqdq m2, m2 ; frac2 frac3
pmaddubsw m1, m2
pmulhrsw m0, m3
paddw m6, m7
pmulhrsw m1, m3
lea r4, [dstq+strideq*4]
psllw m1, 8
por m0, m1
vextracti128 xm1, m0, 1
punpcklbw xm2, xm0, xm1
punpckhbw xm0, xm1
movd [dstq+strideq*0], xm2
pextrd [dstq+strideq*1], xm2, 1
pextrd [dstq+strideq*2], xm2, 2
pextrd [dstq+r5 ], xm2, 3
movd [r4 +strideq*0], xm0
pextrd [r4 +strideq*1], xm0, 1
pextrd [r4 +strideq*2], xm0, 2
pextrd [r4 +r5 ], xm0, 3