Commit d6770f93 authored by Henrik Gramner's avatar Henrik Gramner Committed by Ronald S. Bultje

Add ipred_z1 AVX2 asm

parent f816d5cf
......@@ -57,6 +57,21 @@ smooth_weights: SMOOTH_WEIGHT_TABLE \
18, 16, 15, 13, 12, 10, 9, 8, \
7, 6, 6, 5, 5, 4, 4, 4
; Note that the order of (some of) the following z constants matter
z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39
db 39, 39, 47, 47, 47, 63, 63, 63, 79, 79, 79, -1
z_filter_k: db 0, 16, 0, 16, 0, 20, 0, 20, 8, 16, 8, 16
db 32, 16, 32, 16, 24, 20, 24, 20, 16, 16, 16, 16
db 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 8, 0
z_filter_s: db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7
db 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15
z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0
z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0
z_upsample: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
z_shuf_w4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12
z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64
dw 16*64, 17*64, 18*64, 19*64, 20*64, 21*64, 22*64, 23*64
; vpermd indices in bits 4..6 of filter_shuf1: 0, 2, 6, 4, 1, 3, 7, 5
filter_shuf1: db 10, 4, 10, 4, 37, 6, 5, 6,103, 9, 7, 9, 72, -1, 8, -1
db 16, 4, 0, 4, 53, 6, 5, 6,119, 11, 7, 11, 95, -1, 15, -1
......@@ -67,6 +82,7 @@ ipred_v_shuf: db 0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13
ipred_h_shuf: db 7, 7, 7, 7, 3, 3, 3, 3, 5, 5, 5, 5, 1, 1, 1, 1
db 6, 6, 6, 6, 2, 2, 2, 2, 4, 4, 4, 4, 0, 0, 0, 0
pb_0to15:
cfl_ac_w16_pad_shuffle: ; w=16, w_pad=1
db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
; w=8, w_pad=1 as well as second half of previous one
......@@ -81,12 +97,21 @@ cfl_ac_w8_pad1_shuffle: db 0, 1, 2, 3, 4, 5
pb_1: times 4 db 1
pb_2: times 4 db 2
pb_8: times 4 db 8
pb_12: times 4 db 12
pb_14: times 4 db 14
pb_15 times 4 db 15
pb_31: times 4 db 31
pb_128: times 4 db 128
pw_1: times 2 dw 1
pw_8: times 2 dw 8
pw_62: times 2 dw 62
pw_64: times 2 dw 64
pw_128: times 2 dw 128
pw_255: times 2 dw 255
pw_512: times 2 dw 512
pb_36_m4: times 2 db 36, -4
pb_127_m127: times 2 db 127, -127
%macro JMP_TABLE 3-*
......@@ -111,12 +136,14 @@ JMP_TABLE ipred_dc, avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
JMP_TABLE ipred_dc_left, avx2, h4, h8, h16, h32, h64
JMP_TABLE ipred_h, avx2, w4, w8, w16, w32, w64
JMP_TABLE ipred_z1, avx2, w4, w8, w16, w32, w64
JMP_TABLE ipred_cfl, avx2, h4, h8, h16, h32, w4, w8, w16, w32, \
s4-8*4, s8-8*4, s16-8*4, s32-8*4
JMP_TABLE ipred_cfl_left, avx2, h4, h8, h16, h32
JMP_TABLE ipred_cfl_ac_420, avx2, w16_pad1, w16_pad2, w16_pad3
JMP_TABLE pal_pred, avx2, w4, w8, w16, w32, w64
cextern dr_intra_derivative
cextern filter_intra_taps
SECTION .text
......@@ -1259,6 +1286,849 @@ ALIGN function_align
sub r3, hq
ret
cglobal ipred_z1, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase
%assign org_stack_offset stack_offset
lea r6, [ipred_z1_avx2_table]
tzcnt wd, wm
movifnidn angled, anglem
movifnidn hd, hm
lea r7, [dr_intra_derivative]
inc tlq
movsxd wq, [r6+wq*4]
add wq, r6
movzx dxd, angleb
add angled, 165 ; ~90
movzx dxd, word [r7+dxq*2]
xor angled, 0x4ff ; d = 90 - angle
vpbroadcastd m3, [pw_512]
vpbroadcastd m4, [pw_62]
vpbroadcastd m5, [pw_64]
jmp wq
.w4:
cmp angleb, 40
jae .w4_no_upsample
lea r3d, [angleq-1024]
sar r3d, 7
add r3d, hd
jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm)
ALLOC_STACK -32, 8
mova xm1, [tlq-1]
pshufb xm0, xm1, [z_upsample]
vpbroadcastd xm2, [pb_8]
pminub xm2, [z_filter_s+6]
pshufb xm1, xm2
vpbroadcastd xm2, [pb_36_m4] ; upshifted by 2 to be able to reuse
add dxd, dxd ; pw_512 (which is already in m3)
pmaddubsw xm0, xm2 ; for rounding instead of pw_2048
pextrd [rsp+16], xm1, 3 ; top[max_base_x]
pmaddubsw xm1, xm2
movd xm7, dxd
mov r3d, dxd ; xpos
vpbroadcastw m7, xm7
paddw xm1, xm0
movq xm0, [tlq]
pmulhrsw xm1, xm3
pslldq m6, m7, 8
paddw xm2, xm7, xm7
lea r2, [strideq*3]
paddw m6, m7
packuswb xm1, xm1
paddw m6, m2 ; xpos2 xpos3 xpos0 xpos1
punpcklbw xm0, xm1
psllw m7, 2
mova [rsp], xm0
.w4_upsample_loop:
lea r5d, [r3+dxq]
shr r3d, 6 ; base0
vpbroadcastq m1, [rsp+r3]
lea r3d, [r5+dxq]
shr r5d, 6 ; base1
vpbroadcastq m2, [rsp+r5]
lea r5d, [r3+dxq]
shr r3d, 6 ; base2
movq xm0, [rsp+r3]
lea r3d, [r5+dxq]
shr r5d, 6 ; base3
movhps xm0, [rsp+r5]
vpblendd m1, m2, 0xc0
pand m2, m4, m6 ; frac << 1
vpblendd m0, m1, 0xf0
psubw m1, m5, m2 ; (32 - frac) << 1
psllw m2, 8
por m1, m2 ; (32-frac, frac) << 1
pmaddubsw m0, m1
paddw m6, m7 ; xpos += dx
pmulhrsw m0, m3
packuswb m0, m0
vextracti128 xm1, m0, 1
movd [dstq+strideq*2], xm0
pextrd [dstq+r2 ], xm0, 1
movd [dstq+strideq*0], xm1
pextrd [dstq+strideq*1], xm1, 1
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w4_upsample_loop
RET
ALIGN function_align
.filter_strength: ; w4/w8/w16
; The C version uses a lot of branches, but we can do all the comparisons
; in parallel and use popcnt to get the final filter strength value.
movd xm0, maxbased
movd xm2, angled
lea r3, [z_filter_t0]
shr angled, 8 ; is_sm << 1
vpbroadcastb m0, xm0
vpbroadcastb m2, xm2
pcmpeqb m1, m0, [r3-z_filter_t0+z_filter_wh]
pand m1, m2
mova xm2, [r3+angleq*8] ; upper ymm half zero in both cases
pcmpgtb m1, m2
pmovmskb r5d, m1
popcnt r5d, r5d ; sets ZF which can be used by caller
ret
.w4_no_upsample:
%assign stack_offset org_stack_offset
ALLOC_STACK -16, 11
mov maxbased, 7
test angled, 0x400 ; !enable_intra_edge_filter
jnz .w4_main
lea maxbased, [hq+3]
call .filter_strength
mov maxbased, 7
jz .w4_main ; filter_strength == 0
lea r3, [z_filter_k-4]
vpbroadcastd m7, [pb_8]
vbroadcasti128 m2, [tlq-1]
pminub m1, m7, [r3-z_filter_k+z_filter_s+4]
vpbroadcastd m8, [r3+r5*4+12*0]
pminub m7, [r3-z_filter_k+z_filter_s+12]
vpbroadcastd m9, [r3+r5*4+12*1]
vpbroadcastd m10, [r3+r5*4+12*2]
pshufb m0, m2, m1
shufps m1, m7, q2121
pmaddubsw m0, m8
pshufb m1, m2, m1
pmaddubsw m1, m9
pshufb m2, m7
pmaddubsw m2, m10
paddw m0, m1
paddw m0, m2
pmulhrsw m0, m3
mov r3d, 9
mov tlq, rsp
cmp hd, 4
cmova maxbased, r3d
vextracti128 xm1, m0, 1
packuswb xm0, xm1
mova [tlq], xm0
.w4_main:
movd xm6, dxd
vpbroadcastq m0, [z_base_inc] ; base_inc << 6
vpbroadcastb m7, [tlq+maxbaseq]
shl maxbased, 6
vpbroadcastw m6, xm6
mov r3d, dxd ; xpos
movd xm9, maxbased
vpbroadcastw m9, xm9
vbroadcasti128 m8, [z_shuf_w4]
psrlw m7, 8 ; top[max_base_x]
paddw m10, m6, m6
psubw m9, m0 ; max_base_x
vpblendd m6, m10, 0xcc
mova xm0, xm10
paddw m6, m0 ; xpos2 xpos3 xpos0 xpos1
paddw m10, m10
.w4_loop:
lea r5d, [r3+dxq]
shr r3d, 6 ; base0
vpbroadcastq m1, [tlq+r3]
lea r3d, [r5+dxq]
shr r5d, 6 ; base1
vpbroadcastq m2, [tlq+r5]
lea r5d, [r3+dxq]
shr r3d, 6 ; base2
movq xm0, [tlq+r3]
lea r3d, [r5+dxq]
shr r5d, 6 ; base3
movhps xm0, [tlq+r5]
vpblendd m1, m2, 0xc0
pand m2, m4, m6 ; frac << 1
vpblendd m0, m1, 0xf0
psubw m1, m5, m2 ; (32 - frac) << 1
psllw m2, 8
pshufb m0, m8
por m1, m2 ; (32-frac, frac) << 1
pmaddubsw m0, m1
pcmpgtw m1, m9, m6 ; base < max_base_x
pmulhrsw m0, m3
paddsw m6, m10 ; xpos += dx
lea r5, [dstq+strideq*2]
vpblendvb m0, m7, m0, m1
packuswb m0, m0
vextracti128 xm1, m0, 1
movd [r5 +strideq*0], xm0
pextrd [r5 +strideq*1], xm0, 1
movd [dstq+strideq*0], xm1
pextrd [dstq+strideq*1], xm1, 1
lea dstq, [dstq+strideq*4]
sub hd, 4
jz .w4_end
cmp r3d, maxbased
jb .w4_loop
packuswb xm7, xm7
lea r6, [strideq*3]
.w4_end_loop:
movd [dstq+strideq*0], xm7
movd [dstq+strideq*1], xm7
movd [dstq+strideq*2], xm7
movd [dstq+r6 ], xm7
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w4_end_loop
.w4_end:
RET
ALIGN function_align
.w8:
lea r3d, [angleq+216]
mov r3b, hb
cmp r3d, 8
ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
%assign stack_offset org_stack_offset
ALLOC_STACK -32, 8
movu xm2, [z_filter_s+6]
mova xm0, [tlq-1]
movd xm6, hd
vinserti128 m0, [tlq+7], 1
vpbroadcastb xm6, xm6
vbroadcasti128 m1, [z_upsample]
pminub xm6, xm2
vpbroadcastd m7, [pb_36_m4]
vinserti128 m2, xm6, 1
add dxd, dxd
pshufb m1, m0, m1
pshufb m2, m0, m2
movd xm6, dxd
pmaddubsw m1, m7
pmaddubsw m2, m7
vpbroadcastw m6, xm6
mov r3d, dxd
psrldq m0, 1
lea r2, [strideq*3]
paddw m7, m6, m6
paddw m1, m2
vpblendd m6, m7, 0xf0
pmulhrsw m1, m3
pslldq m2, m7, 8
paddw m7, m7
paddw m6, m2
packuswb m1, m1
punpcklbw m0, m1
mova [rsp], m0
.w8_upsample_loop:
lea r5d, [r3+dxq]
shr r3d, 6 ; base0
movu xm0, [rsp+r3]
lea r3d, [r5+dxq]
shr r5d, 6 ; base1
vinserti128 m0, [rsp+r5], 1
lea r5d, [r3+dxq]
shr r3d, 6 ; base2
pand m1, m4, m6
psubw m2, m5, m1
psllw m1, 8
por m2, m1
punpcklqdq m1, m2, m2 ; frac0 frac1
pmaddubsw m0, m1
movu xm1, [rsp+r3]
lea r3d, [r5+dxq]
shr r5d, 6 ; base3
vinserti128 m1, [rsp+r5], 1
punpckhqdq m2, m2 ; frac2 frac3
pmaddubsw m1, m2
pmulhrsw m0, m3
paddw m6, m7
pmulhrsw m1, m3
packuswb m0, m1
vextracti128 xm1, m0, 1
movq [dstq+strideq*0], xm0
movhps [dstq+strideq*2], xm0
movq [dstq+strideq*1], xm1
movhps [dstq+r2 ], xm1
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w8_upsample_loop
RET
.w8_no_intra_edge_filter:
mov r3d, 15
cmp hd, 8
cmova maxbased, r3d
jmp .w8_main
.w8_no_upsample:
%assign stack_offset org_stack_offset
ALLOC_STACK -32, 10
lea maxbased, [hq+7]
test angled, 0x400
jnz .w8_no_intra_edge_filter
call .filter_strength
vpbroadcastd xm6, [pb_15]
pminub xm6, xm0 ; imin(h, 8) + 7
movd maxbased, xm6
movzx maxbased, maxbaseb
jz .w8_main ; filter_strength == 0
lea r3, [z_filter_k-4]
movu xm2, [tlq]
pminub xm1, xm6, [r3-z_filter_k+z_filter_s+18]
vinserti128 m2, [tlq-1], 1
vinserti128 m1, [r3-z_filter_k+z_filter_s+ 4], 1
vpbroadcastd m7, [r3+r5*4+12*0]
pminub xm6, [r3-z_filter_k+z_filter_s+26]
vinserti128 m6, [r3-z_filter_k+z_filter_s+12], 1
pshufb m0, m2, m1
pmaddubsw m0, m7
vpbroadcastd m7, [r3+r5*4+12*1]
movzx r3d, byte [tlq+15]
shufps m1, m6, q2121
pshufb m1, m2, m1
pmaddubsw m1, m7
paddw m0, m1
sub r5d, 3
jnz .w8_3tap
; filter_strength == 3 uses a 5-tap filter instead of a 3-tap one,
; which also results in an awkward edge case where out[w*2] is
; slightly different from out[max_base_x] when h > w.
vpbroadcastd m7, [z_filter_k+4*8]
movzx r2d, byte [tlq+14]
pshufb m2, m6
pmaddubsw m2, m7
sub r2d, r3d
lea r2d, [r2+r3*8+4]
shr r2d, 3 ; (tlq[w*2-2] + tlq[w*2-1]*7 + 4) >> 3
mov [rsp+16], r2b
paddw m0, m2
.w8_3tap:
pmulhrsw m0, m3
sar r5d, 1
mov tlq, rsp
add r5d, 17 ; w*2 + (filter_strength == 3)
cmp hd, 8
cmova maxbased, r5d
mov [tlq+r5], r3b
vextracti128 xm1, m0, 1
packuswb xm1, xm0
mova [tlq], xm1
.w8_main:
movd xm2, dxd
vbroadcasti128 m0, [z_base_inc]
vpbroadcastw m2, xm2
vpbroadcastb m7, [tlq+maxbaseq]
shl maxbased, 6
movd xm9, maxbased
vbroadcasti128 m8, [z_filter_s+2]
vpbroadcastw m9, xm9
psrlw m7, 8
psubw m9, m0
mov r3d, dxd
paddw m6, m2, m2
vpblendd m2, m6, 0xf0
.w8_loop:
lea r5d, [r3+dxq]
shr r3d, 6
pand m0, m4, m2
psubw m1, m5, m0
psllw m0, 8
por m1, m0
movu xm0, [tlq+r3]
lea r3d, [r5+dxq]
shr r5d, 6 ; base1
vinserti128 m0, [tlq+r5], 1
pshufb m0, m8
pmaddubsw m0, m1
pcmpgtw m1, m9, m2
paddsw m2, m6
pmulhrsw m0, m3
vpblendvb m0, m7, m0, m1
vextracti128 xm1, m0, 1
packuswb xm0, xm1
movq [dstq+strideq*0], xm0
movhps [dstq+strideq*1], xm0
lea dstq, [dstq+strideq*2]
sub hd, 2
jz .w8_end
cmp r3d, maxbased
jb .w8_loop
packuswb xm7, xm7
.w8_end_loop:
movq [dstq+strideq*0], xm7
movq [dstq+strideq*1], xm7
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w8_end_loop
.w8_end:
RET
.w16_no_intra_edge_filter:
mov r3d, 31
cmp hd, 16
cmova maxbased, r3d
jmp .w16_main
ALIGN function_align
.w16:
%assign stack_offset org_stack_offset
ALLOC_STACK -64, 12
lea maxbased, [hq+15]
test angled, 0x400
jnz .w16_no_intra_edge_filter
call .filter_strength
vpbroadcastd m1, [pb_31]
pminub m0, m1 ; imin(h, 16) + 15
movd maxbased, xm0
movzx maxbased, maxbaseb
jz .w16_main ; filter_strength == 0
lea r3, [z_filter_k-4]
vpbroadcastd m1, [pb_12]
vpbroadcastd m11, [pb_15]
vbroadcasti128 m6, [r3-z_filter_k+z_filter_s+12]
vinserti128 m2, m6, [r3-z_filter_k+z_filter_s+4], 0
vinserti128 m6, [r3-z_filter_k+z_filter_s+20], 1
mova xm10, [tlq-1]
vinserti128 m10, [tlq+3], 1
vpbroadcastd m9, [r3+r5*4+12*0]
vbroadcasti128 m7, [r3-z_filter_k+z_filter_s+18]
vinserti128 m8, m7, [r3-z_filter_k+z_filter_s+10], 0
vinserti128 m7, [r3-z_filter_k+z_filter_s+26], 1
psubw m0, m1
pminub m0, m11 ; imin(h+3, 15)
movu xm11, [tlq+12]
vinserti128 m11, [tlq+16], 1
pminub m8, m0
pminub m7, m0
pshufb m0, m10, m2
shufps m2, m6, q2121
pmaddubsw m0, m9
pshufb m1, m11, m8
shufps m8, m7, q2121
pmaddubsw m1, m9
vpbroadcastd m9, [r3+r5*4+12*1]
movzx r3d, byte [tlq+31]
pshufb m2, m10, m2
pmaddubsw m2, m9
pshufb m8, m11, m8
pmaddubsw m8, m9
paddw m0, m2
paddw m1, m8
sub r5d, 3
jnz .w16_3tap
vpbroadcastd m9, [z_filter_k+4*8]
movzx r2d, byte [tlq+30]
pshufb m10, m6
pmaddubsw m10, m9
pshufb m11, m7
pmaddubsw m11, m9
sub r2d, r3d
lea r2d, [r2+r3*8+4]
shr r2d, 3
mov [rsp+32], r2b
paddw m0, m10
paddw m1, m11
.w16_3tap:
pmulhrsw m0, m3
pmulhrsw m1, m3
sar r5d, 1
mov tlq, rsp
add r5d, 33
cmp hd, 16
cmova maxbased, r5d
mov [tlq+r5], r3b
packuswb m0, m1
vpermq m0, m0, q3120
mova [tlq], m0
.w16_main:
movd xm6, dxd
vbroadcasti128 m0, [z_base_inc]
vpbroadcastb m7, [tlq+maxbaseq]
shl maxbased, 6
vpbroadcastw m6, xm6
movd xm9, maxbased
vbroadcasti128 m8, [z_filter_s+2]
vpbroadcastw m9, xm9
mov r3d, dxd
psubw m9, m0
paddw m11, m6, m6
psubw m10, m9, m3 ; 64*8
vpblendd m6, m11, 0xf0
.w16_loop:
lea r5d, [r3+dxq]
shr r3d, 6 ; base0
pand m1, m4, m6
psubw m2, m5, m1
psllw m1, 8
por m2, m1
movu xm0, [tlq+r3+0]
movu xm1, [tlq+r3+8]
lea r3d, [r5+dxq]
shr r5d, 6 ; base1
vinserti128 m0, [tlq+r5+0], 1
vinserti128 m1, [tlq+r5+8], 1
pshufb m0, m8
pshufb m1, m8
pmaddubsw m0, m2
pmaddubsw m1, m2
pmulhrsw m0, m3
pmulhrsw m1, m3
packuswb m0, m1
pcmpgtw m1, m9, m6
pcmpgtw m2, m10, m6
packsswb m1, m2
paddsw m6, m11
vpblendvb m0, m7, m0, m1
mova [dstq+strideq*0], xm0
vextracti128 [dstq+strideq*1], m0, 1
lea dstq, [dstq+strideq*2]
sub hd, 2
jz .w16_end
cmp r3d, maxbased
jb .w16_loop
.w16_end_loop:
mova [dstq+strideq*0], xm7
mova [dstq+strideq*1], xm7
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w16_end_loop
.w16_end:
RET
ALIGN function_align
.w32:
%assign stack_offset org_stack_offset
ALLOC_STACK -96, 15
lea r3d, [hq+31]
mov maxbased, 63
cmp hd, 32
cmovb maxbased, r3d
test angled, 0x400 ; !enable_intra_edge_filter
jnz .w32_main
vbroadcasti128 m0, [pb_0to15]
sub r3d, 29 ; h+2
movu xm13, [tlq+29] ; 32-39
movd xm1, r3d
movu xm14, [tlq+37] ; 40-47
sub r3d, 8 ; h-6
vinserti128 m14, [tlq+51], 1 ; 56-63
vpbroadcastb xm1, xm1
mova xm11, [tlq- 1] ; 0- 7
vinserti128 m11, [tlq+13], 1 ; 16-23
movd xm2, r3d
movu xm12, [tlq+ 5] ; 8-15
vinserti128 m12, [tlq+19], 1 ; 24-31
pminub xm1, xm0 ; clip 32x8
mova m7, [z_filter_s+0]
pshufb xm13, xm1
vpbroadcastd m1, [pb_12]
vpbroadcastb xm2, xm2
vinserti128 m13, [tlq+43], 1 ; 48-55
vinserti128 m8, m7, [z_filter_s+4], 1
vpblendd m2, m1, 0xf0
vinserti128 m7, [z_filter_s+12], 0
pminub m2, m0 ; clip 32x16 and 32x(32|64)
vpbroadcastd m9, [z_filter_k+4*2+12*0]
pshufb m14, m2
pshufb m0, m11, m8
shufps m8, m7, q1021
pmaddubsw m0, m9
pshufb m2, m12, m8
pmaddubsw m2, m9
pshufb m1, m13, m8
pmaddubsw m1, m9
pshufb m6, m14, m8
pmaddubsw m6, m9
vpbroadcastd m9, [z_filter_k+4*2+12*1]
pshufb m10, m11, m8
shufps m8, m7, q2121
pmaddubsw m10, m9
paddw m0, m10
pshufb m10, m12, m8
pmaddubsw m10, m9
paddw m2, m10
pshufb m10, m13, m8
pmaddubsw m10, m9
paddw m1, m10
pshufb m10, m14, m8
pmaddubsw m10, m9
paddw m6, m10
vpbroadcastd m9, [z_filter_k+4*2+12*2]
pshufb m11, m8
pmaddubsw m11, m9
pshufb m12, m7
pmaddubsw m12, m9
movzx r3d, byte [tlq+63]
movzx r2d, byte [tlq+62]
paddw m0, m11
paddw m2, m12
pshufb m13, m7
pmaddubsw m13, m9
pshufb m14, m7
pmaddubsw m14, m9
paddw m1, m13
paddw m6, m14
sub r2d, r3d
lea r2d, [r2+r3*8+4] ; edge case for 32x64
pmulhrsw m0, m3
pmulhrsw m2, m3
pmulhrsw m1, m3
pmulhrsw m6, m3
shr r2d, 3
mov [rsp+64], r2b
mov tlq, rsp
mov [tlq+65], r3b
mov r3d, 65
cmp hd, 32
cmova maxbased, r3d
packuswb m0, m2
packuswb m1, m6
mova [tlq+ 0], m0
mova [tlq+32], m1
.w32_main:
movd xm6, dxd
vpbroadcastb m7, [tlq+maxbaseq]
shl maxbased, 6
vpbroadcastw m6, xm6
movd xm9, maxbased
vbroadcasti128 m8, [z_filter_s+2]
vpbroadcastw m9, xm9
mov r3d, dxd
psubw m9, [z_base_inc]
mova m11, m6
psubw m10, m9, m3 ; 64*8
.w32_loop:
mov r5d, r3d
shr r5d, 6
pand m1, m4, m6
psubw m2, m5, m1
psllw m1, 8
por m2, m1
movu m0, [tlq+r5+0]
movu m1, [tlq+r5+8]
add r3d, dxd
pshufb m0, m8
pshufb m1, m8
pmaddubsw m0, m2
pmaddubsw m1, m2
pmulhrsw m0, m3
pmulhrsw m1, m3
packuswb m0, m1
pcmpgtw m1, m9, m6
pcmpgtw m2, m10, m6
packsswb m1, m2
paddsw m6, m11
vpblendvb m0, m7, m0, m1
mova [dstq], m0
add dstq, strideq
dec hd