Commit 04b70ea5 authored by Ronald S. Bultje's avatar Ronald S. Bultje

Add vertical loopfilter AVX2 SIMD

parent a146437d
......@@ -347,8 +347,8 @@ void dav1d_calc_eih(Av1FilterLUT *const lim_lut, const int filter_sharpness) {
int level;
// set E/I/H values from loopfilter level
const int sharp = filter_sharpness;
for (level = 0; level < 64; level++) {
const int sharp = filter_sharpness;
int limit = level;
if (sharp > 0) {
......@@ -360,6 +360,8 @@ void dav1d_calc_eih(Av1FilterLUT *const lim_lut, const int filter_sharpness) {
lim_lut->i[level] = limit;
lim_lut->e[level] = 2 * (level + 2) + limit;
}
lim_lut->sharp[0] = (sharp + 3) >> 2;
lim_lut->sharp[1] = sharp ? 9 - sharp : 0xff;
}
static void dav1d_calc_lf_value(uint8_t (*const lflvl_values)[2],
......
......@@ -36,6 +36,7 @@
typedef struct Av1FilterLUT {
uint8_t e[64];
uint8_t i[64];
uint64_t sharp[2];
} Av1FilterLUT;
typedef struct Av1RestorationUnit {
......
......@@ -231,4 +231,8 @@ void bitfn(dav1d_loop_filter_dsp_init)(Dav1dLoopFilterDSPContext *const c) {
c->loop_filter_sb128y = loop_filter_v_sb128y_c;
c->loop_filter_sb128uv = loop_filter_v_sb128uv_c;
#if HAVE_ASM && ARCH_X86
bitfn(dav1d_loop_filter_dsp_init_x86)(c);
#endif
}
......@@ -62,4 +62,7 @@ typedef struct Dav1dLoopFilterDSPContext {
void dav1d_loop_filter_dsp_init_8bpc(Dav1dLoopFilterDSPContext *c);
void dav1d_loop_filter_dsp_init_10bpc(Dav1dLoopFilterDSPContext *c);
void dav1d_loop_filter_dsp_init_x86_8bpc(Dav1dLoopFilterDSPContext *c);
void dav1d_loop_filter_dsp_init_x86_10bpc(Dav1dLoopFilterDSPContext *c);
#endif /* __DAV1D_SRC_LOOPFILTER_H__ */
......@@ -99,6 +99,7 @@ if is_asm_enabled
libdav1d_tmpl_sources += files(
'x86/itx_init.c',
'x86/loopfilter_init.c',
'x86/mc_init.c',
)
......@@ -106,6 +107,7 @@ if is_asm_enabled
libdav1d_sources_asm = files(
'x86/cpuid.asm',
'x86/itx.asm',
'x86/loopfilter.asm',
'x86/mc.asm',
)
......
; Copyright © 2018, VideoLAN and dav1d authors
; Copyright © 2018, Two Orioles, LLC
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
;
; 1. Redistributions of source code must retain the above copyright notice, this
; list of conditions and the following disclaimer.
;
; 2. Redistributions in binary form must reproduce the above copyright notice,
; this list of conditions and the following disclaimer in the documentation
; and/or other materials provided with the distribution.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm"
%if ARCH_X86_64
SECTION_RODATA 32
pb_4x1_4x5_4x9_4x13: times 2 db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12
pb_7_1: times 16 db 7, 1
pb_3_1: times 16 db 3, 1
pb_2_1: times 16 db 2, 1
pb_m1_0: times 16 db -1, 0
pb_m1_1: times 16 db -1, 1
pb_m1_2: times 16 db -1, 2
pb_1: times 32 db 1
pb_2: times 32 db 2
pb_3: times 32 db 3
pb_4: times 32 db 4
pb_16: times 32 db 16
pb_63: times 32 db 63
pb_64: times 32 db 64
pb_128: times 32 db 0x80
pb_129: times 32 db 0x81
pb_240: times 32 db 0xf0
pb_248: times 32 db 0xf8
pb_254: times 32 db 0xfe
pw_2048: times 16 dw 2048
pw_4096: times 16 dw 4096
pb_mask: dd 1, 2, 4, 8, 16, 32, 64, 128
SECTION .text
%macro ABSSUB 4 ; dst, a, b, tmp
psubusb %1, %2, %3
psubusb %4, %3, %2
por %1, %4
%endmacro
%macro FILTER 1 ; width
movu m1, [lq]
movu m0, [lq+l_strideq]
pxor m2, m2
pcmpeqb m3, m2, m0
pand m1, m3
por m0, m1 ; l[x][] ? l[x][] : l[x-stride][]
pshufb m0, [pb_4x1_4x5_4x9_4x13] ; l[x][1]
pcmpeqb m10, m2, m0 ; !L
pand m1, m0, [pb_240]
psrlq m1, 4 ; H
psrlq m2, m0, [lutq+128]
pand m2, [pb_63]
vpbroadcastb m4, [lutq+136]
pminub m2, m4
pmaxub m2, [pb_1] ; I
paddb m0, [pb_2]
paddb m0, m0
paddb m0, m2 ; E
pxor m1, [pb_128]
pxor m2, [pb_128]
pxor m0, [pb_128]
%if %1 == 4
lea tmpq, [dstq+mstrideq*2]
mova m3, [tmpq+strideq*0] ; p1
mova m4, [tmpq+strideq*1] ; p0
mova m5, [tmpq+strideq*2] ; q0
mova m6, [tmpq+stride3q] ; q1
%else
lea tmpq, [dstq+mstrideq*4]
mova m3, [tmpq+strideq*2]
mova m4, [tmpq+stride3q]
mova m5, [dstq+strideq*0]
mova m6, [dstq+strideq*1]
%endif
ABSSUB m8, m3, m4, m9 ; abs(p1-p0)
pmaxub m8, m10
ABSSUB m9, m5, m6, m10 ; abs(q1-q0)
pmaxub m8, m9
%if %1 == 4
pxor m8, [pb_128]
pcmpgtb m7, m8, m1 ; hev
%else
pxor m7, m8, [pb_128]
pcmpgtb m7, m1 ; hev
%if %1 != 6
mova m12, [tmpq+strideq*0]
%endif
mova m13, [tmpq+strideq*1]
mova m14, [dstq+strideq*2]
%if %1 != 6
mova m15, [dstq+stride3q]
%endif
%if %1 == 6
ABSSUB m9, m13, m4, m10 ; abs(p2-p0)
pmaxub m9, m8
%else
ABSSUB m9, m12, m4, m10 ; abs(p3-p0)
pmaxub m9, m8
ABSSUB m10, m13, m4, m11 ; abs(p2-p0)
pmaxub m9, m10
%endif
ABSSUB m10, m5, m14, m11 ; abs(q2-q0)
pmaxub m9, m10
%if %1 != 6
ABSSUB m10, m5, m15, m11 ; abs(q3-q0)
pmaxub m9, m10
%endif
pxor m9, [pb_128]
pcmpgtb m9, [pb_129] ; !flat8in
%if %1 == 6
ABSSUB m10, m13, m3, m1 ; abs(p2-p1)
%else
ABSSUB m10, m12, m13, m11 ; abs(p3-p2)
ABSSUB m11, m13, m3, m1 ; abs(p2-p1)
pmaxub m10, m11
ABSSUB m11, m14, m15, m1 ; abs(q3-q2)
pmaxub m10, m11
%endif
ABSSUB m11, m14, m6, m1 ; abs(q2-q1)
pmaxub m10, m11
%if %1 == 16
vpbroadcastd m11, [maskq+8]
vpbroadcastd m1, [maskq+4]
por m11, m1
pand m11, [pb_mask]
pcmpeqd m11, [pb_mask]
pand m10, m11
%else
vpbroadcastd m11, [maskq+4]
pand m11, [pb_mask]
pcmpeqd m11, [pb_mask]
pand m10, m11 ; only apply fm-wide to wd>4 blocks
%endif
pmaxub m8, m10
pxor m8, [pb_128]
%endif
pcmpgtb m8, m2
ABSSUB m10, m3, m6, m11 ; abs(p1-q1)
ABSSUB m11, m4, m5, m2 ; abs(p0-q0)
paddusb m11, m11
pand m10, [pb_254]
psrlq m10, 1
paddusb m10, m11 ; abs(p0-q0)*2+(abs(p1-q1)>>1)
pxor m10, [pb_128]
pcmpgtb m10, m0 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E
por m8, m10
%if %1 == 16
lea tmpq, [dstq+mstrideq*8]
mova m0, [tmpq+strideq*1]
ABSSUB m1, m0, m4, m2
mova m0, [tmpq+strideq*2]
ABSSUB m2, m0, m4, m10
pmaxub m1, m2
mova m0, [tmpq+stride3q]
ABSSUB m2, m0, m4, m10
pmaxub m1, m2
lea tmpq, [dstq+strideq*4]
mova m0, [tmpq+strideq*0]
ABSSUB m2, m0, m5, m10
pmaxub m1, m2
mova m0, [tmpq+strideq*1]
ABSSUB m2, m0, m5, m10
pmaxub m1, m2
mova m0, [tmpq+strideq*2]
ABSSUB m2, m0, m5, m10
pmaxub m1, m2
pxor m1, [pb_128]
pcmpgtb m1, [pb_129] ; !flat8out
por m1, m9 ; !flat8in | !flat8out
vpbroadcastd m2, [maskq+8]
pand m10, m2, [pb_mask]
pcmpeqd m10, [pb_mask]
pandn m1, m10 ; flat16
pandn m1, m8, m1 ; flat16 & fm
vpbroadcastd m10, [maskq+4]
por m10, m2
pand m2, m10, [pb_mask]
pcmpeqd m2, [pb_mask]
pandn m9, m2 ; flat8in
pandn m9, m8, m9
vpbroadcastd m2, [maskq+0]
por m2, m10
pand m2, [pb_mask]
pcmpeqd m2, [pb_mask]
pandn m8, m2
pandn m8, m9, m8 ; fm & !flat8 & !flat16
pandn m9, m1, m9 ; flat8 & !flat16
%elif %1 != 4
vpbroadcastd m0, [maskq+4]
pand m2, m0, [pb_mask]
pcmpeqd m2, [pb_mask]
pandn m9, m2
pandn m9, m8, m9 ; flat8 & fm
vpbroadcastd m2, [maskq+0]
por m0, m2
pand m0, [pb_mask]
pcmpeqd m0, [pb_mask]
pandn m8, m0
pandn m8, m9, m8 ; fm & !flat8
%else
vpbroadcastd m0, [maskq+0]
pand m0, [pb_mask]
pcmpeqd m0, [pb_mask]
pandn m8, m0 ; fm
%endif
; short filter
pxor m3, [pb_128]
pxor m6, [pb_128]
psubsb m10, m3, m6 ; iclip_diff(p1-q1)
pand m10, m7 ; f=iclip_diff(p1-q1)&hev
pxor m4, [pb_128]
pxor m5, [pb_128]
psubsb m11, m5, m4
paddsb m10, m11
paddsb m10, m11
paddsb m10, m11 ; f=iclip_diff(3*(q0-p0)+f)
pand m8, m10 ; f&=fm
paddsb m10, m8, [pb_3]
paddsb m8, [pb_4]
pand m10, [pb_248]
pand m8, [pb_248]
psrlq m10, 3
psrlq m8, 3
pxor m10, [pb_16]
pxor m8, [pb_16]
psubb m10, [pb_16] ; f2
psubb m8, [pb_16] ; f1
paddsb m4, m10
psubsb m5, m8
pxor m4, [pb_128]
pxor m5, [pb_128]
pxor m8, [pb_128]
pxor m10, m10
pavgb m8, m10 ; f=(f1+1)>>1
psubb m8, [pb_64]
pandn m8, m7, m8 ; f&=!hev
paddsb m3, m8
psubsb m6, m8
pxor m3, [pb_128]
pxor m6, [pb_128]
%if %1 == 16
; flat16 filter
lea tmpq, [dstq+mstrideq*8]
mova m0, [tmpq+strideq*1] ; p6
mova m2, [tmpq+strideq*2] ; p5
mova m7, [tmpq+stride3q] ; p4
mova [rsp+0*32], m9
mova [rsp+1*32], m14
mova [rsp+2*32], m15
; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 [p5/p4/p2/p1/p0/q0][p6/p3] A
; write -6
punpcklbw m14, m0, m12
punpckhbw m15, m0, m12
pmaddubsw m10, m14, [pb_7_1]
pmaddubsw m11, m15, [pb_7_1] ; p6*7+p3
punpcklbw m8, m2, m7
punpckhbw m9, m2, m7
pmaddubsw m8, [pb_2]
pmaddubsw m9, [pb_2]
paddw m10, m8
paddw m11, m9 ; p6*7+p5*2+p4*2+p3
punpcklbw m8, m13, m3
punpckhbw m9, m13, m3
pmaddubsw m8, [pb_1]
pmaddubsw m9, [pb_1]
paddw m10, m8
paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1
punpcklbw m8, m4, m5
punpckhbw m9, m4, m5
pmaddubsw m8, [pb_1]
pmaddubsw m9, [pb_1]
paddw m10, m8
paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
pmulhrsw m8, m10, [pw_2048]
pmulhrsw m9, m11, [pw_2048]
packuswb m8, m9
pand m8, m1
pandn m9, m1, m2
por m8, m9
mova [tmpq+strideq*2], m8 ; p5
; sub p6*2, add p3/q1 [reuse p6/p3 from A][-p6,+q1|save] B
; write -5
pmaddubsw m14, [pb_m1_1]
pmaddubsw m15, [pb_m1_1]
paddw m10, m14
paddw m11, m15 ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0
punpcklbw m8, m0, m6
punpckhbw m9, m0, m6
pmaddubsw m8, [pb_m1_1]
pmaddubsw m9, [pb_m1_1]
mova [rsp+3*32], m8
mova [rsp+4*32], m9
paddw m10, m8
paddw m11, m9 ; p6*5+p5*2+p4*2+p3*2+p2+p1+p0+q0+q1
pmulhrsw m8, m10, [pw_2048]
pmulhrsw m9, m11, [pw_2048]
packuswb m8, m9
pand m8, m1
pandn m9, m1, m7
por m8, m9
mova [tmpq+stride3q], m8 ; p4
; sub p6/p5, add p2/q2 [-p6,+p2][-p5,+q2|save] C
; write -4
mova m14, [rsp+1*32]
punpcklbw m8, m0, m13
punpckhbw m9, m0, m13
pmaddubsw m8, [pb_m1_1]
pmaddubsw m9, [pb_m1_1]
paddw m10, m8
paddw m11, m9 ; p6*4+p5*2+p4*2+p3*2+p2*2+p1+p0+q0+q1
punpcklbw m8, m2, m14
punpckhbw m2, m14
pmaddubsw m8, [pb_m1_1]
pmaddubsw m2, [pb_m1_1]
mova [rsp+1*32], m8
paddw m10, m8
paddw m11, m2 ; p6*4+p5+p4*2+p3*2+p2*2+p1+p0+q0+q1+q2
pmulhrsw m8, m10, [pw_2048]
pmulhrsw m9, m11, [pw_2048]
packuswb m8, m9
pand m8, m1
pandn m9, m1, m12
por m8, m9
mova [tmpq+strideq*4], m8 ; p3
; sub p6/p4, add p1/q3 [-p6,+p1][-p4,+q3|save] D
; write -3
mova m15, [rsp+2*32]
punpcklbw m8, m0, m3
punpckhbw m9, m0, m3
pmaddubsw m8, [pb_m1_1]
pmaddubsw m9, [pb_m1_1]
paddw m10, m8
paddw m11, m9 ; p6*3+p5+p4*2+p3*2+p2*2+p1*2+p0+q0+q1+q2
punpcklbw m8, m7, m15
punpckhbw m7, m15
pmaddubsw m8, [pb_m1_1]
pmaddubsw m7, [pb_m1_1]
mova [rsp+2*32], m8
paddw m10, m8
paddw m11, m7 ; p6*3+p5+p4+p3*2+p2*2+p1*2+p0+q0+q1+q2+q3
pmulhrsw m8, m10, [pw_2048]
pmulhrsw m9, m11, [pw_2048]
packuswb m8, m9
pand m8, m1
pandn m9, m1, m13
por m8, m9
mova [rsp+6*32], m8 ; don't clobber p2/m13 since we need it in F
; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E
; write -2
lea tmpq, [dstq+strideq*4]
punpcklbw m8, m0, m4
punpckhbw m9, m0, m4
pmaddubsw m8, [pb_m1_1]
pmaddubsw m9, [pb_m1_1]
paddw m10, m8
paddw m11, m9 ; p6*2+p5+p4+p3*2+p2*2+p1*2+p0*2+q0+q1+q2+q3
mova m9, [tmpq+strideq*0] ; q4
punpcklbw m8, m12, m9
punpckhbw m9, m12, m9
pmaddubsw m8, [pb_m1_1]
pmaddubsw m9, [pb_m1_1]
mova [rsp+7*32], m8
mova [rsp+5*32], m9
paddw m10, m8
paddw m11, m9 ; p6*2+p5+p4+p3+p2*2+p1*2+p0*2+q0+q1+q2+q3+q4
pmulhrsw m8, m10, [pw_2048]
pmulhrsw m9, m11, [pw_2048]
packuswb m8, m9
pand m8, m1
pandn m9, m1, m3
por m8, m9
mova [rsp+8*32], m8 ; don't clobber p1/m3 since we need it in G
; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F
; write -1
mova m9, [tmpq+strideq*1] ; q5
punpcklbw m8, m0, m5
punpckhbw m0, m5
pmaddubsw m8, [pb_m1_1]
pmaddubsw m0, [pb_m1_1]
paddw m10, m8
paddw m11, m0 ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4
punpcklbw m0, m13, m9
punpckhbw m9, m13, m9
mova m13, [rsp+6*32]
pmaddubsw m0, [pb_m1_1]
pmaddubsw m9, [pb_m1_1]
mova [rsp+ 9*32], m0
mova [rsp+10*32], m9
paddw m10, m0
paddw m11, m9 ; p6+p5+p4+p3+p2+p1*2+p0*2+q0*2+q1+q2+q3+q4+q5
pmulhrsw m0, m10, [pw_2048]
pmulhrsw m8, m11, [pw_2048]
packuswb m0, m8
pand m0, m1
pandn m8, m1, m4
por m0, m8
mova [rsp+6*32], m0 ; don't clobber p0/m4 since we need it in H
; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G
; write +0
mova m0, [tmpq+strideq*2] ; q6
paddw m10, [rsp+3*32]
paddw m11, [rsp+4*32] ; p5+p4+p3+p2+p1*2+p0*2+q0*2+q1*2+q2+q3+q4+q5
punpcklbw m8, m3, m0
punpckhbw m9, m3, m0
mova m3, [rsp+8*32]
pmaddubsw m8, [pb_m1_1]
pmaddubsw m9, [pb_m1_1]
mova [rsp+3*32], m8
mova [rsp+4*32], m9
paddw m10, m8
paddw m11, m9 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6
pmulhrsw m8, m10, [pw_2048]
pmulhrsw m9, m11, [pw_2048]
packuswb m8, m9
pand m8, m1
pandn m9, m1, m5
por m8, m9
mova [rsp+8*32], m8 ; don't clobber q0/m5 since we need it in I
; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H
; write +1
paddw m10, [rsp+1*32]
paddw m11, m2 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6
punpcklbw m8, m4, m0
punpckhbw m2, m4, m0
mova m4, [rsp+6*32]
pmaddubsw m8, [pb_m1_1]
pmaddubsw m2, [pb_m1_1]
paddw m10, m8
paddw m11, m2 ; p4+p3+p2+p1+p0+q0*2+q1*2+q2*2+q3+q4+q5+q6*2
pmulhrsw m2, m10, [pw_2048]
pmulhrsw m9, m11, [pw_2048]
packuswb m2, m9
pand m2, m1
pandn m9, m1, m6
por m2, m9 ; don't clobber q1/m6 since we need it in K
; sub p4/q0, add q3/q6 [reuse -p4,+q3 from D][-q0,+q6] I
; write +2
paddw m10, [rsp+2*32]
paddw m11, m7 ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2
punpcklbw m8, m5, m0
punpckhbw m9, m5, m0
mova m5, [rsp+8*32]
pmaddubsw m8, [pb_m1_1]
pmaddubsw m9, [pb_m1_1]
paddw m10, m8
paddw m11, m9 ; p3+p2+p1+p0+q0+q1*2+q2*2+q3*2+q4+q5+q6*3
pmulhrsw m7, m10, [pw_2048]
pmulhrsw m9, m11, [pw_2048]
packuswb m7, m9
pand m7, m1
pandn m9, m1, m14
por m7, m9 ; don't clobber q2/m14 since we need it in K
; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J
; write +3
paddw m10, [rsp+7*32]
paddw m11, [rsp+5*32] ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3
punpcklbw m8, m6, m0
punpckhbw m9, m6, m0
SWAP 2, 6
pmaddubsw m8, [pb_m1_1]
pmaddubsw m9, [pb_m1_1]
paddw m10, m8
paddw m11, m9 ; p2+p1+p0+q0+q1+q2*2+q3*2+q4*2+q5+q6*4
pmulhrsw m8, m10, [pw_2048]
pmulhrsw m9, m11, [pw_2048]
packuswb m8, m9
pand m8, m1
pandn m9, m1, m15
por m8, m9
mova [tmpq+mstrideq], m8 ; q3
; sub p2/q2, add q5/q6 [reuse -p2,+q5 from F][-q2,+q6] K
; write +4
paddw m10, [rsp+ 9*32]
paddw m11, [rsp+10*32] ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4
punpcklbw m8, m14, m0
punpckhbw m9, m14, m0
SWAP 14, 7
pmaddubsw m8, [pb_m1_1]
pmaddubsw m9, [pb_m1_1]
paddw m10, m8
paddw m11, m9 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5
pmulhrsw m8, m10, [pw_2048]
pmulhrsw m9, m11, [pw_2048]
packuswb m8, m9
pand m8, m1
pandn m9, m1, [tmpq+strideq*0]
por m8, m9
mova [tmpq+strideq*0], m8 ; q4
; sub p1/q3, add q6*2 [reuse -p1,+q6 from G][-q3,+q6] L
; write +5
paddw m10, [rsp+3*32]
paddw m11, [rsp+4*32] ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4
punpcklbw m8, m15, m0
punpckhbw m9, m15, m0
pmaddubsw m8, [pb_m1_1]
pmaddubsw m9, [pb_m1_1]
paddw m10, m8
paddw m11, m9 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5
pmulhrsw m10, [pw_2048]
pmulhrsw m11, [pw_2048]
packuswb m10, m11
pand m10, m1
pandn m11, m1, [tmpq+strideq*1]
por m10, m11
mova [tmpq+strideq*1], m10 ; q5
mova m9, [rsp+0*32]
lea tmpq, [dstq+mstrideq*4]
%endif
%if %1 >= 8
; flat8 filter
punpcklbw m0, m12, m3
punpckhbw m1, m12, m3
pmaddubsw m2, m0, [pb_3_1]
pmaddubsw m7, m1, [pb_3_1] ; 3 * p3 + p1
punpcklbw m8, m13, m4
punpckhbw m11, m13, m4
pmaddubsw m8, [pb_2_1]
pmaddubsw m11, [pb_2_1]
paddw m2, m8
paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0
punpcklbw m8, m5, [pb_4]
punpckhbw m11, m5, [pb_4]
pmaddubsw m8, [pb_1]
pmaddubsw m11, [pb_1]
paddw m2, m8
paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4
psrlw m8, m2, 3
psrlw m11, m7, 3
packuswb m8, m11
pand m8, m9
pandn m11, m9, m13
por m8, m11 ; p2
mova [tmpq+strideq*1], m8 ; p2
pmaddubsw m8, m0, [pb_m1_1]
pmaddubsw m11, m1, [pb_m1_1]
paddw m2, m8
paddw m7, m11
punpcklbw m8, m13, m6
punpckhbw m11, m13, m6
pmaddubsw m8, [pb_m1_1]
pmaddubsw m11, [pb_m1_1]
paddw m2, m8
paddw m7, m11 ; 2 * p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4
psrlw m8, m2, 3
psrlw m11, m7, 3
packuswb m8, m11
pand m8, m9
pandn m11, m9, m3
por m8, m11 ; p1
mova [tmpq+strideq*2], m8 ; p1
pmaddubsw m0, [pb_1]
pmaddubsw m1, [pb_1]
psubw m2, m0
psubw m7, m1
punpcklbw m8, m4, m14
punpckhbw m11, m4, m14
pmaddubsw m8, [pb_1]
pmaddubsw m11, [pb_1]
paddw m2, m8
paddw m7, m11 ; p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4
psrlw m8, m2, 3
psrlw m11, m7, 3
packuswb m8, m11
pand m8, m9
pandn m11, m9, m4
por m8, m11 ; p0
mova [tmpq+stride3q ], m8 ; p0
punpcklbw m0, m5, m15
punpckhbw m1, m5, m15
pmaddubsw m8, m0, [pb_1]
pmaddubsw m11, m1, [pb_1]
paddw m2, m8
paddw m7, m11
punpcklbw m8, m4, m12
punpckhbw m11, m4, m12
pmaddubsw m8, [pb_1]
pmaddubsw m11, [pb_1]
psubw m2, m8
psubw m7, m11 ; p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4
psrlw m8, m2, 3
psrlw m11, m7, 3
packuswb m8, m11
pand m8, m9
pandn m11, m9, m5
por m8, m11 ; q0
mova [dstq+strideq*0], m8 ; q0
pmaddubsw m0, [pb_m1_1]
pmaddubsw m1, [pb_m1_1]
paddw m2, m0
paddw m7, m1
punpcklbw m8, m13, m6
punpckhbw m11, m13, m6
pmaddubsw m8, [pb_m1_1]
pmaddubsw m11, [pb_m1_1]
paddw m2, m8
paddw m7, m11 ; p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3 + 4
psrlw m8, m2, 3
psrlw m11, m7, 3
packuswb m8, m11
pand m8, m9
pandn m11, m9, m6
por m8, m11 ; q1
mova [dstq+strideq*1], m8 ; q1