Commit 58fc5165 authored by Henrik Gramner's avatar Henrik Gramner

Split MC blend

The mstride == 0, mstride == 1, and mstride == w cases are very different
from each other, and splitting them into separate functions makes it easier
top optimize them.

Also add some further optimizations to the AVX2 asm that became possible
after this change.
parent 8b8e9fe8
...@@ -81,11 +81,14 @@ void (name)(pixel *dst, ptrdiff_t dst_stride, \ ...@@ -81,11 +81,14 @@ void (name)(pixel *dst, ptrdiff_t dst_stride, \
typedef decl_w_mask_fn(*w_mask_fn); typedef decl_w_mask_fn(*w_mask_fn);
#define decl_blend_fn(name) \ #define decl_blend_fn(name) \
void (name)(pixel *dst, ptrdiff_t dst_stride, \ void (name)(pixel *dst, ptrdiff_t dst_stride, const pixel *tmp, \
const pixel *tmp, int w, int h, \ int w, int h, const uint8_t *mask)
const uint8_t *mask, ptrdiff_t mstride)
typedef decl_blend_fn(*blend_fn); typedef decl_blend_fn(*blend_fn);
#define decl_blend_dir_fn(name) \
void (name)(pixel *dst, ptrdiff_t dst_stride, const pixel *tmp, int w, int h)
typedef decl_blend_dir_fn(*blend_dir_fn);
#define decl_emu_edge_fn(name) \ #define decl_emu_edge_fn(name) \
void (name)(intptr_t bw, intptr_t bh, intptr_t iw, intptr_t ih, intptr_t x, intptr_t y, \ void (name)(intptr_t bw, intptr_t bh, intptr_t iw, intptr_t ih, intptr_t x, intptr_t y, \
pixel *dst, ptrdiff_t dst_stride, const pixel *src, ptrdiff_t src_stride) pixel *dst, ptrdiff_t dst_stride, const pixel *src, ptrdiff_t src_stride)
...@@ -99,6 +102,8 @@ typedef struct Dav1dMCDSPContext { ...@@ -99,6 +102,8 @@ typedef struct Dav1dMCDSPContext {
mask_fn mask; mask_fn mask;
w_mask_fn w_mask[3 /* 444, 422, 420 */]; w_mask_fn w_mask[3 /* 444, 422, 420 */];
blend_fn blend; blend_fn blend;
blend_dir_fn blend_v;
blend_dir_fn blend_h;
warp8x8_fn warp8x8; warp8x8_fn warp8x8;
warp8x8t_fn warp8x8t; warp8x8t_fn warp8x8t;
emu_edge_fn emu_edge; emu_edge_fn emu_edge;
......
...@@ -373,19 +373,46 @@ static void mask_c(pixel *dst, const ptrdiff_t dst_stride, ...@@ -373,19 +373,46 @@ static void mask_c(pixel *dst, const ptrdiff_t dst_stride,
} while (--h); } while (--h);
} }
static void blend_c(pixel *dst, const ptrdiff_t dst_stride, #define blend_px(a, b, m) (((a * (64 - m) + b * m) + 32) >> 6)
const pixel *tmp, const int w, const int h, static NOINLINE void
const uint8_t *mask, const ptrdiff_t m_stride) blend_internal_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
const int w, int h, const uint8_t *mask,
const ptrdiff_t mask_stride)
{ {
for (int y = 0; y < h; y++) { do {
for (int x = 0; x < w; x++) { for (int x = 0; x < w; x++) {
#define blend_px(a, b, m) (((a * (64 - m) + b * m) + 32) >> 6) dst[x] = blend_px(dst[x], tmp[x], mask[x]);
dst[x] = blend_px(dst[x], tmp[x], mask[m_stride == 1 ? 0 : x]);
} }
dst += PXSTRIDE(dst_stride); dst += PXSTRIDE(dst_stride);
tmp += w; tmp += w;
mask += m_stride; mask += mask_stride;
} } while (--h);
}
static void blend_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
const int w, const int h, const uint8_t *mask)
{
blend_internal_c(dst, dst_stride, tmp, w, h, mask, w);
}
static void blend_v_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
const int w, const int h)
{
blend_internal_c(dst, dst_stride, tmp, w, h, &dav1d_obmc_masks[w], 0);
}
static void blend_h_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
const int w, int h)
{
const uint8_t *mask = &dav1d_obmc_masks[h];
do {
const int m = *mask++;
for (int x = 0; x < w; x++) {
dst[x] = blend_px(dst[x], tmp[x], m);
}
dst += PXSTRIDE(dst_stride);
tmp += w;
} while (--h);
} }
static void w_mask_c(pixel *dst, const ptrdiff_t dst_stride, static void w_mask_c(pixel *dst, const ptrdiff_t dst_stride,
...@@ -591,6 +618,8 @@ void bitfn(dav1d_mc_dsp_init)(Dav1dMCDSPContext *const c) { ...@@ -591,6 +618,8 @@ void bitfn(dav1d_mc_dsp_init)(Dav1dMCDSPContext *const c) {
c->w_avg = w_avg_c; c->w_avg = w_avg_c;
c->mask = mask_c; c->mask = mask_c;
c->blend = blend_c; c->blend = blend_c;
c->blend_v = blend_v_c;
c->blend_h = blend_h_c;
c->w_mask[0] = w_mask_444_c; c->w_mask[0] = w_mask_444_c;
c->w_mask[1] = w_mask_422_c; c->w_mask[1] = w_mask_422_c;
c->w_mask[2] = w_mask_420_c; c->w_mask[2] = w_mask_420_c;
......
...@@ -579,9 +579,8 @@ static int obmc(Dav1dTileContext *const t, ...@@ -579,9 +579,8 @@ static int obmc(Dav1dTileContext *const t,
&f->refp[a_r->ref[0] - 1], &f->refp[a_r->ref[0] - 1],
dav1d_filter_2d[t->a->filter[1][bx4 + x + 1]][t->a->filter[0][bx4 + x + 1]]); dav1d_filter_2d[t->a->filter[1][bx4 + x + 1]][t->a->filter[0][bx4 + x + 1]]);
if (res) return res; if (res) return res;
f->dsp->mc.blend(&dst[x * h_mul], dst_stride, lap, f->dsp->mc.blend_h(&dst[x * h_mul], dst_stride, lap,
h_mul * ow4, v_mul * oh4, h_mul * ow4, v_mul * oh4);
&dav1d_obmc_masks[v_mul * oh4], 1);
i++; i++;
} }
x += imax(a_b_dim[0], 2); x += imax(a_b_dim[0], 2);
...@@ -603,9 +602,8 @@ static int obmc(Dav1dTileContext *const t, ...@@ -603,9 +602,8 @@ static int obmc(Dav1dTileContext *const t,
&f->refp[l_r->ref[0] - 1], &f->refp[l_r->ref[0] - 1],
dav1d_filter_2d[t->l.filter[1][by4 + y + 1]][t->l.filter[0][by4 + y + 1]]); dav1d_filter_2d[t->l.filter[1][by4 + y + 1]][t->l.filter[0][by4 + y + 1]]);
if (res) return res; if (res) return res;
f->dsp->mc.blend(&dst[y * v_mul * PXSTRIDE(dst_stride)], f->dsp->mc.blend_v(&dst[y * v_mul * PXSTRIDE(dst_stride)],
dst_stride, lap, h_mul * ow4, v_mul * oh4, dst_stride, lap, h_mul * ow4, v_mul * oh4);
&dav1d_obmc_masks[h_mul * ow4], 0);
i++; i++;
} }
y += imax(l_b_dim[1], 2); y += imax(l_b_dim[1], 2);
...@@ -1144,7 +1142,7 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize ...@@ -1144,7 +1142,7 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize
dav1d_ii_masks[bs][0][b->interintra_mode] : dav1d_ii_masks[bs][0][b->interintra_mode] :
dav1d_wedge_masks[bs][0][0][b->wedge_idx]; dav1d_wedge_masks[bs][0][0][b->wedge_idx];
dsp->mc.blend(dst, f->cur.p.stride[0], tmp, dsp->mc.blend(dst, f->cur.p.stride[0], tmp,
bw4 * 4, bh4 * 4, ii_mask, bw4 * 4); bw4 * 4, bh4 * 4, ii_mask);
} }
if (!has_chroma) goto skip_inter_chroma_pred; if (!has_chroma) goto skip_inter_chroma_pred;
...@@ -1277,7 +1275,7 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize ...@@ -1277,7 +1275,7 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize
dsp->ipred.intra_pred[m](tmp, cbw4 * 4 * sizeof(pixel), dsp->ipred.intra_pred[m](tmp, cbw4 * 4 * sizeof(pixel),
tl_edge, cbw4 * 4, cbh4 * 4, 0); tl_edge, cbw4 * 4, cbh4 * 4, 0);
dsp->mc.blend(uvdst, f->cur.p.stride[1], tmp, dsp->mc.blend(uvdst, f->cur.p.stride[1], tmp,
cbw4 * 4, cbh4 * 4, ii_mask, cbw4 * 4); cbw4 * 4, cbh4 * 4, ii_mask);
} }
} }
} }
......
...@@ -30,6 +30,23 @@ ...@@ -30,6 +30,23 @@
SECTION_RODATA 32 SECTION_RODATA 32
; dav1d_obmc_masks[] with 64-x interleaved
obmc_masks: db 0, 0, 0, 0
; 2
db 45, 19, 64, 0
; 4
db 39, 25, 50, 14, 59, 5, 64, 0
; 8
db 36, 28, 42, 22, 48, 16, 53, 11, 57, 7, 61, 3, 64, 0, 64, 0
; 16
db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10
db 56, 8, 58, 6, 60, 4, 61, 3, 64, 0, 64, 0, 64, 0, 64, 0
; 32
db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20
db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9
db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2
db 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0
warp_8x8_shufA: db 0, 2, 4, 6, 1, 3, 5, 7, 1, 3, 5, 7, 2, 4, 6, 8 warp_8x8_shufA: db 0, 2, 4, 6, 1, 3, 5, 7, 1, 3, 5, 7, 2, 4, 6, 8
db 4, 6, 8, 10, 5, 7, 9, 11, 5, 7, 9, 11, 6, 8, 10, 12 db 4, 6, 8, 10, 5, 7, 9, 11, 5, 7, 9, 11, 6, 8, 10, 12
warp_8x8_shufB: db 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9, 4, 6, 8, 10 warp_8x8_shufB: db 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9, 4, 6, 8, 10
...@@ -42,10 +59,9 @@ subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 ...@@ -42,10 +59,9 @@ subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11 bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11
bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11 deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11
blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
blend_shuf: ; bits 0-3: 0, 0, 0, 0, 1, 1, 1, 1
pb_64: times 4 db 64 pb_64: times 4 db 64
times 4 db 1
pw_8: times 2 dw 8 pw_8: times 2 dw 8
pw_26: times 2 dw 26 pw_26: times 2 dw 26
pw_34: times 2 dw 34 pw_34: times 2 dw 34
...@@ -61,7 +77,7 @@ pd_32768: dd 32768 ...@@ -61,7 +77,7 @@ pd_32768: dd 32768
cextern mc_subpel_filters cextern mc_subpel_filters
%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) %define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
%macro BIDIR_JMP_TABLE 1-* 4, 8, 16, 32, 64, 128 %macro BIDIR_JMP_TABLE 1-*
%xdefine %1_table (%%table - 2*%2) %xdefine %1_table (%%table - 2*%2)
%xdefine %%base %1_table %xdefine %%base %1_table
%xdefine %%prefix mangle(private_prefix %+ _%1) %xdefine %%prefix mangle(private_prefix %+ _%1)
...@@ -72,11 +88,13 @@ cextern mc_subpel_filters ...@@ -72,11 +88,13 @@ cextern mc_subpel_filters
%endrep %endrep
%endmacro %endmacro
BIDIR_JMP_TABLE avg_avx2 BIDIR_JMP_TABLE avg_avx2, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_avg_avx2 BIDIR_JMP_TABLE w_avg_avx2, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE mask_avx2 BIDIR_JMP_TABLE mask_avx2, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_mask_420_avx2 BIDIR_JMP_TABLE w_mask_420_avx2, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE blend_avx2, 2, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE blend_avx2, 4, 8, 16, 32
BIDIR_JMP_TABLE blend_v_avx2, 2, 4, 8, 16, 32
BIDIR_JMP_TABLE blend_h_avx2, 2, 4, 8, 16, 32, 32, 32
%macro BASE_JMP_TABLE 3-* %macro BASE_JMP_TABLE 3-*
%xdefine %1_%2_table (%%table - %3) %xdefine %1_%2_table (%%table - %3)
...@@ -3286,7 +3304,7 @@ cglobal w_mask_420, 4, 8, 15, dst, stride, tmp1, tmp2, w, h, mask, stride3 ...@@ -3286,7 +3304,7 @@ cglobal w_mask_420, 4, 8, 15, dst, stride, tmp1, tmp2, w, h, mask, stride3
jg .w128_loop jg .w128_loop
RET RET
cglobal blend, 3, 7, 6, dst, ds, tmp, w, h, mask, ms cglobal blend, 3, 7, 7, dst, ds, tmp, w, h, mask
%define base r6-blend_avx2_table %define base r6-blend_avx2_table
lea r6, [blend_avx2_table] lea r6, [blend_avx2_table]
tzcnt wd, wm tzcnt wd, wm
...@@ -3296,62 +3314,132 @@ cglobal blend, 3, 7, 6, dst, ds, tmp, w, h, mask, ms ...@@ -3296,62 +3314,132 @@ cglobal blend, 3, 7, 6, dst, ds, tmp, w, h, mask, ms
vpbroadcastd m4, [base+pb_64] vpbroadcastd m4, [base+pb_64]
vpbroadcastd m5, [base+pw_512] vpbroadcastd m5, [base+pw_512]
add wq, r6 add wq, r6
mov msq, msmp lea r6, [dsq*3]
jmp wq jmp wq
.w2: .w4:
cmp msq, 1
jb .w2_s0
je .w2_s1
.w2_s2:
movd xm1, [maskq]
movd xm0, [dstq+dsq*0] movd xm0, [dstq+dsq*0]
pinsrw xm0, [dstq+dsq*1], 1 pinsrd xm0, [dstq+dsq*1], 1
psubb xm2, xm4, xm1 vpbroadcastd xm1, [dstq+dsq*2]
punpcklbw xm2, xm1 pinsrd xm1, [dstq+r6 ], 3
movd xm1, [tmpq] mova xm6, [maskq]
add maskq, 2*2 psubb xm3, xm4, xm6
add tmpq, 2*2 punpcklbw xm2, xm3, xm6
punpcklbw xm0, xm1 punpckhbw xm3, xm6
mova xm6, [tmpq]
add maskq, 4*4
add tmpq, 4*4
punpcklbw xm0, xm6
punpckhbw xm1, xm6
pmaddubsw xm0, xm2 pmaddubsw xm0, xm2
pmaddubsw xm1, xm3
pmulhrsw xm0, xm5 pmulhrsw xm0, xm5
packuswb xm0, xm0 pmulhrsw xm1, xm5
pextrw [dstq+dsq*0], xm0, 0 packuswb xm0, xm1
pextrw [dstq+dsq*1], xm0, 1 movd [dstq+dsq*0], xm0
lea dstq, [dstq+dsq*2] pextrd [dstq+dsq*1], xm0, 1
sub hd, 2 pextrd [dstq+dsq*2], xm0, 2
jg .w2_s2 pextrd [dstq+r6 ], xm0, 3
lea dstq, [dstq+dsq*4]
sub hd, 4
jg .w4
RET RET
.w2_s1: ALIGN function_align
movd xm1, [maskq] .w8:
movd xm0, [dstq+dsq*0] movq xm1, [dstq+dsq*0]
psubb xm2, xm4, xm1 movhps xm1, [dstq+dsq*1]
punpcklbw xm2, xm1 vpbroadcastq m2, [dstq+dsq*2]
pinsrw xm0, [dstq+dsq*1], 1 vpbroadcastq m3, [dstq+r6 ]
movd xm1, [tmpq] mova m0, [maskq]
punpcklwd xm2, xm2 mova m6, [tmpq]
add maskq, 2 add maskq, 8*4
add tmpq, 2*2 add tmpq, 8*4
punpcklbw xm0, xm1 vpblendd m1, m2, 0x30
pmaddubsw xm0, xm2 vpblendd m1, m3, 0xc0
pmulhrsw xm0, xm5 psubb m3, m4, m0
packuswb xm0, xm0 punpcklbw m2, m3, m0
pextrw [dstq+dsq*0], xm0, 0 punpckhbw m3, m0
pextrw [dstq+dsq*1], xm0, 1 punpcklbw m0, m1, m6
punpckhbw m1, m6
pmaddubsw m0, m2
pmaddubsw m1, m3
pmulhrsw m0, m5
pmulhrsw m1, m5
packuswb m0, m1
vextracti128 xm1, m0, 1
movq [dstq+dsq*0], xm0
movhps [dstq+dsq*1], xm0
movq [dstq+dsq*2], xm1
movhps [dstq+r6 ], xm1
lea dstq, [dstq+dsq*4]
sub hd, 4
jg .w8
RET
ALIGN function_align
.w16:
mova m0, [maskq]
mova xm1, [dstq+dsq*0]
vinserti128 m1, [dstq+dsq*1], 1
psubb m3, m4, m0
punpcklbw m2, m3, m0
punpckhbw m3, m0
mova m6, [tmpq]
add maskq, 16*2
add tmpq, 16*2
punpcklbw m0, m1, m6
punpckhbw m1, m6
pmaddubsw m0, m2
pmaddubsw m1, m3
pmulhrsw m0, m5
pmulhrsw m1, m5
packuswb m0, m1
mova [dstq+dsq*0], xm0
vextracti128 [dstq+dsq*1], m0, 1
lea dstq, [dstq+dsq*2] lea dstq, [dstq+dsq*2]
sub hd, 2 sub hd, 2
jg .w2_s1 jg .w16
RET
ALIGN function_align
.w32:
mova m0, [maskq]
mova m1, [dstq]
mova m6, [tmpq]
add maskq, 32
add tmpq, 32
psubb m3, m4, m0
punpcklbw m2, m3, m0
punpckhbw m3, m0
punpcklbw m0, m1, m6
punpckhbw m1, m6
pmaddubsw m0, m2
pmaddubsw m1, m3
pmulhrsw m0, m5
pmulhrsw m1, m5
packuswb m0, m1
mova [dstq], m0
add dstq, dsq
dec hd
jg .w32
RET RET
.w2_s0:
vpbroadcastw xm0, [maskq] cglobal blend_v, 3, 6, 6, dst, ds, tmp, w, h, mask
psubb xm4, xm0 %define base r5-blend_v_avx2_table
punpcklbw xm4, xm0 lea r5, [blend_v_avx2_table]
tzcnt wd, wm
movifnidn hd, hm
movsxd wq, dword [r5+wq*4]
vpbroadcastd m5, [base+pw_512]
add wq, r5
add maskq, obmc_masks-blend_v_avx2_table
jmp wq
.w2:
vpbroadcastd xm2, [maskq+2*2]
.w2_s0_loop: .w2_s0_loop:
movd xm0, [dstq+dsq*0] movd xm0, [dstq+dsq*0]
pinsrw xm0, [dstq+dsq*1], 1 pinsrw xm0, [dstq+dsq*1], 1
movd xm1, [tmpq] movd xm1, [tmpq]
add tmpq, 2*2 add tmpq, 2*2
punpcklbw xm0, xm1 punpcklbw xm0, xm1
pmaddubsw xm0, xm4 pmaddubsw xm0, xm2
pmulhrsw xm0, xm5 pmulhrsw xm0, xm5
packuswb xm0, xm0 packuswb xm0, xm0
pextrw [dstq+dsq*0], xm0, 0 pextrw [dstq+dsq*0], xm0, 0
...@@ -3362,17 +3450,11 @@ cglobal blend, 3, 7, 6, dst, ds, tmp, w, h, mask, ms ...@@ -3362,17 +3450,11 @@ cglobal blend, 3, 7, 6, dst, ds, tmp, w, h, mask, ms
RET RET
ALIGN function_align ALIGN function_align
.w4: .w4:
cmp msq, 1 vpbroadcastq xm2, [maskq+4*2]
jb .w4_s0 .w4_loop:
je .w4_s1
.w4_s4:
movq xm1, [maskq]
movd xm0, [dstq+dsq*0] movd xm0, [dstq+dsq*0]
pinsrd xm0, [dstq+dsq*1], 1 pinsrd xm0, [dstq+dsq*1], 1
psubb xm2, xm4, xm1
punpcklbw xm2, xm1
movq xm1, [tmpq] movq xm1, [tmpq]
add maskq, 4*2
add tmpq, 4*2 add tmpq, 4*2
punpcklbw xm0, xm1 punpcklbw xm0, xm1
pmaddubsw xm0, xm2 pmaddubsw xm0, xm2
...@@ -3382,109 +3464,12 @@ ALIGN function_align ...@@ -3382,109 +3464,12 @@ ALIGN function_align
pextrd [dstq+dsq*1], xm0, 1 pextrd [dstq+dsq*1], xm0, 1
lea dstq, [dstq+dsq*2] lea dstq, [dstq+dsq*2]
sub hd, 2 sub hd, 2
jg .w4_s4 jg .w4_loop
RET
.w4_s1:
movq xm3, [blend_shuf]
.w4_s1_loop:
movd xm1, [maskq]
movd xm0, [dstq+dsq*0]
pshufb xm1, xm3
psubb xm2, xm4, xm1
pinsrd xm0, [dstq+dsq*1], 1
punpcklbw xm2, xm1
movq xm1, [tmpq]
add maskq, 2
add tmpq, 4*2
punpcklbw xm0, xm1
pmaddubsw xm0, xm2
pmulhrsw xm0, xm5
packuswb xm0, xm0
movd [dstq+dsq*0], xm0
pextrd [dstq+dsq*1], xm0, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .w4_s1_loop
RET
.w4_s0:
vpbroadcastd xm0, [maskq]
psubb xm4, xm0
punpcklbw xm4, xm0
.w4_s0_loop:
movd xm0, [dstq+dsq*0]
pinsrd xm0, [dstq+dsq*1], 1
movq xm1, [tmpq]
add tmpq, 4*2
punpcklbw xm0, xm1
pmaddubsw xm0, xm4
pmulhrsw xm0, xm5
packuswb xm0, xm0
movd [dstq+dsq*0], xm0
pextrd [dstq+dsq*1], xm0, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .w4_s0_loop
RET RET
ALIGN function_align ALIGN function_align
.w8: .w8:
cmp msq, 1 vbroadcasti128 m4, [maskq+8*2]
jb .w8_s0 .w8_loop:
je .w8_s1
.w8_s8:
movq xm1, [maskq+8*1]
vinserti128 m1, [maskq+8*0], 1
vpbroadcastq m2, [dstq+dsq*0]
movq xm0, [dstq+dsq*1]
vpblendd m0, m2, 0x30
psubb m2, m4, m1
punpcklbw m2, m1
movq xm1, [tmpq+8*1]
vinserti128 m1, [tmpq+8*0], 1
add maskq, 8*2
add tmpq, 8*2
punpcklbw m0, m1
pmaddubsw m0, m2
pmulhrsw m0, m5
vextracti128 xm1, m0, 1
packuswb xm0, xm1
movhps [dstq+dsq*0], xm0
movq [dstq+dsq*1], xm0
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .w8_s8
RET
.w8_s1:
vpbroadcastd m0, [blend_shuf+0]
vpbroadcastd xm3, [blend_shuf+4]
vpblendd m3, m0, 0xf0
.w8_s1_loop:
vpbroadcastd m0, [maskq]
vpbroadcastq m1, [dstq+dsq*0]
pshufb m0, m3
psubb m2, m4, m0
punpcklbw m2, m0
movq xm0, [dstq+dsq*1]
vpblendd m0, m1, 0x30
movq xm1, [tmpq+8*1]
vinserti128 m1, [tmpq+8*0], 1
add maskq, 2
add tmpq, 8*2
punpcklbw m0, m1
pmaddubsw m0, m2
pmulhrsw m0, m5
vextracti128 xm1, m0, 1
packuswb xm0, xm1
movhps [dstq+dsq*0], xm0
movq [dstq+dsq*1], xm0
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .w8_s1_loop
RET
.w8_s0:
vpbroadcastq m0, [maskq]
psubb m4, m0
punpcklbw m4, m0
.w8_s0_loop:
vpbroadcastq m2, [dstq+dsq*0] vpbroadcastq m2, [dstq+dsq*0]
movq xm0, [dstq+dsq*1] movq xm0, [dstq+dsq*1]
vpblendd m0, m2, 0x30 vpblendd m0, m2, 0x30
...@@ -3500,28 +3485,21 @@ ALIGN function_align ...@@ -3500,28 +3485,21 @@ ALIGN function_align
movq [dstq+dsq*1], xm0 movq [dstq+dsq*1], xm0
lea dstq, [dstq+dsq*2] lea dstq, [dstq+dsq*2]
sub hd, 2 sub hd, 2
jg .w8_s0_loop jg .w8_loop
RET RET
ALIGN function_align ALIGN function_align
.w16: .w16:
cmp msq, 1 vbroadcasti128 m3, [maskq+16*2]
jb .w16_s0 vbroadcasti128 m4, [maskq+16*3]
WIN64_SPILL_XMM 7 .w16_loop:
je .w16_s1
.w16_s16:
mova m0, [maskq]
mova xm1, [dstq+dsq*0] mova xm1, [dstq+dsq*0]
vinserti128 m1, [dstq+dsq*1], 1 vinserti128 m1, [dstq+dsq*1], 1
psubb m3, m4, m0