Commit e4fbbbce authored by Henrik Gramner's avatar Henrik Gramner Committed by Jean-Baptiste Kempf

Add blend AVX2 asm

parent 8676fda3
Pipeline #2374 passed with stages
in 2 minutes and 46 seconds
......@@ -43,6 +43,9 @@ bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11
bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11
blend_shuf: ; bits 0-3: 0, 0, 0, 0, 1, 1, 1, 1
pb_64: times 4 db 64
times 4 db 1
pw_8: times 2 dw 8
pw_26: times 2 dw 26
pw_34: times 2 dw 34
......@@ -58,12 +61,13 @@ pd_32768: dd 32768
cextern mc_subpel_filters
%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
%macro BIDIR_JMP_TABLE 1-7 4, 8, 16, 32, 64, 128
%xdefine %1_table (%%table - 2*4)
%macro BIDIR_JMP_TABLE 1-* 4, 8, 16, 32, 64, 128
%xdefine %1_table (%%table - 2*%2)
%xdefine %%base %1_table
%xdefine %%prefix mangle(private_prefix %+ _%1)
%%table:
%rep 6
dd %%prefix %+ .w%2 - (%%table - 2*4)
%rep %0 - 1
dd %%prefix %+ .w%2 - %%base
%rotate 1
%endrep
%endmacro
......@@ -72,6 +76,7 @@ BIDIR_JMP_TABLE avg_avx2
BIDIR_JMP_TABLE w_avg_avx2
BIDIR_JMP_TABLE mask_avx2
BIDIR_JMP_TABLE w_mask_420_avx2
BIDIR_JMP_TABLE blend_avx2, 2, 4, 8, 16, 32, 64, 128
%macro BASE_JMP_TABLE 3-*
%xdefine %1_%2_table (%%table - %3)
......@@ -3281,7 +3286,410 @@ cglobal w_mask_420, 4, 8, 15, dst, stride, tmp1, tmp2, w, h, mask, stride3
jg .w128_loop
RET
INIT_YMM avx2
cglobal blend, 3, 7, 6, dst, ds, tmp, w, h, mask, ms
%define base r6-blend_avx2_table
lea r6, [blend_avx2_table]
tzcnt wd, wm
movifnidn hd, hm
movifnidn maskq, maskmp
movsxd wq, dword [r6+wq*4]
vpbroadcastd m4, [base+pb_64]
vpbroadcastd m5, [base+pw_512]
add wq, r6
mov msq, msmp
jmp wq
.w2:
cmp msq, 1
jb .w2_s0
je .w2_s1
.w2_s2:
movd xm1, [maskq]
movd xm0, [dstq+dsq*0]
pinsrw xm0, [dstq+dsq*1], 1
psubb xm2, xm4, xm1
punpcklbw xm2, xm1
movd xm1, [tmpq]
add maskq, 2*2
add tmpq, 2*2
punpcklbw xm0, xm1
pmaddubsw xm0, xm2
pmulhrsw xm0, xm5
packuswb xm0, xm0
pextrw [dstq+dsq*0], xm0, 0
pextrw [dstq+dsq*1], xm0, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .w2_s2
RET
.w2_s1:
movd xm1, [maskq]
movd xm0, [dstq+dsq*0]
psubb xm2, xm4, xm1
punpcklbw xm2, xm1
pinsrw xm0, [dstq+dsq*1], 1
movd xm1, [tmpq]
punpcklwd xm2, xm2
add maskq, 2
add tmpq, 2*2
punpcklbw xm0, xm1
pmaddubsw xm0, xm2
pmulhrsw xm0, xm5
packuswb xm0, xm0
pextrw [dstq+dsq*0], xm0, 0
pextrw [dstq+dsq*1], xm0, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .w2_s1
RET
.w2_s0:
vpbroadcastw xm0, [maskq]
psubb xm4, xm0
punpcklbw xm4, xm0
.w2_s0_loop:
movd xm0, [dstq+dsq*0]
pinsrw xm0, [dstq+dsq*1], 1
movd xm1, [tmpq]
add tmpq, 2*2
punpcklbw xm0, xm1
pmaddubsw xm0, xm4
pmulhrsw xm0, xm5
packuswb xm0, xm0
pextrw [dstq+dsq*0], xm0, 0
pextrw [dstq+dsq*1], xm0, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .w2_s0_loop
RET
ALIGN function_align
.w4:
cmp msq, 1
jb .w4_s0
je .w4_s1
.w4_s4:
movq xm1, [maskq]
movd xm0, [dstq+dsq*0]
pinsrd xm0, [dstq+dsq*1], 1
psubb xm2, xm4, xm1
punpcklbw xm2, xm1
movq xm1, [tmpq]
add maskq, 4*2
add tmpq, 4*2
punpcklbw xm0, xm1
pmaddubsw xm0, xm2
pmulhrsw xm0, xm5
packuswb xm0, xm0
movd [dstq+dsq*0], xm0
pextrd [dstq+dsq*1], xm0, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .w4_s4
RET
.w4_s1:
movq xm3, [blend_shuf]
.w4_s1_loop:
movd xm1, [maskq]
movd xm0, [dstq+dsq*0]
pshufb xm1, xm3
psubb xm2, xm4, xm1
pinsrd xm0, [dstq+dsq*1], 1
punpcklbw xm2, xm1
movq xm1, [tmpq]
add maskq, 2
add tmpq, 4*2
punpcklbw xm0, xm1
pmaddubsw xm0, xm2
pmulhrsw xm0, xm5
packuswb xm0, xm0
movd [dstq+dsq*0], xm0
pextrd [dstq+dsq*1], xm0, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .w4_s1_loop
RET
.w4_s0:
vpbroadcastd xm0, [maskq]
psubb xm4, xm0
punpcklbw xm4, xm0
.w4_s0_loop:
movd xm0, [dstq+dsq*0]
pinsrd xm0, [dstq+dsq*1], 1
movq xm1, [tmpq]
add tmpq, 4*2
punpcklbw xm0, xm1
pmaddubsw xm0, xm4
pmulhrsw xm0, xm5
packuswb xm0, xm0
movd [dstq+dsq*0], xm0
pextrd [dstq+dsq*1], xm0, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .w4_s0_loop
RET
ALIGN function_align
.w8:
cmp msq, 1
jb .w8_s0
je .w8_s1
.w8_s8:
movq xm1, [maskq+8*1]
vinserti128 m1, [maskq+8*0], 1
vpbroadcastq m2, [dstq+dsq*0]
movq xm0, [dstq+dsq*1]
vpblendd m0, m2, 0x30
psubb m2, m4, m1
punpcklbw m2, m1
movq xm1, [tmpq+8*1]
vinserti128 m1, [tmpq+8*0], 1
add maskq, 8*2
add tmpq, 8*2
punpcklbw m0, m1
pmaddubsw m0, m2
pmulhrsw m0, m5
vextracti128 xm1, m0, 1
packuswb xm0, xm1
movhps [dstq+dsq*0], xm0
movq [dstq+dsq*1], xm0
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .w8_s8
RET
.w8_s1:
vpbroadcastd m0, [blend_shuf+0]
vpbroadcastd xm3, [blend_shuf+4]
vpblendd m3, m0, 0xf0
.w8_s1_loop:
vpbroadcastd m0, [maskq]
vpbroadcastq m1, [dstq+dsq*0]
pshufb m0, m3
psubb m2, m4, m0
punpcklbw m2, m0
movq xm0, [dstq+dsq*1]
vpblendd m0, m1, 0x30
movq xm1, [tmpq+8*1]
vinserti128 m1, [tmpq+8*0], 1
add maskq, 2
add tmpq, 8*2
punpcklbw m0, m1
pmaddubsw m0, m2
pmulhrsw m0, m5
vextracti128 xm1, m0, 1
packuswb xm0, xm1
movhps [dstq+dsq*0], xm0
movq [dstq+dsq*1], xm0
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .w8_s1_loop
RET
.w8_s0:
vpbroadcastq m0, [maskq]
psubb m4, m0
punpcklbw m4, m0
.w8_s0_loop:
vpbroadcastq m2, [dstq+dsq*0]
movq xm0, [dstq+dsq*1]
vpblendd m0, m2, 0x30
movq xm1, [tmpq+8*1]
vinserti128 m1, [tmpq+8*0], 1
add tmpq, 8*2
punpcklbw m0, m1
pmaddubsw m0, m4
pmulhrsw m0, m5
vextracti128 xm1, m0, 1
packuswb xm0, xm1
movhps [dstq+dsq*0], xm0
movq [dstq+dsq*1], xm0
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .w8_s0_loop
RET
ALIGN function_align
.w16:
cmp msq, 1
jb .w16_s0
WIN64_SPILL_XMM 7
je .w16_s1
.w16_s16:
mova m0, [maskq]
mova xm1, [dstq+dsq*0]
vinserti128 m1, [dstq+dsq*1], 1
psubb m3, m4, m0
punpcklbw m2, m3, m0
punpckhbw m3, m0
mova m6, [tmpq]
add maskq, 16*2
add tmpq, 16*2
punpcklbw m0, m1, m6
punpckhbw m1, m6
pmaddubsw m0, m2
pmaddubsw m1, m3
pmulhrsw m0, m5
pmulhrsw m1, m5
packuswb m0, m1
mova [dstq+dsq*0], xm0
vextracti128 [dstq+dsq*1], m0, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .w16_s16
RET
.w16_s1:
vpbroadcastd xm6, [blend_shuf]
vpbroadcastd m0, [blend_shuf+4]
vpblendd m6, m0, 0xf0
.w16_s1_loop:
vpbroadcastd m2, [maskq]
mova xm1, [dstq+dsq*0]
pshufb m2, m6
psubb m3, m4, m2
vinserti128 m1, [dstq+dsq*1], 1
punpcklbw m3, m2
mova m2, [tmpq]
add maskq, 2
add tmpq, 16*2
punpcklbw m0, m1, m2
punpckhbw m1, m2
pmaddubsw m0, m3
pmaddubsw m1, m3
pmulhrsw m0, m5
pmulhrsw m1, m5
packuswb m0, m1
mova [dstq+dsq*0], xm0
vextracti128 [dstq+dsq*1], m0, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .w16_s1_loop
RET
.w16_s0:
%assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 6
vbroadcasti128 m0, [maskq]
psubb m4, m0
punpcklbw m3, m4, m0
punpckhbw m4, m0
.w16_s0_loop:
mova xm1, [dstq+dsq*0]
vinserti128 m1, [dstq+dsq*1], 1
mova m2, [tmpq]
add tmpq, 16*2
punpcklbw m0, m1, m2
punpckhbw m1, m2
pmaddubsw m0, m3
pmaddubsw m1, m4
pmulhrsw m0, m5
pmulhrsw m1, m5
packuswb m0, m1
mova [dstq+dsq*0], xm0
vextracti128 [dstq+dsq*1], m0, 1
lea dstq, [dstq+dsq*2]
sub hd, 2
jg .w16_s0_loop
RET
ALIGN function_align
.w32:
mov wd, 32
jmp .w32_start
.w64:
mov wd, 64
jmp .w32_start
.w128:
mov wd, 128
.w32_start:
WIN64_SPILL_XMM 7
cmp msq, 1
jb .w32_s0
je .w32_s1
sub dsq, wq
.w32_s32:
mov r6d, wd
.w32_s32_loop:
mova m0, [maskq]
mova m1, [dstq]
psubb m3, m4, m0
punpcklbw m2, m3, m0
punpckhbw m3, m0
mova m6, [tmpq]
add maskq, 32
add tmpq, 32
punpcklbw m0, m1, m6
punpckhbw m1, m6
pmaddubsw m0, m2
pmaddubsw m1, m3
pmulhrsw m0, m5
pmulhrsw m1, m5
packuswb m0, m1
mova [dstq], m0
add dstq, 32
sub r6d, 32
jg .w32_s32_loop
add dstq, dsq
dec hd
jg .w32_s32
RET
.w32_s1:
sub dsq, wq
.w32_s1_loop0:
vpbroadcastb m0, [maskq]
mov r6d, wd
inc maskq
psubb m3, m4, m0
punpcklbw m3, m0
.w32_s1_loop:
mova m1, [dstq]
mova m2, [tmpq]
add tmpq, 32
punpcklbw m0, m1, m2
punpckhbw m1, m2
pmaddubsw m0, m3
pmaddubsw m1, m3
pmulhrsw m0, m5
pmulhrsw m1, m5
packuswb m0, m1
mova [dstq], m0
add dstq, 32
sub r6d, 32
jg .w32_s1_loop
add dstq, dsq
dec hd
jg .w32_s1_loop0
RET
.w32_s0:
%if WIN64
PUSH r7
PUSH r8
%define regs_used 9
%endif
lea r6d, [hq+wq*8-256]
mov r7, dstq
mov r8, tmpq
.w32_s0_loop0:
mova m0, [maskq]
add maskq, 32
psubb m3, m4, m0
punpcklbw m2, m3, m0
punpckhbw m3, m0
.w32_s0_loop:
mova m1, [dstq]
mova m6, [tmpq]
add tmpq, wq
punpcklbw m0, m1, m6
punpckhbw m1, m6
pmaddubsw m0, m2
pmaddubsw m1, m3
pmulhrsw m0, m5
pmulhrsw m1, m5
packuswb m0, m1
mova [dstq], m0
add dstq, dsq
dec hd
jg .w32_s0_loop
add r7, 32
add r8, 32
mov dstq, r7
mov tmpq, r8
mov hb, r6b
sub r6d, 256
jg .w32_s0_loop0
RET
cglobal emu_edge, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \
bottomext, rightext
; we assume that the buffer (stride) is larger than width, so we can
......@@ -3475,4 +3883,5 @@ cglobal emu_edge, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \
.end:
RET
%endif ; ARCH_X86_64
......@@ -54,6 +54,7 @@ decl_avg_fn(dav1d_avg_avx2);
decl_w_avg_fn(dav1d_w_avg_avx2);
decl_mask_fn(dav1d_mask_avx2);
decl_w_mask_fn(dav1d_w_mask_420_avx2);
decl_blend_fn(dav1d_blend_avx2);
decl_warp8x8_fn(dav1d_warp_affine_8x8_avx2);
decl_warp8x8t_fn(dav1d_warp_affine_8x8t_avx2);
......@@ -96,6 +97,7 @@ void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
c->w_avg = dav1d_w_avg_avx2;
c->mask = dav1d_mask_avx2;
c->w_mask[2] = dav1d_w_mask_420_avx2;
c->blend = dav1d_blend_avx2;
c->warp8x8 = dav1d_warp_affine_8x8_avx2;
c->warp8x8t = dav1d_warp_affine_8x8t_avx2;
......
......@@ -237,12 +237,12 @@ static void check_w_mask(Dav1dMCDSPContext *const c) {
}
static void check_blend(Dav1dMCDSPContext *const c) {
ALIGN_STK_32(pixel, tmp, 128 * 128,);
ALIGN_STK_32(pixel, c_dst, 128 * 128,);
ALIGN_STK_32(pixel, a_dst, 128 * 128,);
ALIGN_STK_32(uint8_t, mask, 128 * 128,);
ALIGN_STK_32(pixel, tmp, 128 * 32,);
ALIGN_STK_32(pixel, c_dst, 128 * 32,);
ALIGN_STK_32(pixel, a_dst, 128 * 32,);
ALIGN_STK_32(uint8_t, mask, 128 * 32,);
for (int i = 0; i < 128 * 128; i++) {
for (int i = 0; i < 128 * 32; i++) {
tmp[i] = rand() & ((1 << BITDEPTH) - 1);
mask[i] = rand() % 65;
}
......@@ -252,9 +252,11 @@ static void check_blend(Dav1dMCDSPContext *const c) {
for (int w = 2; w <= 128; w <<= 1) {
const ptrdiff_t dst_stride = w * sizeof(pixel);
const int h_min = (w == 128) ? 4 : 2;
const int h_max = (w > 32) ? 32 : (w == 2) ? 64 : 128;
for (int ms = 0; ms <= w; ms += ms ? w - 1 : 1)
if (check_func(c->blend, "blend_w%d_ms%d_%dbpc", w, ms, BITDEPTH))
for (int h = imax(w / 8, 2); h <= imin(w * 8, 128); h <<= 1) {
for (int h = h_min; h <= h_max; h <<= 1) {
for (int i = 0; i < w * h; i++)
c_dst[i] = a_dst[i] = rand() & ((1 << BITDEPTH) - 1);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment