Commit 70fb01d8 authored by Ronald S. Bultje's avatar Ronald S. Bultje

Make per-width versions of cfl_ac

Also use aligned reads and writes in sub_loop, and integrate sum_loop into
the main loop.

before:
cfl_ac_420_w4_8bpc_c: 367.4
cfl_ac_420_w4_8bpc_avx2: 72.8
cfl_ac_420_w8_8bpc_c: 621.6
cfl_ac_420_w8_8bpc_avx2: 85.1
cfl_ac_420_w16_8bpc_c: 983.4
cfl_ac_420_w16_8bpc_avx2: 141.0

after:
cfl_ac_420_w4_8bpc_c: 376.2
cfl_ac_420_w4_8bpc_avx2: 28.5
cfl_ac_420_w8_8bpc_c: 607.2
cfl_ac_420_w8_8bpc_avx2: 29.9
cfl_ac_420_w16_8bpc_c: 962.1
cfl_ac_420_w16_8bpc_avx2: 48.8
parent e2aa2d14
...@@ -67,6 +67,18 @@ ipred_v_shuf: db 0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13 ...@@ -67,6 +67,18 @@ ipred_v_shuf: db 0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13
ipred_h_shuf: db 7, 7, 7, 7, 3, 3, 3, 3, 5, 5, 5, 5, 1, 1, 1, 1 ipred_h_shuf: db 7, 7, 7, 7, 3, 3, 3, 3, 5, 5, 5, 5, 1, 1, 1, 1
db 6, 6, 6, 6, 2, 2, 2, 2, 4, 4, 4, 4, 0, 0, 0, 0 db 6, 6, 6, 6, 2, 2, 2, 2, 4, 4, 4, 4, 0, 0, 0, 0
cfl_ac_w16_pad_shuffle: ; w=16, w_pad=1
db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
; w=8, w_pad=1 as well as second half of previous one
cfl_ac_w8_pad1_shuffle: db 0, 1, 2, 3, 4, 5
times 5 db 6, 7
; w=16,w_pad=2
db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
times 8 db 14, 15
; w=16,w_pad=3
db 0, 1, 2, 3, 4, 5
times 13 db 6, 7
pb_1: times 4 db 1 pb_1: times 4 db 1
pb_2: times 4 db 2 pb_2: times 4 db 2
pb_128: times 4 db 128 pb_128: times 4 db 128
...@@ -102,6 +114,7 @@ JMP_TABLE ipred_h, avx2, w4, w8, w16, w32, w64 ...@@ -102,6 +114,7 @@ JMP_TABLE ipred_h, avx2, w4, w8, w16, w32, w64
JMP_TABLE ipred_cfl, avx2, h4, h8, h16, h32, w4, w8, w16, w32, \ JMP_TABLE ipred_cfl, avx2, h4, h8, h16, h32, w4, w8, w16, w32, \
s4-8*4, s8-8*4, s16-8*4, s32-8*4 s4-8*4, s8-8*4, s16-8*4, s32-8*4
JMP_TABLE ipred_cfl_left, avx2, h4, h8, h16, h32 JMP_TABLE ipred_cfl_left, avx2, h4, h8, h16, h32
JMP_TABLE ipred_cfl_ac_420, avx2, w16_pad1, w16_pad2, w16_pad3
JMP_TABLE pal_pred, avx2, w4, w8, w16, w32, w64 JMP_TABLE pal_pred, avx2, w4, w8, w16, w32, w64
cextern filter_intra_taps cextern filter_intra_taps
...@@ -1784,99 +1797,185 @@ cglobal ipred_cfl_128, 3, 7, 6, dst, stride, tl, w, h, ac, alpha ...@@ -1784,99 +1797,185 @@ cglobal ipred_cfl_128, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
movifnidn acq, acmp movifnidn acq, acmp
jmp wq jmp wq
cglobal ipred_cfl_ac_420, 6, 10, 5, ac, y, stride, wpad, hpad, w, h cglobal ipred_cfl_ac_420, 4, 9, 5, ac, y, stride, wpad, hpad, w, h, sz, ac_bak
shl wpadd, 2 movifnidn hpadd, hpadm
movifnidn wd, wm
mov hd, hm
mov szd, wd
mov ac_bakq, acq
imul szd, hd
shl hpadd, 2 shl hpadd, 2
mov r9d, hm sub hd, hpadd
mov r6d, wd vpbroadcastd m2, [pb_2]
movsxd wq, wd pxor m4, m4
add yq, strideq cmp wd, 8
mov r7, acq jg .w16
sub r6d, wpadd je .w8
sub r9d, hpadd ; fall-through
mov r8d, r9d
vpbroadcastd xm2, [pb_2] DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak
.dec_rows: .w4:
mov r3, yq lea stride3q, [strideq*3]
xor r4, r4 .w4_loop:
sub r3, strideq movq xm0, [yq]
.dec_cols: movq xm1, [yq+strideq]
movq xm0, [r3+r4*2] movhps xm0, [yq+strideq*2]
movq xm1, [yq+r4*2] movhps xm1, [yq+stride3q]
pmaddubsw xm0, xm2 pmaddubsw xm0, xm2
pmaddubsw xm1, xm2 pmaddubsw xm1, xm2
paddw xm0, xm1 paddw xm0, xm1
movq [r7+r4*2], xm0 mova [acq], xm0
add r4, 4 paddw xm4, xm0
cmp r6d, r4d lea yq, [yq+strideq*4]
jg .dec_cols add acq, 16
lea r7, [r7+wq*2] sub hd, 2
jg .w4_loop
test hpadd, hpadd
jz .calc_avg
vpermq m0, m0, q1111
.w4_hpad_loop:
mova [acq], m0
paddw m4, m0
add acq, 32
sub hpadd, 4
jg .w4_hpad_loop
jmp .calc_avg
.w8:
lea stride3q, [strideq*3]
test wpadd, wpadd
jnz .w8_wpad
.w8_loop:
mova xm0, [yq]
mova xm1, [yq+strideq]
vinserti128 m0, [yq+strideq*2], 1
vinserti128 m1, [yq+stride3q], 1
pmaddubsw m0, m2
pmaddubsw m1, m2
paddw m0, m1
mova [acq], m0
paddw m4, m0
lea yq, [yq+strideq*4]
add acq, 32
sub hd, 2
jg .w8_loop
test hpadd, hpadd
jz .calc_avg
jmp .w8_hpad
.w8_wpad:
vbroadcasti128 m3, [cfl_ac_w8_pad1_shuffle]
.w8_wpad_loop:
movq xm0, [yq]
movq xm1, [yq+strideq]
vinserti128 m0, [yq+strideq*2], 1
vinserti128 m1, [yq+stride3q], 1
pmaddubsw m0, m2
pmaddubsw m1, m2
paddw m0, m1
pshufb m0, m3
mova [acq], m0
paddw m4, m0
lea yq, [yq+strideq*4]
add acq, 32
sub hd, 2
jg .w8_wpad_loop
test hpadd, hpadd
jz .calc_avg
.w8_hpad:
vpermq m0, m0, q3232
.w8_hpad_loop:
mova [acq], m0
paddw m4, m0
add acq, 32
sub hpadd, 2
jg .w8_hpad_loop
jmp .calc_avg
.w16:
test wpadd, wpadd
jnz .w16_wpad
.w16_loop:
mova m0, [yq]
mova m1, [yq+strideq]
pmaddubsw m0, m2
pmaddubsw m1, m2
paddw m0, m1
mova [acq], m0
paddw m4, m0
lea yq, [yq+strideq*2]
add acq, 32
dec hd
jg .w16_loop
test hpadd, hpadd
jz .calc_avg
jmp .w16_hpad_loop
.w16_wpad:
DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak
lea iptrq, [ipred_cfl_ac_420_avx2_table]
shl wpadd, 2
mova m3, [iptrq+cfl_ac_w16_pad_shuffle- \
ipred_cfl_ac_420_avx2_table+wpadq*8-32]
movsxd wpadq, [iptrq+wpadq+4]
add iptrq, wpadq
jmp iptrq
.w16_pad3:
vpbroadcastq m0, [yq]
vpbroadcastq m1, [yq+strideq]
jmp .w16_wpad_end
.w16_pad2:
vbroadcasti128 m0, [yq]
vbroadcasti128 m1, [yq+strideq]
jmp .w16_wpad_end
.w16_pad1:
mova m0, [yq]
mova m1, [yq+strideq]
; fall-through
.w16_wpad_end:
pmaddubsw m0, m2
pmaddubsw m1, m2
paddw m0, m1
pshufb m0, m3
mova [acq], m0
paddw m4, m0
lea yq, [yq+strideq*2] lea yq, [yq+strideq*2]
dec r8d add acq, 32
jg .dec_rows dec hd
cmp r6d, wd jz .w16_wpad_done
je .wpad_end jmp iptrq
mov r7, acq .w16_wpad_done:
lea r1, [r6q+r6q] test hpadd, hpadd
.wpad_rows: jz .calc_avg
vpbroadcastw xm0, [r7+r1-2] .w16_hpad_loop:
mov r2q, r6q mova [acq], m0
.wpad_cols: paddw m4, m0
movq [r7+r2q*2], xm0 add acq, 32
add r2q, 4 dec hpadd
cmp wd, r2d jg .w16_hpad_loop
jg .wpad_cols ; fall-through
lea r7, [r7+wq*2]
dec r9d .calc_avg:
jg .wpad_rows vpbroadcastd m2, [pw_1]
.wpad_end: pmaddwd m0, m4, m2
bsf r3d, hm vextracti128 xm1, m0, 1
shlx r6d, wd, r3d tzcnt r1d, szd
neg wd
bsf r3d, r6d
movsxd wq, wd
add wq, wq
movsxd r2q, r6d
lea r2q, [acq+r2q*2]
.hpad_loop:
cmp r2q, r7
jbe .hpad_end
mov r1, [r7+wq]
add r7, 8
mov [r7-8], r1
jmp .hpad_loop
.hpad_end:
mov r1, acq
pxor m1, m1
vpbroadcastd m3, [pw_1]
.sum_loop:
movdqu m0, [r1]
add r1, 32
cmp r2q, r1
pmaddwd m0, m3
paddd m1, m0
ja .sum_loop
vextracti128 xm0, m1, 1
sar r6d, 1
movd xm4, r6d
mov r6d, r3d
paddd xm0, xm1 paddd xm0, xm1
movd xm2, r1d
movd xm3, szd
punpckhqdq xm1, xm0, xm0 punpckhqdq xm1, xm0, xm0
paddd xm1, xm0
vbroadcastss xm0, xm4
psrlq xm2, xm1, 32
movq xm4, r6q
paddd xm0, xm2
paddd xm0, xm1 paddd xm0, xm1
psrld xm0, xm4 psrad xm3, 1
psrlq xm1, xm0, 32
paddd xm0, xm3
paddd xm0, xm1
psrad xm0, xm2
vpbroadcastw m0, xm0 vpbroadcastw m0, xm0
.sub_loop: .sub_loop:
movdqu m1, [acq] mova m1, [ac_bakq]
add acq, 32
psubw m1, m0 psubw m1, m0
movdqu [acq-32], m1 mova [ac_bakq], m1
cmp r2q, acq add ac_bakq, 32
ja .sub_loop sub szd, 16
jg .sub_loop
RET RET
cglobal pal_pred, 4, 6, 5, dst, stride, pal, idx, w, h cglobal pal_pred, 4, 6, 5, dst, stride, pal, idx, w, h
......
...@@ -120,17 +120,18 @@ static void check_cfl_ac(Dav1dIntraPredDSPContext *const c) { ...@@ -120,17 +120,18 @@ static void check_cfl_ac(Dav1dIntraPredDSPContext *const c) {
{ {
for (int h = imax(w / 4, 4); h <= imin(w * 4, (32 >> ss_ver)); h <<= 1) { for (int h = imax(w / 4, 4); h <= imin(w * 4, (32 >> ss_ver)); h <<= 1) {
const ptrdiff_t stride = 32 * sizeof(pixel); const ptrdiff_t stride = 32 * sizeof(pixel);
const int w_pad = rand() & ((w >> 2) - 1); for (int w_pad = (w >> 2) - 1; w_pad >= 0; w_pad--) {
const int h_pad = rand() & ((h >> 2) - 1); for (int h_pad = (h >> 2) - 1; h_pad >= 0; h_pad--) {
for (int y = 0; y < (h << ss_ver); y++)
for (int y = 0; y < (h << ss_ver); y++) for (int x = 0; x < (w << ss_hor); x++)
for (int x = 0; x < (w << ss_hor); x++) luma[y * 32 + x] = rand() & ((1 << BITDEPTH) - 1);
luma[y * 32 + x] = rand() & ((1 << BITDEPTH) - 1);
call_ref(c_dst, luma, stride, w_pad, h_pad, w, h);
call_ref(c_dst, luma, stride, w_pad, h_pad, w, h); call_new(a_dst, luma, stride, w_pad, h_pad, w, h);
call_new(a_dst, luma, stride, w_pad, h_pad, w, h); if (memcmp(c_dst, a_dst, w * h * sizeof(*c_dst)))
if (memcmp(c_dst, a_dst, w * h * sizeof(*c_dst))) fail();
fail(); }
}
bench_new(a_dst, luma, stride, 0, 0, w, h); bench_new(a_dst, luma, stride, 0, 0, w, h);
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment