Commit 205b723e authored by Henrik Gramner's avatar Henrik Gramner Committed by Henrik Gramner

Add SGR optimizations

parent 33ce3829
Pipeline #4030 passed with stages
in 5 minutes and 24 seconds
...@@ -446,11 +446,11 @@ static void selfguided_filter(coef *dst, const pixel *src, ...@@ -446,11 +446,11 @@ static void selfguided_filter(coef *dst, const pixel *src,
const unsigned p = imax(a * n - b * b, 0); const unsigned p = imax(a * n - b * b, 0);
const unsigned z = (p * s + (1 << 19)) >> 20; const unsigned z = (p * s + (1 << 19)) >> 20;
const unsigned x = dav1d_sgr_x_by_x[imin(z, 255)];
const int x = dav1d_sgr_x_by_xplus1[imin(z, 255)];
// This is where we invert A and B, so that B is of size coef. // This is where we invert A and B, so that B is of size coef.
AA[i] = (((1U << 8) - x) * BB[i] * sgr_one_by_x + (1 << 11)) >> 12; AA[i] = (x * BB[i] * sgr_one_by_x + (1 << 11)) >> 12;
BB[i] = x; BB[i] = 256 - x;
} }
AA += step * REST_UNIT_STRIDE; AA += step * REST_UNIT_STRIDE;
BB += step * REST_UNIT_STRIDE; BB += step * REST_UNIT_STRIDE;
......
...@@ -502,25 +502,25 @@ const int16_t dav1d_sgr_params[16][4] = { // r0, r1, e0, e1 ...@@ -502,25 +502,25 @@ const int16_t dav1d_sgr_params[16][4] = { // r0, r1, e0, e1
{ 2, 0, 22, -1 }, { 2, 0, 22, -1 },
}; };
const int dav1d_sgr_x_by_xplus1[256] = { const uint8_t dav1d_sgr_x_by_x[256] = {
1, 128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239, 255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17,
240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247, 16, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9,
248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250, 8, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6,
250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4,
252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3,
253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2,
254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
256, 0
}; };
const int8_t ALIGN(dav1d_mc_subpel_filters[5][15][8], 8) = { const int8_t ALIGN(dav1d_mc_subpel_filters[5][15][8], 8) = {
......
...@@ -107,7 +107,7 @@ static const unsigned interintra_allowed_mask = ...@@ -107,7 +107,7 @@ static const unsigned interintra_allowed_mask =
extern const Dav1dWarpedMotionParams dav1d_default_wm_params; extern const Dav1dWarpedMotionParams dav1d_default_wm_params;
extern const int16_t dav1d_sgr_params[16][4]; extern const int16_t dav1d_sgr_params[16][4];
extern const int dav1d_sgr_x_by_xplus1[256]; extern const uint8_t dav1d_sgr_x_by_x[256];
extern const int8_t dav1d_mc_subpel_filters[5][15][8]; extern const int8_t dav1d_mc_subpel_filters[5][15][8];
extern const int8_t dav1d_mc_warp_filter[193][8]; extern const int8_t dav1d_mc_warp_filter[193][8];
......
...@@ -42,14 +42,12 @@ pw_2048: times 2 dw 2048 ...@@ -42,14 +42,12 @@ pw_2048: times 2 dw 2048
pw_16380: times 2 dw 16380 pw_16380: times 2 dw 16380
pw_0_128: dw 0, 128 pw_0_128: dw 0, 128
pw_5_6: dw 5, 6 pw_5_6: dw 5, 6
pw_82: times 2 dw 82
pw_91_5: dw 91, 5
pd_6: dd 6 pd_6: dd 6
pd_255: dd 255
pd_1024: dd 1024 pd_1024: dd 1024
pd_0x80000: dd 0x80000 pd_0xf0080029: dd 0xf0080029
pd_0xf00801c7: dd 0xf00801c7
cextern sgr_x_by_xplus1 cextern sgr_x_by_x
SECTION .text SECTION .text
...@@ -477,76 +475,65 @@ cglobal sgr_box3_v, 5, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ...@@ -477,76 +475,65 @@ cglobal sgr_box3_v, 5, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr,
RET RET
INIT_YMM avx2 INIT_YMM avx2
cglobal sgr_calc_ab1, 4, 6, 14, a, b, w, h, s cglobal sgr_calc_ab1, 4, 6, 11, a, b, w, h, s
sub aq, (384+16-1)*4 sub aq, (384+16-1)*4
sub bq, (384+16-1)*2 sub bq, (384+16-1)*2
add hd, 2 add hd, 2
lea r5, [sgr_x_by_xplus1] lea r5, [sgr_x_by_x-0xf03]
pxor m6, m6
vpbroadcastd m7, [pw_91_5]
%ifidn sd, sm %ifidn sd, sm
movd xm8, sd movd xm6, sd
vpbroadcastd m8, xm8 vpbroadcastd m6, xm6
%else %else
vpbroadcastd m8, sm vpbroadcastd m6, sm
%endif %endif
vpbroadcastd m9, [pd_0x80000] vpbroadcastd m8, [pd_0xf00801c7]
vpbroadcastd m10, [pd_255] vpbroadcastd m9, [pw_256]
psrad m12, m9, 8 ; pd_2048 pcmpeqb m7, m7
psrad m11, m9, 11 ; pd_256 psrld m10, m9, 13 ; pd_2048
pcmpeqb m13, m13
DEFINE_ARGS a, b, w, h, x DEFINE_ARGS a, b, w, h, x
.loop_y: .loop_y:
mov xq, -2 mov xq, -2
.loop_x: .loop_x:
movu xm0, [aq+xq*4+ 0] pmovzxwd m0, [bq+xq*2]
movu xm1, [aq+xq*4+16] pmovzxwd m1, [bq+xq*2+(384+16)*2]
vinserti128 m0, [aq+xq*4+ 0+(384+16)*4], 1 movu m2, [aq+xq*4]
vinserti128 m1, [aq+xq*4+16+(384+16)*4], 1 movu m3, [aq+xq*4+(384+16)*4]
movu xm2, [bq+xq*2] pslld m4, m2, 3
vinserti128 m2, [bq+xq*2+(384+16)*2], 1 pslld m5, m3, 3
pslld m3, m0, 3 paddd m2, m4 ; aa * 9
pslld m4, m1, 3 paddd m3, m5
paddd m3, m0 ; aa * 9 [first half] pmaddwd m4, m0, m0
paddd m4, m1 ; aa * 9 [second half] pmaddwd m5, m1, m1
punpcklwd m0, m6, m2 pmaddwd m0, m8
punpckhwd m2, m6, m2 pmaddwd m1, m8
pmaddwd m1, m0, m0 psubd m2, m4 ; p = aa * 9 - bb * bb
pmaddwd m5, m2, m2 psubd m3, m5
pmaddwd m0, m7 pmulld m2, m6
pmaddwd m2, m7 pmulld m3, m6
psubd m3, m1 ; p = aa * 9 - bb * bb [first half] paddusw m2, m8
psubd m4, m5 ; p = aa * 9 - bb * bb [second half] paddusw m3, m8
pmulld m3, m8 psrld m2, 20 ; z
pmulld m4, m8 psrld m3, 20
paddd m3, m9 mova m5, m7
paddd m4, m9 vpgatherdd m4, [r5+m2], m5 ; xx
psrld m3, 20 ; z [first half] mova m5, m7
psrld m4, 20 ; z [second half] vpgatherdd m2, [r5+m3], m5
pminsd m3, m10 psrld m4, 24
pminsd m4, m10 psrld m2, 24
mova m5, m13 pmulld m0, m4
vpgatherdd m1, [r5+m3*4], m5 ; xx [first half] pmulld m1, m2
mova m5, m13 packssdw m4, m2
vpgatherdd m3, [r5+m4*4], m5 ; xx [second half] psubw m4, m9, m4
psubd m5, m11, m1 vpermq m4, m4, q3120
psubd m4, m11, m3 paddd m0, m10
packssdw m1, m3 paddd m1, m10
pmullw m5, m7 psrld m0, 12
pmullw m4, m7 psrld m1, 12
pmaddwd m5, m0 movu [bq+xq*2], xm4
pmaddwd m4, m2 vextracti128 [bq+xq*2+(384+16)*2], m4, 1
paddd m5, m12 movu [aq+xq*4], m0
paddd m4, m12 movu [aq+xq*4+(384+16)*4], m1
psrad m5, 12
psrad m4, 12
movu [bq+xq*2], xm1
vextracti128 [bq+xq*2+(384+16)*2], m1, 1
movu [aq+xq*4+ 0], xm5
movu [aq+xq*4+16], xm4
vextracti128 [aq+xq*4+ 0+(384+16)*4], m5, 1
vextracti128 [aq+xq*4+16+(384+16)*4], m4, 1
add xd, 8 add xd, 8
cmp xd, wd cmp xd, wd
jl .loop_x jl .loop_x
...@@ -903,78 +890,67 @@ cglobal sgr_box5_v, 5, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ...@@ -903,78 +890,67 @@ cglobal sgr_box5_v, 5, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr,
jmp .loop_y_noload jmp .loop_y_noload
INIT_YMM avx2 INIT_YMM avx2
cglobal sgr_calc_ab2, 4, 6, 14, a, b, w, h, s cglobal sgr_calc_ab2, 4, 6, 11, a, b, w, h, s
sub aq, (384+16-1)*4 sub aq, (384+16-1)*4
sub bq, (384+16-1)*2 sub bq, (384+16-1)*2
add hd, 2 add hd, 2
lea r5, [sgr_x_by_xplus1] lea r5, [sgr_x_by_x-0xf03]
pxor m6, m6
vpbroadcastd m7, [pw_82]
%ifidn sd, sm %ifidn sd, sm
movd xm8, sd movd xm6, sd
vpbroadcastd m8, xm8 vpbroadcastd m6, xm6
%else %else
vpbroadcastd m8, sm vpbroadcastd m6, sm
%endif %endif
vpbroadcastd m9, [pd_0x80000] vpbroadcastd m8, [pd_0xf0080029]
vpbroadcastd m10, [pd_255] vpbroadcastd m9, [pw_256]
psrad m12, m9, 8 ; pd_2048 pcmpeqb m7, m7
psrad m11, m9, 11 ; pd_256 psrld m10, m9, 15 ; pd_512
pcmpeqb m13, m13
DEFINE_ARGS a, b, w, h, x DEFINE_ARGS a, b, w, h, x
.loop_y: .loop_y:
mov xq, -2 mov xq, -2
.loop_x: .loop_x:
movu xm0, [aq+xq*4+ 0] pmovzxwd m0, [bq+xq*2+ 0]
movu xm1, [aq+xq*4+16] pmovzxwd m1, [bq+xq*2+16]
vinserti128 m0, [aq+xq*4+32], 1 movu m2, [aq+xq*4+ 0]
vinserti128 m1, [aq+xq*4+48], 1 movu m3, [aq+xq*4+32]
movu m2, [bq+xq*2] pslld m4, m2, 3 ; aa * 8
pslld m3, m0, 5 ; aa * 32 [first half] pslld m5, m3, 3
pslld m4, m1, 5 ; aa * 32 [second half] paddd m2, m4 ; aa * 9
paddd m3, m0 ; aa * 33 [first half] paddd m3, m5
paddd m4, m1 ; aa * 33 [first half] paddd m4, m4 ; aa * 16
pslld m0, 3 ; aa * 8 [first half] paddd m5, m5
pslld m1, 3 ; aa * 8 [second half] paddd m2, m4 ; aa * 25
psubd m3, m0 ; aa * 25 [first half] paddd m3, m5
psubd m4, m1 ; aa * 25 [second half] pmaddwd m4, m0, m0
punpcklwd m0, m2, m6 pmaddwd m5, m1, m1
punpckhwd m2, m6 psubd m2, m4 ; p = aa * 25 - bb * bb
pmaddwd m1, m0, m0 psubd m3, m5
pmaddwd m5, m2, m2 pmulld m2, m6
paddw m0, m0 pmulld m3, m6
paddw m2, m2 paddusw m2, m8
psubd m3, m1 ; p = aa * 25 - bb * bb [first half] paddusw m3, m8
psubd m4, m5 ; p = aa * 25 - bb * bb [second half] psrld m2, 20 ; z
pmulld m3, m8 psrld m3, 20
pmulld m4, m8 mova m5, m7
paddd m3, m9 vpgatherdd m4, [r5+m2], m5 ; xx
paddd m4, m9 mova m5, m7
psrld m3, 20 ; z [first half] vpgatherdd m2, [r5+m3], m5
psrld m4, 20 ; z [second half] psrld m4, 24
pminsd m3, m10 psrld m2, 24
pminsd m4, m10 packssdw m3, m4, m2
mova m5, m13 pmullw m4, m8
vpgatherdd m1, [r5+m3*4], m5 ; xx [first half] pmullw m2, m8
mova m5, m13 psubw m3, m9, m3
vpgatherdd m3, [r5+m4*4], m5 ; xx [second half] vpermq m3, m3, q3120
psubd m5, m11, m1 pmaddwd m0, m4
psubd m4, m11, m3 pmaddwd m1, m2
packssdw m1, m3 paddd m0, m10
pmullw m5, m7 paddd m1, m10
pmullw m4, m7 psrld m0, 10
pmaddwd m5, m0 psrld m1, 10
pmaddwd m4, m2 movu [bq+xq*2], m3
paddd m5, m12 movu [aq+xq*4+ 0], m0
paddd m4, m12 movu [aq+xq*4+32], m1
psrad m5, 12
psrad m4, 12
movu [bq+xq*2], m1
movu [aq+xq*4+ 0], xm5
movu [aq+xq*4+16], xm4
vextracti128 [aq+xq*4+32], m5, 1
vextracti128 [aq+xq*4+48], m4, 1
add xd, 16 add xd, 16
cmp xd, wd cmp xd, wd
jl .loop_x jl .loop_x
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment