Commit 2e8a3a21 authored by Martin Storsjö's avatar Martin Storsjö

arm64: msac: Add handwritten versions of msac_decode_bool functions

GCC                     Cortex A53   A72   A73
msac_decode_bool_c:           29.9  17.9  23.2
msac_decode_bool_neon:        27.4  15.3  20.4
msac_decode_bool_adapt_c:     49.2  26.8  31.0
msac_decode_bool_adapt_neon:  38.2  22.2  25.4
msac_decode_bool_equi_c:      26.6  16.8  19.4
msac_decode_bool_equi_neon:   23.9  13.7  15.7

Clang                   Cortex A53   A72   A73
msac_decode_bool_c:           28.0  16.4  23.1
msac_decode_bool_neon:        26.9  14.6  21.0
msac_decode_bool_adapt_c:     46.8  25.1  31.4
msac_decode_bool_adapt_neon:  36.2  19.0  26.2
msac_decode_bool_equi_c:      23.7  13.4  18.8
msac_decode_bool_equi_neon:   23.7  11.3  14.2

This is as fast as, or faster than, what either GCC or Clang
produces.
parent 84f938ec
Pipeline #6980 passed with stages
in 8 minutes and 11 seconds
......@@ -215,6 +215,7 @@ L(renorm):
eor w5, w5, #16 // d = clz(rng) ^ 16
mvn x7, x7 // ~dif
add x7, x7, x3, lsl #48 // ~dif + (v << 48)
L(renorm2):
lsl w4, w4, w5 // rng << d
subs w6, w6, w5 // cnt -= d
lsl x7, x7, x5 // (~dif + (v << 48)) << d
......@@ -278,3 +279,86 @@ function msac_decode_symbol_adapt16_neon, export=1
decode_update .8h, .16b, 16
b L(renorm)
endfunc
function msac_decode_bool_equi_neon, export=1
ldp w5, w6, [x0, #RNG] // + CNT
sub sp, sp, #48
ldr x7, [x0, #DIF]
bic w4, w5, #0xff // r &= 0xff00
add w4, w4, #8
subs x8, x7, x4, lsl #47 // dif - vw
lsr w4, w4, #1 // v
sub w5, w5, w4 // r - v
cset w15, lo
csel w4, w5, w4, hs // if (ret) v = r - v;
csel x7, x8, x7, hs // if (ret) dif = dif - vw;
clz w5, w4 // clz(rng)
mvn x7, x7 // ~dif
eor w5, w5, #16 // d = clz(rng) ^ 16
b L(renorm2)
endfunc
function msac_decode_bool_neon, export=1
ldp w5, w6, [x0, #RNG] // + CNT
sub sp, sp, #48
ldr x7, [x0, #DIF]
lsr w4, w5, #8 // r >> 8
bic w1, w1, #0x3f // f &= ~63
mul w4, w4, w1
lsr w4, w4, #7
add w4, w4, #4 // v
subs x8, x7, x4, lsl #48 // dif - vw
sub w5, w5, w4 // r - v
cset w15, lo
csel w4, w5, w4, hs // if (ret) v = r - v;
csel x7, x8, x7, hs // if (ret) dif = dif - vw;
clz w5, w4 // clz(rng)
mvn x7, x7 // ~dif
eor w5, w5, #16 // d = clz(rng) ^ 16
b L(renorm2)
endfunc
function msac_decode_bool_adapt_neon, export=1
ldr w9, [x1] // cdf[0-1]
ldp w5, w6, [x0, #RNG] // + CNT
sub sp, sp, #48
ldr x7, [x0, #DIF]
lsr w4, w5, #8 // r >> 8
and w2, w9, #0xffc0 // f &= ~63
mul w4, w4, w2
lsr w4, w4, #7
add w4, w4, #4 // v
subs x8, x7, x4, lsl #48 // dif - vw
sub w5, w5, w4 // r - v
cset w15, lo
csel w4, w5, w4, hs // if (ret) v = r - v;
csel x7, x8, x7, hs // if (ret) dif = dif - vw;
ldr w10, [x0, #ALLOW_UPDATE_CDF]
clz w5, w4 // clz(rng)
mvn x7, x7 // ~dif
eor w5, w5, #16 // d = clz(rng) ^ 16
cbz w10, L(renorm2)
lsr w2, w9, #16 // count = cdf[1]
and w9, w9, #0xffff // cdf[0]
sub w3, w2, w2, lsr #5 // count - (count >= 32)
lsr w2, w2, #4 // count >> 4
add w10, w3, #1 // count + (count < 32)
add w2, w2, #4 // rate = (count >> 4) | 4
sub w9, w9, w15 // cdf[0] -= bit
sub w11, w9, w15, lsl #15 // {cdf[0], cdf[0] - 32769}
asr w11, w11, w2 // {cdf[0], cdf[0] - 32769} >> rate
sub w9, w9, w11 // cdf[0]
strh w9, [x1]
strh w10, [x1, #2]
b L(renorm2)
endfunc
......@@ -61,12 +61,15 @@ unsigned dav1d_msac_decode_symbol_adapt8_neon(MsacContext *s, uint16_t *cdf,
size_t n_symbols);
unsigned dav1d_msac_decode_symbol_adapt16_neon(MsacContext *s, uint16_t *cdf,
size_t n_symbols);
unsigned dav1d_msac_decode_bool_adapt_neon(MsacContext *s, uint16_t *cdf);
unsigned dav1d_msac_decode_bool_equi_neon(MsacContext *s);
unsigned dav1d_msac_decode_bool_neon(MsacContext *s, unsigned f);
#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_neon
#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_neon
#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_neon
#define dav1d_msac_decode_bool_adapt dav1d_msac_decode_bool_adapt_c
#define dav1d_msac_decode_bool_equi dav1d_msac_decode_bool_equi_c
#define dav1d_msac_decode_bool dav1d_msac_decode_bool_c
#define dav1d_msac_decode_bool_adapt dav1d_msac_decode_bool_adapt_neon
#define dav1d_msac_decode_bool_equi dav1d_msac_decode_bool_equi_neon
#define dav1d_msac_decode_bool dav1d_msac_decode_bool_neon
#elif ARCH_X86_64 && HAVE_ASM
unsigned dav1d_msac_decode_symbol_adapt4_sse2(MsacContext *s, uint16_t *cdf,
size_t n_symbols);
......
......@@ -171,6 +171,9 @@ void checkasm_check_msac(void) {
c.symbol_adapt4 = dav1d_msac_decode_symbol_adapt4_neon;
c.symbol_adapt8 = dav1d_msac_decode_symbol_adapt8_neon;
c.symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_neon;
c.bool_adapt = dav1d_msac_decode_bool_adapt_neon;
c.bool_equi = dav1d_msac_decode_bool_equi_neon;
c.bool = dav1d_msac_decode_bool_neon;
}
#elif ARCH_X86_64 && HAVE_ASM
if (dav1d_get_cpu_flags() & DAV1D_X86_CPU_FLAG_SSE2) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment