Commit fa1b2651 authored by Henrik Gramner's avatar Henrik Gramner Committed by Henrik Gramner

x86-64: Add msac_decode_symbol_adapt SSE2 asm

Also make various minor optimizations/style fixes to the MSAC C functions.
parent 44d0de41
Pipeline #6311 passed with stages
in 6 minutes and 2 seconds
......@@ -813,7 +813,7 @@ static const uint16_t default_mv_joint_cdf[N_MV_JOINTS + 1] = {
AOM_CDF4(4096, 11264, 19328)
};
static const uint16_t default_kf_y_mode_cdf[5][5][N_INTRA_PRED_MODES + 1] = {
static const uint16_t default_kf_y_mode_cdf[5][5][N_INTRA_PRED_MODES + 1 + 2] = {
{
{ AOM_CDF13(15588, 17027, 19338, 20218, 20682, 21110, 21825, 23244,
24189, 28165, 29093, 30466) },
......
......@@ -34,11 +34,13 @@
#include "src/ref.h"
#include "src/thread_data.h"
/* Buffers padded to [8] or [16] for SIMD where needed. */
typedef struct CdfModeContext {
uint16_t y_mode[4][N_INTRA_PRED_MODES + 1];
uint16_t y_mode[4][N_INTRA_PRED_MODES + 1 + 2];
uint16_t use_filter_intra[N_BS_SIZES][2];
uint16_t filter_intra[5 + 1];
uint16_t uv_mode[2][N_INTRA_PRED_MODES][N_UV_INTRA_PRED_MODES + 1];
uint16_t uv_mode[2][N_INTRA_PRED_MODES][N_UV_INTRA_PRED_MODES + 1 + 1];
uint16_t angle_delta[8][8];
uint16_t filter[2][8][DAV1D_N_SWITCHABLE_FILTERS + 1];
uint16_t newmv_mode[6][2];
......@@ -66,7 +68,7 @@ typedef struct CdfModeContext {
uint16_t txtp_intra[3][N_TX_SIZES][N_INTRA_PRED_MODES][N_TX_TYPES + 1];
uint16_t skip[3][2];
uint16_t skip_mode[3][2];
uint16_t partition[N_BL_LEVELS][4][N_PARTITIONS + 1];
uint16_t partition[N_BL_LEVELS][4][N_PARTITIONS + 1 + 5];
uint16_t seg_pred[3][2];
uint16_t seg_id[3][DAV1D_MAX_SEGMENTS + 1];
uint16_t cfl_sign[8 + 1];
......@@ -88,12 +90,12 @@ typedef struct CdfModeContext {
typedef struct CdfCoefContext {
uint16_t skip[N_TX_SIZES][13][2];
uint16_t eob_bin_16[2][2][6];
uint16_t eob_bin_32[2][2][7];
uint16_t eob_bin_32[2][2][7 + 1];
uint16_t eob_bin_64[2][2][8];
uint16_t eob_bin_128[2][2][9];
uint16_t eob_bin_256[2][2][10];
uint16_t eob_bin_512[2][2][11];
uint16_t eob_bin_1024[2][2][12];
uint16_t eob_bin_256[2][2][10 + 6];
uint16_t eob_bin_512[2][2][11 + 5];
uint16_t eob_bin_1024[2][2][12 + 4];
uint16_t eob_hi_bit[N_TX_SIZES][2][11 /*22*/][2];
uint16_t eob_base_tok[N_TX_SIZES][2][4][4];
uint16_t base_tok[N_TX_SIZES][2][41][5];
......@@ -102,7 +104,7 @@ typedef struct CdfCoefContext {
} CdfCoefContext;
typedef struct CdfMvComponent {
uint16_t classes[11 + 1];
uint16_t classes[11 + 1 + 4];
uint16_t class0[2];
uint16_t classN[10][2];
uint16_t class0_fp[2][4 + 1];
......@@ -119,7 +121,7 @@ typedef struct CdfMvContext {
typedef struct CdfContext {
CdfModeContext m;
uint16_t kfym[5][5][N_INTRA_PRED_MODES + 1];
uint16_t kfym[5][5][N_INTRA_PRED_MODES + 1 + 2];
CdfCoefContext coef;
CdfMvContext mv, dmv;
} CdfContext;
......
This diff is collapsed.
......@@ -119,6 +119,7 @@ if is_asm_enabled
# NASM source files
libdav1d_sources_asm = files(
'x86/cpuid.asm',
'x86/msac.asm',
)
if dav1d_bitdepths.contains('8')
......
......@@ -58,8 +58,8 @@ static inline void ctx_refill(MsacContext *s) {
* necessary), and stores them back in the decoder context.
* dif: The new value of dif.
* rng: The new value of the range. */
static inline void ctx_norm(MsacContext *s, ec_win dif, uint32_t rng) {
const uint16_t d = 15 - (31 ^ clz(rng));
static inline void ctx_norm(MsacContext *s, ec_win dif, unsigned rng) {
const int d = 15 ^ (31 ^ clz(rng));
assert(rng <= 65535U);
s->cnt -= d;
s->dif = ((dif + 1) << d) - 1; /* Shift in 1s in the LSBs */
......@@ -69,18 +69,17 @@ static inline void ctx_norm(MsacContext *s, ec_win dif, uint32_t rng) {
}
unsigned dav1d_msac_decode_bool_equi(MsacContext *const s) {
ec_win v, vw, dif = s->dif;
uint16_t r = s->rng;
unsigned ret;
ec_win vw, dif = s->dif;
unsigned ret, v, r = s->rng;
assert((dif >> (EC_WIN_SIZE - 16)) < r);
// When the probability is 1/2, f = 16384 >> EC_PROB_SHIFT = 256 and we can
// replace the multiply with a simple shift.
v = ((r >> 8) << 7) + EC_MIN_PROB;
vw = v << (EC_WIN_SIZE - 16);
vw = (ec_win)v << (EC_WIN_SIZE - 16);
ret = dif >= vw;
dif -= ret*vw;
v += ret*(r - 2*v);
ctx_norm(s, dif, (unsigned) v);
ctx_norm(s, dif, v);
return !ret;
}
......@@ -88,59 +87,57 @@ unsigned dav1d_msac_decode_bool_equi(MsacContext *const s) {
* f: The probability that the bit is one
* Return: The value decoded (0 or 1). */
unsigned dav1d_msac_decode_bool(MsacContext *const s, const unsigned f) {
ec_win v, vw, dif = s->dif;
uint16_t r = s->rng;
unsigned ret;
ec_win vw, dif = s->dif;
unsigned ret, v, r = s->rng;
assert((dif >> (EC_WIN_SIZE - 16)) < r);
v = ((r >> 8) * (f >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT)) + EC_MIN_PROB;
vw = v << (EC_WIN_SIZE - 16);
vw = (ec_win)v << (EC_WIN_SIZE - 16);
ret = dif >= vw;
dif -= ret*vw;
v += ret*(r - 2*v);
ctx_norm(s, dif, (unsigned) v);
ctx_norm(s, dif, v);
return !ret;
}
unsigned dav1d_msac_decode_bools(MsacContext *const c, const unsigned l) {
int v = 0;
for (int n = (int) l - 1; n >= 0; n--)
v = (v << 1) | dav1d_msac_decode_bool_equi(c);
unsigned dav1d_msac_decode_bools(MsacContext *const s, unsigned n) {
unsigned v = 0;
while (n--)
v = (v << 1) | dav1d_msac_decode_bool_equi(s);
return v;
}
int dav1d_msac_decode_subexp(MsacContext *const c, const int ref,
int dav1d_msac_decode_subexp(MsacContext *const s, const int ref,
const int n, const unsigned k)
{
int i = 0;
int a = 0;
int b = k;
while ((2 << b) < n) {
if (!dav1d_msac_decode_bool_equi(c)) break;
if (!dav1d_msac_decode_bool_equi(s)) break;
b = k + i++;
a = (1 << b);
}
const unsigned v = dav1d_msac_decode_bools(c, b) + a;
const unsigned v = dav1d_msac_decode_bools(s, b) + a;
return ref * 2 <= n ? inv_recenter(ref, v) :
n - 1 - inv_recenter(n - 1 - ref, v);
}
int dav1d_msac_decode_uniform(MsacContext *const c, const unsigned n) {
int dav1d_msac_decode_uniform(MsacContext *const s, const unsigned n) {
assert(n > 0);
const int l = ulog2(n) + 1;
assert(l > 1);
const unsigned m = (1 << l) - n;
const unsigned v = dav1d_msac_decode_bools(c, l - 1);
return v < m ? v : (v << 1) - m + dav1d_msac_decode_bool_equi(c);
const unsigned v = dav1d_msac_decode_bools(s, l - 1);
return v < m ? v : (v << 1) - m + dav1d_msac_decode_bool_equi(s);
}
/* Decodes a symbol given an inverse cumulative distribution function (CDF)
* table in Q15. */
static unsigned decode_symbol(MsacContext *const s, const uint16_t *const cdf,
const unsigned n_symbols)
const size_t n_symbols)
{
ec_win u, v = s->rng, r = s->rng >> 8;
const ec_win c = s->dif >> (EC_WIN_SIZE - 16);
unsigned ret = 0;
const unsigned c = s->dif >> (EC_WIN_SIZE - 16);
unsigned u, v = s->rng, r = s->rng >> 8, ret = 0;
assert(!cdf[n_symbols - 1]);
......@@ -153,39 +150,34 @@ static unsigned decode_symbol(MsacContext *const s, const uint16_t *const cdf,
assert(u <= s->rng);
ctx_norm(s, s->dif - (v << (EC_WIN_SIZE - 16)), (unsigned) (u - v));
ctx_norm(s, s->dif - ((ec_win)v << (EC_WIN_SIZE - 16)), u - v);
return ret - 1;
}
static void update_cdf(uint16_t *const cdf, const unsigned val,
const unsigned n_symbols)
unsigned dav1d_msac_decode_symbol_adapt_c(MsacContext *const s,
uint16_t *const cdf,
const size_t n_symbols)
{
const unsigned count = cdf[n_symbols];
const int rate = ((count >> 4) | 4) + (n_symbols > 3);
unsigned i;
for (i = 0; i < val; i++)
cdf[i] += (32768 - cdf[i]) >> rate;
for (; i < n_symbols - 1; i++)
cdf[i] -= cdf[i] >> rate;
cdf[n_symbols] = count + (count < 32);
}
unsigned dav1d_msac_decode_symbol_adapt(MsacContext *const c,
uint16_t *const cdf,
const unsigned n_symbols)
{
const unsigned val = decode_symbol(c, cdf, n_symbols);
if(c->allow_update_cdf)
update_cdf(cdf, val, n_symbols);
const unsigned val = decode_symbol(s, cdf, n_symbols);
if (s->allow_update_cdf) {
const unsigned count = cdf[n_symbols];
const int rate = ((count >> 4) | 4) + (n_symbols > 3);
unsigned i;
for (i = 0; i < val; i++)
cdf[i] += (32768 - cdf[i]) >> rate;
for (; i < n_symbols - 1; i++)
cdf[i] -= cdf[i] >> rate;
cdf[n_symbols] = count + (count < 32);
}
return val;
}
unsigned dav1d_msac_decode_bool_adapt(MsacContext *const c,
unsigned dav1d_msac_decode_bool_adapt(MsacContext *const s,
uint16_t *const cdf)
{
const unsigned bit = dav1d_msac_decode_bool(c, *cdf);
const unsigned bit = dav1d_msac_decode_bool(s, *cdf);
if(c->allow_update_cdf){
if (s->allow_update_cdf) {
// update_cdf() specialized for boolean CDFs
const unsigned count = cdf[1];
const int rate = (count >> 4) | 4;
......
......@@ -38,20 +38,37 @@ typedef struct MsacContext {
const uint8_t *buf_pos;
const uint8_t *buf_end;
ec_win dif;
uint16_t rng;
unsigned rng;
int cnt;
int allow_update_cdf;
} MsacContext;
void dav1d_msac_init(MsacContext *c, const uint8_t *data, size_t sz,
void dav1d_msac_init(MsacContext *s, const uint8_t *data, size_t sz,
int disable_cdf_update_flag);
unsigned dav1d_msac_decode_symbol_adapt(MsacContext *s, uint16_t *cdf,
const unsigned n_symbols);
unsigned dav1d_msac_decode_bool_equi(MsacContext *const s);
unsigned dav1d_msac_decode_symbol_adapt_c(MsacContext *s, uint16_t *cdf,
size_t n_symbols);
unsigned dav1d_msac_decode_bool_equi(MsacContext *s);
unsigned dav1d_msac_decode_bool(MsacContext *s, unsigned f);
unsigned dav1d_msac_decode_bool_adapt(MsacContext *s, uint16_t *cdf);
unsigned dav1d_msac_decode_bools(MsacContext *c, unsigned l);
int dav1d_msac_decode_subexp(MsacContext *c, int ref, int n, unsigned k);
int dav1d_msac_decode_uniform(MsacContext *c, unsigned n);
unsigned dav1d_msac_decode_bools(MsacContext *s, unsigned n);
int dav1d_msac_decode_subexp(MsacContext *s, int ref, int n, unsigned k);
int dav1d_msac_decode_uniform(MsacContext *s, unsigned n);
/* Supported n_symbols ranges: adapt4: 1-5, adapt8: 1-8, adapt16: 4-16 */
#if ARCH_X86_64 && HAVE_ASM
unsigned dav1d_msac_decode_symbol_adapt4_sse2(MsacContext *s, uint16_t *cdf,
size_t n_symbols);
unsigned dav1d_msac_decode_symbol_adapt8_sse2(MsacContext *s, uint16_t *cdf,
size_t n_symbols);
unsigned dav1d_msac_decode_symbol_adapt16_sse2(MsacContext *s, uint16_t *cdf,
size_t n_symbols);
#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_sse2
#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_sse2
#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_sse2
#else
#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt_c
#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt_c
#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt_c
#endif
#endif /* DAV1D_SRC_MSAC_H */
......@@ -107,7 +107,9 @@ static int decode_coefs(Dav1dTileContext *const t,
uint16_t *const txtp_cdf = intra ?
ts->cdf.m.txtp_intra[set_idx][t_dim->min][y_mode_nofilt] :
ts->cdf.m.txtp_inter[set_idx][t_dim->min];
idx = dav1d_msac_decode_symbol_adapt(&ts->msac, txtp_cdf, set_cnt);
idx = (set_cnt <= 8 ? dav1d_msac_decode_symbol_adapt8 :
dav1d_msac_decode_symbol_adapt16)(&ts->msac, txtp_cdf, set_cnt);
if (dbg)
printf("Post-txtp[%d->%d][%d->%d][%d][%d->%d]: r=%d\n",
set, set_idx, tx, t_dim->min, intra ? (int)y_mode_nofilt : -1,
......@@ -122,19 +124,19 @@ static int decode_coefs(Dav1dTileContext *const t,
const enum TxClass tx_class = dav1d_tx_type_class[*txtp];
const int is_1d = tx_class != TX_CLASS_2D;
switch (tx2dszctx) {
#define case_sz(sz, bin) \
#define case_sz(sz, bin, ns) \
case sz: { \
uint16_t *const eob_bin_cdf = ts->cdf.coef.eob_bin_##bin[chroma][is_1d]; \
eob_bin = dav1d_msac_decode_symbol_adapt(&ts->msac, eob_bin_cdf, 5 + sz); \
eob_bin = dav1d_msac_decode_symbol_adapt##ns(&ts->msac, eob_bin_cdf, 5 + sz); \
break; \
}
case_sz(0, 16);
case_sz(1, 32);
case_sz(2, 64);
case_sz(3, 128);
case_sz(4, 256);
case_sz(5, 512);
case_sz(6, 1024);
case_sz(0, 16, 4);
case_sz(1, 32, 8);
case_sz(2, 64, 8);
case_sz(3, 128, 8);
case_sz(4, 256, 16);
case_sz(5, 512, 16);
case_sz(6, 1024, 16);
#undef case_sz
}
if (dbg)
......@@ -179,8 +181,8 @@ static int decode_coefs(Dav1dTileContext *const t,
uint16_t *const lo_cdf = is_last ?
ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma][ctx] :
ts->cdf.coef.base_tok[t_dim->ctx][chroma][ctx];
int tok = dav1d_msac_decode_symbol_adapt(&ts->msac, lo_cdf,
4 - is_last) + is_last;
int tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf,
4 - is_last) + is_last;
if (dbg)
printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n",
t_dim->ctx, chroma, ctx, i, rc, tok, ts->msac.rng);
......@@ -190,7 +192,7 @@ static int decode_coefs(Dav1dTileContext *const t,
if (tok == 3) {
const int br_ctx = get_br_ctx(levels, rc, tx, tx_class);
do {
const int tok_br = dav1d_msac_decode_symbol_adapt(&ts->msac,
const int tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac,
br_cdf[br_ctx], 4);
if (dbg)
printf("Post-hi_tok[%d][%d][%d][%d=%d=%d->%d]: r=%d\n",
......
; Copyright © 2019, VideoLAN and dav1d authors
; Copyright © 2019, Two Orioles, LLC
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
;
; 1. Redistributions of source code must retain the above copyright notice, this
; list of conditions and the following disclaimer.
;
; 2. Redistributions in binary form must reproduce the above copyright notice,
; this list of conditions and the following disclaimer in the documentation
; and/or other materials provided with the distribution.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm"
%if ARCH_X86_64
SECTION_RODATA 64 ; avoids cacheline splits
dw 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
pw_0xff00: times 8 dw 0xff00
pw_32: times 8 dw 32
struc msac
.buf: resq 1
.end: resq 1
.dif: resq 1
.rng: resd 1
.cnt: resd 1
.update_cdf: resd 1
endstruc
%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
SECTION .text
%if WIN64
DECLARE_REG_TMP 3
%define buf rsp+8 ; shadow space
%else
DECLARE_REG_TMP 0
%define buf rsp-40 ; red zone
%endif
INIT_XMM sse2
cglobal msac_decode_symbol_adapt4, 3, 7, 6, s, cdf, ns
movd m2, [sq+msac.rng]
movq m1, [cdfq]
lea rax, [pw_0xff00]
movq m3, [sq+msac.dif]
mov r3d, [sq+msac.update_cdf]
mov r4d, nsd
neg nsq
pshuflw m2, m2, q0000
movd [buf+12], m2
pand m2, [rax]
mova m0, m1
psrlw m1, 6
psllw m1, 7
pmulhuw m1, m2
movq m2, [rax+nsq*2]
pshuflw m3, m3, q3333
paddw m1, m2
mova [buf+16], m1
psubusw m1, m3
pxor m2, m2
pcmpeqw m1, m2 ; c >= v
pmovmskb eax, m1
test r3d, r3d
jz .renorm ; !allow_update_cdf
; update_cdf:
movzx r3d, word [cdfq+r4*2] ; count
pcmpeqw m2, m2
mov r2d, r3d
shr r3d, 4
cmp r4d, 4
sbb r3d, -5 ; (count >> 4) + (n_symbols > 3) + 4
cmp r2d, 32
adc r2d, 0 ; count + (count < 32)
movd m3, r3d
pavgw m2, m1 ; i >= val ? -1 : 32768
psubw m2, m0 ; for (i = 0; i < val; i++)
psubw m0, m1 ; cdf[i] += (32768 - cdf[i]) >> rate;
psraw m2, m3 ; for (; i < n_symbols - 1; i++)
paddw m0, m2 ; cdf[i] += (( -1 - cdf[i]) >> rate) + 1;
movq [cdfq], m0
mov [cdfq+r4*2], r2w
.renorm:
tzcnt eax, eax
mov r4, [sq+msac.dif]
movzx r1d, word [buf+rax+16] ; v
movzx r2d, word [buf+rax+14] ; u
shr eax, 1
.renorm2:
not r4
sub r2d, r1d ; rng
shl r1, 48
add r4, r1 ; ~dif
mov r1d, [sq+msac.cnt]
movifnidn t0, sq
bsr ecx, r2d
xor ecx, 15 ; d
shl r2d, cl
shl r4, cl
mov [t0+msac.rng], r2d
not r4
sub r1d, ecx
jge .end ; no refill required
; refill:
mov r2, [t0+msac.buf]
mov rcx, [t0+msac.end]
lea r5, [r2+8]
cmp r5, rcx
jg .refill_eob
mov r2, [r2]
lea ecx, [r1+23]
add r1d, 16
shr ecx, 3 ; shift_bytes
bswap r2
sub r5, rcx
shl ecx, 3 ; shift_bits
shr r2, cl
sub ecx, r1d ; shift_bits - 16 - cnt
mov r1d, 48
shl r2, cl
mov [t0+msac.buf], r5
sub r1d, ecx ; cnt + 64 - shift_bits
xor r4, r2
.end:
mov [t0+msac.cnt], r1d
mov [t0+msac.dif], r4
RET
.refill_eob: ; avoid overreading the input buffer
mov r5, rcx
mov ecx, 40
sub ecx, r1d ; c
.refill_eob_loop:
cmp r2, r5
jge .refill_eob_end ; eob reached
movzx r1d, byte [r2]
inc r2
shl r1, cl
xor r4, r1
sub ecx, 8
jge .refill_eob_loop
.refill_eob_end:
mov r1d, 40
sub r1d, ecx
mov [t0+msac.buf], r2
mov [t0+msac.dif], r4
mov [t0+msac.cnt], r1d
RET
cglobal msac_decode_symbol_adapt8, 3, 7, 6, s, cdf, ns
movd m2, [sq+msac.rng]
movu m1, [cdfq]
lea rax, [pw_0xff00]
movq m3, [sq+msac.dif]
mov r3d, [sq+msac.update_cdf]
mov r4d, nsd
neg nsq
pshuflw m2, m2, q0000
movd [buf+12], m2
punpcklqdq m2, m2
mova m0, m1
psrlw m1, 6
pand m2, [rax]
psllw m1, 7
pmulhuw m1, m2
movu m2, [rax+nsq*2]
pshuflw m3, m3, q3333
paddw m1, m2
punpcklqdq m3, m3
mova [buf+16], m1
psubusw m1, m3
pxor m2, m2
pcmpeqw m1, m2
pmovmskb eax, m1
test r3d, r3d
jz m(msac_decode_symbol_adapt4).renorm
movzx r3d, word [cdfq+r4*2]
pcmpeqw m2, m2
mov r2d, r3d
shr r3d, 4
cmp r4d, 4 ; may be called with n_symbols < 4
sbb r3d, -5
cmp r2d, 32
adc r2d, 0
movd m3, r3d
pavgw m2, m1
psubw m2, m0
psubw m0, m1
psraw m2, m3
paddw m0, m2
movu [cdfq], m0
mov [cdfq+r4*2], r2w
jmp m(msac_decode_symbol_adapt4).renorm
cglobal msac_decode_symbol_adapt16, 3, 7, 6, s, cdf, ns
movd m4, [sq+msac.rng]
movu m2, [cdfq]
lea rax, [pw_0xff00]
movu m3, [cdfq+16]
movq m5, [sq+msac.dif]
mov r3d, [sq+msac.update_cdf]
mov r4d, nsd
neg nsq
%if WIN64
sub rsp, 48 ; need 36 bytes, shadow space is only 32
%endif
pshuflw m4, m4, q0000
movd [buf-4], m4
punpcklqdq m4, m4
mova m0, m2
psrlw m2, 6
mova m1, m3
psrlw m3, 6
pand m4, [rax]
psllw m2, 7
psllw m3, 7
pmulhuw m2, m4
pmulhuw m3, m4
movu m4, [rax+nsq*2]
pshuflw m5, m5, q3333
paddw m2, m4
psubw m4, [rax-pw_0xff00+pw_32]
punpcklqdq m5, m5
paddw m3, m4
mova [buf], m2
mova [buf+16], m3
psubusw m2, m5
psubusw m3, m5
pxor m4, m4
pcmpeqw m2, m4
pcmpeqw m3, m4
packsswb m5, m2, m3
pmovmskb eax, m5
test r3d, r3d
jz .renorm
movzx r3d, word [cdfq+r4*2]
pcmpeqw m4, m4
mova m5, m4
lea r2d, [r3+80] ; only support n_symbols >= 4
shr r2d, 4
cmp r3d, 32
adc r3d, 0
pavgw m4, m2
pavgw m5, m3
psubw m4, m0
psubw m0, m2
movd m2, r2d
psubw m5, m1
psubw m1, m3
psraw m4, m2
psraw m5, m2
paddw m0, m4
paddw m1, m5
movu [cdfq], m0
movu [cdfq+16], m1
mov [cdfq+r4*2], r3w
.renorm:
tzcnt eax, eax
mov r4, [sq+msac.dif]
movzx r1d, word [buf+rax*2]
movzx r2d, word [buf+rax*2-2]
%if WIN64
add rsp, 48
%endif
jmp m(msac_decode_symbol_adapt4).renorm2
%endif
......@@ -62,6 +62,7 @@ static const struct {
const char *name;
void (*func)(void);
} tests[] = {
{ "msac", checkasm_check_msac },
#if CONFIG_8BPC
{ "cdef_8bpc", checkasm_check_cdef_8bpc },
{ "ipred_8bpc", checkasm_check_ipred_8bpc },
......
......@@ -57,6 +57,7 @@ int xor128_rand(void);
name##_8bpc(void); \
name##_16bpc(void)
void checkasm_check_msac(void);
decl_check_bitfns(void checkasm_check_cdef);
decl_check_bitfns(void checkasm_check_ipred);
decl_check_bitfns(void checkasm_check_itx);
......
/*
* Copyright © 2019, VideoLAN and dav1d authors
* Copyright © 2019, Two Orioles, LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "tests/checkasm/checkasm.h"
#include "src/cpu.h"
#include "src/msac.h"
#include <string.h>
/* The normal code doesn't use function pointers */
typedef unsigned (*decode_symbol_adapt_fn)(MsacContext *s, uint16_t *cdf,
size_t n_symbols);
typedef struct {
decode_symbol_adapt_fn symbol_adapt4;
decode_symbol_adapt_fn symbol_adapt8;
decode_symbol_adapt_fn symbol_adapt16;
} MsacDSPContext;
static void randomize_cdf(uint16_t *const cdf, int n) {
for (int i = 16; i > n; i--)
cdf[i] = rnd(); /* randomize padding */
cdf[n] = cdf[n-1] = 0;
while (--n > 0)
cdf[n-1] = cdf[n] + rnd() % (32768 - cdf[n] - n) + 1;
}
/* memcmp() on structs can have weird behavior due to padding etc. */
static int msac_cmp(const MsacContext *const a, const MsacContext *const b) {
return a->buf_pos != b->buf_pos || a->buf_end != b->buf_end ||
a->dif != b->dif || a->rng != b->rng || a->cnt != b->cnt ||