Commit 93c4bea2 authored by Henrik Gramner's avatar Henrik Gramner

x86: Add pal_pred AVX2 asm

parent 0ba64ee5
......@@ -70,7 +70,7 @@ typedef decl_cfl_pred_fn(*cfl_pred_fn);
*/
#define decl_pal_pred_fn(name) \
void (name)(pixel *dst, ptrdiff_t stride, const uint16_t *pal, \
const uint8_t *idx, const int w, const int h)
const uint8_t *idx, int w, int h)
typedef decl_pal_pred_fn(*pal_pred_fn);
typedef struct Dav1dIntraPredDSPContext {
......
......@@ -93,6 +93,7 @@ JMP_TABLE ipred_h, avx2, w4, w8, w16, w32, w64
JMP_TABLE ipred_cfl, avx2, h4, h8, h16, h32, w4, w8, w16, w32, \
s4-8*4, s8-8*4, s16-8*4, s32-8*4
JMP_TABLE ipred_cfl_left, avx2, h4, h8, h16, h32
JMP_TABLE pal_pred, avx2, w4, w8, w16, w32, w64
SECTION .text
......@@ -1515,4 +1516,82 @@ cglobal ipred_cfl_128, 3, 7, 6, dst, stride, tl, w, h, ac, alpha
movifnidn acq, acmp
jmp wq
cglobal pal_pred, 4, 6, 5, dst, stride, pal, idx, w, h
vbroadcasti128 m4, [palq]
lea r2, [pal_pred_avx2_table]
tzcnt wd, wm
movifnidn hd, hm
movsxd wq, [r2+wq*4]
packuswb m4, m4
add wq, r2
lea r2, [strideq*3]
jmp wq
.w4:
pshufb xm0, xm4, [idxq]
add idxq, 16
movd [dstq+strideq*0], xm0
pextrd [dstq+strideq*1], xm0, 1
pextrd [dstq+strideq*2], xm0, 2
pextrd [dstq+r2 ], xm0, 3
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w4
RET
ALIGN function_align
.w8:
pshufb xm0, xm4, [idxq+16*0]
pshufb xm1, xm4, [idxq+16*1]
add idxq, 16*2
movq [dstq+strideq*0], xm0
movhps [dstq+strideq*1], xm0
movq [dstq+strideq*2], xm1
movhps [dstq+r2 ], xm1
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w8
RET
ALIGN function_align
.w16:
pshufb m0, m4, [idxq+32*0]
pshufb m1, m4, [idxq+32*1]
add idxq, 32*2
mova [dstq+strideq*0], xm0
vextracti128 [dstq+strideq*1], m0, 1
mova [dstq+strideq*2], xm1
vextracti128 [dstq+r2 ], m1, 1
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w16
RET
ALIGN function_align
.w32:
pshufb m0, m4, [idxq+32*0]
pshufb m1, m4, [idxq+32*1]
pshufb m2, m4, [idxq+32*2]
pshufb m3, m4, [idxq+32*3]
add idxq, 32*4
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
mova [dstq+strideq*2], m2
mova [dstq+r2 ], m3
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w32
RET
ALIGN function_align
.w64:
pshufb m0, m4, [idxq+32*0]
pshufb m1, m4, [idxq+32*1]
pshufb m2, m4, [idxq+32*2]
pshufb m3, m4, [idxq+32*3]
add idxq, 32*4
mova [dstq+strideq*0+32*0], m0
mova [dstq+strideq*0+32*1], m1
mova [dstq+strideq*1+32*0], m2
mova [dstq+strideq*1+32*1], m3
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w64
RET
%endif
......@@ -44,6 +44,8 @@ decl_cfl_pred_fn(dav1d_ipred_cfl_128_avx2);
decl_cfl_pred_fn(dav1d_ipred_cfl_top_avx2);
decl_cfl_pred_fn(dav1d_ipred_cfl_left_avx2);
decl_pal_pred_fn(dav1d_pal_pred_avx2);
void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
......@@ -65,5 +67,7 @@ void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c) {
c->cfl_pred[DC_128_PRED] = dav1d_ipred_cfl_128_avx2;
c->cfl_pred[TOP_DC_PRED] = dav1d_ipred_cfl_top_avx2;
c->cfl_pred[LEFT_DC_PRED] = dav1d_ipred_cfl_left_avx2;
c->pal_pred = dav1d_pal_pred_avx2;
#endif
}
......@@ -142,10 +142,42 @@ static void check_cfl_pred(Dav1dIntraPredDSPContext *const c) {
report("cfl_pred");
}
static void check_pal_pred(Dav1dIntraPredDSPContext *const c) {
ALIGN_STK_32(pixel, c_dst, 64 * 64,);
ALIGN_STK_32(pixel, a_dst, 64 * 64,);
ALIGN_STK_32(uint8_t, idx, 64 * 64,);
ALIGN_STK_16(uint16_t, pal, 8,);
declare_func(void, pixel *dst, ptrdiff_t stride, const uint16_t *pal,
const uint8_t *idx, int w, int h);
for (int w = 4; w <= 64; w <<= 1)
if (check_func(c->pal_pred, "pal_pred_w%d_%dbpc", w, BITDEPTH))
for (int h = imax(w / 4, 4); h <= imin(w * 4, 64); h <<= 1)
{
const ptrdiff_t stride = w * sizeof(pixel);
for (int i = 0; i < 8; i++)
pal[i] = rand() & ((1 << BITDEPTH) - 1);
for (int i = 0; i < w * h; i++)
idx[i] = rand() & 7;
call_ref(c_dst, stride, pal, idx, w, h);
call_new(a_dst, stride, pal, idx, w, h);
if (memcmp(c_dst, a_dst, w * h * sizeof(*c_dst)))
fail();
bench_new(a_dst, stride, pal, idx, w, h);
}
report("pal_pred");
}
void bitfn(checkasm_check_ipred)(void) {
Dav1dIntraPredDSPContext c;
bitfn(dav1d_intra_pred_dsp_init)(&c);
check_intra_pred(&c);
check_cfl_pred(&c);
check_pal_pred(&c);
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment