Commit 71e13008 authored by Xuefeng Jiang's avatar Xuefeng Jiang Committed by Henrik Gramner

Add SSSE3 implementation for dav1d_ipred_v and dav1d_ipred_dc

Cycle times:
intra_pred_dc_w4_8bpc_c: 1051.4
intra_pred_dc_w4_8bpc_ssse3: 58.8
intra_pred_dc_w8_8bpc_c: 1587.6
intra_pred_dc_w8_8bpc_ssse3: 75.3
intra_pred_dc_w16_8bpc_c: 2526.2
intra_pred_dc_w16_8bpc_ssse3: 103.5
intra_pred_dc_w32_8bpc_c: 2646.6
intra_pred_dc_w32_8bpc_ssse3: 179.5
intra_pred_dc_w64_8bpc_c: 4084.6
intra_pred_dc_w64_8bpc_ssse3: 356.1
intra_pred_v_w4_8bpc_c: 468.5
intra_pred_v_w4_8bpc_ssse3: 46.8
intra_pred_v_w8_8bpc_c: 839.1
intra_pred_v_w8_8bpc_ssse3: 56.7
intra_pred_v_w16_8bpc_c: 1750.5
intra_pred_v_w16_8bpc_ssse3: 73.0
intra_pred_v_w32_8bpc_c: 1552.5
intra_pred_v_w32_8bpc_ssse3: 135.4
intra_pred_v_w64_8bpc_c: 2463.6
intra_pred_v_w64_8bpc_ssse3: 305.6
parent 8c5d34c8
Pipeline #3779 passed with stages
in 5 minutes and 11 seconds
......@@ -51,7 +51,9 @@ decl_cfl_ac_fn(dav1d_ipred_cfl_ac_422_avx2);
decl_pal_pred_fn(dav1d_pal_pred_avx2);
decl_angular_ipred_fn(dav1d_ipred_dc_ssse3);
decl_angular_ipred_fn(dav1d_ipred_h_ssse3);
decl_angular_ipred_fn(dav1d_ipred_v_ssse3);
void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
......@@ -59,7 +61,9 @@ void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c) {
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
#if BITDEPTH == 8
c->intra_pred[HOR_PRED] = dav1d_ipred_h_ssse3;
c->intra_pred[DC_PRED] = dav1d_ipred_dc_ssse3;
c->intra_pred[HOR_PRED] = dav1d_ipred_h_ssse3;
c->intra_pred[VERT_PRED] = dav1d_ipred_v_ssse3;
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
......
......@@ -39,11 +39,18 @@ SECTION_RODATA 16
%endrep
%endmacro
JMP_TABLE ipred_h, ssse3, w4, w8, w16, w32, w64
%define ipred_dc_splat_ssse3_table (ipred_dc_ssse3_table + 10*4)
SECTION .text
JMP_TABLE ipred_h, ssse3, w4, w8, w16, w32, w64
JMP_TABLE ipred_dc, ssse3, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
SECTION .text
;---------------------------------------------------------------------------------------
;int dav1d_ipred_h_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
; const int width, const int height, const int a);
;---------------------------------------------------------------------------------------
%macro IPRED_SET 3 ; width, stride, stride size pshuflw_imm8
pshuflw m1, m0, %3 ; extend 8 byte for 2 pos
punpcklqdq m1, m1
......@@ -110,3 +117,262 @@ cglobal ipred_h, 3, 6, 2, dst, stride, tl, w, h, stride3
IPRED_H 32
.w64:
IPRED_H 64
;---------------------------------------------------------------------------------------
;int dav1d_ipred_v_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
; const int width, const int height, const int a);
;---------------------------------------------------------------------------------------
cglobal ipred_v, 3, 7, 6, dst, stride, tl, w, h, stride3
LEA r5, ipred_dc_splat_ssse3_table
tzcnt wd, wm
movu m0, [tlq+ 1]
movu m1, [tlq+17]
movu m2, [tlq+33]
movu m3, [tlq+49]
movifnidn hd, hm
movsxd wq, [r5+wq*4]
add wq, r5
lea stride3q, [strideq*3]
jmp wq
;---------------------------------------------------------------------------------------
;int dav1d_ipred_dc_ssse3(pixel *dst, const ptrdiff_t stride, const pixel *const topleft,
; const int width, const int height, const int a);
;---------------------------------------------------------------------------------------
cglobal ipred_dc, 3, 7, 6, dst, stride, tl, w, h, stride3
movifnidn hd, hm
movifnidn wd, wm
tzcnt r6d, hd
lea r5d, [wq+hq]
movd m4, r5d
tzcnt r5d, r5d
movd m5, r5d
LEA r5, ipred_dc_ssse3_table
tzcnt wd, wd
movsxd r6, [r5+r6*4]
movsxd wq, [r5+wq*4+20]
pcmpeqd m3, m3
psrlw m4, 1 ; dc = (width + height) >> 1;
add r6, r5
add wq, r5
lea stride3q, [strideq*3]
jmp r6
.h4:
movd m0, [tlq-4]
pmaddubsw m0, m3
jmp wq
.w4:
movd m1, [tlq+1]
pmaddubsw m1, m3
psubw m0, m4
paddw m0, m1
pmaddwd m0, m3
cmp hd, 4
jg .w4_mul
psrlw m0, 3 ; dc >>= ctz(width + height);
jmp .w4_end
.w4_mul:
punpckhqdq m1, m0, m0
paddw m0, m1
psrlq m1, m0, 32
paddw m0, m1
psrlw m0, 2
mov r6d, 0x5556
mov r2d, 0x3334
test hd, 8
cmovz r6d, r2d
movd m5, r6d
pmulhuw m0, m5
.w4_end:
pxor m1, m1
pshufb m0, m1
.s4:
movd [dstq+strideq*0], m0
movd [dstq+strideq*1], m0
movd [dstq+strideq*2], m0
movd [dstq+stride3q ], m0
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .s4
RET
ALIGN function_align
.h8:
movq m0, [tlq-8]
pmaddubsw m0, m3
jmp wq
.w8:
movq m1, [tlq+1]
pmaddubsw m1, m3
psubw m4, m0
punpckhqdq m0, m0
psubw m0, m4
paddw m0, m1
pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
paddw m0, m1
pmaddwd m0, m3
psrlw m0, m5
cmp hd, 8
je .w8_end
mov r6d, 0x5556
mov r2d, 0x3334
cmp hd, 32
cmovz r6d, r2d
movd m1, r6d
pmulhuw m0, m1
.w8_end:
pxor m1, m1
pshufb m0, m1
.s8:
movq [dstq+strideq*0], m0
movq [dstq+strideq*1], m0
movq [dstq+strideq*2], m0
movq [dstq+stride3q ], m0
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .s8
RET
ALIGN function_align
.h16:
mova m0, [tlq-16]
pmaddubsw m0, m3
jmp wq
.w16:
movu m1, [tlq+1]
pmaddubsw m1, m3
paddw m0, m1
psubw m4, m0
punpckhqdq m0, m0
psubw m0, m4
pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
paddw m0, m1
pmaddwd m0, m3
psrlw m0, m5
cmp hd, 16
je .w16_end
mov r6d, 0x5556
mov r2d, 0x3334
test hd, 8|32
cmovz r6d, r2d
movd m1, r6d
pmulhuw m0, m1
.w16_end:
pxor m1, m1
pshufb m0, m1
.s16:
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m0
mova [dstq+strideq*2], m0
mova [dstq+stride3q ], m0
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .s16
RET
ALIGN function_align
.h32:
mova m0, [tlq-32]
pmaddubsw m0, m3
mova m2, [tlq-16]
pmaddubsw m2, m3
paddw m0, m2
jmp wq
.w32:
movu m1, [tlq+1]
pmaddubsw m1, m3
movu m2, [tlq+17]
pmaddubsw m2, m3
paddw m1, m2
paddw m0, m1
psubw m4, m0
punpckhqdq m0, m0
psubw m0, m4
pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
paddw m0, m1
pmaddwd m0, m3
psrlw m0, m5
cmp hd, 32
je .w32_end
lea r2d, [hq*2]
mov r6d, 0x5556
mov r2d, 0x3334
test hd, 64|16
cmovz r6d, r2d
movd m1, r6d
pmulhuw m0, m1
.w32_end:
pxor m1, m1
pshufb m0, m1
mova m1, m0
.s32:
mova [dstq], m0
mova [dstq+16], m1
mova [dstq+strideq], m0
mova [dstq+strideq+16], m1
mova [dstq+strideq*2], m0
mova [dstq+strideq*2+16], m1
mova [dstq+stride3q], m0
mova [dstq+stride3q+16], m1
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .s32
RET
ALIGN function_align
.h64:
mova m0, [tlq-64]
mova m1, [tlq-48]
pmaddubsw m0, m3
pmaddubsw m1, m3
paddw m0, m1
mova m1, [tlq-32]
pmaddubsw m1, m3
paddw m0, m1
mova m1, [tlq-16]
pmaddubsw m1, m3
paddw m0, m1
jmp wq
.w64:
movu m1, [tlq+ 1]
movu m2, [tlq+17]
pmaddubsw m1, m3
pmaddubsw m2, m3
paddw m1, m2
movu m2, [tlq+33]
pmaddubsw m2, m3
paddw m1, m2
movu m2, [tlq+49]
pmaddubsw m2, m3
paddw m1, m2
paddw m0, m1
psubw m4, m0
punpckhqdq m0, m0
psubw m0, m4
pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
paddw m0, m1
pmaddwd m0, m3
psrlw m0, m5
cmp hd, 64
je .w64_end
mov r6d, 0x5556
mov r2d, 0x3334
test hd, 32
cmovz r6d, r2d
movd m1, r6d
pmulhuw m0, m1
.w64_end:
pxor m1, m1
pshufb m0, m1
mova m1, m0
mova m2, m0
mova m3, m0
.s64:
mova [dstq], m0
mova [dstq+16], m1
mova [dstq+32], m2
mova [dstq+48], m3
mova [dstq+strideq], m0
mova [dstq+strideq+16], m1
mova [dstq+strideq+32], m2
mova [dstq+strideq+48], m3
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .s64
RET
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment