Commit 87a377e9 authored by Liwei Wang's avatar Liwei Wang Committed by Henrik Gramner

Add SSSE3 implementation for the 4x4 blocks in itx

Cycle times:
inv_txfm_add_4x4_adst_adst_0_8bpc_c: 445.9
inv_txfm_add_4x4_adst_adst_0_8bpc_ssse3: 23.7
inv_txfm_add_4x4_adst_adst_1_8bpc_c: 443.7
inv_txfm_add_4x4_adst_adst_1_8bpc_ssse3: 52.6
inv_txfm_add_4x4_adst_dct_0_8bpc_c: 474.5
inv_txfm_add_4x4_adst_dct_0_8bpc_ssse3: 23.9
inv_txfm_add_4x4_adst_dct_1_8bpc_c: 482.0
inv_txfm_add_4x4_adst_dct_1_8bpc_ssse3: 51.1
inv_txfm_add_4x4_adst_flipadst_0_8bpc_c: 587.2
inv_txfm_add_4x4_adst_flipadst_0_8bpc_ssse3: 24.0
inv_txfm_add_4x4_adst_flipadst_1_8bpc_c: 457.2
inv_txfm_add_4x4_adst_flipadst_1_8bpc_ssse3: 52.8
inv_txfm_add_4x4_adst_identity_0_8bpc_c: 412.4
inv_txfm_add_4x4_adst_identity_0_8bpc_ssse3: 43.3
inv_txfm_add_4x4_adst_identity_1_8bpc_c: 412.0
inv_txfm_add_4x4_adst_identity_1_8bpc_ssse3: 43.3
inv_txfm_add_4x4_dct_adst_0_8bpc_c: 467.4
inv_txfm_add_4x4_dct_adst_0_8bpc_ssse3: 23.2
inv_txfm_add_4x4_dct_adst_1_8bpc_c: 588.3
inv_txfm_add_4x4_dct_adst_1_8bpc_ssse3: 48.6
inv_txfm_add_4x4_dct_dct_0_8bpc_c: 611.5
inv_txfm_add_4x4_dct_dct_0_8bpc_ssse3: 23.1
inv_txfm_add_4x4_dct_dct_1_8bpc_c: 576.2
inv_txfm_add_4x4_dct_dct_1_8bpc_ssse3: 47.6
inv_txfm_add_4x4_dct_flipadst_0_8bpc_c: 479.5
inv_txfm_add_4x4_dct_flipadst_0_8bpc_ssse3: 23.4
inv_txfm_add_4x4_dct_flipadst_1_8bpc_c: 549.3
inv_txfm_add_4x4_dct_flipadst_1_8bpc_ssse3: 48.3
inv_txfm_add_4x4_dct_identity_0_8bpc_c: 576.9
inv_txfm_add_4x4_dct_identity_0_8bpc_ssse3: 25.4
inv_txfm_add_4x4_dct_identity_1_8bpc_c: 610.7
inv_txfm_add_4x4_dct_identity_1_8bpc_ssse3: 25.1
inv_txfm_add_4x4_flipadst_adst_0_8bpc_c: 532.8
inv_txfm_add_4x4_flipadst_adst_0_8bpc_ssse3: 23.8
inv_txfm_add_4x4_flipadst_adst_1_8bpc_c: 666.7
inv_txfm_add_4x4_flipadst_adst_1_8bpc_ssse3: 61.0
inv_txfm_add_4x4_flipadst_dct_0_8bpc_c: 539.6
inv_txfm_add_4x4_flipadst_dct_0_8bpc_ssse3: 23.8
inv_txfm_add_4x4_flipadst_dct_1_8bpc_c: 484.6
inv_txfm_add_4x4_flipadst_dct_1_8bpc_ssse3: 51.1
inv_txfm_add_4x4_flipadst_flipadst_0_8bpc_c: 503.1
inv_txfm_add_4x4_flipadst_flipadst_0_8bpc_ssse3: 23.9
inv_txfm_add_4x4_flipadst_flipadst_1_8bpc_c: 463.0
inv_txfm_add_4x4_flipadst_flipadst_1_8bpc_ssse3: 54.0
inv_txfm_add_4x4_flipadst_identity_0_8bpc_c: 719.9
inv_txfm_add_4x4_flipadst_identity_0_8bpc_ssse3: 43.0
inv_txfm_add_4x4_flipadst_identity_1_8bpc_c: 456.8
inv_txfm_add_4x4_flipadst_identity_1_8bpc_ssse3: 44.1
inv_txfm_add_4x4_identity_adst_0_8bpc_c: 422.8
inv_txfm_add_4x4_identity_adst_0_8bpc_ssse3: 42.4
inv_txfm_add_4x4_identity_adst_1_8bpc_c: 417.1
inv_txfm_add_4x4_identity_adst_1_8bpc_ssse3: 42.3
inv_txfm_add_4x4_identity_dct_0_8bpc_c: 435.4
inv_txfm_add_4x4_identity_dct_0_8bpc_ssse3: 25.7
inv_txfm_add_4x4_identity_dct_1_8bpc_c: 434.1
inv_txfm_add_4x4_identity_dct_1_8bpc_ssse3: 25.3
inv_txfm_add_4x4_identity_flipadst_0_8bpc_c: 528.1
inv_txfm_add_4x4_identity_flipadst_0_8bpc_ssse3: 40.9
inv_txfm_add_4x4_identity_flipadst_1_8bpc_c: 720.0
inv_txfm_add_4x4_identity_flipadst_1_8bpc_ssse3: 41.8
inv_txfm_add_4x4_identity_identity_0_8bpc_c: 383.2
inv_txfm_add_4x4_identity_identity_0_8bpc_ssse3: 28.3
inv_txfm_add_4x4_identity_identity_1_8bpc_c: 378.9
inv_txfm_add_4x4_identity_identity_1_8bpc_ssse3: 28.2
inv_txfm_add_4x4_wht_wht_0_8bpc_c: 271.5
inv_txfm_add_4x4_wht_wht_0_8bpc_ssse3: 34.0
inv_txfm_add_4x4_wht_wht_1_8bpc_c: 266.0
inv_txfm_add_4x4_wht_wht_1_8bpc_ssse3: 33.9
parent 6f2f0188
Pipeline #3315 passed with stages
in 8 minutes and 55 seconds
...@@ -77,7 +77,7 @@ decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x16_avx2); ...@@ -77,7 +77,7 @@ decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x16_avx2);
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x32_avx2); decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x32_avx2);
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x64_avx2); decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x64_avx2);
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_4x4_ssse3); decl_itx17_fns(4, 4, ssse3);
void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) { void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) {
#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \ #define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
...@@ -115,15 +115,13 @@ void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) { ...@@ -115,15 +115,13 @@ void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) {
assign_itx16_fn(pfx, w, h, ext); \ assign_itx16_fn(pfx, w, h, ext); \
assign_itx_fn(pfx, w, h, wht_wht, WHT_WHT, ext) assign_itx_fn(pfx, w, h, wht_wht, WHT_WHT, ext)
#define assign_itx_ssse3_fn_8b() \
c->itxfm_add[TX_4X4][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_4x4_ssse3;
const unsigned flags = dav1d_get_cpu_flags(); const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
#if BITDEPTH == 8 #if BITDEPTH == 8
assign_itx_ssse3_fn_8b(); assign_itx17_fn(, 4, 4, ssse3);
#endif #endif
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
......
...@@ -35,9 +35,22 @@ qw_2896x8: times 8 dw 2896*8 ...@@ -35,9 +35,22 @@ qw_2896x8: times 8 dw 2896*8
qw_1567_m3784: times 4 dw 1567, -3784 qw_1567_m3784: times 4 dw 1567, -3784
qw_3784_1567: times 4 dw 3784, 1567 qw_3784_1567: times 4 dw 3784, 1567
qw_1321_3803: times 4 dw 1321, 3803
qw_2482_m1321: times 4 dw 2482, -1321
qw_3344_2482: times 4 dw 3344, 2482
qw_3344_m3803: times 4 dw 3344, -3803
qw_m6688_m3803: times 4 dw -6688, -3803
qw_3344x8: times 8 dw 3344*8
qw_5793x4: times 8 dw 5793*4
pd_2048: times 4 dd 2048 pd_2048: times 4 dd 2048
qw_2048: times 8 dw 2048 qw_2048: times 8 dw 2048
iadst4_dconly1a: times 2 dw 10568, 19856, 26752, 30424
iadst4_dconly1b: times 2 dw 30424, 26752, 19856, 10568
iadst4_dconly2a: dw 10568, 10568, 10568, 10568, 19856, 19856, 19856, 19856
iadst4_dconly2b: dw 26752, 26752, 26752, 26752, 30424, 30424, 30424, 30424
SECTION .text SECTION .text
%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX) %define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
...@@ -84,7 +97,7 @@ SECTION .text ...@@ -84,7 +97,7 @@ SECTION .text
psrlq m0, 32 psrlq m0, 32
movd [%%row_adr4], m0 ;store dst3 + out3 movd [%%row_adr4], m0 ;store dst3 + out3
RET ret
%endmacro %endmacro
...@@ -126,6 +139,34 @@ SECTION .text ...@@ -126,6 +139,34 @@ SECTION .text
paddw m0, m2 ;high: out1 ;low: out0 paddw m0, m2 ;high: out1 ;low: out0
%endmacro %endmacro
%macro IADST4_1D_PACKED 0
punpcklwd m2, m0, m1 ;unpacked in0 in2
punpckhwd m3, m0, m1 ;unpacked in1 in3
psubw m0, m1
punpckhqdq m1, m1 ;
paddw m1, m0 ;low: in0 - in2 + in3
pmaddwd m0, m2, [qw_1321_3803] ;1321 * in0 + 3803 * in2
pmaddwd m2, [qw_2482_m1321] ;2482 * in0 - 1321 * in2
pmaddwd m4, m3, [qw_3344_2482] ;3344 * in1 + 2482 * in3
pmaddwd m5, m3, [qw_3344_m3803] ;3344 * in1 - 3803 * in3
paddd m4, m0 ;t0 + t3
pmaddwd m3, [qw_m6688_m3803] ;-2 * 3344 * in1 - 3803 * in3
pmulhrsw m1, [qw_3344x8] ;low: out2
mova m0, [pd_2048]
paddd m2, m0
paddd m0, m4 ;t0 + t3 + 2048
paddd m5, m2 ;t1 + t3 + 2048
paddd m2, m4
paddd m2, m3 ;t0 + t1 - t3 + 2048
psrad m0, 12 ;out0
psrad m5, 12 ;out1
psrad m2, 12 ;out3
packssdw m0, m5 ;high: out1 ;low: out0
packssdw m2, m2 ;high: out3 ;low: out3
%endmacro
%macro INV_TXFM_FN 4 ; type1, type2, fast_thresh, size %macro INV_TXFM_FN 4 ; type1, type2, fast_thresh, size
cglobal inv_txfm_add_%1_%2_%4, 4, 5, 0, dst, stride, coeff, eob, tx2 cglobal inv_txfm_add_%1_%2_%4, 4, 5, 0, dst, stride, coeff, eob, tx2
...@@ -146,29 +187,57 @@ ALIGN function_align ...@@ -146,29 +187,57 @@ ALIGN function_align
%macro INV_TXFM_4X4_FN 2-3 -1 ; type1, type2, fast_thresh %macro INV_TXFM_4X4_FN 2-3 -1 ; type1, type2, fast_thresh
INV_TXFM_FN %1, %2, %3, 4x4 INV_TXFM_FN %1, %2, %3, 4x4
%ifidn %1_%2, dct_identity %ifidn %1_%2, dct_identity
%elifidn %1_%2, identity_dct mova m0, [qw_2896x8]
%elif %3 >= 0 pmulhrsw m0, [coeffq]
paddw m0, m0
pmulhrsw m0, [qw_5793x4]
punpcklwd m0, m0
punpckhdq m1, m0, m0
punpckldq m0, m0
call m(iadst_4x4_internal).end
RET
%elifidn %1_%2, identity_dct
mova m1, [coeffq+16*0]
mova m2, [coeffq+16*1]
punpcklwd m0, m1, m2
punpckhwd m1, m2
punpcklwd m0, m1
punpcklqdq m0, m0
paddw m0, m0
pmulhrsw m0, [qw_5793x4]
pmulhrsw m0, [qw_2896x8]
mova m1, m0
call m(iadst_4x4_internal).end
RET
%elif %3 >= 0
pshuflw m0, [coeffq], q0000 pshuflw m0, [coeffq], q0000
punpcklqdq m0, m0 punpcklqdq m0, m0
%ifidn %1, dct %ifidn %1, dct
mova m1, [qw_2896x8] mova m1, [qw_2896x8]
pmulhrsw m0, m1 pmulhrsw m0, m1
%elifidn %1, adst %elifidn %1, adst
%elifidn %1, flipadst pmulhrsw m0, [iadst4_dconly1a]
%endif %elifidn %1, flipadst
mov [coeffq], eobd ;0 pmulhrsw m0, [iadst4_dconly1b]
%ifidn %2, dct %endif
%ifnidn %1, dct mov [coeffq], eobd ;0
pmulhrsw m0, [qw_2896x8] %ifidn %2, dct
%else %ifnidn %1, dct
pmulhrsw m0, m1 pmulhrsw m0, [qw_2896x8]
%endif %else
mova m1, m0 pmulhrsw m0, m1
ITX4_END 0, 1, 2, 3 %endif
%else ; adst / flipadst mova m1, m0
%endif call m(iadst_4x4_internal).end2
%endif RET
%else ; adst / flipadst
pmulhrsw m1, m0, [iadst4_dconly2b]
pmulhrsw m0, [iadst4_dconly2a]
call m(i%2_4x4_internal).end2
RET
%endif
%endif
%endmacro %endmacro
...@@ -197,3 +266,129 @@ cglobal idct_4x4_internal, 0, 0, 4, dst, stride, coeff, eob, tx2 ...@@ -197,3 +266,129 @@ cglobal idct_4x4_internal, 0, 0, 4, dst, stride, coeff, eob, tx2
ITX4_END 0, 1, 3, 2 ITX4_END 0, 1, 3, 2
INV_TXFM_4X4_FN dct, dct, 0 INV_TXFM_4X4_FN dct, dct, 0
cglobal iadst_4x4_internal, 0, 0, 6, dst, stride, coeff, eob, tx2
mova m0, [coeffq+16*0]
mova m1, [coeffq+16*1]
call .main
punpckhwd m3, m0, m2
punpcklwd m0, m1
punpckhwd m1, m0, m3 ;high: in3 ;low :in2
punpcklwd m0, m3 ;high: in1 ;low: in0
jmp tx2q
.pass2:
call .main
punpcklqdq m1, m2 ;out2 out3
.end:
pxor m2, m2
mova [coeffq+16*0], m2
mova [coeffq+16*1], m2
.end2:
ITX4_END 0, 1, 2, 3
ALIGN function_align
.main:
IADST4_1D_PACKED
ret
INV_TXFM_4X4_FN adst, adst, 0
INV_TXFM_4X4_FN dct, adst, 0
INV_TXFM_4X4_FN adst, dct, 0
cglobal iflipadst_4x4_internal, 0, 0, 6, dst, stride, coeff, eob, tx2
mova m0, [coeffq+16*0]
mova m1, [coeffq+16*1]
call m(iadst_4x4_internal).main
punpcklwd m1, m0
punpckhwd m2, m0
punpcklwd m0, m2, m1 ;high: in3 ;low :in2
punpckhwd m2, m1 ;high: in1 ;low: in0
mova m1, m2
jmp tx2q
.pass2:
call m(iadst_4x4_internal).main
punpcklqdq m1, m2 ;out2 out3
.end:
pxor m2, m2
mova [coeffq+16*0], m2
mova [coeffq+16*1], m2
.end2:
ITX4_END 3, 2, 1, 0
INV_TXFM_4X4_FN flipadst, flipadst, 0
INV_TXFM_4X4_FN flipadst, dct, 0
INV_TXFM_4X4_FN flipadst, adst, 0
INV_TXFM_4X4_FN dct, flipadst, 0
INV_TXFM_4X4_FN adst, flipadst, 0
cglobal iidentity_4x4_internal, 0, 0, 6, dst, stride, coeff, eob, tx2
mova m0, [coeffq+16*0]
mova m1, [coeffq+16*1]
mova m2, [qw_5793x4]
paddw m0, m0
paddw m1, m1
pmulhrsw m0, m2
pmulhrsw m1, m2
punpckhwd m2, m0, m1
punpcklwd m0, m1
punpckhwd m1, m0, m2 ;high: in3 ;low :in2
punpcklwd m0, m2 ;high: in1 ;low: in0
jmp tx2q
.pass2:
mova m2, [qw_5793x4]
paddw m0, m0
paddw m1, m1
pmulhrsw m0, m2
pmulhrsw m1, m2
jmp m(iadst_4x4_internal).end
INV_TXFM_4X4_FN identity, identity
INV_TXFM_4X4_FN identity, dct, 3
INV_TXFM_4X4_FN identity, adst
INV_TXFM_4X4_FN identity, flipadst
INV_TXFM_4X4_FN dct, identity, 3
INV_TXFM_4X4_FN adst, identity
INV_TXFM_4X4_FN flipadst, identity
%macro IWHT4_1D_PACKED 0
punpckhqdq m3, m0, m1 ;low: in1 high: in3
punpcklqdq m0, m1 ;low: in0 high: in2
psubw m2, m0, m3 ;low: in0 - in1 high: in2 - in3
paddw m0, m3 ;low: in0 + in1 high: in2 + in3
punpckhqdq m2, m2 ;t2 t2
punpcklqdq m0, m0 ;t0 t0
psubw m1, m0, m2
psraw m1, 1 ;t4 t4
psubw m1, m3 ;low: t1/out2 high: t3/out1
psubw m0, m1 ;high: out0
paddw m2, m1 ;low: out3
%endmacro
cglobal inv_txfm_add_wht_wht_4x4, 3, 3, 4, dst, stride, coeff
mova m0, [coeffq+16*0]
mova m1, [coeffq+16*1]
pxor m2, m2
mova [coeffq+16*0], m2
mova [coeffq+16*1], m2
psraw m0, 2
psraw m1, 2
IWHT4_1D_PACKED
punpckhwd m0, m1
punpcklwd m3, m1, m2
punpckhdq m1, m0, m3
punpckldq m0, m3
IWHT4_1D_PACKED
shufpd m0, m2, 0x01
ITX4_END 0, 3, 2, 1, 0
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment