Commit 1703f21f authored by Liwei Wang's avatar Liwei Wang Committed by Henrik Gramner

Add SSSE3 implementation for the 4x8 and 8x4 blocks in itx

Cycle times:
inv_txfm_add_4x8_adst_adst_0_8bpc_c: 1167.6
inv_txfm_add_4x8_adst_adst_0_8bpc_ssse3: 114.6
inv_txfm_add_4x8_adst_adst_1_8bpc_c: 1167.2
inv_txfm_add_4x8_adst_adst_1_8bpc_ssse3: 114.1
inv_txfm_add_4x8_adst_dct_0_8bpc_c: 1174.7
inv_txfm_add_4x8_adst_dct_0_8bpc_ssse3: 34.8
inv_txfm_add_4x8_adst_dct_1_8bpc_c: 1158.0
inv_txfm_add_4x8_adst_dct_1_8bpc_ssse3: 101.0
inv_txfm_add_4x8_adst_flipadst_0_8bpc_c: 1150.9
inv_txfm_add_4x8_adst_flipadst_0_8bpc_ssse3: 115.8
inv_txfm_add_4x8_adst_flipadst_1_8bpc_c: 1157.6
inv_txfm_add_4x8_adst_flipadst_1_8bpc_ssse3: 115.8
inv_txfm_add_4x8_adst_identity_0_8bpc_c: 848.4
inv_txfm_add_4x8_adst_identity_0_8bpc_ssse3: 59.1
inv_txfm_add_4x8_adst_identity_1_8bpc_c: 850.1
inv_txfm_add_4x8_adst_identity_1_8bpc_ssse3: 59.1
inv_txfm_add_4x8_dct_adst_0_8bpc_c: 1205.6
inv_txfm_add_4x8_dct_adst_0_8bpc_ssse3: 107.0
inv_txfm_add_4x8_dct_adst_1_8bpc_c: 1183.7
inv_txfm_add_4x8_dct_adst_1_8bpc_ssse3: 107.0
inv_txfm_add_4x8_dct_dct_0_8bpc_c: 1227.0
inv_txfm_add_4x8_dct_dct_0_8bpc_ssse3: 34.6
inv_txfm_add_4x8_dct_dct_1_8bpc_c: 1229.7
inv_txfm_add_4x8_dct_dct_1_8bpc_ssse3: 96.1
inv_txfm_add_4x8_dct_flipadst_0_8bpc_c: 1188.2
inv_txfm_add_4x8_dct_flipadst_0_8bpc_ssse3: 109.3
inv_txfm_add_4x8_dct_flipadst_1_8bpc_c: 1192.7
inv_txfm_add_4x8_dct_flipadst_1_8bpc_ssse3: 109.9
inv_txfm_add_4x8_dct_identity_0_8bpc_c: 878.4
inv_txfm_add_4x8_dct_identity_0_8bpc_ssse3: 31.9
inv_txfm_add_4x8_dct_identity_1_8bpc_c: 879.0
inv_txfm_add_4x8_dct_identity_1_8bpc_ssse3: 54.8
inv_txfm_add_4x8_flipadst_adst_0_8bpc_c: 1181.8
inv_txfm_add_4x8_flipadst_adst_0_8bpc_ssse3: 114.7
inv_txfm_add_4x8_flipadst_adst_1_8bpc_c: 1203.0
inv_txfm_add_4x8_flipadst_adst_1_8bpc_ssse3: 114.5
inv_txfm_add_4x8_flipadst_dct_0_8bpc_c: 1203.6
inv_txfm_add_4x8_flipadst_dct_0_8bpc_ssse3: 34.1
inv_txfm_add_4x8_flipadst_dct_1_8bpc_c: 1204.4
inv_txfm_add_4x8_flipadst_dct_1_8bpc_ssse3: 100.2
inv_txfm_add_4x8_flipadst_flipadst_0_8bpc_c: 1180.6
inv_txfm_add_4x8_flipadst_flipadst_0_8bpc_ssse3: 117.1
inv_txfm_add_4x8_flipadst_flipadst_1_8bpc_c: 1178.7
inv_txfm_add_4x8_flipadst_flipadst_1_8bpc_ssse3: 116.8
inv_txfm_add_4x8_flipadst_identity_0_8bpc_c: 871.3
inv_txfm_add_4x8_flipadst_identity_0_8bpc_ssse3: 69.0
inv_txfm_add_4x8_flipadst_identity_1_8bpc_c: 872.3
inv_txfm_add_4x8_flipadst_identity_1_8bpc_ssse3: 70.0
inv_txfm_add_4x8_identity_adst_0_8bpc_c: 1125.2
inv_txfm_add_4x8_identity_adst_0_8bpc_ssse3: 98.7
inv_txfm_add_4x8_identity_adst_1_8bpc_c: 1092.6
inv_txfm_add_4x8_identity_adst_1_8bpc_ssse3: 99.6
inv_txfm_add_4x8_identity_dct_0_8bpc_c: 1139.4
inv_txfm_add_4x8_identity_dct_0_8bpc_ssse3: 38.8
inv_txfm_add_4x8_identity_dct_1_8bpc_c: 1111.0
inv_txfm_add_4x8_identity_dct_1_8bpc_ssse3: 84.1
inv_txfm_add_4x8_identity_flipadst_0_8bpc_c: 1112.4
inv_txfm_add_4x8_identity_flipadst_0_8bpc_ssse3: 100.7
inv_txfm_add_4x8_identity_flipadst_1_8bpc_c: 1098.7
inv_txfm_add_4x8_identity_flipadst_1_8bpc_ssse3: 100.8
inv_txfm_add_4x8_identity_identity_0_8bpc_c: 791.6
inv_txfm_add_4x8_identity_identity_0_8bpc_ssse3: 43.9
inv_txfm_add_4x8_identity_identity_1_8bpc_c: 797.0
inv_txfm_add_4x8_identity_identity_1_8bpc_ssse3: 43.8
inv_txfm_add_8x4_adst_adst_0_8bpc_c: 1102.8
inv_txfm_add_8x4_adst_adst_0_8bpc_ssse3: 108.7
inv_txfm_add_8x4_adst_adst_1_8bpc_c: 1101.8
inv_txfm_add_8x4_adst_adst_1_8bpc_ssse3: 108.9
inv_txfm_add_8x4_adst_dct_0_8bpc_c: 1146.9
inv_txfm_add_8x4_adst_dct_0_8bpc_ssse3: 98.7
inv_txfm_add_8x4_adst_dct_1_8bpc_c: 1157.9
inv_txfm_add_8x4_adst_dct_1_8bpc_ssse3: 98.9
inv_txfm_add_8x4_adst_flipadst_0_8bpc_c: 1144.6
inv_txfm_add_8x4_adst_flipadst_0_8bpc_ssse3: 111.4
inv_txfm_add_8x4_adst_flipadst_1_8bpc_c: 1128.2
inv_txfm_add_8x4_adst_flipadst_1_8bpc_ssse3: 112.4
inv_txfm_add_8x4_adst_identity_0_8bpc_c: 1051.1
inv_txfm_add_8x4_adst_identity_0_8bpc_ssse3: 87.1
inv_txfm_add_8x4_adst_identity_1_8bpc_c: 1059.2
inv_txfm_add_8x4_adst_identity_1_8bpc_ssse3: 87.7
inv_txfm_add_8x4_dct_adst_0_8bpc_c: 1130.2
inv_txfm_add_8x4_dct_adst_0_8bpc_ssse3: 29.0
inv_txfm_add_8x4_dct_adst_1_8bpc_c: 1130.1
inv_txfm_add_8x4_dct_adst_1_8bpc_ssse3: 89.2
inv_txfm_add_8x4_dct_dct_0_8bpc_c: 1186.0
inv_txfm_add_8x4_dct_dct_0_8bpc_ssse3: 26.3
inv_txfm_add_8x4_dct_dct_1_8bpc_c: 1172.2
inv_txfm_add_8x4_dct_dct_1_8bpc_ssse3: 78.8
inv_txfm_add_8x4_dct_flipadst_0_8bpc_c: 1154.7
inv_txfm_add_8x4_dct_flipadst_0_8bpc_ssse3: 29.1
inv_txfm_add_8x4_dct_flipadst_1_8bpc_c: 1150.2
inv_txfm_add_8x4_dct_flipadst_1_8bpc_ssse3: 92.2
inv_txfm_add_8x4_dct_identity_0_8bpc_c: 1078.7
inv_txfm_add_8x4_dct_identity_0_8bpc_ssse3: 29.2
inv_txfm_add_8x4_dct_identity_1_8bpc_c: 1090.1
inv_txfm_add_8x4_dct_identity_1_8bpc_ssse3: 72.2
inv_txfm_add_8x4_flipadst_adst_0_8bpc_c: 1111.6
inv_txfm_add_8x4_flipadst_adst_0_8bpc_ssse3: 108.6
inv_txfm_add_8x4_flipadst_adst_1_8bpc_c: 1112.1
inv_txfm_add_8x4_flipadst_adst_1_8bpc_ssse3: 107.6
inv_txfm_add_8x4_flipadst_dct_0_8bpc_c: 1163.0
inv_txfm_add_8x4_flipadst_dct_0_8bpc_ssse3: 98.3
inv_txfm_add_8x4_flipadst_dct_1_8bpc_c: 1160.0
inv_txfm_add_8x4_flipadst_dct_1_8bpc_ssse3: 99.6
inv_txfm_add_8x4_flipadst_flipadst_0_8bpc_c: 1137.9
inv_txfm_add_8x4_flipadst_flipadst_0_8bpc_ssse3: 112.0
inv_txfm_add_8x4_flipadst_flipadst_1_8bpc_c: 1140.0
inv_txfm_add_8x4_flipadst_flipadst_1_8bpc_ssse3: 112.0
inv_txfm_add_8x4_flipadst_identity_0_8bpc_c: 1057.2
inv_txfm_add_8x4_flipadst_identity_0_8bpc_ssse3: 88.1
inv_txfm_add_8x4_flipadst_identity_1_8bpc_c: 1058.3
inv_txfm_add_8x4_flipadst_identity_1_8bpc_ssse3: 87.1
inv_txfm_add_8x4_identity_adst_0_8bpc_c: 794.0
inv_txfm_add_8x4_identity_adst_0_8bpc_ssse3: 60.6
inv_txfm_add_8x4_identity_adst_1_8bpc_c: 793.4
inv_txfm_add_8x4_identity_adst_1_8bpc_ssse3: 60.6
inv_txfm_add_8x4_identity_dct_0_8bpc_c: 838.4
inv_txfm_add_8x4_identity_dct_0_8bpc_ssse3: 27.4
inv_txfm_add_8x4_identity_dct_1_8bpc_c: 838.5
inv_txfm_add_8x4_identity_dct_1_8bpc_ssse3: 52.0
inv_txfm_add_8x4_identity_flipadst_0_8bpc_c: 825.3
inv_txfm_add_8x4_identity_flipadst_0_8bpc_ssse3: 66.7
inv_txfm_add_8x4_identity_flipadst_1_8bpc_c: 831.7
inv_txfm_add_8x4_identity_flipadst_1_8bpc_ssse3: 66.7
inv_txfm_add_8x4_identity_identity_0_8bpc_c: 768.6
inv_txfm_add_8x4_identity_identity_0_8bpc_ssse3: 40.0
inv_txfm_add_8x4_identity_identity_1_8bpc_c: 743.3
inv_txfm_add_8x4_identity_identity_1_8bpc_ssse3: 39.9
parent bd8ce19e
Pipeline #3731 passed with stages
in 5 minutes and 5 seconds
......@@ -78,6 +78,8 @@ decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x32_avx2);
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x64_avx2);
decl_itx17_fns(4, 4, ssse3);
decl_itx16_fns(4, 8, ssse3);
decl_itx16_fns(8, 4, ssse3);
void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) {
#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
......@@ -121,7 +123,9 @@ void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) {
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
#if BITDEPTH == 8
assign_itx17_fn(, 4, 4, ssse3);
assign_itx17_fn(, 4, 4, ssse3);
assign_itx16_fn(R, 4, 8, ssse3);
assign_itx16_fn(R, 8, 4, ssse3);
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
......
......@@ -29,22 +29,38 @@
SECTION_RODATA 16
deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
pw_2896x8: times 8 dw 2896*8
pw_1567_m3784: times 4 dw 1567, -3784
pw_3784_1567: times 4 dw 3784, 1567
deint_shuf1: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
deint_shuf2: db 8, 9, 0, 1, 10, 11, 2, 3, 12, 13, 4, 5, 14, 15, 6, 7
%macro COEF_PAIR 2
pw_%1_m%2: times 4 dw %1, -%2
pw_%2_%1: times 4 dw %2, %1
%endmacro
;adst4
pw_1321_3803: times 4 dw 1321, 3803
pw_2482_m1321: times 4 dw 2482, -1321
pw_3344_2482: times 4 dw 3344, 2482
pw_3344_m3803: times 4 dw 3344, -3803
pw_m6688_m3803: times 4 dw -6688, -3803
pw_3344x8: times 8 dw 3344*8
pw_5793x4: times 8 dw 5793*4
COEF_PAIR 1567, 3784
COEF_PAIR 799, 4017
COEF_PAIR 3406, 2276
COEF_PAIR 401, 4076
COEF_PAIR 1931, 3612
COEF_PAIR 3166, 2598
COEF_PAIR 3920, 1189
COEF_PAIR 3784, 1567
pd_2048: times 4 dd 2048
pw_2048: times 8 dw 2048
pw_4096: times 8 dw 4096
pw_2896x8: times 8 dw 2896*8
pw_3344x8: times 8 dw 3344*8
pw_5793x4: times 8 dw 5793*4
iadst4_dconly1a: times 2 dw 10568, 19856, 26752, 30424
iadst4_dconly1b: times 2 dw 30424, 26752, 19856, 10568
......@@ -61,14 +77,10 @@ SECTION .text
%define o(x) r5-$$+x ; PIC
%endif
%macro ITX4_END 4-5 2048 ; row[1-4], rnd
%if %5
mova m2, [o(pw_%5)]
pmulhrsw m0, m2
pmulhrsw m1, m2
%endif
%macro WRITE_4X4 9 ;src[1-2], tmp[1-3], row[1-4]
lea r2, [dstq+strideq*2]
%assign %%i 1
%rotate 5
%rep 4
%if %1 & 2
CAT_XDEFINE %%row_adr, %%i, r2 + strideq*(%1&1)
......@@ -79,33 +91,43 @@ SECTION .text
%rotate 1
%endrep
movd m2, [%%row_adr1] ;dst0
movd m4, [%%row_adr2] ;dst1
punpckldq m2, m4 ;high: dst1 :low: dst0
movd m3, [%%row_adr3] ;dst2
movd m4, [%%row_adr4] ;dst3
punpckldq m3, m4 ;high: dst3 :low: dst2
movd m%3, [%%row_adr1] ;dst0
movd m%5, [%%row_adr2] ;dst1
punpckldq m%3, m%5 ;high: dst1 :low: dst0
movd m%4, [%%row_adr3] ;dst2
movd m%5, [%%row_adr4] ;dst3
punpckldq m%4, m%5 ;high: dst3 :low: dst2
pxor m4, m4
punpcklbw m2, m4 ;extend byte to word
punpcklbw m3, m4 ;extend byte to word
pxor m%5, m%5
punpcklbw m%3, m%5 ;extend byte to word
punpcklbw m%4, m%5 ;extend byte to word
paddw m%1, m%3 ;high: dst1 + out1 ;low: dst0 + out0
paddw m%2, m%4 ;high: dst3 + out3 ;low: dst2 + out2
paddw m0, m2 ;high: dst1 + out1 ;low: dst0 + out0
paddw m1, m3 ;high: dst3 + out3 ;low: dst2 + out2
packuswb m%1, m%2 ;high->low: dst3 + out3, dst2 + out2, dst1 + out1, dst0 + out0
packuswb m0, m1 ;high->low: dst3 + out3, dst2 + out2, dst1 + out1, dst0 + out0
movd [%%row_adr1], m%1 ;store dst0 + out0
pshuflw m%2, m%1, q1032
movd [%%row_adr2], m%2 ;store dst1 + out1
punpckhqdq m%1, m%1
movd [%%row_adr3], m%1 ;store dst2 + out2
psrlq m%1, 32
movd [%%row_adr4], m%1 ;store dst3 + out3
%endmacro
movd [%%row_adr1], m0 ;store dst0 + out0
pshuflw m1, m0, q1032
movd [%%row_adr2], m1 ;store dst1 + out1
punpckhqdq m0, m0
movd [%%row_adr3], m0 ;store dst2 + out2
psrlq m0, 32
movd [%%row_adr4], m0 ;store dst3 + out3
%macro ITX4_END 4-5 2048 ; row[1-4], rnd
%if %5
mova m2, [o(pw_%5)]
pmulhrsw m0, m2
pmulhrsw m1, m2
%endif
WRITE_4X4 0, 1, 2, 3, 4, %1, %2, %3, %4
ret
%endmacro
; flags: 1 = swap, 2: coef_regs
%macro ITX_MUL2X_PACK 5-6 0 ; dst/src, tmp[1], rnd, coef[1-2], flags
%if %6 & 2
......@@ -113,7 +135,7 @@ SECTION .text
pmaddwd m%1, m%5
%elif %6 & 1
pmaddwd m%2, m%1, [o(pw_%5_%4)]
pmaddwd m%1, [pw_%4_m%5]
pmaddwd m%1, [o(pw_%4_m%5)]
%else
pmaddwd m%2, m%1, [o(pw_%4_m%5)]
pmaddwd m%1, [o(pw_%5_%4)]
......@@ -126,24 +148,25 @@ SECTION .text
%endmacro
%macro IDCT4_1D_PACKED 0-1 ;pw_2896x8
punpckhwd m2, m0, m1 ;unpacked in1 in3
punpckhwd m2, m0, m1 ;unpacked in1 in3
psubw m3, m0, m1
paddw m0, m1
punpcklqdq m0, m3 ;high: in0-in2 ;low: in0+in2
punpcklqdq m0, m3 ;high: in0-in2 ;low: in0+in2
mova m3, [o(pd_2048)]
ITX_MUL2X_PACK 2, 1, 3, 1567, 3784
ITX_MUL2X_PACK 2, 1, 3, 1567, 3784
%if %0 == 1
pmulhrsw m0, m%1
%else
pmulhrsw m0, [o(pw_2896x8)] ;high: t1 ;low: t0
pmulhrsw m0, [o(pw_2896x8)] ;high: t1 ;low: t0
%endif
psubsw m1, m0, m2 ;high: out2 ;low: out3
paddsw m0, m2 ;high: out1 ;low: out0
psubsw m1, m0, m2 ;high: out2 ;low: out3
paddsw m0, m2 ;high: out1 ;low: out0
%endmacro
%macro IADST4_1D_PACKED 0
punpcklwd m2, m0, m1 ;unpacked in0 in2
punpckhwd m3, m0, m1 ;unpacked in1 in3
......@@ -172,8 +195,8 @@ SECTION .text
packssdw m2, m2 ;high: out3 ;low: out3
%endmacro
%macro INV_TXFM_FN 4 ; type1, type2, fast_thresh, size
cglobal inv_txfm_add_%1_%2_%4, 4, 6, 0, dst, stride, coeff, eob, tx2
%macro INV_TXFM_FN 5+ ; type1, type2, fast_thresh, size, xmm/stack
cglobal inv_txfm_add_%1_%2_%4, 4, 6, %5, dst, stride, coeff, eob, tx2
%undef cmp
%if ARCH_X86_32
LEA r5, $$
......@@ -193,7 +216,7 @@ ALIGN function_align
%endmacro
%macro INV_TXFM_4X4_FN 2-3 -1 ; type1, type2, fast_thresh
INV_TXFM_FN %1, %2, %3, 4x4
INV_TXFM_FN %1, %2, %3, 4x4, 6
%ifidn %1_%2, dct_identity
mova m0, [o(pw_2896x8)]
pmulhrsw m0, [coeffq]
......@@ -254,7 +277,7 @@ INV_TXFM_4X4_FN dct, adst, 0
INV_TXFM_4X4_FN dct, flipadst, 0
INV_TXFM_4X4_FN dct, identity, 3
cglobal idct_4x4_internal, 0, 0, 4, dst, stride, coeff, eob, tx2
cglobal idct_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
mova m0, [coeffq+16*0] ;high: in1 ;low: in0
mova m1, [coeffq+16*1] ;high: in3 ;low in2
......@@ -281,7 +304,7 @@ INV_TXFM_4X4_FN adst, adst, 0
INV_TXFM_4X4_FN adst, flipadst, 0
INV_TXFM_4X4_FN adst, identity
cglobal iadst_4x4_internal, 0, 0, 6, dst, stride, coeff, eob, tx2
cglobal iadst_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
mova m0, [coeffq+16*0]
mova m1, [coeffq+16*1]
call .main
......@@ -313,7 +336,7 @@ INV_TXFM_4X4_FN flipadst, adst, 0
INV_TXFM_4X4_FN flipadst, flipadst, 0
INV_TXFM_4X4_FN flipadst, identity
cglobal iflipadst_4x4_internal, 0, 0, 6, dst, stride, coeff, eob, tx2
cglobal iflipadst_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
mova m0, [coeffq+16*0]
mova m1, [coeffq+16*1]
call m(iadst_4x4_internal).main
......@@ -341,7 +364,7 @@ INV_TXFM_4X4_FN identity, adst
INV_TXFM_4X4_FN identity, flipadst
INV_TXFM_4X4_FN identity, identity
cglobal iidentity_4x4_internal, 0, 0, 6, dst, stride, coeff, eob, tx2
cglobal iidentity_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
mova m0, [coeffq+16*0]
mova m1, [coeffq+16*1]
mova m2, [o(pw_5793x4)]
......@@ -398,3 +421,692 @@ cglobal inv_txfm_add_wht_wht_4x4, 3, 3, 4, dst, stride, coeff
shufpd m0, m2, 0x01
ITX4_END 0, 3, 2, 1, 0
%macro IDCT8_1D_PACKED 0
mova m6, [o(pd_2048)]
punpckhwd m5, m0, m3 ;unpacked in1 in7
punpckhwd m4, m2, m1 ;unpacked in5 in3
punpcklwd m1, m3 ;unpacked in2 in6
psubw m3, m0, m2
paddw m0, m2
punpcklqdq m0, m3 ;low: in0+in4 high: in0-in4
ITX_MUL2X_PACK 5, 2, 6, 799, 4017, 1 ;low: t4a high: t7a
ITX_MUL2X_PACK 4, 2, 6, 3406, 2276, 1 ;low: t5a high: t6a
ITX_MUL2X_PACK 1, 2, 6, 1567, 3784 ;low: t3 high: t2
mova m6, [o(pw_2896x8)]
psubsw m2, m5, m4 ;low: t5a high: t6a
paddsw m5, m4 ;low: t4 high: t7
punpckhqdq m4, m2, m2 ;low: t6a high: t6a
psubw m3, m4, m2 ;low: t6a - t5a
paddw m4, m2 ;low: t6a + t5a
punpcklqdq m4, m3 ;low: t6a + t5a high: t6a - t5a
pmulhrsw m0, m6 ;low: t0 high: t1
pmulhrsw m4, m6 ;low: t6 high: t5
shufps m2, m5, m4, q1032 ;low: t7 high: t6
shufps m5, m4, q3210 ;low: t4 high: t5
psubsw m4, m0, m1 ;low: tmp3 high: tmp2
paddsw m0, m1 ;low: tmp0 high: tmp1
psubsw m3, m0, m2 ;low: out7 high: out6
paddsw m0, m2 ;low: out0 high: out1
psubsw m2, m4, m5 ;low: out4 high: out5
paddsw m1, m4, m5 ;low: out3 high: out2
%endmacro
;dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
;dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
%macro ITX_MULSUB_2W 7 ; dst/src[1-2], tmp[1-2], rnd, coef[1-2]
punpckhwd m%3, m%1, m%2
punpcklwd m%1, m%2
%if %7 < 8
pmaddwd m%2, m%7, m%1
pmaddwd m%4, m%7, m%3
%else
mova m%2, [o(pw_%7_%6)]
pmaddwd m%4, m%3, m%2
pmaddwd m%2, m%1
%endif
paddd m%4, m%5
paddd m%2, m%5
psrad m%4, 12
psrad m%2, 12
packssdw m%2, m%4 ;dst2
%if %7 < 8
pmaddwd m%3, m%6
pmaddwd m%1, m%6
%else
mova m%4, [o(pw_%6_m%7)]
pmaddwd m%3, m%4
pmaddwd m%1, m%4
%endif
paddd m%3, m%5
paddd m%1, m%5
psrad m%3, 12
psrad m%1, 12
packssdw m%1, m%3 ;dst1
%endmacro
%macro IDCT4_1D 7 ; src[1-4], tmp[1-2], pd_2048
ITX_MULSUB_2W %2, %4, %5, %6, %7, 1567, 3784 ;t2, t3
mova m%6, [o(pw_2896x8)]
paddw m%5, m%1, m%3
psubw m%1, m%3
pmulhrsw m%1, m%6 ;t1
pmulhrsw m%5, m%6 ;t0
psubsw m%3, m%1, m%2 ;out2
paddsw m%2, m%1 ;out1
paddsw m%1, m%5, m%4 ;out0
psubsw m%5, m%4 ;out3
mova m%4, m%5
%endmacro
%macro IADST4_1D 0
mova m4, m2
psubw m2, m0, m4
paddw m2, m3 ;low: in0 - in2 + in3
punpckhwd m6, m0, m4 ;unpacked in0 in2
punpckhwd m7, m1, m3 ;unpacked in1 in3
punpcklwd m0, m4 ;unpacked in0 in2
punpcklwd m1, m3 ;unpacked in1 in3
pmaddwd m4, m0, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2
pmaddwd m0, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2
pmaddwd m3, m1, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3
pmaddwd m5, m1, [o(pw_3344_m3803)] ;3344 * in1 - 3803 * in3
paddd m3, m4 ;t0 + t3
pmaddwd m1, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3
pmulhrsw m2, [o(pw_3344x8)] ;out2
mova m4, [o(pd_2048)]
paddd m0, m4
paddd m4, m3 ;t0 + t3 + 2048
paddd m5, m0 ;t1 + t3 + 2048
paddd m3, m0
paddd m3, m1 ;t0 + t1 - t3 + 2048
psrad m4, 12 ;out0
psrad m5, 12 ;out1
psrad m3, 12 ;out3
packssdw m0, m4, m5 ;low: out0 high: out1
pmaddwd m4, m6, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2
pmaddwd m6, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2
pmaddwd m1, m7, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3
pmaddwd m5, m7, [o(pw_3344_m3803)] ;3344 * in1 - 3803 * in3
paddd m1, m4 ;t0 + t3
pmaddwd m7, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3
mova m4, [o(pd_2048)]
paddd m6, m4
paddd m4, m1 ;t0 + t3 + 2048
paddd m5, m6 ;t1 + t3 + 2048
paddd m1, m6
paddd m1, m7 ;t0 + t1 - t3 + 2048
psrad m4, 12 ;out0
psrad m5, 12 ;out1
psrad m1, 12 ;out3
packssdw m3, m1 ;out3
packssdw m4, m5 ;low: out0 high: out1
punpckhqdq m1, m0, m4 ;out1
punpcklqdq m0, m4 ;out0
%endmacro
%macro IADST8_1D_PACKED 0
mova m6, [o(pd_2048)]
punpckhwd m4, m3, m0 ;unpacked in7 in0
punpckhwd m5, m2, m1 ;unpacked in5 in2
punpcklwd m1, m2 ;unpacked in3 in4
punpcklwd m0, m3 ;unpacked in1 in6
ITX_MUL2X_PACK 4, 2, 6, 401, 4076 ;low: t0a high: t1a
ITX_MUL2X_PACK 5, 2, 6, 1931, 3612 ;low: t2a high: t3a
ITX_MUL2X_PACK 1, 2, 6, 3166, 2598 ;low: t4a high: t5a
ITX_MUL2X_PACK 0, 2, 6, 3920, 1189 ;low: t6a high: t7a
psubsw m3, m4, m1 ;low: t4 high: t5
paddsw m4, m1 ;low: t0 high: t1
psubsw m2, m5, m0 ;low: t6 high: t7
paddsw m5, m0 ;low: t2 high: t3
shufps m1, m3, m2, q1032
punpckhwd m2, m1
punpcklwd m3, m1
ITX_MUL2X_PACK 3, 0, 6, 1567, 3784, 1 ;low: t5a high: t4a
ITX_MUL2X_PACK 2, 0, 6, 3784, 1567 ;low: t7a high: t6a
psubsw m1, m4, m5 ;low: t2 high: t3
paddsw m4, m5 ;low: out0 high: -out7
psubsw m5, m3, m2 ;low: t7 high: t6
paddsw m3, m2 ;low: out6 high: -out1
shufps m0, m4, m3, q3210 ;low: out0 high: -out1
shufps m3, m4, q3210 ;low: out6 high: -out7
shufps m4, m1, m5, q1032 ;low: t3 high: t7
shufps m1, m5, q3210 ;low: t2 high: t6
mova m5, [o(pw_2896x8)]
psubw m2, m1, m4 ;low: t2-t3 high: t6-t7
paddw m1, m4 ;low: t2+t3 high: t6+t7
pmulhrsw m2, m5 ;low: out4 high: -out5
shufps m1, m1, q1032
pmulhrsw m1, m5 ;low: out2 high: -out3
%endmacro
%macro WRITE_4X8 4 ;row[1-4]
WRITE_4X4 0, 1, 4, 5, 6, %1, %2, %3, %4
lea dstq, [dstq+strideq*4]
WRITE_4X4 2, 3, 4, 5, 6, %1, %2, %3, %4
%endmacro
%macro INV_4X8 0
punpckhwd m4, m2, m3
punpcklwd m2, m3
punpckhwd m3, m0, m1
punpcklwd m0, m1
punpckhdq m1, m0, m2 ;low: in2 high: in3
punpckldq m0, m2 ;low: in0 high: in1
punpckldq m2, m3, m4 ;low: in4 high: in5
punpckhdq m3, m4 ;low: in6 high: in7
%endmacro
%macro INV_TXFM_4X8_FN 2-3 -1 ; type1, type2, fast_thresh
INV_TXFM_FN %1, %2, %3, 4x8, 8
%if %3 >= 0
%ifidn %1_%2, dct_identity
mova m1, [o(pw_2896x8)]
pmulhrsw m0, m1, [coeffq]
pmulhrsw m0, m1
pmulhrsw m0, [o(pw_4096)]
punpckhwd m2, m0, m0
punpcklwd m0, m0
punpckhdq m1, m0, m0
punpckldq m0, m0
punpckhdq m3, m2, m2
punpckldq m2, m2
call m(iadst_4x8_internal).end3
RET
%elifidn %1_%2, identity_dct
movd m0, [coeffq+16*0]
punpcklwd m0, [coeffq+16*1]
movd m1, [coeffq+16*2]
punpcklwd m1, [coeffq+16*3]
mova m2, [o(pw_2896x8)]
punpckldq m0, m1
pmulhrsw m0, m2
paddw m0, m0
pmulhrsw m0, [o(pw_5793x4)]
pmulhrsw m0, m2
pmulhrsw m0, [o(pw_2048)]
punpcklqdq m0, m0
mova m1, m0
mova m2, m0
mova m3, m0
call m(iadst_4x8_internal).end3
RET
%elifidn %1_%2, dct_dct
pshuflw m0, [coeffq], q0000
punpcklqdq m0, m0
mova m1, [o(pw_2896x8)]
pmulhrsw m0, m1
mov [coeffq], eobd
pmulhrsw m0, m1
pmulhrsw m0, m1
pmulhrsw m0, [o(pw_2048)]
mova m1, m0
mova m2, m0
mova m3, m0
call m(iadst_4x8_internal).end4
RET
%else ; adst_dct / flipadst_dct
pshuflw m0, [coeffq], q0000
punpcklqdq m0, m0
mova m1, [o(pw_2896x8)]
pmulhrsw m0, m1
%ifidn %1, adst
pmulhrsw m0, [o(iadst4_dconly1a)]
%else ; flipadst
pmulhrsw m0, [o(iadst4_dconly1b)]
%endif
mov [coeffq], eobd
pmulhrsw m0, m1
pmulhrsw m0, [o(pw_2048)]
mova m1, m0
mova m2, m0
mova m3, m0
call m(iadst_4x8_internal).end4
RET
%endif
%endif
%endmacro
INV_TXFM_4X8_FN dct, dct, 0
INV_TXFM_4X8_FN dct, identity, 7
INV_TXFM_4X8_FN dct, adst
INV_TXFM_4X8_FN dct, flipadst
cglobal idct_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
mova m3, [o(pw_2896x8)]
pmulhrsw m0, m3, [coeffq+16*0]
pmulhrsw m1, m3, [coeffq+16*1]
pmulhrsw m2, m3, [coeffq+16*2]
pmulhrsw m3, [coeffq+16*3]
call m(idct_8x4_internal).main
call m(iadst_4x8_internal).inversion
jmp tx2q
.pass2:
call .main
shufps m1, m1, q1032
shufps m3, m3, q1032
mova m4, [o(pw_2048)]
jmp m(iadst_4x8_internal).end2
ALIGN function_align
.main:
IDCT8_1D_PACKED
ret
INV_TXFM_4X8_FN adst, dct, 0
INV_TXFM_4X8_FN adst, adst
INV_TXFM_4X8_FN adst, flipadst
INV_TXFM_4X8_FN adst, identity
cglobal iadst_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
mova m3, [o(pw_2896x8)]
pmulhrsw m0, m3, [coeffq+16*0]
pmulhrsw m1, m3, [coeffq+16*1]
pmulhrsw m2, m3, [coeffq+16*2]
pmulhrsw m3, [coeffq+16*3]
call m(iadst_8x4_internal).main
call .inversion
jmp tx2q
.pass2:
shufps m0, m0, q1032
shufps m1, m1, q1032
call .main
mova m4, [o(pw_2048)]
pxor m5, m5
psubw m5, m4
.end:
punpcklqdq m4, m5
.end2:
pmulhrsw m0, m4
pmulhrsw m1, m4
pmulhrsw m2, m4
pmulhrsw m3, m4
.end3:
pxor m5, m5
mova [coeffq+16*0], m5
mova [coeffq+16*1], m5
mova [coeffq+16*2], m5
mova [coeffq+16*3], m5
.end4:
WRITE_4X8 0, 1, 2, 3
RET
ALIGN function_align
.main:
IADST8_1D_PACKED
ret
ALIGN function_align
.inversion:
INV_4X8
ret
INV_TXFM_4X8_FN flipadst, dct, 0
INV_TXFM_4X8_FN flipadst, adst
INV_TXFM_4X8_FN flipadst, flipadst
INV_TXFM_4X8_FN flipadst, identity
cglobal iflipadst_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
mova m3, [o(pw_2896x8)]
pmulhrsw m0, m3, [coeffq+16*0]
pmulhrsw m1, m3, [coeffq+16*1]
pmulhrsw m2, m3, [coeffq+16*2]
pmulhrsw m3, [coeffq+16*3]
call m(iadst_8x4_internal).main
punpcklwd m4, m3, m2
punpckhwd m3, m2
punpcklwd m5, m1, m0
punpckhwd m1, m0
punpckldq m2, m3, m1 ;low: in4 high: in5
punpckhdq m3, m1 ;low: in6 high: in7
punpckldq m0, m4, m5 ;low: in0 high: in1
punpckhdq m1, m4, m5 ;low: in2 high: in3
jmp tx2q
.pass2:
shufps m0, m0, q1032
shufps m1, m1, q1032
call m(iadst_4x8_internal).main
mova m4, m0
mova m5, m1
pshufd m0, m3, q1032
pshufd m1, m2, q1032
pshufd m2, m5, q1032
pshufd m3, m4, q1032
mova m5, [o(pw_2048)]
pxor m4, m4
psubw m4, m5
jmp m(iadst_4x8_internal).end
INV_TXFM_4X8_FN identity, dct, 3
INV_TXFM_4X8_FN identity, adst
INV_TXFM_4X8_FN identity, flipadst
INV_TXFM_4X8_FN identity, identity
cglobal iidentity_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
mova m3, [o(pw_2896x8)]
pmulhrsw m0, m3, [coeffq+16*0]
pmulhrsw m1, m3, [coeffq+16*1]
pmulhrsw m2, m3, [coeffq+16*2]
pmulhrsw m3, [coeffq+16*3]
mova m5, [o(pw_5793x4)]
paddw m0, m0
paddw m1, m1
paddw m2, m2
paddw m3, m3
pmulhrsw m0, m5
pmulhrsw m1, m5
pmulhrsw m2, m5
pmulhrsw m3, m5
call m(iadst_4x8_internal).inversion
jmp tx2q
.pass2:
mova m4, [o(pw_4096)]
jmp m(iadst_4x8_internal).end2
%macro WRITE_8X2 5 ;coefs[1-2], tmp[1-3]
movq m%3, [dstq ]
movq m%4, [dstq+strideq]
pxor m%5, m%5
punpcklbw m%3, m%5 ;extend byte to word
punpcklbw m%4, m%5 ;extend byte to word
paddw m%3, m%1
paddw m%4, m%2
packuswb m%3, m%4
movq [dstq ], m%3
punpckhqdq m%3, m%3
movq [dstq+strideq], m%3
%endmacro
%macro WRITE_8X4 7 ;coefs[1-4], tmp[1-3]
WRITE_8X2 0, 1, 4, 5, 6
lea dstq, [dstq+strideq*2]
WRITE_8X2 2, 3, 4, 5, 6
%endmacro
%macro INV_TXFM_8X4_FN 2-3 -1 ; type1, type2, fast_thresh
INV_TXFM_FN %1, %2, %3, 8x4, 8
%if %3 >= 0
%ifidn %1_%2, dct_identity
mova m0, [o(pw_2896x8)]
pmulhrsw m1, m0, [coeffq]
pmulhrsw m1, m0
paddw m1, m1
pmulhrsw m1, [o(pw_5793x4)]
pmulhrsw m1, [o(pw_2048)]
punpcklwd m1, m1
punpckhdq m2, m1, m1
punpckldq m1, m1
punpckhdq m3, m2, m2
punpckldq m2, m2
punpckldq m0, m1, m1
punpckhdq m1, m1
%elifidn %1_%2, identity_dct
mova m0, [coeffq+16*0]
mova m1, [coeffq+16*1]
mova m2, [coeffq+16*2]
mova m3, [coeffq+16*3]
punpckhwd m4, m0, m1
punpcklwd m0, m1
punpckhwd m5, m2, m3
punpcklwd m2, m3
punpcklwd m0, m4
punpcklwd m2, m5
punpcklqdq m0, m2
mova m4, [o(pw_2896x8)]
pmulhrsw m0, m4
paddw m0, m0
pmulhrsw m0, m4
pmulhrsw m0, [o(pw_2048)]
mova m1, m0
mova m2, m0
mova m3, m0
%else