Commit a532e5ae authored by Liwei Wang's avatar Liwei Wang Committed by Henrik Gramner

Add SSSE3 implementation for the 8x16 and 16x8 blocks in itx

Cycle times:
inv_txfm_add_8x16_adst_adst_0_8bpc_c: 5063.0
inv_txfm_add_8x16_adst_adst_0_8bpc_ssse3: 406.8
inv_txfm_add_8x16_adst_adst_1_8bpc_c: 5051.2
inv_txfm_add_8x16_adst_adst_1_8bpc_ssse3: 407.3
inv_txfm_add_8x16_adst_adst_2_8bpc_c: 5065.4
inv_txfm_add_8x16_adst_adst_2_8bpc_ssse3: 407.9
inv_txfm_add_8x16_adst_dct_0_8bpc_c: 5201.1
inv_txfm_add_8x16_adst_dct_0_8bpc_ssse3: 354.8
inv_txfm_add_8x16_adst_dct_1_8bpc_c: 5214.8
inv_txfm_add_8x16_adst_dct_1_8bpc_ssse3: 354.8
inv_txfm_add_8x16_adst_dct_2_8bpc_c: 5225.0
inv_txfm_add_8x16_adst_dct_2_8bpc_ssse3: 355.1
inv_txfm_add_8x16_adst_flipadst_0_8bpc_c: 7135.9
inv_txfm_add_8x16_adst_flipadst_0_8bpc_ssse3: 409.7
inv_txfm_add_8x16_adst_flipadst_1_8bpc_c: 8354.4
inv_txfm_add_8x16_adst_flipadst_1_8bpc_ssse3: 409.2
inv_txfm_add_8x16_adst_flipadst_2_8bpc_c: 7198.7
inv_txfm_add_8x16_adst_flipadst_2_8bpc_ssse3: 409.7
inv_txfm_add_8x16_adst_identity_0_8bpc_c: 3936.5
inv_txfm_add_8x16_adst_identity_0_8bpc_ssse3: 262.0
inv_txfm_add_8x16_adst_identity_1_8bpc_c: 4617.8
inv_txfm_add_8x16_adst_identity_1_8bpc_ssse3: 261.4
inv_txfm_add_8x16_adst_identity_2_8bpc_c: 3895.1
inv_txfm_add_8x16_adst_identity_2_8bpc_ssse3: 262.1
inv_txfm_add_8x16_dct_adst_0_8bpc_c: 5203.9
inv_txfm_add_8x16_dct_adst_0_8bpc_ssse3: 355.1
inv_txfm_add_8x16_dct_adst_1_8bpc_c: 5200.8
inv_txfm_add_8x16_dct_adst_1_8bpc_ssse3: 355.4
inv_txfm_add_8x16_dct_adst_2_8bpc_c: 5208.2
inv_txfm_add_8x16_dct_adst_2_8bpc_ssse3: 355.1
inv_txfm_add_8x16_dct_dct_0_8bpc_c: 5270.8
inv_txfm_add_8x16_dct_dct_0_8bpc_ssse3: 57.0
inv_txfm_add_8x16_dct_dct_1_8bpc_c: 5280.9
inv_txfm_add_8x16_dct_dct_1_8bpc_ssse3: 303.2
inv_txfm_add_8x16_dct_dct_2_8bpc_c: 5275.9
inv_txfm_add_8x16_dct_dct_2_8bpc_ssse3: 302.4
inv_txfm_add_8x16_dct_flipadst_0_8bpc_c: 5374.4
inv_txfm_add_8x16_dct_flipadst_0_8bpc_ssse3: 356.5
inv_txfm_add_8x16_dct_flipadst_1_8bpc_c: 5449.9
inv_txfm_add_8x16_dct_flipadst_1_8bpc_ssse3: 356.8
inv_txfm_add_8x16_dct_flipadst_2_8bpc_c: 5446.9
inv_txfm_add_8x16_dct_flipadst_2_8bpc_ssse3: 356.7
inv_txfm_add_8x16_dct_identity_0_8bpc_c: 3883.4
inv_txfm_add_8x16_dct_identity_0_8bpc_ssse3: 76.1
inv_txfm_add_8x16_dct_identity_1_8bpc_c: 3892.3
inv_txfm_add_8x16_dct_identity_1_8bpc_ssse3: 76.1
inv_txfm_add_8x16_dct_identity_2_8bpc_c: 4027.1
inv_txfm_add_8x16_dct_identity_2_8bpc_ssse3: 209.9
inv_txfm_add_8x16_flipadst_adst_0_8bpc_c: 7387.5
inv_txfm_add_8x16_flipadst_adst_0_8bpc_ssse3: 408.9
inv_txfm_add_8x16_flipadst_adst_1_8bpc_c: 7298.8
inv_txfm_add_8x16_flipadst_adst_1_8bpc_ssse3: 408.8
inv_txfm_add_8x16_flipadst_adst_2_8bpc_c: 7397.2
inv_txfm_add_8x16_flipadst_adst_2_8bpc_ssse3: 408.9
inv_txfm_add_8x16_flipadst_dct_0_8bpc_c: 5250.4
inv_txfm_add_8x16_flipadst_dct_0_8bpc_ssse3: 355.3
inv_txfm_add_8x16_flipadst_dct_1_8bpc_c: 5263.9
inv_txfm_add_8x16_flipadst_dct_1_8bpc_ssse3: 355.4
inv_txfm_add_8x16_flipadst_dct_2_8bpc_c: 5259.0
inv_txfm_add_8x16_flipadst_dct_2_8bpc_ssse3: 356.3
inv_txfm_add_8x16_flipadst_flipadst_0_8bpc_c: 5448.4
inv_txfm_add_8x16_flipadst_flipadst_0_8bpc_ssse3: 410.2
inv_txfm_add_8x16_flipadst_flipadst_1_8bpc_c: 5402.6
inv_txfm_add_8x16_flipadst_flipadst_1_8bpc_ssse3: 410.8
inv_txfm_add_8x16_flipadst_flipadst_2_8bpc_c: 6479.7
inv_txfm_add_8x16_flipadst_flipadst_2_8bpc_ssse3: 409.8
inv_txfm_add_8x16_flipadst_identity_0_8bpc_c: 3828.9
inv_txfm_add_8x16_flipadst_identity_0_8bpc_ssse3: 262.7
inv_txfm_add_8x16_flipadst_identity_1_8bpc_c: 3884.5
inv_txfm_add_8x16_flipadst_identity_1_8bpc_ssse3: 262.0
inv_txfm_add_8x16_flipadst_identity_2_8bpc_c: 3809.2
inv_txfm_add_8x16_flipadst_identity_2_8bpc_ssse3: 262.9
inv_txfm_add_8x16_identity_adst_0_8bpc_c: 4294.5
inv_txfm_add_8x16_identity_adst_0_8bpc_ssse3: 268.8
inv_txfm_add_8x16_identity_adst_1_8bpc_c: 4955.4
inv_txfm_add_8x16_identity_adst_1_8bpc_ssse3: 269.1
inv_txfm_add_8x16_identity_adst_2_8bpc_c: 4166.4
inv_txfm_add_8x16_identity_adst_2_8bpc_ssse3: 269.9
inv_txfm_add_8x16_identity_dct_0_8bpc_c: 4012.3
inv_txfm_add_8x16_identity_dct_0_8bpc_ssse3: 56.7
inv_txfm_add_8x16_identity_dct_1_8bpc_c: 4767.1
inv_txfm_add_8x16_identity_dct_1_8bpc_ssse3: 215.1
inv_txfm_add_8x16_identity_dct_2_8bpc_c: 4012.6
inv_txfm_add_8x16_identity_dct_2_8bpc_ssse3: 215.9
inv_txfm_add_8x16_identity_flipadst_0_8bpc_c: 4452.6
inv_txfm_add_8x16_identity_flipadst_0_8bpc_ssse3: 270.5
inv_txfm_add_8x16_identity_flipadst_1_8bpc_c: 4885.8
inv_txfm_add_8x16_identity_flipadst_1_8bpc_ssse3: 270.3
inv_txfm_add_8x16_identity_flipadst_2_8bpc_c: 4186.1
inv_txfm_add_8x16_identity_flipadst_2_8bpc_ssse3: 271.5
inv_txfm_add_8x16_identity_identity_0_8bpc_c: 2623.0
inv_txfm_add_8x16_identity_identity_0_8bpc_ssse3: 123.1
inv_txfm_add_8x16_identity_identity_1_8bpc_c: 2617.7
inv_txfm_add_8x16_identity_identity_1_8bpc_ssse3: 122.9
inv_txfm_add_8x16_identity_identity_2_8bpc_c: 2617.2
inv_txfm_add_8x16_identity_identity_2_8bpc_ssse3: 123.1
inv_txfm_add_16x8_adst_adst_0_8bpc_c: 5102.3
inv_txfm_add_16x8_adst_adst_0_8bpc_ssse3: 409.0
inv_txfm_add_16x8_adst_adst_1_8bpc_c: 5063.2
inv_txfm_add_16x8_adst_adst_1_8bpc_ssse3: 409.5
inv_txfm_add_16x8_adst_adst_2_8bpc_c: 5029.1
inv_txfm_add_16x8_adst_adst_2_8bpc_ssse3: 410.1
inv_txfm_add_16x8_adst_dct_0_8bpc_c: 5848.8
inv_txfm_add_16x8_adst_dct_0_8bpc_ssse3: 358.8
inv_txfm_add_16x8_adst_dct_1_8bpc_c: 5612.8
inv_txfm_add_16x8_adst_dct_1_8bpc_ssse3: 358.8
inv_txfm_add_16x8_adst_dct_2_8bpc_c: 5143.2
inv_txfm_add_16x8_adst_dct_2_8bpc_ssse3: 358.5
inv_txfm_add_16x8_adst_flipadst_0_8bpc_c: 5072.4
inv_txfm_add_16x8_adst_flipadst_0_8bpc_ssse3: 413.3
inv_txfm_add_16x8_adst_flipadst_1_8bpc_c: 5082.2
inv_txfm_add_16x8_adst_flipadst_1_8bpc_ssse3: 413.6
inv_txfm_add_16x8_adst_flipadst_2_8bpc_c: 5108.0
inv_txfm_add_16x8_adst_flipadst_2_8bpc_ssse3: 413.8
inv_txfm_add_16x8_adst_identity_0_8bpc_c: 3897.2
inv_txfm_add_16x8_adst_identity_0_8bpc_ssse3: 283.6
inv_txfm_add_16x8_adst_identity_1_8bpc_c: 3947.2
inv_txfm_add_16x8_adst_identity_1_8bpc_ssse3: 283.1
inv_txfm_add_16x8_adst_identity_2_8bpc_c: 3881.7
inv_txfm_add_16x8_adst_identity_2_8bpc_ssse3: 283.6
inv_txfm_add_16x8_dct_adst_0_8bpc_c: 5200.7
inv_txfm_add_16x8_dct_adst_0_8bpc_ssse3: 355.0
inv_txfm_add_16x8_dct_adst_1_8bpc_c: 5261.0
inv_txfm_add_16x8_dct_adst_1_8bpc_ssse3: 355.1
inv_txfm_add_16x8_dct_adst_2_8bpc_c: 5212.5
inv_txfm_add_16x8_dct_adst_2_8bpc_ssse3: 354.5
inv_txfm_add_16x8_dct_dct_0_8bpc_c: 5252.9
inv_txfm_add_16x8_dct_dct_0_8bpc_ssse3: 43.6
inv_txfm_add_16x8_dct_dct_1_8bpc_c: 5260.0
inv_txfm_add_16x8_dct_dct_1_8bpc_ssse3: 302.1
inv_txfm_add_16x8_dct_dct_2_8bpc_c: 5250.4
inv_txfm_add_16x8_dct_dct_2_8bpc_ssse3: 302.0
inv_txfm_add_16x8_dct_flipadst_0_8bpc_c: 5216.6
inv_txfm_add_16x8_dct_flipadst_0_8bpc_ssse3: 359.3
inv_txfm_add_16x8_dct_flipadst_1_8bpc_c: 5229.9
inv_txfm_add_16x8_dct_flipadst_1_8bpc_ssse3: 357.6
inv_txfm_add_16x8_dct_flipadst_2_8bpc_c: 5261.4
inv_txfm_add_16x8_dct_flipadst_2_8bpc_ssse3: 357.4
inv_txfm_add_16x8_dct_identity_0_8bpc_c: 3999.2
inv_txfm_add_16x8_dct_identity_0_8bpc_ssse3: 63.8
inv_txfm_add_16x8_dct_identity_1_8bpc_c: 4018.1
inv_txfm_add_16x8_dct_identity_1_8bpc_ssse3: 227.1
inv_txfm_add_16x8_dct_identity_2_8bpc_c: 3998.7
inv_txfm_add_16x8_dct_identity_2_8bpc_ssse3: 226.2
inv_txfm_add_16x8_flipadst_adst_0_8bpc_c: 5124.9
inv_txfm_add_16x8_flipadst_adst_0_8bpc_ssse3: 419.7
inv_txfm_add_16x8_flipadst_adst_1_8bpc_c: 5100.7
inv_txfm_add_16x8_flipadst_adst_1_8bpc_ssse3: 420.5
inv_txfm_add_16x8_flipadst_adst_2_8bpc_c: 5087.1
inv_txfm_add_16x8_flipadst_adst_2_8bpc_ssse3: 419.9
inv_txfm_add_16x8_flipadst_dct_0_8bpc_c: 5183.2
inv_txfm_add_16x8_flipadst_dct_0_8bpc_ssse3: 367.1
inv_txfm_add_16x8_flipadst_dct_1_8bpc_c: 5193.7
inv_txfm_add_16x8_flipadst_dct_1_8bpc_ssse3: 368.6
inv_txfm_add_16x8_flipadst_dct_2_8bpc_c: 5186.8
inv_txfm_add_16x8_flipadst_dct_2_8bpc_ssse3: 368.4
inv_txfm_add_16x8_flipadst_flipadst_0_8bpc_c: 5091.3
inv_txfm_add_16x8_flipadst_flipadst_0_8bpc_ssse3: 421.2
inv_txfm_add_16x8_flipadst_flipadst_1_8bpc_c: 5118.5
inv_txfm_add_16x8_flipadst_flipadst_1_8bpc_ssse3: 421.4
inv_txfm_add_16x8_flipadst_flipadst_2_8bpc_c: 5119.0
inv_txfm_add_16x8_flipadst_flipadst_2_8bpc_ssse3: 421.2
inv_txfm_add_16x8_flipadst_identity_0_8bpc_c: 3909.3
inv_txfm_add_16x8_flipadst_identity_0_8bpc_ssse3: 289.9
inv_txfm_add_16x8_flipadst_identity_1_8bpc_c: 3920.7
inv_txfm_add_16x8_flipadst_identity_1_8bpc_ssse3: 290.4
inv_txfm_add_16x8_flipadst_identity_2_8bpc_c: 3936.7
inv_txfm_add_16x8_flipadst_identity_2_8bpc_ssse3: 290.6
inv_txfm_add_16x8_identity_adst_0_8bpc_c: 3869.3
inv_txfm_add_16x8_identity_adst_0_8bpc_ssse3: 280.0
inv_txfm_add_16x8_identity_adst_1_8bpc_c: 3832.2
inv_txfm_add_16x8_identity_adst_1_8bpc_ssse3: 281.4
inv_txfm_add_16x8_identity_adst_2_8bpc_c: 3820.8
inv_txfm_add_16x8_identity_adst_2_8bpc_ssse3: 281.5
inv_txfm_add_16x8_identity_dct_0_8bpc_c: 3878.6
inv_txfm_add_16x8_identity_dct_0_8bpc_ssse3: 76.7
inv_txfm_add_16x8_identity_dct_1_8bpc_c: 3883.3
inv_txfm_add_16x8_identity_dct_1_8bpc_ssse3: 76.3
inv_txfm_add_16x8_identity_dct_2_8bpc_c: 3900.6
inv_txfm_add_16x8_identity_dct_2_8bpc_ssse3: 220.1
inv_txfm_add_16x8_identity_flipadst_0_8bpc_c: 3840.9
inv_txfm_add_16x8_identity_flipadst_0_8bpc_ssse3: 277.1
inv_txfm_add_16x8_identity_flipadst_1_8bpc_c: 3860.6
inv_txfm_add_16x8_identity_flipadst_1_8bpc_ssse3: 277.0
inv_txfm_add_16x8_identity_flipadst_2_8bpc_c: 3849.4
inv_txfm_add_16x8_identity_flipadst_2_8bpc_ssse3: 277.2
inv_txfm_add_16x8_identity_identity_0_8bpc_c: 2610.9
inv_txfm_add_16x8_identity_identity_0_8bpc_ssse3: 159.8
inv_txfm_add_16x8_identity_identity_1_8bpc_c: 2597.1
inv_txfm_add_16x8_identity_identity_1_8bpc_ssse3: 159.8
inv_txfm_add_16x8_identity_identity_2_8bpc_c: 2607.9
inv_txfm_add_16x8_identity_identity_2_8bpc_ssse3: 159.9
parent e811c476
Pipeline #5032 passed with stages
in 10 minutes and 28 seconds
......@@ -83,6 +83,8 @@ decl_itx16_fns( 8, 4, ssse3);
decl_itx16_fns( 8, 8, ssse3);
decl_itx16_fns( 4, 16, ssse3);
decl_itx16_fns(16, 4, ssse3);
decl_itx16_fns( 8, 16, ssse3);
decl_itx16_fns(16, 8, ssse3);
void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) {
#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
......@@ -132,6 +134,8 @@ void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) {
assign_itx16_fn(, 8, 8, ssse3);
assign_itx16_fn(R, 4, 16, ssse3);
assign_itx16_fn(R, 16, 4, ssse3);
assign_itx16_fn(R, 8, 16, ssse3);
assign_itx16_fn(R, 16, 8, ssse3);
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
......
......@@ -1147,7 +1147,7 @@ cglobal iidentity_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
jmp m(iadst_8x4_internal).end
%macro INV_TXFM_8X8_FN 2-3 -1 ; type1, type2, fast_thresh
INV_TXFM_FN %1, %2, %3, 8x8, 8
INV_TXFM_FN %1, %2, %3, 8x8, 8, 16*4
%ifidn %1_%2, dct_identity
mova m0, [o(pw_2896x8)]
pmulhrsw m0, [coeffq]
......@@ -1182,6 +1182,7 @@ cglobal iidentity_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
pmulhrsw m0, m2
.end:
mov r2d, 2
lea tx2q, [o(m(inv_txfm_add_dct_dct_8x8).end3)]
.end2:
lea r3, [strideq*3]
.loop:
......@@ -1189,6 +1190,8 @@ cglobal iidentity_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
lea dstq, [dstq+strideq*2]
dec r2d
jg .loop
jmp tx2q
.end3:
RET
%else ; identity
mova m0, [coeffq+16*0]
......@@ -1219,6 +1222,7 @@ cglobal iidentity_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
mova m4, [coeffq+16*4]
mova m5, [coeffq+16*5]
mova m6, [coeffq+16*6]
mova m7, [coeffq+16*7]
%endmacro
%macro IDCT8_1D_ODDHALF 7 ; src[1-4], tmp[1-2], pd_2048
......@@ -1242,97 +1246,106 @@ INV_TXFM_8X8_FN dct, flipadst
cglobal idct_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
ITX_8X8_LOAD_COEFS
.pass1:
call .main
.pass1_end:
mova m7, [o(pw_16384)]
REPX {pmulhrsw x, m7}, m0, m2, m4, m6
mova [coeffq+16*6], m6
mova m7, [o(pw_16384)]
REPX {pmulhrsw x, m7}, m0, m2, m4, m6
mova [rsp+gprsize+16*1], m6
.pass1_end2:
REPX {pmulhrsw x, m7}, m1, m3, m5
pmulhrsw m7, [coeffq+16*7]
REPX {pmulhrsw x, m7}, m1, m3, m5
pmulhrsw m7, [rsp+gprsize+16*0]
.pass1_end3:
punpcklwd m6, m1, m5 ;10 50 11 51 12 52 13 53
punpckhwd m1, m5 ;14 54 15 55 16 56 17 57
punpckhwd m5, m0, m4 ;04 44 05 45 06 46 07 47
punpcklwd m0, m4 ;00 40 01 41 02 42 03 43
punpckhwd m4, m3, m7 ;34 74 35 75 36 76 37 77
punpcklwd m3, m7 ;30 70 31 71 32 72 33 73
punpckhwd m7, m1, m4 ;16 36 56 76 17 37 57 77
punpcklwd m1, m4 ;14 34 54 74 15 35 55 75
punpckhwd m4, m6, m3 ;12 32 52 72 13 33 53 73
punpcklwd m6, m3 ;10 30 50 70 11 31 51 71
mova [coeffq+16*5], m6
mova m6, [coeffq+16*6]
punpckhwd m3, m2, m6 ;24 64 25 65 26 66 27 67
punpcklwd m2, m6 ;20 60 21 61 22 62 23 63
punpckhwd m6, m5, m3 ;06 26 46 66 07 27 47 67
punpcklwd m5, m3 ;04 24 44 64 05 25 45 65
punpckhwd m3, m0, m2 ;02 22 42 62 03 23 43 63
punpcklwd m0, m2 ;00 20 40 60 01 21 41 61
punpckhwd m2, m6, m7 ;07 17 27 37 47 57 67 77
punpcklwd m6, m7 ;06 16 26 36 46 56 66 76
mova [coeffq+16*7], m2
punpcklwd m2, m3, m4 ;02 12 22 32 42 52 62 72
punpckhwd m3, m4 ;03 13 23 33 43 53 63 73
punpcklwd m4, m5, m1 ;04 14 24 34 44 54 64 74
punpckhwd m5, m1 ;05 15 25 35 45 55 65 75
mova m7, [coeffq+16*5]
punpckhwd m1, m0, m7 ;01 11 21 31 41 51 61 71
punpcklwd m0, m7 ;00 10 20 30 40 50 60 70
jmp tx2q
punpcklwd m6, m1, m5 ;10 50 11 51 12 52 13 53
punpckhwd m1, m5 ;14 54 15 55 16 56 17 57
punpckhwd m5, m0, m4 ;04 44 05 45 06 46 07 47
punpcklwd m0, m4 ;00 40 01 41 02 42 03 43
punpckhwd m4, m3, m7 ;34 74 35 75 36 76 37 77
punpcklwd m3, m7 ;30 70 31 71 32 72 33 73
punpckhwd m7, m1, m4 ;16 36 56 76 17 37 57 77
punpcklwd m1, m4 ;14 34 54 74 15 35 55 75
punpckhwd m4, m6, m3 ;12 32 52 72 13 33 53 73
punpcklwd m6, m3 ;10 30 50 70 11 31 51 71
mova [rsp+gprsize+16*2], m6
mova m6, [rsp+gprsize+16*1]
punpckhwd m3, m2, m6 ;24 64 25 65 26 66 27 67
punpcklwd m2, m6 ;20 60 21 61 22 62 23 63
punpckhwd m6, m5, m3 ;06 26 46 66 07 27 47 67
punpcklwd m5, m3 ;04 24 44 64 05 25 45 65
punpckhwd m3, m0, m2 ;02 22 42 62 03 23 43 63
punpcklwd m0, m2 ;00 20 40 60 01 21 41 61
punpckhwd m2, m6, m7 ;07 17 27 37 47 57 67 77
punpcklwd m6, m7 ;06 16 26 36 46 56 66 76
mova [rsp+gprsize+16*0], m2
punpcklwd m2, m3, m4 ;02 12 22 32 42 52 62 72
punpckhwd m3, m4 ;03 13 23 33 43 53 63 73
punpcklwd m4, m5, m1 ;04 14 24 34 44 54 64 74
punpckhwd m5, m1 ;05 15 25 35 45 55 65 75
mova m7, [rsp+gprsize+16*2]
punpckhwd m1, m0, m7 ;01 11 21 31 41 51 61 71
punpcklwd m0, m7 ;00 10 20 30 40 50 60 70
mova m7, [rsp+gprsize+16*0]
jmp tx2q
.pass2:
lea tx2q, [o(m(idct_8x8_internal).end4)]
.pass2_main:
call .main
.end:
mova m7, [o(pw_2048)]
REPX {pmulhrsw x, m7}, m0, m2, m4, m6
mova [coeffq+16*6], m6
mova m7, [o(pw_2048)]
REPX {pmulhrsw x, m7}, m0, m2, m4, m6
mova [rsp+gprsize+16*1], m6
.end2:
REPX {pmulhrsw x, m7}, m1, m3, m5
pmulhrsw m7, [coeffq+16*7]
mova [coeffq+16*5], m5
mova [coeffq+16*7], m7
REPX {pmulhrsw x, m7}, m1, m3, m5
pmulhrsw m7, [rsp+gprsize+16*0]
mova [rsp+gprsize+16*2], m5
mova [rsp+gprsize+16*0], m7
.end3:
WRITE_8X4 0, 1, 2, 3, 5, 6, 7
lea dstq, [dstq+strideq*2]
WRITE_8X4 4, [coeffq+16*5], [coeffq+16*6], [coeffq+16*7], 5, 6, 7
WRITE_8X4 0, 1, 2, 3, 5, 6, 7
lea dstq, [dstq+strideq*2]
WRITE_8X4 4, [rsp+gprsize+16*2], [rsp+gprsize+16*1], [rsp+gprsize+16*0], 5, 6, 7
jmp tx2q
pxor m7, m7
REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
.end4:
pxor m7, m7
REPX {mova [coeffq+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
ret
ALIGN function_align
.main:
mova [coeffq+16*6], m3
mova [coeffq+16*5], m1
mova m7, [o(pd_2048)]
IDCT4_1D 0, 2, 4, 6, 1, 3, 7
mova m3, [coeffq+16*5]
mova [coeffq+16*5], m2
mova m2, [coeffq+16*6]
mova [coeffq+16*6], m4
mova m4, [coeffq+16*7]
mova [coeffq+16*7], m6
IDCT8_1D_ODDHALF 3, 2, 5, 4, 1, 6, 7
mova m6, [coeffq+16*7]
psubsw m7, m0, m4 ;out7
paddsw m0, m4 ;out0
mova [coeffq+16*7], m7
mova m1, [coeffq+16*5]
psubsw m4, m6, m3 ;out4
paddsw m3, m6 ;out3
mova m7, [coeffq+16*6]
psubsw m6, m1, m5 ;out6
paddsw m1, m5 ;out1
psubsw m5, m7, m2 ;out5
paddsw m2, m7 ;out2
mova [rsp+gprsize*2+16*0], m7
mova [rsp+gprsize*2+16*1], m3
mova [rsp+gprsize*2+16*2], m1
mova m7, [o(pd_2048)]
IDCT4_1D 0, 2, 4, 6, 1, 3, 7
mova m3, [rsp+gprsize*2+16*2]
mova [rsp+gprsize*2+16*2], m2
mova m2, [rsp+gprsize*2+16*1]
mova [rsp+gprsize*2+16*1], m4
mova m4, [rsp+gprsize*2+16*0]
mova [rsp+gprsize*2+16*0], m6
IDCT8_1D_ODDHALF 3, 2, 5, 4, 1, 6, 7
mova m6, [rsp+gprsize*2+16*0]
psubsw m7, m0, m4 ;out7
paddsw m0, m4 ;out0
mova [rsp+gprsize*2+16*0], m7
mova m1, [rsp+gprsize*2+16*2]
psubsw m4, m6, m3 ;out4
paddsw m3, m6 ;out3
mova m7, [rsp+gprsize*2+16*1]
psubsw m6, m1, m5 ;out6
paddsw m1, m5 ;out1
psubsw m5, m7, m2 ;out5
paddsw m2, m7 ;out2
ret
......@@ -1343,75 +1356,85 @@ INV_TXFM_8X8_FN adst, identity
cglobal iadst_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
ITX_8X8_LOAD_COEFS
.pass1:
call .main
mova m7, [o(pw_16384)]
REPX {pmulhrsw x, m7}, m0, m2, m4, m6
mova [coeffq+16*6], m6
pxor m6, m6
psubw m6, m7
mova m7, m6
.pass1_end:
mova m7, [o(pw_16384)]
REPX {pmulhrsw x, m7}, m0, m2, m4, m6
mova [rsp+gprsize+16*1], m6
pxor m6, m6
psubw m6, m7
mova m7, m6
jmp m(idct_8x8_internal).pass1_end2
ALIGN function_align
.pass2:
lea tx2q, [o(m(idct_8x8_internal).end4)]
.pass2_main:
call .main
mova m7, [o(pw_2048)]
REPX {pmulhrsw x, m7}, m0, m2, m4, m6
mova [coeffq+16*6], m6
pxor m6, m6
psubw m6, m7
mova m7, m6
.end:
mova m7, [o(pw_2048)]
REPX {pmulhrsw x, m7}, m0, m2, m4, m6
mova [rsp+gprsize+16*1], m6
pxor m6, m6
psubw m6, m7
mova m7, m6
jmp m(idct_8x8_internal).end2
ALIGN function_align
.main:
mova [coeffq+16*6], m3
mova [coeffq+16*5], m4
mova m7, [o(pd_2048)]
ITX_MULSUB_2W 5, 2, 3, 4, 7, 1931, 3612 ;t3a, t2a
ITX_MULSUB_2W 1, 6, 3, 4, 7, 3920, 1189 ;t7a, t6a
paddsw m3, m2, m6 ;t2
psubsw m2, m6 ;t6
paddsw m4, m5, m1 ;t3
psubsw m5, m1 ;t7
ITX_MULSUB_2W 5, 2, 1, 6, 7, 3784, 1567 ;t6a, t7a
mova m6, [coeffq+16*5]
mova [coeffq+16*5], m5
mova m1, [coeffq+16*6]
mova [coeffq+16*6], m2
mova m5, [coeffq+16*7]
mova [coeffq+16*7], m3
ITX_MULSUB_2W 5, 0, 2, 3, 7, 401, 4076 ;t1a, t0a
ITX_MULSUB_2W 1, 6, 2, 3, 7, 3166, 2598 ;t5a, t4a
psubsw m2, m0, m6 ;t4
paddsw m0, m6 ;t0
paddsw m3, m5, m1 ;t1
psubsw m5, m1 ;t5
ITX_MULSUB_2W 2, 5, 1, 6, 7, 1567, 3784 ;t5a, t4a
mova m7, [coeffq+16*7]
paddsw m1, m3, m4 ;-out7
psubsw m3, m4 ;t3
mova [coeffq+16*7], m1
psubsw m4, m0, m7 ;t2
paddsw m0, m7 ;out0
mova m6, [coeffq+16*5]
mova m7, [coeffq+16*6]
paddsw m1, m5, m6 ;-out1
psubsw m5, m6 ;t6
paddsw m6, m2, m7 ;out6
psubsw m2, m7 ;t7
paddw m7, m4, m3 ;t2 + t3
psubw m4, m3 ;t2 - t3
paddw m3, m5, m2 ;t6 + t7
psubw m5, m2 ;t6 - t7
mova m2, [o(pw_2896x8)]
pmulhrsw m4, m2 ;out4
pmulhrsw m5, m2 ;-out5
pmulhrsw m7, m2 ;-out3
pmulhrsw m2, m3 ;out2
mova m3, m7
mova [rsp+gprsize*2+16*0], m7
mova [rsp+gprsize*2+16*1], m3
mova [rsp+gprsize*2+16*2], m4
mova m7, [o(pd_2048)]
ITX_MULSUB_2W 5, 2, 3, 4, 7, 1931, 3612 ;t3a, t2a
ITX_MULSUB_2W 1, 6, 3, 4, 7, 3920, 1189 ;t7a, t6a
paddsw m3, m2, m6 ;t2
psubsw m2, m6 ;t6
paddsw m4, m5, m1 ;t3
psubsw m5, m1 ;t7
ITX_MULSUB_2W 5, 2, 1, 6, 7, 3784, 1567 ;t6a, t7a
mova m6, [rsp+gprsize*2+16*2]
mova [rsp+gprsize*2+16*2], m5
mova m1, [rsp+gprsize*2+16*1]
mova [rsp+gprsize*2+16*1], m2
mova m5, [rsp+gprsize*2+16*0]
mova [rsp+gprsize*2+16*0], m3
ITX_MULSUB_2W 5, 0, 2, 3, 7, 401, 4076 ;t1a, t0a
ITX_MULSUB_2W 1, 6, 2, 3, 7, 3166, 2598 ;t5a, t4a
psubsw m2, m0, m6 ;t4
paddsw m0, m6 ;t0
paddsw m3, m5, m1 ;t1
psubsw m5, m1 ;t5
ITX_MULSUB_2W 2, 5, 1, 6, 7, 1567, 3784 ;t5a, t4a
mova m7, [rsp+gprsize*2+16*0]
paddsw m1, m3, m4 ;-out7
psubsw m3, m4 ;t3
mova [rsp+gprsize*2+16*0], m1
psubsw m4, m0, m7 ;t2
paddsw m0, m7 ;out0
mova m6, [rsp+gprsize*2+16*2]
mova m7, [rsp+gprsize*2+16*1]
paddsw m1, m5, m6 ;-out1
psubsw m5, m6 ;t6
paddsw m6, m2, m7 ;out6
psubsw m2, m7 ;t7
paddw m7, m4, m3 ;t2 + t3
psubw m4, m3 ;t2 - t3
paddw m3, m5, m2 ;t6 + t7
psubw m5, m2 ;t6 - t7
mova m2, [o(pw_2896x8)]
pmulhrsw m4, m2 ;out4
pmulhrsw m5, m2 ;-out5
pmulhrsw m7, m2 ;-out3
pmulhrsw m2, m3 ;out2
mova m3, m7
ret
INV_TXFM_8X8_FN flipadst, dct
......@@ -1421,45 +1444,54 @@ INV_TXFM_8X8_FN flipadst, identity
cglobal iflipadst_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
ITX_8X8_LOAD_COEFS
.pass1:
call m(iadst_8x8_internal).main
mova m7, [o(pw_m16384)]
pmulhrsw m1, m7
mova [coeffq+16*6], m1
mova m1, m6
mova m6, m2
pmulhrsw m2, m5, m7
mova m5, m6
mova m6, m4
pmulhrsw m4, m3, m7
mova m3, m6
mova m6, m0
mova m0, m7
pxor m7, m7
psubw m7, m0
pmulhrsw m0, [coeffq+16*7]
REPX {pmulhrsw x, m7}, m1, m3, m5
pmulhrsw m7, m6
.pass1_end:
mova m7, [o(pw_m16384)]
pmulhrsw m1, m7
mova [rsp+gprsize+16*1], m1
mova m1, m6
mova m6, m2
pmulhrsw m2, m5, m7
mova m5, m6
mova m6, m4
pmulhrsw m4, m3, m7
mova m3, m6
mova m6, m0
mova m0, m7
pxor m7, m7
psubw m7, m0
pmulhrsw m0, [rsp+gprsize+16*0]
REPX {pmulhrsw x, m7}, m1, m3, m5
pmulhrsw m7, m6
jmp m(idct_8x8_internal).pass1_end3
ALIGN function_align
.pass2:
lea tx2q, [o(m(idct_8x8_internal).end4)]
.pass2_main:
call m(iadst_8x8_internal).main
mova m7, [o(pw_2048)]
REPX {pmulhrsw x, m7}, m0, m2, m4, m6
mova [coeffq+16*5], m2
mova m2, m0
pxor m0, m0
psubw m0, m7
mova m7, m2
pmulhrsw m1, m0
pmulhrsw m2, m5, m0
mova [coeffq+16*6], m1
mova m5, m4
mova m1, m6
pmulhrsw m4, m3, m0
pmulhrsw m0, [coeffq+16*7]
mova m3, m5
mova [coeffq+16*7], m7
.end:
mova m7, [o(pw_2048)]
REPX {pmulhrsw x, m7}, m0, m2, m4, m6
mova [rsp+gprsize+16*2], m2
mova m2, m0
pxor m0, m0
psubw m0, m7
mova m7, m2
pmulhrsw m1, m0
pmulhrsw m2, m5, m0
mova [rsp+gprsize+16*1], m1
mova m5, m4
mova m1, m6
pmulhrsw m4, m3, m0
pmulhrsw m0, [rsp+gprsize+16*0]
mova m3, m5
mova [rsp+gprsize+16*0], m7
jmp m(idct_8x8_internal).end3
INV_TXFM_8X8_FN identity, dct, 7
......@@ -1468,23 +1500,21 @@ INV_TXFM_8X8_FN identity, flipadst
INV_TXFM_8X8_FN identity, identity
cglobal iidentity_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
mova m0, [coeffq+16*0]
mova m1, [coeffq+16*1]
mova m2, [coeffq+16*2]
mova m3, [coeffq+16*3]
mova m4, [coeffq+16*4]
mova m5, [coeffq+16*5]
mova m7, [coeffq+16*7]
jmp m(idct_8x8_internal).pass1_end3
ITX_8X8_LOAD_COEFS
mova [rsp+gprsize+16*1], m6
jmp m(idct_8x8_internal).pass1_end3
ALIGN function_align
.pass2:
mova m7, [o(pw_4096)]
REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
pmulhrsw m7, [coeffq+16*7]
mova [coeffq+16*5], m5
mova [coeffq+16*6], m6
mova [coeffq+16*7], m7
lea tx2q, [o(m(idct_8x8_internal).end4)]
.end:
pmulhrsw m7, [o(pw_4096)]
mova [rsp+gprsize+16*0], m7
mova m7, [o(pw_4096)]
REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
mova [rsp+gprsize+16*2], m5
mova [rsp+gprsize+16*1], m6
jmp m(idct_8x8_internal).end3
......@@ -1829,6 +1859,7 @@ cglobal iidentity_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
movd m2, [o(pw_16384)]
mov [coeffq], eobd
mov r2d, 2
lea tx2q, [o(m(inv_txfm_add_dct_dct_16x4).end)]
.dconly:
pmulhrsw m0, m2
movd m2, [o(pw_2048)] ;intentionally rip-relative
......@@ -1855,6 +1886,8 @@ cglobal iidentity_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
lea dstq, [dstq+strideq*2]
dec r2d
jg .dconly_loop
jmp tx2q
.end:
RET
%else ; adst / flipadst
movd m2, [o(pw_16384)]
......@@ -1889,7 +1922,13 @@ cglobal iidentity_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
%endmacro
%macro ITX_16X4_LOAD_COEFS 0
ITX_8X8_LOAD_COEFS
mova m0, [coeffq+16*0]
mova m1, [coeffq+16*1]
mova m2, [coeffq+16*2]
mova m3, [coeffq+16*3]
mova m4, [coeffq+16*4]
mova m5, [coeffq+16*5]
mova m6, [coeffq+16*6]
%endmacro
%macro IDCT16_1D_PACKED_ODDHALF 7 ;src[1-4], tmp[1-3]
......@@ -2229,3 +2268,916 @@ cglobal iidentity_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
.pass2:
lea tx2q, [o(m(iidentity_8x4_internal).pass2)]
jmp m(idct_16x4_internal).pass2_end
%macro ITX_8X16_LOAD_EVEN_COEFS 0
mova m0, [coeffq+32*0]
mova m1, [coeffq+32*1]
mova m2, [coeffq+32*2]
mova m3, [coeffq+32*3]
mova m4, [coeffq+32*4]
mova m5, [coeffq+32*5]
mova m6, [coeffq+32*6]
mova m7, [coeffq+32*7]
%endmacro
%macro ITX_8X16_RECT2_LOAD_EVEN_COEFS 0
mova m7, [o(pw_2896x8)]
pmulhrsw m0, m7, [coeffq+32*0]
pmulhrsw m1, m7, [coeffq+32*1]
pmulhrsw m2, m7, [coeffq+32*2]
pmulhrsw m3, m7, [coeffq+32*3]
pmulhrsw m4, m7, [coeffq+32*4]
pmulhrsw m5, m7, [coeffq+32*5]
pmulhrsw m6, m7, [coeffq+32*6]
pmulhrsw m7, [coeffq+32*7]
%endmacro
%macro ITX_8X16_LOAD_ODD_COEFS 0
mova m0, [coeffq+16*1 ]
mova m1, [coeffq+16*3 ]
mova m2, [coeffq+16*5 ]
mova m3, [coeffq+16*7 ]
mova m4, [coeffq+16*9 ]
mova m5, [coeffq+16*11]
mova m6, [coeffq+16*13]
mova m7, [coeffq+16*15]
%endmacro
%macro ITX_8X16_RECT2_LOAD_ODD_COEFS 0
mova m7, [o(pw_2896x8)]
pmulhrsw m0, m7, [coeffq+16*1 ]
pmulhrsw m1, m7, [coeffq+16*3 ]
pmulhrsw m2, m7, [coeffq+16*5 ]
pmulhrsw m3, m7, [coeffq+16*7 ]
pmulhrsw m4, m7, [coeffq+16*9 ]
pmulhrsw m5, m7, [coeffq+16*11]
pmulhrsw m6, m7, [coeffq+16*13]
pmulhrsw m7, [coeffq+16*15]
%endmacro
%macro ITX_8X16_SAVE_EVEN_COEFS 0
mova [coeffq+32*0], m0
mova [coeffq+32*1], m1
mova [coeffq+32*2], m2
mova [coeffq+32*3], m3
mova [coeffq+32*4], m4
mova [coeffq+32*5], m5
mova [coeffq+32*6], m6
mova [coeffq+32*7], m7
%endmacro
%macro ITX_8X16_SAVE_ODD_COEFS 0
mova [coeffq+16*1 ], m0
mova [coeffq+16*3 ], m1
mova [coeffq+16*5 ], m2
mova [coeffq+16*7 ], m3
mova [coeffq+16*9 ], m4
mova [coeffq+16*11], m5
mova [coeffq+16*13], m6
mova [coeffq+16*15], m7
%endmacro
%macro ITX_8X16_LOAD_STACK_COEFS 0
mova m0, [rsp+gprsize+16*3]
mova m1, [rsp+gprsize+16*4]
mova m2, [rsp+gprsize+16*5]
mova m3, [rsp+gprsize+16*6]
mova m4, [rsp+gprsize+16*7]
mova m5, [rsp+gprsize+16*8]
mova m6, [rsp+gprsize+16*9]
mova m7, [rsp+gprsize+32*5]
%endmacro
%macro INV_TXFM_8X16_FN 2-3 -1 ; type1, type2, fast_thresh
INV_TXFM_FN %1, %2, %3, 8x16, 8, 16*12
%ifidn %1_%2, dct_dct
pshuflw m0, [coeffq], q0000
punpcklwd m0, m0
mova m1, [o(pw_2896x8)]
pmulhrsw m0, m1
mova m2, [o(pw_16384)]
mov [coeffq], eobd
pmulhrsw m0, m1
pmulhrsw m0, m2
psrlw m2, 3 ; pw_2048
pmulhrsw m0, m1
pmulhrsw m0, m2
mov r2d, 4
lea tx2q, [o(m(inv_txfm_add_dct_dct_8x16).end)]
jmp m(inv_txfm_add_dct_dct_8x8).end2
.end:
RET
%elifidn %1_%2, dct_identity
mov r3d, 2
.loop:
mova m0, [o(pw_2896x8)]
pmulhrsw m7, m0, [coeffq]
mova m1, [o(pw_16384)]
pxor m2, m2
mova [coeffq], m2
pmulhrsw m7, m0
pmulhrsw m7, m1
psrlw m1, 3 ; pw_2048
psllw m7, 2
pmulhrsw m7, [o(pw_5793x4)]
pmulhrsw m7, m1
punpcklwd m0, m7, m7
punpckhwd m7, m7
pshufd m3, m0, q3333
pshufd m2, m0, q2222
pshufd m1, m0, q1111
pshufd m0, m0, q0000
call m(iadst_8x4_internal).end3
pshufd m3, m7, q3333
pshufd m2, m7, q2222
pshufd m1, m7, q1111
pshufd m0, m7, q0000
lea dstq, [dstq+strideq*2]
call m(iadst_8x4_internal).end3
add coeffq, 16
lea dstq, [dstq+strideq*2]
dec r3d
jg .loop
RET
%elifidn %1_%2, identity_dct
movd m0, [coeffq+32*0]
punpcklwd m0, [coeffq+32*1]
movd m2, [coeffq+32*2]
punpcklwd m2, [coeffq+32*3]
add coeffq, 32*4
movd m1, [coeffq+32*0]
punpcklwd m1, [coeffq+32*1]
movd m3, [coeffq+32*2]
punpcklwd m3, [coeffq+32*3]
mova m4, [o(pw_2896x8)]
xor eobd, eobd
mov [coeffq-32*4], eobd
mov [coeffq-32*3], eobd
mov [coeffq-32*2], eobd
mov [coeffq-32*1], eobd
punpckldq m0, m2
punpckldq m1, m3
punpcklqdq m0, m1
pmulhrsw m0, m4
pmulhrsw m0, m4