Commit a532e5ae authored by Liwei Wang's avatar Liwei Wang Committed by Henrik Gramner

Add SSSE3 implementation for the 8x16 and 16x8 blocks in itx

Cycle times:
inv_txfm_add_8x16_adst_adst_0_8bpc_c: 5063.0
inv_txfm_add_8x16_adst_adst_0_8bpc_ssse3: 406.8
inv_txfm_add_8x16_adst_adst_1_8bpc_c: 5051.2
inv_txfm_add_8x16_adst_adst_1_8bpc_ssse3: 407.3
inv_txfm_add_8x16_adst_adst_2_8bpc_c: 5065.4
inv_txfm_add_8x16_adst_adst_2_8bpc_ssse3: 407.9
inv_txfm_add_8x16_adst_dct_0_8bpc_c: 5201.1
inv_txfm_add_8x16_adst_dct_0_8bpc_ssse3: 354.8
inv_txfm_add_8x16_adst_dct_1_8bpc_c: 5214.8
inv_txfm_add_8x16_adst_dct_1_8bpc_ssse3: 354.8
inv_txfm_add_8x16_adst_dct_2_8bpc_c: 5225.0
inv_txfm_add_8x16_adst_dct_2_8bpc_ssse3: 355.1
inv_txfm_add_8x16_adst_flipadst_0_8bpc_c: 7135.9
inv_txfm_add_8x16_adst_flipadst_0_8bpc_ssse3: 409.7
inv_txfm_add_8x16_adst_flipadst_1_8bpc_c: 8354.4
inv_txfm_add_8x16_adst_flipadst_1_8bpc_ssse3: 409.2
inv_txfm_add_8x16_adst_flipadst_2_8bpc_c: 7198.7
inv_txfm_add_8x16_adst_flipadst_2_8bpc_ssse3: 409.7
inv_txfm_add_8x16_adst_identity_0_8bpc_c: 3936.5
inv_txfm_add_8x16_adst_identity_0_8bpc_ssse3: 262.0
inv_txfm_add_8x16_adst_identity_1_8bpc_c: 4617.8
inv_txfm_add_8x16_adst_identity_1_8bpc_ssse3: 261.4
inv_txfm_add_8x16_adst_identity_2_8bpc_c: 3895.1
inv_txfm_add_8x16_adst_identity_2_8bpc_ssse3: 262.1
inv_txfm_add_8x16_dct_adst_0_8bpc_c: 5203.9
inv_txfm_add_8x16_dct_adst_0_8bpc_ssse3: 355.1
inv_txfm_add_8x16_dct_adst_1_8bpc_c: 5200.8
inv_txfm_add_8x16_dct_adst_1_8bpc_ssse3: 355.4
inv_txfm_add_8x16_dct_adst_2_8bpc_c: 5208.2
inv_txfm_add_8x16_dct_adst_2_8bpc_ssse3: 355.1
inv_txfm_add_8x16_dct_dct_0_8bpc_c: 5270.8
inv_txfm_add_8x16_dct_dct_0_8bpc_ssse3: 57.0
inv_txfm_add_8x16_dct_dct_1_8bpc_c: 5280.9
inv_txfm_add_8x16_dct_dct_1_8bpc_ssse3: 303.2
inv_txfm_add_8x16_dct_dct_2_8bpc_c: 5275.9
inv_txfm_add_8x16_dct_dct_2_8bpc_ssse3: 302.4
inv_txfm_add_8x16_dct_flipadst_0_8bpc_c: 5374.4
inv_txfm_add_8x16_dct_flipadst_0_8bpc_ssse3: 356.5
inv_txfm_add_8x16_dct_flipadst_1_8bpc_c: 5449.9
inv_txfm_add_8x16_dct_flipadst_1_8bpc_ssse3: 356.8
inv_txfm_add_8x16_dct_flipadst_2_8bpc_c: 5446.9
inv_txfm_add_8x16_dct_flipadst_2_8bpc_ssse3: 356.7
inv_txfm_add_8x16_dct_identity_0_8bpc_c: 3883.4
inv_txfm_add_8x16_dct_identity_0_8bpc_ssse3: 76.1
inv_txfm_add_8x16_dct_identity_1_8bpc_c: 3892.3
inv_txfm_add_8x16_dct_identity_1_8bpc_ssse3: 76.1
inv_txfm_add_8x16_dct_identity_2_8bpc_c: 4027.1
inv_txfm_add_8x16_dct_identity_2_8bpc_ssse3: 209.9
inv_txfm_add_8x16_flipadst_adst_0_8bpc_c: 7387.5
inv_txfm_add_8x16_flipadst_adst_0_8bpc_ssse3: 408.9
inv_txfm_add_8x16_flipadst_adst_1_8bpc_c: 7298.8
inv_txfm_add_8x16_flipadst_adst_1_8bpc_ssse3: 408.8
inv_txfm_add_8x16_flipadst_adst_2_8bpc_c: 7397.2
inv_txfm_add_8x16_flipadst_adst_2_8bpc_ssse3: 408.9
inv_txfm_add_8x16_flipadst_dct_0_8bpc_c: 5250.4
inv_txfm_add_8x16_flipadst_dct_0_8bpc_ssse3: 355.3
inv_txfm_add_8x16_flipadst_dct_1_8bpc_c: 5263.9
inv_txfm_add_8x16_flipadst_dct_1_8bpc_ssse3: 355.4
inv_txfm_add_8x16_flipadst_dct_2_8bpc_c: 5259.0
inv_txfm_add_8x16_flipadst_dct_2_8bpc_ssse3: 356.3
inv_txfm_add_8x16_flipadst_flipadst_0_8bpc_c: 5448.4
inv_txfm_add_8x16_flipadst_flipadst_0_8bpc_ssse3: 410.2
inv_txfm_add_8x16_flipadst_flipadst_1_8bpc_c: 5402.6
inv_txfm_add_8x16_flipadst_flipadst_1_8bpc_ssse3: 410.8
inv_txfm_add_8x16_flipadst_flipadst_2_8bpc_c: 6479.7
inv_txfm_add_8x16_flipadst_flipadst_2_8bpc_ssse3: 409.8
inv_txfm_add_8x16_flipadst_identity_0_8bpc_c: 3828.9
inv_txfm_add_8x16_flipadst_identity_0_8bpc_ssse3: 262.7
inv_txfm_add_8x16_flipadst_identity_1_8bpc_c: 3884.5
inv_txfm_add_8x16_flipadst_identity_1_8bpc_ssse3: 262.0
inv_txfm_add_8x16_flipadst_identity_2_8bpc_c: 3809.2
inv_txfm_add_8x16_flipadst_identity_2_8bpc_ssse3: 262.9
inv_txfm_add_8x16_identity_adst_0_8bpc_c: 4294.5
inv_txfm_add_8x16_identity_adst_0_8bpc_ssse3: 268.8
inv_txfm_add_8x16_identity_adst_1_8bpc_c: 4955.4
inv_txfm_add_8x16_identity_adst_1_8bpc_ssse3: 269.1
inv_txfm_add_8x16_identity_adst_2_8bpc_c: 4166.4
inv_txfm_add_8x16_identity_adst_2_8bpc_ssse3: 269.9
inv_txfm_add_8x16_identity_dct_0_8bpc_c: 4012.3
inv_txfm_add_8x16_identity_dct_0_8bpc_ssse3: 56.7
inv_txfm_add_8x16_identity_dct_1_8bpc_c: 4767.1
inv_txfm_add_8x16_identity_dct_1_8bpc_ssse3: 215.1
inv_txfm_add_8x16_identity_dct_2_8bpc_c: 4012.6
inv_txfm_add_8x16_identity_dct_2_8bpc_ssse3: 215.9
inv_txfm_add_8x16_identity_flipadst_0_8bpc_c: 4452.6
inv_txfm_add_8x16_identity_flipadst_0_8bpc_ssse3: 270.5
inv_txfm_add_8x16_identity_flipadst_1_8bpc_c: 4885.8
inv_txfm_add_8x16_identity_flipadst_1_8bpc_ssse3: 270.3
inv_txfm_add_8x16_identity_flipadst_2_8bpc_c: 4186.1
inv_txfm_add_8x16_identity_flipadst_2_8bpc_ssse3: 271.5
inv_txfm_add_8x16_identity_identity_0_8bpc_c: 2623.0
inv_txfm_add_8x16_identity_identity_0_8bpc_ssse3: 123.1
inv_txfm_add_8x16_identity_identity_1_8bpc_c: 2617.7
inv_txfm_add_8x16_identity_identity_1_8bpc_ssse3: 122.9
inv_txfm_add_8x16_identity_identity_2_8bpc_c: 2617.2
inv_txfm_add_8x16_identity_identity_2_8bpc_ssse3: 123.1
inv_txfm_add_16x8_adst_adst_0_8bpc_c: 5102.3
inv_txfm_add_16x8_adst_adst_0_8bpc_ssse3: 409.0
inv_txfm_add_16x8_adst_adst_1_8bpc_c: 5063.2
inv_txfm_add_16x8_adst_adst_1_8bpc_ssse3: 409.5
inv_txfm_add_16x8_adst_adst_2_8bpc_c: 5029.1
inv_txfm_add_16x8_adst_adst_2_8bpc_ssse3: 410.1
inv_txfm_add_16x8_adst_dct_0_8bpc_c: 5848.8
inv_txfm_add_16x8_adst_dct_0_8bpc_ssse3: 358.8
inv_txfm_add_16x8_adst_dct_1_8bpc_c: 5612.8
inv_txfm_add_16x8_adst_dct_1_8bpc_ssse3: 358.8
inv_txfm_add_16x8_adst_dct_2_8bpc_c: 5143.2
inv_txfm_add_16x8_adst_dct_2_8bpc_ssse3: 358.5
inv_txfm_add_16x8_adst_flipadst_0_8bpc_c: 5072.4
inv_txfm_add_16x8_adst_flipadst_0_8bpc_ssse3: 413.3
inv_txfm_add_16x8_adst_flipadst_1_8bpc_c: 5082.2
inv_txfm_add_16x8_adst_flipadst_1_8bpc_ssse3: 413.6
inv_txfm_add_16x8_adst_flipadst_2_8bpc_c: 5108.0
inv_txfm_add_16x8_adst_flipadst_2_8bpc_ssse3: 413.8
inv_txfm_add_16x8_adst_identity_0_8bpc_c: 3897.2
inv_txfm_add_16x8_adst_identity_0_8bpc_ssse3: 283.6
inv_txfm_add_16x8_adst_identity_1_8bpc_c: 3947.2
inv_txfm_add_16x8_adst_identity_1_8bpc_ssse3: 283.1
inv_txfm_add_16x8_adst_identity_2_8bpc_c: 3881.7
inv_txfm_add_16x8_adst_identity_2_8bpc_ssse3: 283.6
inv_txfm_add_16x8_dct_adst_0_8bpc_c: 5200.7
inv_txfm_add_16x8_dct_adst_0_8bpc_ssse3: 355.0
inv_txfm_add_16x8_dct_adst_1_8bpc_c: 5261.0
inv_txfm_add_16x8_dct_adst_1_8bpc_ssse3: 355.1
inv_txfm_add_16x8_dct_adst_2_8bpc_c: 5212.5
inv_txfm_add_16x8_dct_adst_2_8bpc_ssse3: 354.5
inv_txfm_add_16x8_dct_dct_0_8bpc_c: 5252.9
inv_txfm_add_16x8_dct_dct_0_8bpc_ssse3: 43.6
inv_txfm_add_16x8_dct_dct_1_8bpc_c: 5260.0
inv_txfm_add_16x8_dct_dct_1_8bpc_ssse3: 302.1
inv_txfm_add_16x8_dct_dct_2_8bpc_c: 5250.4
inv_txfm_add_16x8_dct_dct_2_8bpc_ssse3: 302.0
inv_txfm_add_16x8_dct_flipadst_0_8bpc_c: 5216.6
inv_txfm_add_16x8_dct_flipadst_0_8bpc_ssse3: 359.3
inv_txfm_add_16x8_dct_flipadst_1_8bpc_c: 5229.9
inv_txfm_add_16x8_dct_flipadst_1_8bpc_ssse3: 357.6
inv_txfm_add_16x8_dct_flipadst_2_8bpc_c: 5261.4
inv_txfm_add_16x8_dct_flipadst_2_8bpc_ssse3: 357.4
inv_txfm_add_16x8_dct_identity_0_8bpc_c: 3999.2
inv_txfm_add_16x8_dct_identity_0_8bpc_ssse3: 63.8
inv_txfm_add_16x8_dct_identity_1_8bpc_c: 4018.1
inv_txfm_add_16x8_dct_identity_1_8bpc_ssse3: 227.1
inv_txfm_add_16x8_dct_identity_2_8bpc_c: 3998.7
inv_txfm_add_16x8_dct_identity_2_8bpc_ssse3: 226.2
inv_txfm_add_16x8_flipadst_adst_0_8bpc_c: 5124.9
inv_txfm_add_16x8_flipadst_adst_0_8bpc_ssse3: 419.7
inv_txfm_add_16x8_flipadst_adst_1_8bpc_c: 5100.7
inv_txfm_add_16x8_flipadst_adst_1_8bpc_ssse3: 420.5
inv_txfm_add_16x8_flipadst_adst_2_8bpc_c: 5087.1
inv_txfm_add_16x8_flipadst_adst_2_8bpc_ssse3: 419.9
inv_txfm_add_16x8_flipadst_dct_0_8bpc_c: 5183.2
inv_txfm_add_16x8_flipadst_dct_0_8bpc_ssse3: 367.1
inv_txfm_add_16x8_flipadst_dct_1_8bpc_c: 5193.7
inv_txfm_add_16x8_flipadst_dct_1_8bpc_ssse3: 368.6
inv_txfm_add_16x8_flipadst_dct_2_8bpc_c: 5186.8
inv_txfm_add_16x8_flipadst_dct_2_8bpc_ssse3: 368.4
inv_txfm_add_16x8_flipadst_flipadst_0_8bpc_c: 5091.3
inv_txfm_add_16x8_flipadst_flipadst_0_8bpc_ssse3: 421.2
inv_txfm_add_16x8_flipadst_flipadst_1_8bpc_c: 5118.5
inv_txfm_add_16x8_flipadst_flipadst_1_8bpc_ssse3: 421.4
inv_txfm_add_16x8_flipadst_flipadst_2_8bpc_c: 5119.0
inv_txfm_add_16x8_flipadst_flipadst_2_8bpc_ssse3: 421.2
inv_txfm_add_16x8_flipadst_identity_0_8bpc_c: 3909.3
inv_txfm_add_16x8_flipadst_identity_0_8bpc_ssse3: 289.9
inv_txfm_add_16x8_flipadst_identity_1_8bpc_c: 3920.7
inv_txfm_add_16x8_flipadst_identity_1_8bpc_ssse3: 290.4
inv_txfm_add_16x8_flipadst_identity_2_8bpc_c: 3936.7
inv_txfm_add_16x8_flipadst_identity_2_8bpc_ssse3: 290.6
inv_txfm_add_16x8_identity_adst_0_8bpc_c: 3869.3
inv_txfm_add_16x8_identity_adst_0_8bpc_ssse3: 280.0
inv_txfm_add_16x8_identity_adst_1_8bpc_c: 3832.2
inv_txfm_add_16x8_identity_adst_1_8bpc_ssse3: 281.4
inv_txfm_add_16x8_identity_adst_2_8bpc_c: 3820.8
inv_txfm_add_16x8_identity_adst_2_8bpc_ssse3: 281.5
inv_txfm_add_16x8_identity_dct_0_8bpc_c: 3878.6
inv_txfm_add_16x8_identity_dct_0_8bpc_ssse3: 76.7
inv_txfm_add_16x8_identity_dct_1_8bpc_c: 3883.3
inv_txfm_add_16x8_identity_dct_1_8bpc_ssse3: 76.3
inv_txfm_add_16x8_identity_dct_2_8bpc_c: 3900.6
inv_txfm_add_16x8_identity_dct_2_8bpc_ssse3: 220.1
inv_txfm_add_16x8_identity_flipadst_0_8bpc_c: 3840.9
inv_txfm_add_16x8_identity_flipadst_0_8bpc_ssse3: 277.1
inv_txfm_add_16x8_identity_flipadst_1_8bpc_c: 3860.6
inv_txfm_add_16x8_identity_flipadst_1_8bpc_ssse3: 277.0
inv_txfm_add_16x8_identity_flipadst_2_8bpc_c: 3849.4
inv_txfm_add_16x8_identity_flipadst_2_8bpc_ssse3: 277.2
inv_txfm_add_16x8_identity_identity_0_8bpc_c: 2610.9
inv_txfm_add_16x8_identity_identity_0_8bpc_ssse3: 159.8
inv_txfm_add_16x8_identity_identity_1_8bpc_c: 2597.1
inv_txfm_add_16x8_identity_identity_1_8bpc_ssse3: 159.8
inv_txfm_add_16x8_identity_identity_2_8bpc_c: 2607.9
inv_txfm_add_16x8_identity_identity_2_8bpc_ssse3: 159.9
parent e811c476
......@@ -83,6 +83,8 @@ decl_itx16_fns( 8, 4, ssse3);
decl_itx16_fns( 8, 8, ssse3);
decl_itx16_fns( 4, 16, ssse3);
decl_itx16_fns(16, 4, ssse3);
decl_itx16_fns( 8, 16, ssse3);
decl_itx16_fns(16, 8, ssse3);
void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) {
#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
......@@ -132,6 +134,8 @@ void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) {
assign_itx16_fn(, 8, 8, ssse3);
assign_itx16_fn(R, 4, 16, ssse3);
assign_itx16_fn(R, 16, 4, ssse3);
assign_itx16_fn(R, 8, 16, ssse3);
assign_itx16_fn(R, 16, 8, ssse3);
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment