Commit 589e96a1 authored by Liwei Wang's avatar Liwei Wang

Add SSSE3 implementation for the {16, 32, 64}x64 and 64 x{16, 32} blocks in itx

Cycle times:
inv_txfm_add_16x64_dct_dct_0_8bpc_c: 3973.5
inv_txfm_add_16x64_dct_dct_0_8bpc_ssse3: 185.7
inv_txfm_add_16x64_dct_dct_1_8bpc_c: 37869.1
inv_txfm_add_16x64_dct_dct_1_8bpc_ssse3: 2103.1
inv_txfm_add_16x64_dct_dct_2_8bpc_c: 37822.9
inv_txfm_add_16x64_dct_dct_2_8bpc_ssse3: 2099.1
inv_txfm_add_16x64_dct_dct_3_8bpc_c: 37871.7
inv_txfm_add_16x64_dct_dct_3_8bpc_ssse3: 2663.5
inv_txfm_add_16x64_dct_dct_4_8bpc_c: 38002.9
inv_txfm_add_16x64_dct_dct_4_8bpc_ssse3: 2589.7
inv_txfm_add_32x64_dct_dct_0_8bpc_c: 8319.2
inv_txfm_add_32x64_dct_dct_0_8bpc_ssse3: 376.9
inv_txfm_add_32x64_dct_dct_1_8bpc_c: 85956.8
inv_txfm_add_32x64_dct_dct_1_8bpc_ssse3: 4298.1
inv_txfm_add_32x64_dct_dct_2_8bpc_c: 89906.2
inv_txfm_add_32x64_dct_dct_2_8bpc_ssse3: 4291.3
inv_txfm_add_32x64_dct_dct_3_8bpc_c: 83710.9
inv_txfm_add_32x64_dct_dct_3_8bpc_ssse3: 5589.5
inv_txfm_add_32x64_dct_dct_4_8bpc_c: 87733.5
inv_txfm_add_32x64_dct_dct_4_8bpc_ssse3: 5658.4
inv_txfm_add_64x16_dct_dct_0_8bpc_c: 3895.9
inv_txfm_add_64x16_dct_dct_0_8bpc_ssse3: 179.5
inv_txfm_add_64x16_dct_dct_1_8bpc_c: 51375.2
inv_txfm_add_64x16_dct_dct_1_8bpc_ssse3: 3859.2
inv_txfm_add_64x16_dct_dct_2_8bpc_c: 52562.9
inv_txfm_add_64x16_dct_dct_2_8bpc_ssse3: 4044.1
inv_txfm_add_64x16_dct_dct_3_8bpc_c: 51347.0
inv_txfm_add_64x16_dct_dct_3_8bpc_ssse3: 5259.5
inv_txfm_add_64x16_dct_dct_4_8bpc_c: 49642.2
inv_txfm_add_64x16_dct_dct_4_8bpc_ssse3: 4008.4
inv_txfm_add_64x32_dct_dct_0_8bpc_c: 7196.4
inv_txfm_add_64x32_dct_dct_0_8bpc_ssse3: 355.8
inv_txfm_add_64x32_dct_dct_1_8bpc_c: 106588.4
inv_txfm_add_64x32_dct_dct_1_8bpc_ssse3: 4965.3
inv_txfm_add_64x32_dct_dct_2_8bpc_c: 106230.7
inv_txfm_add_64x32_dct_dct_2_8bpc_ssse3: 4772.0
inv_txfm_add_64x32_dct_dct_3_8bpc_c: 107427.0
inv_txfm_add_64x32_dct_dct_3_8bpc_ssse3: 7146.9
inv_txfm_add_64x32_dct_dct_4_8bpc_c: 111785.7
inv_txfm_add_64x32_dct_dct_4_8bpc_ssse3: 7156.2
inv_txfm_add_64x64_dct_dct_0_8bpc_c: 14512.4
inv_txfm_add_64x64_dct_dct_0_8bpc_ssse3: 674.2
inv_txfm_add_64x64_dct_dct_1_8bpc_c: 173246.3
inv_txfm_add_64x64_dct_dct_1_8bpc_ssse3: 8790.8
inv_txfm_add_64x64_dct_dct_2_8bpc_c: 174264.6
inv_txfm_add_64x64_dct_dct_2_8bpc_ssse3: 8767.6
inv_txfm_add_64x64_dct_dct_3_8bpc_c: 170047.3
inv_txfm_add_64x64_dct_dct_3_8bpc_ssse3: 10784.9
inv_txfm_add_64x64_dct_dct_4_8bpc_c: 170182.2
inv_txfm_add_64x64_dct_dct_4_8bpc_ssse3: 10795.6
parent 36e1490b
Pipeline #6432 passed with stages
in 5 minutes and 54 seconds
......@@ -92,6 +92,12 @@ decl_itx2_fns (16, 32, ssse3);
decl_itx2_fns (32, 16, ssse3);
decl_itx2_fns (32, 32, ssse3);
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_16x64_ssse3);
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_32x64_ssse3);
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x16_ssse3);
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x32_ssse3);
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x64_ssse3);
void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) {
#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
......@@ -148,6 +154,11 @@ void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) {
assign_itx2_fn (R, 16, 32, ssse3);
assign_itx2_fn (R, 32, 16, ssse3);
assign_itx2_fn (, 32, 32, ssse3);
assign_itx1_fn (R, 16, 64, ssse3);
assign_itx1_fn (R, 32, 64, ssse3);
assign_itx1_fn (R, 64, 16, ssse3);
assign_itx1_fn (R, 64, 32, ssse3);
assign_itx1_fn ( , 64, 64, ssse3);
#endif
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment