1. 22 Dec, 2018 1 commit
  2. 21 Dec, 2018 1 commit
    • Liwei Wang's avatar
      Add SSSE3 implementation for the 4x8 and 8x4 blocks in itx · 1703f21f
      Liwei Wang authored
      Cycle times:
      inv_txfm_add_4x8_adst_adst_0_8bpc_c: 1167.6
      inv_txfm_add_4x8_adst_adst_0_8bpc_ssse3: 114.6
      inv_txfm_add_4x8_adst_adst_1_8bpc_c: 1167.2
      inv_txfm_add_4x8_adst_adst_1_8bpc_ssse3: 114.1
      inv_txfm_add_4x8_adst_dct_0_8bpc_c: 1174.7
      inv_txfm_add_4x8_adst_dct_0_8bpc_ssse3: 34.8
      inv_txfm_add_4x8_adst_dct_1_8bpc_c: 1158.0
      inv_txfm_add_4x8_adst_dct_1_8bpc_ssse3: 101.0
      inv_txfm_add_4x8_adst_flipadst_0_8bpc_c: 1150.9
      inv_txfm_add_4x8_adst_flipadst_0_8bpc_ssse3: 115.8
      inv_txfm_add_4x8_adst_flipadst_1_8bpc_c: 1157.6
      inv_txfm_add_4x8_adst_flipadst_1_8bpc_ssse3: 115.8
      inv_txfm_add_4x8_adst_identity_0_8bpc_c: 848.4
      inv_txfm_add_4x8_adst_identity_0_8bpc_ssse3: 59.1
      inv_txfm_add_4x8_adst_identity_1_8bpc_c: 850.1
      inv_txfm_add_4x8_adst_identity_1_8bpc_ssse3: 59.1
      inv_txfm_add_4x8_dct_adst_0_8bpc_c: 1205.6
      inv_txfm_add_4x8_dct_adst_0_8bpc_ssse3: 107.0
      inv_txfm_add_4x8_dct_adst_1_8bpc_c: 1183.7
      inv_txfm_add_4x8_dct_adst_1_8bpc_ssse3: 107.0
      inv_txfm_add_4x8_dct_dct_0_8bpc_c: 1227.0
      inv_txfm_add_4x8_dct_dct_0_8bpc_ssse3: 34.6
      inv_txfm_add_4x8_dct_dct_1_8bpc_c: 1229.7
      inv_txfm_add_4x8_dct_dct_1_8bpc_ssse3: 96.1
      inv_txfm_add_4x8_dct_flipadst_0_8bpc_c: 1188.2
      inv_txfm_add_4x8_dct_flipadst_0_8bpc_ssse3: 109.3
      inv_txfm_add_4x8_dct_flipadst_1_8bpc_c: 1192.7
      inv_txfm_add_4x8_dct_flipadst_1_8bpc_ssse3: 109.9
      inv_txfm_add_4x8_dct_identity_0_8bpc_c: 878.4
      inv_txfm_add_4x8_dct_identity_0_8bpc_ssse3: 31.9
      inv_txfm_add_4x8_dct_identity_1_8bpc_c: 879.0
      inv_txfm_add_4x8_dct_identity_1_8bpc_ssse3: 54.8
      inv_txfm_add_4x8_flipadst_adst_0_8bpc_c: 1181.8
      inv_txfm_add_4x8_flipadst_adst_0_8bpc_ssse3: 114.7
      inv_txfm_add_4x8_flipadst_adst_1_8bpc_c: 1203.0
      inv_txfm_add_4x8_flipadst_adst_1_8bpc_ssse3: 114.5
      inv_txfm_add_4x8_flipadst_dct_0_8bpc_c: 1203.6
      inv_txfm_add_4x8_flipadst_dct_0_8bpc_ssse3: 34.1
      inv_txfm_add_4x8_flipadst_dct_1_8bpc_c: 1204.4
      inv_txfm_add_4x8_flipadst_dct_1_8bpc_ssse3: 100.2
      inv_txfm_add_4x8_flipadst_flipadst_0_8bpc_c: 1180.6
      inv_txfm_add_4x8_flipadst_flipadst_0_8bpc_ssse3: 117.1
      inv_txfm_add_4x8_flipadst_flipadst_1_8bpc_c: 1178.7
      inv_txfm_add_4x8_flipadst_flipadst_1_8bpc_ssse3: 116.8
      inv_txfm_add_4x8_flipadst_identity_0_8bpc_c: 871.3
      inv_txfm_add_4x8_flipadst_identity_0_8bpc_ssse3: 69.0
      inv_txfm_add_4x8_flipadst_identity_1_8bpc_c: 872.3
      inv_txfm_add_4x8_flipadst_identity_1_8bpc_ssse3: 70.0
      inv_txfm_add_4x8_identity_adst_0_8bpc_c: 1125.2
      inv_txfm_add_4x8_identity_adst_0_8bpc_ssse3: 98.7
      inv_txfm_add_4x8_identity_adst_1_8bpc_c: 1092.6
      inv_txfm_add_4x8_identity_adst_1_8bpc_ssse3: 99.6
      inv_txfm_add_4x8_identity_dct_0_8bpc_c: 1139.4
      inv_txfm_add_4x8_identity_dct_0_8bpc_ssse3: 38.8
      inv_txfm_add_4x8_identity_dct_1_8bpc_c: 1111.0
      inv_txfm_add_4x8_identity_dct_1_8bpc_ssse3: 84.1
      inv_txfm_add_4x8_identity_flipadst_0_8bpc_c: 1112.4
      inv_txfm_add_4x8_identity_flipadst_0_8bpc_ssse3: 100.7
      inv_txfm_add_4x8_identity_flipadst_1_8bpc_c: 1098.7
      inv_txfm_add_4x8_identity_flipadst_1_8bpc_ssse3: 100.8
      inv_txfm_add_4x8_identity_identity_0_8bpc_c: 791.6
      inv_txfm_add_4x8_identity_identity_0_8bpc_ssse3: 43.9
      inv_txfm_add_4x8_identity_identity_1_8bpc_c: 797.0
      inv_txfm_add_4x8_identity_identity_1_8bpc_ssse3: 43.8
      inv_txfm_add_8x4_adst_adst_0_8bpc_c: 1102.8
      inv_txfm_add_8x4_adst_adst_0_8bpc_ssse3: 108.7
      inv_txfm_add_8x4_adst_adst_1_8bpc_c: 1101.8
      inv_txfm_add_8x4_adst_adst_1_8bpc_ssse3: 108.9
      inv_txfm_add_8x4_adst_dct_0_8bpc_c: 1146.9
      inv_txfm_add_8x4_adst_dct_0_8bpc_ssse3: 98.7
      inv_txfm_add_8x4_adst_dct_1_8bpc_c: 1157.9
      inv_txfm_add_8x4_adst_dct_1_8bpc_ssse3: 98.9
      inv_txfm_add_8x4_adst_flipadst_0_8bpc_c: 1144.6
      inv_txfm_add_8x4_adst_flipadst_0_8bpc_ssse3: 111.4
      inv_txfm_add_8x4_adst_flipadst_1_8bpc_c: 1128.2
      inv_txfm_add_8x4_adst_flipadst_1_8bpc_ssse3: 112.4
      inv_txfm_add_8x4_adst_identity_0_8bpc_c: 1051.1
      inv_txfm_add_8x4_adst_identity_0_8bpc_ssse3: 87.1
      inv_txfm_add_8x4_adst_identity_1_8bpc_c: 1059.2
      inv_txfm_add_8x4_adst_identity_1_8bpc_ssse3: 87.7
      inv_txfm_add_8x4_dct_adst_0_8bpc_c: 1130.2
      inv_txfm_add_8x4_dct_adst_0_8bpc_ssse3: 29.0
      inv_txfm_add_8x4_dct_adst_1_8bpc_c: 1130.1
      inv_txfm_add_8x4_dct_adst_1_8bpc_ssse3: 89.2
      inv_txfm_add_8x4_dct_dct_0_8bpc_c: 1186.0
      inv_txfm_add_8x4_dct_dct_0_8bpc_ssse3: 26.3
      inv_txfm_add_8x4_dct_dct_1_8bpc_c: 1172.2
      inv_txfm_add_8x4_dct_dct_1_8bpc_ssse3: 78.8
      inv_txfm_add_8x4_dct_flipadst_0_8bpc_c: 1154.7
      inv_txfm_add_8x4_dct_flipadst_0_8bpc_ssse3: 29.1
      inv_txfm_add_8x4_dct_flipadst_1_8bpc_c: 1150.2
      inv_txfm_add_8x4_dct_flipadst_1_8bpc_ssse3: 92.2
      inv_txfm_add_8x4_dct_identity_0_8bpc_c: 1078.7
      inv_txfm_add_8x4_dct_identity_0_8bpc_ssse3: 29.2
      inv_txfm_add_8x4_dct_identity_1_8bpc_c: 1090.1
      inv_txfm_add_8x4_dct_identity_1_8bpc_ssse3: 72.2
      inv_txfm_add_8x4_flipadst_adst_0_8bpc_c: 1111.6
      inv_txfm_add_8x4_flipadst_adst_0_8bpc_ssse3: 108.6
      inv_txfm_add_8x4_flipadst_adst_1_8bpc_c: 1112.1
      inv_txfm_add_8x4_flipadst_adst_1_8bpc_ssse3: 107.6
      inv_txfm_add_8x4_flipadst_dct_0_8bpc_c: 1163.0
      inv_txfm_add_8x4_flipadst_dct_0_8bpc_ssse3: 98.3
      inv_txfm_add_8x4_flipadst_dct_1_8bpc_c: 1160.0
      inv_txfm_add_8x4_flipadst_dct_1_8bpc_ssse3: 99.6
      inv_txfm_add_8x4_flipadst_flipadst_0_8bpc_c: 1137.9
      inv_txfm_add_8x4_flipadst_flipadst_0_8bpc_ssse3: 112.0
      inv_txfm_add_8x4_flipadst_flipadst_1_8bpc_c: 1140.0
      inv_txfm_add_8x4_flipadst_flipadst_1_8bpc_ssse3: 112.0
      inv_txfm_add_8x4_flipadst_identity_0_8bpc_c: 1057.2
      inv_txfm_add_8x4_flipadst_identity_0_8bpc_ssse3: 88.1
      inv_txfm_add_8x4_flipadst_identity_1_8bpc_c: 1058.3
      inv_txfm_add_8x4_flipadst_identity_1_8bpc_ssse3: 87.1
      inv_txfm_add_8x4_identity_adst_0_8bpc_c: 794.0
      inv_txfm_add_8x4_identity_adst_0_8bpc_ssse3: 60.6
      inv_txfm_add_8x4_identity_adst_1_8bpc_c: 793.4
      inv_txfm_add_8x4_identity_adst_1_8bpc_ssse3: 60.6
      inv_txfm_add_8x4_identity_dct_0_8bpc_c: 838.4
      inv_txfm_add_8x4_identity_dct_0_8bpc_ssse3: 27.4
      inv_txfm_add_8x4_identity_dct_1_8bpc_c: 838.5
      inv_txfm_add_8x4_identity_dct_1_8bpc_ssse3: 52.0
      inv_txfm_add_8x4_identity_flipadst_0_8bpc_c: 825.3
      inv_txfm_add_8x4_identity_flipadst_0_8bpc_ssse3: 66.7
      inv_txfm_add_8x4_identity_flipadst_1_8bpc_c: 831.7
      inv_txfm_add_8x4_identity_flipadst_1_8bpc_ssse3: 66.7
      inv_txfm_add_8x4_identity_identity_0_8bpc_c: 768.6
      inv_txfm_add_8x4_identity_identity_0_8bpc_ssse3: 40.0
      inv_txfm_add_8x4_identity_identity_1_8bpc_c: 743.3
      inv_txfm_add_8x4_identity_identity_1_8bpc_ssse3: 39.9
      1703f21f
  3. 18 Dec, 2018 3 commits
  4. 17 Dec, 2018 1 commit
  5. 06 Dec, 2018 1 commit
    • Liwei Wang's avatar
      Add SSSE3 implementation for the 4x4 blocks in itx · 87a377e9
      Liwei Wang authored
      Cycle times:
      inv_txfm_add_4x4_adst_adst_0_8bpc_c: 445.9
      inv_txfm_add_4x4_adst_adst_0_8bpc_ssse3: 23.7
      inv_txfm_add_4x4_adst_adst_1_8bpc_c: 443.7
      inv_txfm_add_4x4_adst_adst_1_8bpc_ssse3: 52.6
      inv_txfm_add_4x4_adst_dct_0_8bpc_c: 474.5
      inv_txfm_add_4x4_adst_dct_0_8bpc_ssse3: 23.9
      inv_txfm_add_4x4_adst_dct_1_8bpc_c: 482.0
      inv_txfm_add_4x4_adst_dct_1_8bpc_ssse3: 51.1
      inv_txfm_add_4x4_adst_flipadst_0_8bpc_c: 587.2
      inv_txfm_add_4x4_adst_flipadst_0_8bpc_ssse3: 24.0
      inv_txfm_add_4x4_adst_flipadst_1_8bpc_c: 457.2
      inv_txfm_add_4x4_adst_flipadst_1_8bpc_ssse3: 52.8
      inv_txfm_add_4x4_adst_identity_0_8bpc_c: 412.4
      inv_txfm_add_4x4_adst_identity_0_8bpc_ssse3: 43.3
      inv_txfm_add_4x4_adst_identity_1_8bpc_c: 412.0
      inv_txfm_add_4x4_adst_identity_1_8bpc_ssse3: 43.3
      inv_txfm_add_4x4_dct_adst_0_8bpc_c: 467.4
      inv_txfm_add_4x4_dct_adst_0_8bpc_ssse3: 23.2
      inv_txfm_add_4x4_dct_adst_1_8bpc_c: 588.3
      inv_txfm_add_4x4_dct_adst_1_8bpc_ssse3: 48.6
      inv_txfm_add_4x4_dct_dct_0_8bpc_c: 611.5
      inv_txfm_add_4x4_dct_dct_0_8bpc_ssse3: 23.1
      inv_txfm_add_4x4_dct_dct_1_8bpc_c: 576.2
      inv_txfm_add_4x4_dct_dct_1_8bpc_ssse3: 47.6
      inv_txfm_add_4x4_dct_flipadst_0_8bpc_c: 479.5
      inv_txfm_add_4x4_dct_flipadst_0_8bpc_ssse3: 23.4
      inv_txfm_add_4x4_dct_flipadst_1_8bpc_c: 549.3
      inv_txfm_add_4x4_dct_flipadst_1_8bpc_ssse3: 48.3
      inv_txfm_add_4x4_dct_identity_0_8bpc_c: 576.9
      inv_txfm_add_4x4_dct_identity_0_8bpc_ssse3: 25.4
      inv_txfm_add_4x4_dct_identity_1_8bpc_c: 610.7
      inv_txfm_add_4x4_dct_identity_1_8bpc_ssse3: 25.1
      inv_txfm_add_4x4_flipadst_adst_0_8bpc_c: 532.8
      inv_txfm_add_4x4_flipadst_adst_0_8bpc_ssse3: 23.8
      inv_txfm_add_4x4_flipadst_adst_1_8bpc_c: 666.7
      inv_txfm_add_4x4_flipadst_adst_1_8bpc_ssse3: 61.0
      inv_txfm_add_4x4_flipadst_dct_0_8bpc_c: 539.6
      inv_txfm_add_4x4_flipadst_dct_0_8bpc_ssse3: 23.8
      inv_txfm_add_4x4_flipadst_dct_1_8bpc_c: 484.6
      inv_txfm_add_4x4_flipadst_dct_1_8bpc_ssse3: 51.1
      inv_txfm_add_4x4_flipadst_flipadst_0_8bpc_c: 503.1
      inv_txfm_add_4x4_flipadst_flipadst_0_8bpc_ssse3: 23.9
      inv_txfm_add_4x4_flipadst_flipadst_1_8bpc_c: 463.0
      inv_txfm_add_4x4_flipadst_flipadst_1_8bpc_ssse3: 54.0
      inv_txfm_add_4x4_flipadst_identity_0_8bpc_c: 719.9
      inv_txfm_add_4x4_flipadst_identity_0_8bpc_ssse3: 43.0
      inv_txfm_add_4x4_flipadst_identity_1_8bpc_c: 456.8
      inv_txfm_add_4x4_flipadst_identity_1_8bpc_ssse3: 44.1
      inv_txfm_add_4x4_identity_adst_0_8bpc_c: 422.8
      inv_txfm_add_4x4_identity_adst_0_8bpc_ssse3: 42.4
      inv_txfm_add_4x4_identity_adst_1_8bpc_c: 417.1
      inv_txfm_add_4x4_identity_adst_1_8bpc_ssse3: 42.3
      inv_txfm_add_4x4_identity_dct_0_8bpc_c: 435.4
      inv_txfm_add_4x4_identity_dct_0_8bpc_ssse3: 25.7
      inv_txfm_add_4x4_identity_dct_1_8bpc_c: 434.1
      inv_txfm_add_4x4_identity_dct_1_8bpc_ssse3: 25.3
      inv_txfm_add_4x4_identity_flipadst_0_8bpc_c: 528.1
      inv_txfm_add_4x4_identity_flipadst_0_8bpc_ssse3: 40.9
      inv_txfm_add_4x4_identity_flipadst_1_8bpc_c: 720.0
      inv_txfm_add_4x4_identity_flipadst_1_8bpc_ssse3: 41.8
      inv_txfm_add_4x4_identity_identity_0_8bpc_c: 383.2
      inv_txfm_add_4x4_identity_identity_0_8bpc_ssse3: 28.3
      inv_txfm_add_4x4_identity_identity_1_8bpc_c: 378.9
      inv_txfm_add_4x4_identity_identity_1_8bpc_ssse3: 28.2
      inv_txfm_add_4x4_wht_wht_0_8bpc_c: 271.5
      inv_txfm_add_4x4_wht_wht_0_8bpc_ssse3: 34.0
      inv_txfm_add_4x4_wht_wht_1_8bpc_c: 266.0
      inv_txfm_add_4x4_wht_wht_1_8bpc_ssse3: 33.9
      87a377e9
  6. 04 Dec, 2018 1 commit