Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • videolan/dav1d
  • ePirat/dav1d
  • magsoft/dav1d
  • chouquette/dav1d
  • shiz/dav1d
  • tdaede/dav1d
  • tmatth/dav1d
  • dwbuiten/dav1d
  • mstorsjo/dav1d
  • janne/dav1d
  • ltrudeau/dav1d
  • rzumer/dav1d
  • lu_zero/dav1d
  • rbultje/dav1d
  • tbr/dav1d
  • thresh/dav1d
  • haasn/dav1d
  • midtskogen/dav1d
  • SmilingWolf/dav1d
  • lotharkript/dav1d
  • jamrial/dav1d
  • barrbrain/dav1d
  • robUx4/dav1d
  • jbk/dav1d
  • skal65535/dav1d
  • tappara/dav1d
  • dalecurtis/dav1d
  • montytyper/dav1d
  • TLaurent/dav1d
  • liwei/dav1d
  • CounterPillow/dav1d
  • rswarbrick-argon/dav1d
  • mjbshaw/dav1d
  • fcartegnie/dav1d
  • jyavenard/dav1d
  • xuefeng/dav1d
  • licao/dav1d
  • FredB/dav1d
  • jn7163/dav1d
  • bherman.aconspart/dav1d
  • anisse/dav1d
  • koda/dav1d
  • mihulet88/dav1d
  • sabdfl/dav1d
  • brion/dav1d
  • tj_davies/dav1d
  • EwoutH/dav1d
  • KyleSiefring/dav1d
  • manass3018/dav1d
  • krish-iyer/dav1d
  • stebler/dav1d
  • hchen1506/dav1d
  • f3ndot/dav1d
  • linkmauve/dav1d
  • malvanos/dav1d
  • rcss/dav1d
  • DonDiego/dav1d
  • ledyba-z/dav1d
  • seiqan2/dav1d
  • t0934812955/dav1d
  • xclaesse/dav1d
  • lynne/dav1d
  • loveingpowellalways/dav1d
  • govind.sharma/dav1d
  • kossh1/dav1d
  • davidandsabrina4ever2014/dav1d
  • abdouseck664/dav1d
  • jennifer.derrick61583/dav1d
  • msaas01925/dav1d
  • akymaster/dav1d
  • sylvestre/dav1d
  • morgan.shenkin/dav1d
  • B3rn4arD/dav1d
  • evzien/dav1d
  • mwozniak/dav1d
  • TompSciGit/dav1d
  • namse/dav1d
  • kkourin/dav1d
  • nico/dav1d
  • galad/dav1d
  • ltnokiago/dav1d
  • mindfreeze/dav1d
  • DmitriySychov/dav1d
  • oddstone/dav1d
  • nasirhemed/dav1d
  • richselwood/dav1d
  • longervision/dav1d
  • kurosu/dav1d
  • heitbaum/dav1d
  • Opiyonag/dav1d
  • salomethirot-arm/dav1d
  • dillmo71/dav1d
  • jwright-arm/dav1d
  • stonef385/dav1d
  • y-guyon/dav1d
  • andrekempe-arm/dav-1-d-reloaded
  • joedrago/dav1d
  • Rtytry/dav1d
  • altanai/dav1d
  • beiluo97/dav1d
  • wtc/dav1d
  • Asilx21/dav1d
  • DarioSucic/dav1d
  • Siberiawind/dav1d
  • edelmirocove17/dav1d
  • Mtndude/dav1d
  • dconrad/dav1d
  • ChildSoap/dav1d
  • kalan5269/dav1d
  • Jolincai/dav1d
  • kawiddoes/dav1d
  • ledyba/dav1d
  • minhhien231186/dav1d
  • beiluo971/dav1d
  • hakantezgoren34/dav1d
  • chigita73/dav1d
  • slomo/dav1d
  • Starbuck5/dav1d
  • jbeich/dav1d
  • berrylcm/dav1d
  • philip584521/dav1d
  • IgorKey/dav1d
  • shekar007/dav1d
  • jdek/dav1d
  • oldsssteveo/dav1d
  • Jingwiw/dav1d
  • vigneshv/dav1d
  • andrey.semashev/dav1d
  • v.cvetkov/dav1d
  • kattmedhatt/dav1d
  • ccawley2011/dav1d
  • rportalez/dav1d
  • Skantes/dav1d
  • arpadpanyik-arm/dav1d
  • asenat/dav1d
  • pcc/dav1d
  • nickg/dav1d
  • BogdanW3/dav1d
  • brad/dav1d
  • MARBEAN2/dav1d
  • yintong.ustc/dav1d
  • cosmin/dav1d
  • kasper93/dav1d
  • HecaiYuan/dav1d
  • jerrytsai569/dav1d
  • ttwuandes/dav1d
  • OctopusET/dav1d
  • maryla-uc/dav1d
  • Un1q32/dav1d
  • pranavk/dav1d
  • twulz/dav1d
  • gianni-r/dav1d
152 results
Show changes
Commits on Source (25)
  • Hecai Yuan's avatar
    loongarch: Opt one functions of itx_8bpc.add_16x32 series · 74e0eeb5
    Hecai Yuan authored and Hecai Yuan's avatar Hecai Yuan committed
    1. inv_txfm_add_dct_dct_16x32
    
    Relative speedup over C code:
    
    inv_txfm_add_16x32_dct_dct_0_8bpc_c: 63.4
    inv_txfm_add_16x32_dct_dct_0_8bpc_lsx: 3.3
    inv_txfm_add_16x32_dct_dct_1_8bpc_c: 687.0
    inv_txfm_add_16x32_dct_dct_1_8bpc_lsx: 55.7
    inv_txfm_add_16x32_dct_dct_2_8bpc_c: 686.4
    inv_txfm_add_16x32_dct_dct_2_8bpc_lsx: 55.6
    inv_txfm_add_16x32_dct_dct_3_8bpc_c: 686.4
    inv_txfm_add_16x32_dct_dct_3_8bpc_lsx: 55.5
    inv_txfm_add_16x32_dct_dct_4_8bpc_c: 686.4
    inv_txfm_add_16x32_dct_dct_4_8bpc_lsx: 55.6
    
    Change-Id: I9d22b8b3534b7ba17f6e85e42d08eb3165e2e8cb
    74e0eeb5
  • Hecai Yuan's avatar
    loongarch: add lsx implementation of itx_8bpc.add_4x8 series function for 8 bpc · d60d93a5
    Hecai Yuan authored and Hecai Yuan's avatar Hecai Yuan committed
    Relative speedup over C code:
    
    inv_txfm_add_4x8_adst_adst_0_8bpc_c: 43.8
    inv_txfm_add_4x8_adst_adst_0_8bpc_lsx: 8.6
    inv_txfm_add_4x8_adst_adst_1_8bpc_c: 43.8
    inv_txfm_add_4x8_adst_adst_1_8bpc_lsx: 8.6
    inv_txfm_add_4x8_adst_dct_0_8bpc_c: 43.0
    inv_txfm_add_4x8_adst_dct_0_8bpc_lsx: 6.5
    inv_txfm_add_4x8_adst_dct_1_8bpc_c: 43.0
    inv_txfm_add_4x8_adst_dct_1_8bpc_lsx: 6.5
    inv_txfm_add_4x8_adst_flipadst_0_8bpc_c: 44.1
    inv_txfm_add_4x8_adst_flipadst_0_8bpc_lsx: 8.8
    inv_txfm_add_4x8_adst_flipadst_1_8bpc_c: 44.1
    inv_txfm_add_4x8_adst_flipadst_1_8bpc_lsx: 8.8
    inv_txfm_add_4x8_adst_identity_0_8bpc_c: 31.3
    inv_txfm_add_4x8_adst_identity_0_8bpc_lsx: 2.9
    inv_txfm_add_4x8_adst_identity_1_8bpc_c: 31.3
    inv_txfm_add_4x8_adst_identity_1_8bpc_lsx: 2.9
    inv_txfm_add_4x8_dct_adst_0_8bpc_c: 46.3
    inv_txfm_add_4x8_dct_adst_0_8bpc_lsx: 8.8
    inv_txfm_add_4x8_dct_adst_1_8bpc_c: 46.3
    inv_txfm_add_4x8_dct_adst_1_8bpc_lsx: 8.8
    inv_txfm_add_4x8_dct_dct_0_8bpc_c: 7.3
    inv_txfm_add_4x8_dct_dct_0_8bpc_lsx: 1.5
    inv_txfm_add_4x8_dct_dct_1_8bpc_c: 45.7
    inv_txfm_add_4x8_dct_dct_1_8bpc_lsx: 6.7
    inv_txfm_add_4x8_dct_flipadst_0_8bpc_c: 46.7
    inv_txfm_add_4x8_dct_flipadst_0_8bpc_lsx: 8.8
    inv_txfm_add_4x8_dct_flipadst_1_8bpc_c: 46.7
    inv_txfm_add_4x8_dct_flipadst_1_8bpc_lsx: 8.8
    inv_txfm_add_4x8_dct_identity_0_8bpc_c: 33.8
    inv_txfm_add_4x8_dct_identity_0_8bpc_lsx: 2.9
    inv_txfm_add_4x8_dct_identity_1_8bpc_c: 33.8
    inv_txfm_add_4x8_dct_identity_1_8bpc_lsx: 2.9
    inv_txfm_add_4x8_flipadst_adst_0_8bpc_c: 44.0
    inv_txfm_add_4x8_flipadst_adst_0_8bpc_lsx: 8.6
    inv_txfm_add_4x8_flipadst_adst_1_8bpc_c: 43.9
    inv_txfm_add_4x8_flipadst_adst_1_8bpc_lsx: 8.6
    inv_txfm_add_4x8_flipadst_dct_0_8bpc_c: 43.3
    inv_txfm_add_4x8_flipadst_dct_0_8bpc_lsx: 6.5
    inv_txfm_add_4x8_flipadst_dct_1_8bpc_c: 43.4
    inv_txfm_add_4x8_flipadst_dct_1_8bpc_lsx: 6.5
    inv_txfm_add_4x8_flipadst_flipadst_0_8bpc_c: 44.4
    inv_txfm_add_4x8_flipadst_flipadst_0_8bpc_lsx: 8.8
    inv_txfm_add_4x8_flipadst_flipadst_1_8bpc_c: 44.4
    inv_txfm_add_4x8_flipadst_flipadst_1_8bpc_lsx: 8.8
    inv_txfm_add_4x8_flipadst_identity_0_8bpc_c: 31.5
    inv_txfm_add_4x8_flipadst_identity_0_8bpc_lsx: 2.9
    inv_txfm_add_4x8_flipadst_identity_1_8bpc_c: 31.5
    inv_txfm_add_4x8_flipadst_identity_1_8bpc_lsx: 2.9
    inv_txfm_add_4x8_identity_adst_0_8bpc_c: 38.9
    inv_txfm_add_4x8_identity_adst_0_8bpc_lsx: 8.2
    inv_txfm_add_4x8_identity_adst_1_8bpc_c: 38.9
    inv_txfm_add_4x8_identity_adst_1_8bpc_lsx: 8.2
    inv_txfm_add_4x8_identity_dct_0_8bpc_c: 38.1
    inv_txfm_add_4x8_identity_dct_0_8bpc_lsx: 6.1
    inv_txfm_add_4x8_identity_dct_1_8bpc_c: 38.1
    inv_txfm_add_4x8_identity_dct_1_8bpc_lsx: 6.1
    inv_txfm_add_4x8_identity_flipadst_0_8bpc_c: 39.2
    inv_txfm_add_4x8_identity_flipadst_0_8bpc_lsx: 8.3
    inv_txfm_add_4x8_identity_flipadst_1_8bpc_c: 39.2
    inv_txfm_add_4x8_identity_flipadst_1_8bpc_lsx: 8.3
    inv_txfm_add_4x8_identity_identity_0_8bpc_c: 26.4
    inv_txfm_add_4x8_identity_identity_0_8bpc_lsx: 2.4
    inv_txfm_add_4x8_identity_identity_1_8bpc_c: 26.4
    inv_txfm_add_4x8_identity_identity_1_8bpc_lsx: 2.4
    
    Change-Id: Ibbaeca98118774a261cf55afd581196d93ac2004
    d60d93a5
  • Hecai Yuan's avatar
    loongarch: add lsx implementation of itx_8bpc.add_4x16 series function for 8 bpc · 643ae52b
    Hecai Yuan authored and Hecai Yuan's avatar Hecai Yuan committed
    Relative speedup over C code:
    
    inv_txfm_add_4x16_adst_adst_0_8bpc_c: 91.1
    inv_txfm_add_4x16_adst_adst_0_8bpc_lsx: 18.2
    inv_txfm_add_4x16_adst_adst_1_8bpc_c: 91.1
    inv_txfm_add_4x16_adst_adst_1_8bpc_lsx: 18.2
    inv_txfm_add_4x16_adst_adst_2_8bpc_c: 91.1
    inv_txfm_add_4x16_adst_adst_2_8bpc_lsx: 18.2
    inv_txfm_add_4x16_adst_dct_0_8bpc_c: 89.5
    inv_txfm_add_4x16_adst_dct_0_8bpc_lsx: 14.3
    inv_txfm_add_4x16_adst_dct_1_8bpc_c: 89.5
    inv_txfm_add_4x16_adst_dct_1_8bpc_lsx: 14.3
    inv_txfm_add_4x16_adst_dct_2_8bpc_c: 89.5
    inv_txfm_add_4x16_adst_dct_2_8bpc_lsx: 14.3
    inv_txfm_add_4x16_adst_flipadst_0_8bpc_c: 91.8
    inv_txfm_add_4x16_adst_flipadst_0_8bpc_lsx: 18.2
    inv_txfm_add_4x16_adst_flipadst_1_8bpc_c: 91.7
    inv_txfm_add_4x16_adst_flipadst_1_8bpc_lsx: 18.2
    inv_txfm_add_4x16_adst_flipadst_2_8bpc_c: 91.8
    inv_txfm_add_4x16_adst_flipadst_2_8bpc_lsx: 18.2
    inv_txfm_add_4x16_adst_identity_0_8bpc_c: 60.5
    inv_txfm_add_4x16_adst_identity_0_8bpc_lsx: 6.3
    inv_txfm_add_4x16_adst_identity_1_8bpc_c: 60.5
    inv_txfm_add_4x16_adst_identity_1_8bpc_lsx: 6.3
    inv_txfm_add_4x16_adst_identity_2_8bpc_c: 60.5
    inv_txfm_add_4x16_adst_identity_2_8bpc_lsx: 6.3
    inv_txfm_add_4x16_dct_adst_0_8bpc_c: 92.7
    inv_txfm_add_4x16_dct_adst_0_8bpc_lsx: 18.4
    inv_txfm_add_4x16_dct_adst_1_8bpc_c: 92.7
    inv_txfm_add_4x16_dct_adst_1_8bpc_lsx: 18.4
    inv_txfm_add_4x16_dct_adst_2_8bpc_c: 92.7
    inv_txfm_add_4x16_dct_adst_2_8bpc_lsx: 18.4
    inv_txfm_add_4x16_dct_dct_0_8bpc_c: 13.7
    inv_txfm_add_4x16_dct_dct_0_8bpc_lsx: 1.9
    inv_txfm_add_4x16_dct_dct_1_8bpc_c: 90.6
    inv_txfm_add_4x16_dct_dct_1_8bpc_lsx: 14.5
    inv_txfm_add_4x16_dct_dct_2_8bpc_c: 90.6
    inv_txfm_add_4x16_dct_dct_2_8bpc_lsx: 14.5
    inv_txfm_add_4x16_dct_flipadst_0_8bpc_c: 93.3
    inv_txfm_add_4x16_dct_flipadst_0_8bpc_lsx: 18.6
    inv_txfm_add_4x16_dct_flipadst_1_8bpc_c: 93.4
    inv_txfm_add_4x16_dct_flipadst_1_8bpc_lsx: 18.6
    inv_txfm_add_4x16_dct_flipadst_2_8bpc_c: 93.4
    inv_txfm_add_4x16_dct_flipadst_2_8bpc_lsx: 18.6
    inv_txfm_add_4x16_dct_identity_0_8bpc_c: 62.1
    inv_txfm_add_4x16_dct_identity_0_8bpc_lsx: 6.5
    inv_txfm_add_4x16_dct_identity_1_8bpc_c: 62.1
    inv_txfm_add_4x16_dct_identity_1_8bpc_lsx: 6.5
    inv_txfm_add_4x16_dct_identity_2_8bpc_c: 62.1
    inv_txfm_add_4x16_dct_identity_2_8bpc_lsx: 6.5
    inv_txfm_add_4x16_flipadst_adst_0_8bpc_c: 92.2
    inv_txfm_add_4x16_flipadst_adst_0_8bpc_lsx: 18.1
    inv_txfm_add_4x16_flipadst_adst_1_8bpc_c: 92.3
    inv_txfm_add_4x16_flipadst_adst_1_8bpc_lsx: 18.1
    inv_txfm_add_4x16_flipadst_adst_2_8bpc_c: 92.2
    inv_txfm_add_4x16_flipadst_adst_2_8bpc_lsx: 18.1
    inv_txfm_add_4x16_flipadst_dct_0_8bpc_c: 90.6
    inv_txfm_add_4x16_flipadst_dct_0_8bpc_lsx: 14.3
    inv_txfm_add_4x16_flipadst_dct_1_8bpc_c: 90.6
    inv_txfm_add_4x16_flipadst_dct_1_8bpc_lsx: 14.3
    inv_txfm_add_4x16_flipadst_dct_2_8bpc_c: 90.6
    inv_txfm_add_4x16_flipadst_dct_2_8bpc_lsx: 14.3
    inv_txfm_add_4x16_flipadst_flipadst_0_8bpc_c: 92.9
    inv_txfm_add_4x16_flipadst_flipadst_0_8bpc_lsx: 18.2
    inv_txfm_add_4x16_flipadst_flipadst_1_8bpc_c: 92.9
    inv_txfm_add_4x16_flipadst_flipadst_1_8bpc_lsx: 18.2
    inv_txfm_add_4x16_flipadst_flipadst_2_8bpc_c: 92.9
    inv_txfm_add_4x16_flipadst_flipadst_2_8bpc_lsx: 18.2
    inv_txfm_add_4x16_flipadst_identity_0_8bpc_c: 61.8
    inv_txfm_add_4x16_flipadst_identity_0_8bpc_lsx: 6.3
    inv_txfm_add_4x16_flipadst_identity_1_8bpc_c: 61.8
    inv_txfm_add_4x16_flipadst_identity_1_8bpc_lsx: 6.3
    inv_txfm_add_4x16_flipadst_identity_2_8bpc_c: 61.8
    inv_txfm_add_4x16_flipadst_identity_2_8bpc_lsx: 6.3
    inv_txfm_add_4x16_identity_adst_0_8bpc_c: 83.1
    inv_txfm_add_4x16_identity_adst_0_8bpc_lsx: 17.8
    inv_txfm_add_4x16_identity_adst_1_8bpc_c: 83.0
    inv_txfm_add_4x16_identity_adst_1_8bpc_lsx: 17.8
    inv_txfm_add_4x16_identity_adst_2_8bpc_c: 83.0
    inv_txfm_add_4x16_identity_adst_2_8bpc_lsx: 17.8
    inv_txfm_add_4x16_identity_dct_0_8bpc_c: 81.4
    inv_txfm_add_4x16_identity_dct_0_8bpc_lsx: 13.9
    inv_txfm_add_4x16_identity_dct_1_8bpc_c: 81.4
    inv_txfm_add_4x16_identity_dct_1_8bpc_lsx: 13.9
    inv_txfm_add_4x16_identity_dct_2_8bpc_c: 81.4
    inv_txfm_add_4x16_identity_dct_2_8bpc_lsx: 13.9
    inv_txfm_add_4x16_identity_flipadst_0_8bpc_c: 84.1
    inv_txfm_add_4x16_identity_flipadst_0_8bpc_lsx: 17.8
    inv_txfm_add_4x16_identity_flipadst_1_8bpc_c: 84.0
    inv_txfm_add_4x16_identity_flipadst_1_8bpc_lsx: 17.8
    inv_txfm_add_4x16_identity_flipadst_2_8bpc_c: 83.9
    inv_txfm_add_4x16_identity_flipadst_2_8bpc_lsx: 17.8
    inv_txfm_add_4x16_identity_identity_0_8bpc_c: 52.4
    inv_txfm_add_4x16_identity_identity_0_8bpc_lsx: 5.5
    inv_txfm_add_4x16_identity_identity_1_8bpc_c: 52.4
    inv_txfm_add_4x16_identity_identity_1_8bpc_lsx: 5.5
    inv_txfm_add_4x16_identity_identity_2_8bpc_c: 52.4
    inv_txfm_add_4x16_identity_identity_2_8bpc_lsx: 5.5
    
    Change-Id: I36322071eeea45df9289f2b1d533ee937904aec2
    643ae52b
  • Hecai Yuan's avatar
    loongarch: add lsx implementation of itx_8bpc.add_8x16 series function for 8 bpc · 2fc65660
    Hecai Yuan authored and Hecai Yuan's avatar Hecai Yuan committed
    Relative speedup over C code:
    
    inv_txfm_add_8x16_adst_adst_0_8bpc_c: 208.1
    inv_txfm_add_8x16_adst_adst_0_8bpc_lsx: 31.3
    inv_txfm_add_8x16_adst_adst_1_8bpc_c: 208.4
    inv_txfm_add_8x16_adst_adst_1_8bpc_lsx: 31.3
    inv_txfm_add_8x16_adst_adst_2_8bpc_c: 208.1
    inv_txfm_add_8x16_adst_adst_2_8bpc_lsx: 31.3
    inv_txfm_add_8x16_adst_dct_0_8bpc_c: 204.0
    inv_txfm_add_8x16_adst_dct_0_8bpc_lsx: 27.2
    inv_txfm_add_8x16_adst_dct_1_8bpc_c: 204.0
    inv_txfm_add_8x16_adst_dct_1_8bpc_lsx: 27.2
    inv_txfm_add_8x16_adst_dct_2_8bpc_c: 204.0
    inv_txfm_add_8x16_adst_dct_2_8bpc_lsx: 27.2
    inv_txfm_add_8x16_adst_flipadst_0_8bpc_c: 207.9
    inv_txfm_add_8x16_adst_flipadst_0_8bpc_lsx: 31.3
    inv_txfm_add_8x16_adst_flipadst_1_8bpc_c: 208.3
    inv_txfm_add_8x16_adst_flipadst_1_8bpc_lsx: 31.3
    inv_txfm_add_8x16_adst_flipadst_2_8bpc_c: 208.6
    inv_txfm_add_8x16_adst_flipadst_2_8bpc_lsx: 31.3
    inv_txfm_add_8x16_adst_identity_0_8bpc_c: 146.6
    inv_txfm_add_8x16_adst_identity_0_8bpc_lsx: 21.8
    inv_txfm_add_8x16_adst_identity_1_8bpc_c: 146.6
    inv_txfm_add_8x16_adst_identity_1_8bpc_lsx: 21.8
    inv_txfm_add_8x16_adst_identity_2_8bpc_c: 146.6
    inv_txfm_add_8x16_adst_identity_2_8bpc_lsx: 21.8
    inv_txfm_add_8x16_dct_adst_0_8bpc_c: 204.8
    inv_txfm_add_8x16_dct_adst_0_8bpc_lsx: 26.2
    inv_txfm_add_8x16_dct_adst_1_8bpc_c: 204.8
    inv_txfm_add_8x16_dct_adst_1_8bpc_lsx: 26.1
    inv_txfm_add_8x16_dct_adst_2_8bpc_c: 204.8
    inv_txfm_add_8x16_dct_adst_2_8bpc_lsx: 26.2
    inv_txfm_add_8x16_dct_dct_0_8bpc_c: 23.1
    inv_txfm_add_8x16_dct_dct_0_8bpc_lsx: 2.3
    inv_txfm_add_8x16_dct_dct_1_8bpc_c: 200.8
    inv_txfm_add_8x16_dct_dct_1_8bpc_lsx: 21.9
    inv_txfm_add_8x16_dct_dct_2_8bpc_c: 200.7
    inv_txfm_add_8x16_dct_dct_2_8bpc_lsx: 21.9
    inv_txfm_add_8x16_dct_flipadst_0_8bpc_c: 204.6
    inv_txfm_add_8x16_dct_flipadst_0_8bpc_lsx: 26.3
    inv_txfm_add_8x16_dct_flipadst_1_8bpc_c: 204.6
    inv_txfm_add_8x16_dct_flipadst_1_8bpc_lsx: 26.3
    inv_txfm_add_8x16_dct_flipadst_2_8bpc_c: 204.6
    inv_txfm_add_8x16_dct_flipadst_2_8bpc_lsx: 26.3
    inv_txfm_add_8x16_dct_identity_0_8bpc_c: 143.2
    inv_txfm_add_8x16_dct_identity_0_8bpc_lsx: 16.7
    inv_txfm_add_8x16_dct_identity_1_8bpc_c: 142.9
    inv_txfm_add_8x16_dct_identity_1_8bpc_lsx: 16.7
    inv_txfm_add_8x16_dct_identity_2_8bpc_c: 143.5
    inv_txfm_add_8x16_dct_identity_2_8bpc_lsx: 16.7
    inv_txfm_add_8x16_flipadst_adst_0_8bpc_c: 206.5
    inv_txfm_add_8x16_flipadst_adst_0_8bpc_lsx: 31.3
    inv_txfm_add_8x16_flipadst_adst_1_8bpc_c: 206.5
    inv_txfm_add_8x16_flipadst_adst_1_8bpc_lsx: 31.3
    inv_txfm_add_8x16_flipadst_adst_2_8bpc_c: 206.5
    inv_txfm_add_8x16_flipadst_adst_2_8bpc_lsx: 31.3
    inv_txfm_add_8x16_flipadst_dct_0_8bpc_c: 202.5
    inv_txfm_add_8x16_flipadst_dct_0_8bpc_lsx: 26.8
    inv_txfm_add_8x16_flipadst_dct_1_8bpc_c: 202.3
    inv_txfm_add_8x16_flipadst_dct_1_8bpc_lsx: 26.8
    inv_txfm_add_8x16_flipadst_dct_2_8bpc_c: 202.3
    inv_txfm_add_8x16_flipadst_dct_2_8bpc_lsx: 26.8
    inv_txfm_add_8x16_flipadst_flipadst_0_8bpc_c: 206.3
    inv_txfm_add_8x16_flipadst_flipadst_0_8bpc_lsx: 31.3
    inv_txfm_add_8x16_flipadst_flipadst_1_8bpc_c: 206.3
    inv_txfm_add_8x16_flipadst_flipadst_1_8bpc_lsx: 31.3
    inv_txfm_add_8x16_flipadst_flipadst_2_8bpc_c: 206.3
    inv_txfm_add_8x16_flipadst_flipadst_2_8bpc_lsx: 31.3
    inv_txfm_add_8x16_identity_adst_0_8bpc_c: 160.7
    inv_txfm_add_8x16_identity_adst_0_8bpc_lsx: 21.8
    inv_txfm_add_8x16_identity_adst_1_8bpc_c: 160.4
    inv_txfm_add_8x16_identity_adst_1_8bpc_lsx: 21.8
    inv_txfm_add_8x16_identity_adst_2_8bpc_c: 160.1
    inv_txfm_add_8x16_identity_adst_2_8bpc_lsx: 21.8
    inv_txfm_add_8x16_identity_dct_0_8bpc_c: 157.9
    inv_txfm_add_8x16_identity_dct_0_8bpc_lsx: 17.7
    inv_txfm_add_8x16_identity_dct_1_8bpc_c: 156.5
    inv_txfm_add_8x16_identity_dct_1_8bpc_lsx: 17.7
    inv_txfm_add_8x16_identity_dct_2_8bpc_c: 156.8
    inv_txfm_add_8x16_identity_dct_2_8bpc_lsx: 17.7
    inv_txfm_add_8x16_identity_flipadst_0_8bpc_c: 159.9
    inv_txfm_add_8x16_identity_flipadst_0_8bpc_lsx: 21.8
    inv_txfm_add_8x16_identity_flipadst_1_8bpc_c: 159.9
    inv_txfm_add_8x16_identity_flipadst_1_8bpc_lsx: 21.8
    inv_txfm_add_8x16_identity_flipadst_2_8bpc_c: 160.0
    inv_txfm_add_8x16_identity_flipadst_2_8bpc_lsx: 21.8
    inv_txfm_add_8x16_identity_identity_0_8bpc_c: 98.3
    inv_txfm_add_8x16_identity_identity_0_8bpc_lsx: 12.3
    inv_txfm_add_8x16_identity_identity_1_8bpc_c: 98.0
    inv_txfm_add_8x16_identity_identity_1_8bpc_lsx: 12.3
    inv_txfm_add_8x16_identity_identity_2_8bpc_c: 98.1
    inv_txfm_add_8x16_identity_identity_2_8bpc_lsx: 12.3
    
    Change-Id: Ida8d71e4eff782b9f81e0ad426eaa078b68528cf
    2fc65660
  • Hecai Yuan's avatar
    loongarch: Minor improvement on identity4*, identity8* and dct32* · 5de878a4
    Hecai Yuan authored and Hecai Yuan's avatar Hecai Yuan committed
    1. remove the code about identity8 in the 4x8/8x8/8x16 series
    2. modify the code of the function dct_dct_8x32/32x32/64x64
    3. modify the code about identity4 in the 4x4/4x8/8x4 series
    
    After the modification, function performance has been improved by 20%
    
    Change-Id: I1bc2e0fb25e508faf9fc220333460a99be3f5e49
    5de878a4
  • Hecai Yuan's avatar
    loongarch: opt inv_txfm_add_identity_identity_8x32_8bpc_lsx · f6ffdc90
    Hecai Yuan authored and Hecai Yuan's avatar Hecai Yuan committed
    Relative speedup over C code:
    
    inv_txfm_add_8x32_identity_identity_0_8bpc_c:       126.1 ( 1.00x)
    inv_txfm_add_8x32_identity_identity_0_8bpc_lsx:       1.6 (78.59x)
    inv_txfm_add_8x32_identity_identity_1_8bpc_c:       136.9 ( 1.00x)
    inv_txfm_add_8x32_identity_identity_1_8bpc_lsx:       1.6 (85.31x)
    inv_txfm_add_8x32_identity_identity_2_8bpc_c:       148.0 ( 1.00x)
    inv_txfm_add_8x32_identity_identity_2_8bpc_lsx:       3.3 (45.47x)
    inv_txfm_add_8x32_identity_identity_3_8bpc_c:       159.4 ( 1.00x)
    inv_txfm_add_8x32_identity_identity_3_8bpc_lsx:       4.9 (32.78x)
    inv_txfm_add_8x32_identity_identity_4_8bpc_c:       170.2 ( 1.00x)
    inv_txfm_add_8x32_identity_identity_4_8bpc_lsx:       6.5 (26.17x)
    
    Change-Id: Iabda6efcd8a17d26a205f90757dfea85af48848f
    f6ffdc90
  • pengxu's avatar
    Loongarch: Optimized cdef_find_dir_8bpc function by LSX · 2154425f
    pengxu authored and Hecai Yuan's avatar Hecai Yuan committed
    cdef_dir_8bpc_c:                 28.8 ( 1.00x)
    cdef_dir_8bpc_lsx:               19.1 ( 1.51x)
    
    Change-Id: Ic7c1f32c5b1733b011f4c448cffc93f745b564f5
    2154425f
  • guxiwei's avatar
    msac: Add msac_decode_bool_equia_lsx and msac_decode_hi_tok_lsx · 02309b9f
    guxiwei authored and Hecai Yuan's avatar Hecai Yuan committed
    The performance data is as follows:
    msac_decode_bool_equi_c:             0.4 ( 1.00x)
    msac_decode_bool_equi_lsx:           0.3 ( 1.07x)
    msac_decode_hi_tok_c:                1.8 ( 1.00x)
    msac_decode_hi_tok_lsx:              1.4 ( 1.27x)
    
    Change-Id: Ic2f2678cf699bb22c579424af71ae2603e228482
    02309b9f
  • jinbo's avatar
    Refine mc_put_8tap · fa7b72d0
    jinbo authored and Hecai Yuan's avatar Hecai Yuan committed
    Performance speedup over lsx is around 68%~156%.
    
    Change-Id: I0b39cd0e05e3cbd84fded121d29a91ea2a620f03
    fa7b72d0
  • pengxu's avatar
    Loongarch: Optimized cdef_filter_block 4x4,4x8,8x8 8bpc function by LSX · 62c47f35
    pengxu authored and Hecai Yuan's avatar Hecai Yuan committed
    cdef_filter_4x4_01_8bpc_c:      420.8 ( 1.00x)
    cdef_filter_4x4_01_8bpc_lsx:    117.2 ( 3.59x)
    cdef_filter_4x4_10_8bpc_c:      265.8 ( 1.00x)
    cdef_filter_4x4_10_8bpc_lsx:     98.9 ( 2.69x)
    cdef_filter_4x4_11_8bpc_c:     1036.2 ( 1.00x)
    cdef_filter_4x4_11_8bpc_lsx:    169.6 ( 6.11x)
    cdef_filter_4x8_01_8bpc_c:      802.6 ( 1.00x)
    cdef_filter_4x8_01_8bpc_lsx:    206.1 ( 3.89x)
    cdef_filter_4x8_10_8bpc_c:      489.1 ( 1.00x)
    cdef_filter_4x8_10_8bpc_lsx:    167.4 ( 2.92x)
    cdef_filter_4x8_11_8bpc_c:     2028.9 ( 1.00x)
    cdef_filter_4x8_11_8bpc_lsx:    309.4 ( 6.56x)
    cdef_filter_8x8_01_8bpc_c:     1562.2 ( 1.00x)
    cdef_filter_8x8_01_8bpc_lsx:    295.3 ( 5.29x)
    cdef_filter_8x8_10_8bpc_c:      949.4 ( 1.00x)
    cdef_filter_8x8_10_8bpc_lsx:    207.6 ( 4.57x)
    cdef_filter_8x8_11_8bpc_c:     4009.6 ( 1.00x)
    cdef_filter_8x8_11_8bpc_lsx:    466.8 ( 8.59x)
    
    Change-Id: I8cd43426a27055e18c44a7701fa50f8835c712be
    62c47f35
  • pengxu's avatar
    Loongarch: Optimized ipred_dc,ipred_dc_128 8bpc,ipred_dc_left and ipred_dc_top functions by LSX · 2a9cbcc2
    pengxu authored and Hecai Yuan's avatar Hecai Yuan committed
    intra_pred_dc_w4_8bpc_c:              2.1 ( 1.00x)
    intra_pred_dc_w4_8bpc_lsx:            1.3 ( 1.54x)
    intra_pred_dc_w8_8bpc_c:              3.6 ( 1.00x)
    intra_pred_dc_w8_8bpc_lsx:            3.7 ( 0.97x)
    intra_pred_dc_w16_8bpc_c:             6.9 ( 1.00x)
    intra_pred_dc_w16_8bpc_lsx:           7.8 ( 0.88x)
    intra_pred_dc_w32_8bpc_c:            14.1 ( 1.00x)
    intra_pred_dc_w32_8bpc_lsx:           7.1 ( 1.97x)
    intra_pred_dc_w64_8bpc_c:            25.3 ( 1.00x)
    intra_pred_dc_w64_8bpc_lsx:           7.4 ( 3.41x)
    intra_pred_dc_128_w4_8bpc_c:          0.6 ( 1.00x)
    intra_pred_dc_128_w4_8bpc_lsx:        0.8 ( 0.76x)
    intra_pred_dc_128_w8_8bpc_c:          1.4 ( 1.00x)
    intra_pred_dc_128_w8_8bpc_lsx:        3.2 ( 0.45x)
    intra_pred_dc_128_w16_8bpc_c:         3.4 ( 1.00x)
    intra_pred_dc_128_w16_8bpc_lsx:       7.3 ( 0.47x)
    intra_pred_dc_128_w32_8bpc_c:         8.8 ( 1.00x)
    intra_pred_dc_128_w32_8bpc_lsx:       6.4 ( 1.38x)
    intra_pred_dc_128_w64_8bpc_c:        17.0 ( 1.00x)
    intra_pred_dc_128_w64_8bpc_lsx:       6.2 ( 2.74x)
    intra_pred_dc_left_w4_8bpc_c:         1.1 ( 1.00x)
    intra_pred_dc_left_w4_8bpc_lsx:       1.1 ( 1.00x)
    intra_pred_dc_left_w8_8bpc_c:         2.1 ( 1.00x)
    intra_pred_dc_left_w8_8bpc_lsx:       3.4 ( 0.64x)
    intra_pred_dc_left_w16_8bpc_c:        4.6 ( 1.00x)
    intra_pred_dc_left_w16_8bpc_lsx:      7.5 ( 0.62x)
    intra_pred_dc_left_w32_8bpc_c:       10.3 ( 1.00x)
    intra_pred_dc_left_w32_8bpc_lsx:      7.8 ( 1.32x)
    intra_pred_dc_left_w64_8bpc_c:       18.7 ( 1.00x)
    intra_pred_dc_left_w64_8bpc_lsx:      6.6 ( 2.83x)
    intra_pred_dc_top_w4_8bpc_c:          0.9 ( 1.00x)
    intra_pred_dc_top_w4_8bpc_lsx:        0.8 ( 1.10x)
    intra_pred_dc_top_w8_8bpc_c:          1.9 ( 1.00x)
    intra_pred_dc_top_w8_8bpc_lsx:        2.8 ( 0.67x)
    intra_pred_dc_top_w16_8bpc_c:         4.2 ( 1.00x)
    intra_pred_dc_top_w16_8bpc_lsx:       5.5 ( 0.77x)
    intra_pred_dc_top_w32_8bpc_c:        10.4 ( 1.00x)
    intra_pred_dc_top_w32_8bpc_lsx:       6.7 ( 1.54x)
    intra_pred_dc_top_w64_8bpc_c:        19.9 ( 1.00x)
    intra_pred_dc_top_w64_8bpc_lsx:       6.9 ( 2.87x)
    
    Change-Id: Ib5349e2430302da0424a474ce0fedc457439c761
    2a9cbcc2
  • pengxu's avatar
    Loongarch: Optimized ipred_h and ipred_v 8bpc function by LSX · 3e9d80d8
    pengxu authored and Hecai Yuan's avatar Hecai Yuan committed
    intra_pred_h_w4_8bpc_c:               4.3 ( 1.00x)
    intra_pred_h_w4_8bpc_lsx:             3.5 ( 1.21x)
    intra_pred_h_w8_8bpc_c:               5.7 ( 1.00x)
    intra_pred_h_w8_8bpc_lsx:             5.1 ( 1.11x)
    intra_pred_h_w16_8bpc_c:             13.2 ( 1.00x)
    intra_pred_h_w16_8bpc_lsx:            7.1 ( 1.86x)
    intra_pred_h_w32_8bpc_c:             12.4 ( 1.00x)
    intra_pred_h_w32_8bpc_lsx:            6.3 ( 1.96x)
    intra_pred_h_w64_8bpc_c:             25.9 ( 1.00x)
    intra_pred_h_w64_8bpc_lsx:            5.8 ( 4.44x)
    intra_pred_v_w4_8bpc_c:               4.6 ( 1.00x)
    intra_pred_v_w4_8bpc_lsx:             2.5 ( 1.85x)
    intra_pred_v_w8_8bpc_c:               6.9 ( 1.00x)
    intra_pred_v_w8_8bpc_lsx:             4.5 ( 1.53x)
    intra_pred_v_w16_8bpc_c:             13.3 ( 1.00x)
    intra_pred_v_w16_8bpc_lsx:            5.2 ( 2.56x)
    intra_pred_v_w32_8bpc_c:             16.1 ( 1.00x)
    intra_pred_v_w32_8bpc_lsx:            5.1 ( 3.13x)
    intra_pred_v_w64_8bpc_c:             21.7 ( 1.00x)
    intra_pred_v_w64_8bpc_lsx:            7.7 ( 2.80x)
    
    Change-Id: I51b3dd13877315b9c1c64590c19f1ad38bfc4bdf
    3e9d80d8
  • pengxu's avatar
    Loongarch: Optimized ipred_paeth 8bpc function by LSX · 7463c2af
    pengxu authored and Hecai Yuan's avatar Hecai Yuan committed
    intra_pred_paeth_w4_8bpc_c:          12.3 ( 1.00x)
    intra_pred_paeth_w4_8bpc_lsx:         3.9 ( 3.12x)
    intra_pred_paeth_w8_8bpc_c:          39.7 ( 1.00x)
    intra_pred_paeth_w8_8bpc_lsx:         6.4 ( 6.20x)
    intra_pred_paeth_w16_8bpc_c:        133.6 ( 1.00x)
    intra_pred_paeth_w16_8bpc_lsx:       17.0 ( 7.85x)
    intra_pred_paeth_w32_8bpc_c:        342.8 ( 1.00x)
    intra_pred_paeth_w32_8bpc_lsx:       52.7 ( 6.50x)
    intra_pred_paeth_w64_8bpc_c:        903.8 ( 1.00x)
    intra_pred_paeth_w64_8bpc_lsx:      107.3 ( 8.42x)
    
    Change-Id: I457bdb24fdd6b5400ec030bffbdd40c79d8165c1
    7463c2af
  • pengxu's avatar
    Loongarch: Optimized ipred_smooth, ipred_smooth_h and ipred_smooth_v 8bpc functions by LSX · 0b9c756f
    pengxu authored and Hecai Yuan's avatar Hecai Yuan committed
    intra_pred_smooth_h_w4_8bpc_c:         7.3 ( 1.00x)
    intra_pred_smooth_h_w4_8bpc_lsx:       3.1 ( 2.36x)
    intra_pred_smooth_h_w8_8bpc_c:        21.3 ( 1.00x)
    intra_pred_smooth_h_w8_8bpc_lsx:       4.5 ( 4.71x)
    intra_pred_smooth_h_w16_8bpc_c:       66.3 ( 1.00x)
    intra_pred_smooth_h_w16_8bpc_lsx:     13.4 ( 4.96x)
    intra_pred_smooth_h_w32_8bpc_c:      160.0 ( 1.00x)
    intra_pred_smooth_h_w32_8bpc_lsx:     29.3 ( 5.46x)
    intra_pred_smooth_h_w64_8bpc_c:      400.2 ( 1.00x)
    intra_pred_smooth_h_w64_8bpc_lsx:     68.3 ( 5.86x)
    intra_pred_smooth_v_w4_8bpc_c:         6.6 ( 1.00x)
    intra_pred_smooth_v_w4_8bpc_lsx:       3.1 ( 2.10x)
    intra_pred_smooth_v_w8_8bpc_c:        19.3 ( 1.00x)
    intra_pred_smooth_v_w8_8bpc_lsx:       4.9 ( 3.95x)
    intra_pred_smooth_v_w16_8bpc_c:       58.6 ( 1.00x)
    intra_pred_smooth_v_w16_8bpc_lsx:     24.0 ( 2.44x)
    intra_pred_smooth_v_w32_8bpc_c:      139.4 ( 1.00x)
    intra_pred_smooth_v_w32_8bpc_lsx:     27.0 ( 5.17x)
    intra_pred_smooth_v_w64_8bpc_c:      344.8 ( 1.00x)
    intra_pred_smooth_v_w64_8bpc_lsx:     70.8 ( 4.87x)
    intra_pred_smooth_w4_8bpc_c:          10.2 ( 1.00x)
    intra_pred_smooth_w4_8bpc_lsx:         7.9 ( 1.30x)
    intra_pred_smooth_w8_8bpc_c:          30.3 ( 1.00x)
    intra_pred_smooth_w8_8bpc_lsx:        20.0 ( 1.51x)
    intra_pred_smooth_w16_8bpc_c:         96.3 ( 1.00x)
    intra_pred_smooth_w16_8bpc_lsx:       58.3 ( 1.65x)
    intra_pred_smooth_w32_8bpc_c:        231.1 ( 1.00x)
    intra_pred_smooth_w32_8bpc_lsx:      134.3 ( 1.72x)
    intra_pred_smooth_w64_8bpc_c:        571.5 ( 1.00x)
    intra_pred_smooth_w64_8bpc_lsx:      326.5 ( 1.75x)
    
    Change-Id: I22b6c2dcf27c5393bba374b4fbe8879c0463f828
    0b9c756f
  • zhoupeng's avatar
    Loongarch: Optimized blend_c/blenc_v_c function by LSX · 5319278d
    zhoupeng authored and Hecai Yuan's avatar Hecai Yuan committed
    blend_v_w2_8bpc_c:                                   5.7 ( 1.00x)
    blend_v_w2_8bpc_lsx:                                 3.6 ( 1.60x)
    blend_v_w4_8bpc_c:                                  22.8 ( 1.00x)
    blend_v_w4_8bpc_lsx:                                 7.1 ( 3.20x)
    blend_v_w8_8bpc_c:                                  40.2 ( 1.00x)
    blend_v_w8_8bpc_lsx:                                 7.1 ( 5.63x)
    blend_v_w16_8bpc_c:                                 74.6 ( 1.00x)
    blend_v_w16_8bpc_lsx:                                8.1 ( 9.26x)
    blend_v_w32_8bpc_c:                                144.0 ( 1.00x)
    blend_v_w32_8bpc_lsx:                               13.3 (10.83x)
    blend_w4_8bpc_c:                                     4.9 ( 1.00x)
    blend_w4_8bpc_lsx:                                   1.9 ( 2.49x)
    blend_w8_8bpc_c:                                    14.1 ( 1.00x)
    blend_w8_8bpc_lsx:                                   3.2 ( 4.37x)
    blend_w16_8bpc_c:                                   51.5 ( 1.00x)
    blend_w16_8bpc_lsx:                                  7.9 ( 6.51x)
    blend_w32_8bpc_c:                                  127.5 ( 1.00x)
    blend_w32_8bpc_lsx:                                 19.6 ( 6.52x)
    
    Change-Id: I95e2dbc1f0735688f5473687f1a7e8d37ffbe417
    5319278d
  • zhoupeng's avatar
    Loongarch: Optimized blenc_h_c function by LSX/LASX · ce45ebde
    zhoupeng authored and Hecai Yuan's avatar Hecai Yuan committed
    blend_h_w2_8bpc_c:                                   3.8 ( 1.00x)
    blend_h_w2_8bpc_lsx:                                 1.9 ( 1.98x)
    blend_h_w2_8bpc_lasx:                                1.9 ( 1.98x)
    blend_h_w4_8bpc_c:                                   6.4 ( 1.00x)
    blend_h_w4_8bpc_lsx:                                 1.8 ( 3.49x)
    blend_h_w4_8bpc_lasx:                                1.8 ( 3.49x)
    blend_h_w8_8bpc_c:                                  11.6 ( 1.00x)
    blend_h_w8_8bpc_lsx:                                 1.8 ( 6.45x)
    blend_h_w8_8bpc_lasx:                                1.8 ( 6.48x)
    blend_h_w16_8bpc_c:                                 21.5 ( 1.00x)
    blend_h_w16_8bpc_lsx:                                2.1 (10.47x)
    blend_h_w16_8bpc_lasx:                               2.1 (10.48x)
    blend_h_w32_8bpc_c:                                 41.9 ( 1.00x)
    blend_h_w32_8bpc_lsx:                                3.8 (11.08x)
    blend_h_w32_8bpc_lasx:                               3.9 (10.67x)
    blend_h_w64_8bpc_c:                                 82.0 ( 1.00x)
    blend_h_w64_8bpc_lsx:                                6.9 (11.89x)
    blend_h_w64_8bpc_lasx:                               4.6 (17.93x)
    blend_h_w128_8bpc_c:                               202.3 ( 1.00x)
    blend_h_w128_8bpc_lsx:                              16.4 (12.30x)
    blend_h_w128_8bpc_lasx:                             11.4 (17.77x)
    
    Change-Id: I6d6599ccbaba8a62a629c4a52254b2369dba60f6
    ce45ebde
  • jinbo's avatar
    loongarch: Add prep_8tap_8bpc_lsx · b26f315d
    jinbo authored and Hecai Yuan's avatar Hecai Yuan committed
    mct_8tap_regular_w4_0_8bpc_c:                        3.7 ( 1.00x)
    mct_8tap_regular_w4_0_8bpc_lsx:                      0.9 ( 4.21x)
    mct_8tap_regular_w4_h_8bpc_c:                       15.7 ( 1.00x)
    mct_8tap_regular_w4_h_8bpc_lsx:                      1.7 ( 9.24x)
    mct_8tap_regular_w4_hv_8bpc_c:                      44.1 ( 1.00x)
    mct_8tap_regular_w4_hv_8bpc_lsx:                     6.3 ( 6.96x)
    mct_8tap_regular_w4_v_8bpc_c:                       19.8 ( 1.00x)
    mct_8tap_regular_w4_v_8bpc_lsx:                      2.4 ( 8.21x)
    mct_8tap_regular_w8_0_8bpc_c:                       10.5 ( 1.00x)
    mct_8tap_regular_w8_0_8bpc_lsx:                      1.3 ( 8.27x)
    mct_8tap_regular_w8_h_8bpc_c:                       47.2 ( 1.00x)
    mct_8tap_regular_w8_h_8bpc_lsx:                      6.2 ( 7.61x)
    mct_8tap_regular_w8_hv_8bpc_c:                     119.5 ( 1.00x)
    mct_8tap_regular_w8_hv_8bpc_lsx:                    18.9 ( 6.32x)
    mct_8tap_regular_w8_v_8bpc_c:                       60.5 ( 1.00x)
    mct_8tap_regular_w8_v_8bpc_lsx:                      5.4 (11.12x)
    mct_8tap_regular_w16_0_8bpc_c:                      28.8 ( 1.00x)
    mct_8tap_regular_w16_0_8bpc_lsx:                     2.8 (10.32x)
    mct_8tap_regular_w16_h_8bpc_c:                     151.9 ( 1.00x)
    mct_8tap_regular_w16_h_8bpc_lsx:                    19.8 ( 7.67x)
    mct_8tap_regular_w16_hv_8bpc_c:                    357.5 ( 1.00x)
    mct_8tap_regular_w16_hv_8bpc_lsx:                   57.6 ( 6.21x)
    mct_8tap_regular_w16_v_8bpc_c:                     195.6 ( 1.00x)
    mct_8tap_regular_w16_v_8bpc_lsx:                    16.9 (11.61x)
    mct_8tap_regular_w32_0_8bpc_c:                     104.6 ( 1.00x)
    mct_8tap_regular_w32_0_8bpc_lsx:                    11.6 ( 9.03x)
    mct_8tap_regular_w32_h_8bpc_c:                     596.3 ( 1.00x)
    mct_8tap_regular_w32_h_8bpc_lsx:                    77.8 ( 7.67x)
    mct_8tap_regular_w32_hv_8bpc_c:                   1329.0 ( 1.00x)
    mct_8tap_regular_w32_hv_8bpc_lsx:                  217.9 ( 6.10x)
    mct_8tap_regular_w32_v_8bpc_c:                     771.0 ( 1.00x)
    mct_8tap_regular_w32_v_8bpc_lsx:                    65.7 (11.73x)
    mct_8tap_regular_w64_0_8bpc_c:                     242.0 ( 1.00x)
    mct_8tap_regular_w64_0_8bpc_lsx:                    27.0 ( 8.95x)
    mct_8tap_regular_w64_h_8bpc_c:                    1455.9 ( 1.00x)
    mct_8tap_regular_w64_h_8bpc_lsx:                   186.9 ( 7.79x)
    mct_8tap_regular_w64_hv_8bpc_c:                   3221.7 ( 1.00x)
    mct_8tap_regular_w64_hv_8bpc_lsx:                  521.8 ( 6.17x)
    mct_8tap_regular_w64_v_8bpc_c:                    1836.1 ( 1.00x)
    mct_8tap_regular_w64_v_8bpc_lsx:                   158.2 (11.61x)
    mct_8tap_regular_w128_0_8bpc_c:                    629.0 ( 1.00x)
    mct_8tap_regular_w128_0_8bpc_lsx:                   66.3 ( 9.49x)
    mct_8tap_regular_w128_h_8bpc_c:                   3617.5 ( 1.00x)
    mct_8tap_regular_w128_h_8bpc_lsx:                  463.6 ( 7.80x)
    mct_8tap_regular_w128_hv_8bpc_c:                  7881.7 ( 1.00x)
    mct_8tap_regular_w128_hv_8bpc_lsx:                1290.3 ( 6.11x)
    mct_8tap_regular_w128_v_8bpc_c:                   4552.9 ( 1.00x)
    mct_8tap_regular_w128_v_8bpc_lsx:                  391.1 (11.64x)
    
    Change-Id: I8c6046e4bd6c1fb19d5712234abece0355fb77fa
    b26f315d
  • pengxu's avatar
    Loongarch: Optimized pal_pred 8bpc functions by LSX · 3f6c845d
    pengxu authored and Hecai Yuan's avatar Hecai Yuan committed
    pal_pred_w4_8bpc_c:         3.0 ( 1.00x)
    pal_pred_w4_8bpc_lsx:       0.6 ( 5.46x)
    pal_pred_w8_8bpc_c:         8.8 ( 1.00x)
    pal_pred_w8_8bpc_lsx:       0.9 ( 9.49x)
    pal_pred_w16_8bpc_c:       26.0 ( 1.00x)
    pal_pred_w16_8bpc_lsx:      1.9 (13.70x)
    pal_pred_w32_8bpc_c:       60.6 ( 1.00x)
    pal_pred_w32_8bpc_lsx:      4.0 (15.10x)
    pal_pred_w64_8bpc_c:      146.9 ( 1.00x)
    pal_pred_w64_8bpc_lsx:      9.2 (15.97x)
    
    Change-Id: I5414f096a23b09c3a512e727b93fa22104d141f9
    3f6c845d
  • pengxu's avatar
    Loongarch: Optimized cfl_pred_cfl, cfl_pred_cfl_128, cfl_pred_cfl_top and... · 083cf424
    pengxu authored and Hecai Yuan's avatar Hecai Yuan committed
    Loongarch: Optimized cfl_pred_cfl, cfl_pred_cfl_128, cfl_pred_cfl_top and cfl_pred_cfl_left 8bpc functions by LSX
    
    cfl_pred_cfl_128_w4_8bpc_c:         19.4 ( 1.00x)
    cfl_pred_cfl_128_w4_8bpc_lsx:        4.2 ( 4.63x)
    cfl_pred_cfl_128_w8_8bpc_c:         66.3 ( 1.00x)
    cfl_pred_cfl_128_w8_8bpc_lsx:        7.3 ( 9.11x)
    cfl_pred_cfl_128_w16_8bpc_c:       150.1 ( 1.00x)
    cfl_pred_cfl_128_w16_8bpc_lsx:      14.4 (10.45x)
    cfl_pred_cfl_128_w32_8bpc_c:       403.6 ( 1.00x)
    cfl_pred_cfl_128_w32_8bpc_lsx:      34.7 (11.65x)
    cfl_pred_cfl_left_w4_8bpc_c:        20.5 ( 1.00x)
    cfl_pred_cfl_left_w4_8bpc_lsx:       4.4 ( 4.63x)
    cfl_pred_cfl_left_w8_8bpc_c:        67.9 ( 1.00x)
    cfl_pred_cfl_left_w8_8bpc_lsx:       7.6 ( 8.94x)
    cfl_pred_cfl_left_w16_8bpc_c:      152.0 ( 1.00x)
    cfl_pred_cfl_left_w16_8bpc_lsx:     14.6 (10.38x)
    cfl_pred_cfl_left_w32_8bpc_c:      405.8 ( 1.00x)
    cfl_pred_cfl_left_w32_8bpc_lsx:     35.0 (11.58x)
    cfl_pred_cfl_top_w4_8bpc_c:         20.0 ( 1.00x)
    cfl_pred_cfl_top_w4_8bpc_lsx:        4.4 ( 4.51x)
    cfl_pred_cfl_top_w8_8bpc_c:         67.6 ( 1.00x)
    cfl_pred_cfl_top_w8_8bpc_lsx:        7.5 ( 8.99x)
    cfl_pred_cfl_top_w16_8bpc_c:       152.5 ( 1.00x)
    cfl_pred_cfl_top_w16_8bpc_lsx:      14.6 (10.41x)
    cfl_pred_cfl_top_w32_8bpc_c:       408.0 ( 1.00x)
    cfl_pred_cfl_top_w32_8bpc_lsx:      35.2 (11.58x)
    cfl_pred_cfl_w4_8bpc_c:             21.1 ( 1.00x)
    cfl_pred_cfl_w4_8bpc_lsx:            4.8 ( 4.43x)
    cfl_pred_cfl_w8_8bpc_c:             68.6 ( 1.00x)
    cfl_pred_cfl_w8_8bpc_lsx:            7.9 ( 8.73x)
    cfl_pred_cfl_w16_8bpc_c:           154.4 ( 1.00x)
    cfl_pred_cfl_w16_8bpc_lsx:          15.0 (10.29x)
    cfl_pred_cfl_w32_8bpc_c:           410.3 ( 1.00x)
    cfl_pred_cfl_w32_8bpc_lsx:          35.6 (11.54x)
    
    Change-Id: I4ec7cc71483298d28379bfbd824e97a0d74d0c23
    083cf424
  • Hecai Yuan's avatar
    loongarch: opt inv_txfm_add_adst_dct/dct_dct/identity_identity_16x4_8bpc_lsx · 843f00e5
    Hecai Yuan authored and Hecai Yuan's avatar Hecai Yuan committed
    Relative speedup over C code:
    
    inv_txfm_add_16x4_adst_dct_0_8bpc_c:                 61.7 ( 1.00x)
    inv_txfm_add_16x4_adst_dct_0_8bpc_lsx:               17.8 ( 3.46x)
    inv_txfm_add_16x4_adst_dct_1_8bpc_c:                 96.2 ( 1.00x)
    inv_txfm_add_16x4_adst_dct_1_8bpc_lsx:               17.8 ( 5.39x)
    inv_txfm_add_16x4_adst_dct_2_8bpc_c:                 96.2 ( 1.00x)
    inv_txfm_add_16x4_adst_dct_2_8bpc_lsx:               17.8 ( 5.39x)
    inv_txfm_add_16x4_dct_dct_0_8bpc_c:                  10.8 ( 1.00x)
    inv_txfm_add_16x4_dct_dct_0_8bpc_lsx:                 0.9 (12.23x)
    inv_txfm_add_16x4_dct_dct_1_8bpc_c:                  94.5 ( 1.00x)
    inv_txfm_add_16x4_dct_dct_1_8bpc_lsx:                13.6 ( 6.94x)
    inv_txfm_add_16x4_dct_dct_2_8bpc_c:                  94.7 ( 1.00x)
    inv_txfm_add_16x4_dct_dct_2_8bpc_lsx:                13.6 ( 6.95x)
    inv_txfm_add_16x4_identity_identity_0_8bpc_c:        42.1 ( 1.00x)
    inv_txfm_add_16x4_identity_identity_0_8bpc_lsx:       5.1 ( 8.21x)
    inv_txfm_add_16x4_identity_identity_1_8bpc_c:        53.0 ( 1.00x)
    inv_txfm_add_16x4_identity_identity_1_8bpc_lsx:       5.1 (10.35x)
    inv_txfm_add_16x4_identity_identity_2_8bpc_c:        53.0 ( 1.00x)
    inv_txfm_add_16x4_identity_identity_2_8bpc_lsx:       5.1 (10.35x)
    
    Change-Id: I0be4f77e381da390e300070337fff404dcdcb862
    843f00e5
  • Hecai Yuan's avatar
    loongarch: add lsx implementation of itx_8bpc.add_16x8 series function for 8 bpc · 13a857d0
    Hecai Yuan authored and Hecai Yuan's avatar Hecai Yuan committed
    Relative speedup over C code:
    
    inv_txfm_add_16x8_adst_adst_0_8bpc_c:               127.7 ( 1.00x)
    inv_txfm_add_16x8_adst_adst_0_8bpc_lsx:              29.6 ( 4.32x)
    inv_txfm_add_16x8_adst_adst_1_8bpc_c:               206.6 ( 1.00x)
    inv_txfm_add_16x8_adst_adst_1_8bpc_lsx:              29.6 ( 6.98x)
    inv_txfm_add_16x8_adst_adst_2_8bpc_c:               206.6 ( 1.00x)
    inv_txfm_add_16x8_adst_adst_2_8bpc_lsx:              29.6 ( 6.99x)
    inv_txfm_add_16x8_adst_dct_0_8bpc_c:                126.7 ( 1.00x)
    inv_txfm_add_16x8_adst_dct_0_8bpc_lsx:               25.8 ( 4.91x)
    inv_txfm_add_16x8_adst_dct_1_8bpc_c:                205.1 ( 1.00x)
    inv_txfm_add_16x8_adst_dct_1_8bpc_lsx:               25.8 ( 7.94x)
    inv_txfm_add_16x8_adst_dct_2_8bpc_c:                205.2 ( 1.00x)
    inv_txfm_add_16x8_adst_dct_2_8bpc_lsx:               25.8 ( 7.94x)
    inv_txfm_add_16x8_adst_flipadst_0_8bpc_c:           128.3 ( 1.00x)
    inv_txfm_add_16x8_adst_flipadst_0_8bpc_lsx:          29.8 ( 4.30x)
    inv_txfm_add_16x8_adst_flipadst_1_8bpc_c:           207.2 ( 1.00x)
    inv_txfm_add_16x8_adst_flipadst_1_8bpc_lsx:          29.9 ( 6.94x)
    inv_txfm_add_16x8_adst_flipadst_2_8bpc_c:           207.1 ( 1.00x)
    inv_txfm_add_16x8_adst_flipadst_2_8bpc_lsx:          29.8 ( 6.94x)
    inv_txfm_add_16x8_adst_identity_0_8bpc_c:            78.3 ( 1.00x)
    inv_txfm_add_16x8_adst_identity_0_8bpc_lsx:          18.6 ( 4.21x)
    inv_txfm_add_16x8_adst_identity_1_8bpc_c:           157.1 ( 1.00x)
    inv_txfm_add_16x8_adst_identity_1_8bpc_lsx:          18.6 ( 8.45x)
    inv_txfm_add_16x8_adst_identity_2_8bpc_c:           157.2 ( 1.00x)
    inv_txfm_add_16x8_adst_identity_2_8bpc_lsx:          18.6 ( 8.46x)
    inv_txfm_add_16x8_dct_adst_0_8bpc_c:                127.4 ( 1.00x)
    inv_txfm_add_16x8_dct_adst_0_8bpc_lsx:               25.4 ( 5.02x)
    inv_txfm_add_16x8_dct_adst_1_8bpc_c:                201.2 ( 1.00x)
    inv_txfm_add_16x8_dct_adst_1_8bpc_lsx:               25.4 ( 7.93x)
    inv_txfm_add_16x8_dct_adst_2_8bpc_c:                201.2 ( 1.00x)
    inv_txfm_add_16x8_dct_adst_2_8bpc_lsx:               25.4 ( 7.93x)
    inv_txfm_add_16x8_dct_dct_0_8bpc_c:                  21.8 ( 1.00x)
    inv_txfm_add_16x8_dct_dct_0_8bpc_lsx:                 2.1 (10.52x)
    inv_txfm_add_16x8_dct_dct_1_8bpc_c:                 200.2 ( 1.00x)
    inv_txfm_add_16x8_dct_dct_1_8bpc_lsx:                21.6 ( 9.28x)
    inv_txfm_add_16x8_dct_dct_2_8bpc_c:                 200.2 ( 1.00x)
    inv_txfm_add_16x8_dct_dct_2_8bpc_lsx:                21.6 ( 9.28x)
    inv_txfm_add_16x8_dct_flipadst_0_8bpc_c:            127.2 ( 1.00x)
    inv_txfm_add_16x8_dct_flipadst_0_8bpc_lsx:           25.6 ( 4.96x)
    inv_txfm_add_16x8_dct_flipadst_1_8bpc_c:            201.2 ( 1.00x)
    inv_txfm_add_16x8_dct_flipadst_1_8bpc_lsx:           25.7 ( 7.84x)
    inv_txfm_add_16x8_dct_flipadst_2_8bpc_c:            201.7 ( 1.00x)
    inv_txfm_add_16x8_dct_flipadst_2_8bpc_lsx:           25.7 ( 7.86x)
    inv_txfm_add_16x8_dct_identity_0_8bpc_c:             77.3 ( 1.00x)
    inv_txfm_add_16x8_dct_identity_0_8bpc_lsx:           14.5 ( 5.35x)
    inv_txfm_add_16x8_dct_identity_1_8bpc_c:            151.2 ( 1.00x)
    inv_txfm_add_16x8_dct_identity_1_8bpc_lsx:           14.5 (10.46x)
    inv_txfm_add_16x8_dct_identity_2_8bpc_c:            151.5 ( 1.00x)
    inv_txfm_add_16x8_dct_identity_2_8bpc_lsx:           14.5 (10.48x)
    inv_txfm_add_16x8_flipadst_adst_0_8bpc_c:           128.5 ( 1.00x)
    inv_txfm_add_16x8_flipadst_adst_0_8bpc_lsx:          29.7 ( 4.32x)
    inv_txfm_add_16x8_flipadst_adst_1_8bpc_c:           207.3 ( 1.00x)
    inv_txfm_add_16x8_flipadst_adst_1_8bpc_lsx:          29.7 ( 6.97x)
    inv_txfm_add_16x8_flipadst_adst_2_8bpc_c:           207.4 ( 1.00x)
    inv_txfm_add_16x8_flipadst_adst_2_8bpc_lsx:          29.7 ( 6.98x)
    inv_txfm_add_16x8_flipadst_dct_0_8bpc_c:            126.8 ( 1.00x)
    inv_txfm_add_16x8_flipadst_dct_0_8bpc_lsx:           25.9 ( 4.90x)
    inv_txfm_add_16x8_flipadst_dct_1_8bpc_c:            204.8 ( 1.00x)
    inv_txfm_add_16x8_flipadst_dct_1_8bpc_lsx:           25.9 ( 7.92x)
    inv_txfm_add_16x8_flipadst_dct_2_8bpc_c:            205.4 ( 1.00x)
    inv_txfm_add_16x8_flipadst_dct_2_8bpc_lsx:           25.9 ( 7.94x)
    inv_txfm_add_16x8_flipadst_flipadst_0_8bpc_c:       128.6 ( 1.00x)
    inv_txfm_add_16x8_flipadst_flipadst_0_8bpc_lsx:      30.0 ( 4.29x)
    inv_txfm_add_16x8_flipadst_flipadst_1_8bpc_c:       206.6 ( 1.00x)
    inv_txfm_add_16x8_flipadst_flipadst_1_8bpc_lsx:      29.9 ( 6.90x)
    inv_txfm_add_16x8_flipadst_flipadst_2_8bpc_c:       206.5 ( 1.00x)
    inv_txfm_add_16x8_flipadst_flipadst_2_8bpc_lsx:      29.9 ( 6.90x)
    inv_txfm_add_16x8_flipadst_identity_0_8bpc_c:        77.8 ( 1.00x)
    inv_txfm_add_16x8_flipadst_identity_0_8bpc_lsx:      18.6 ( 4.18x)
    inv_txfm_add_16x8_flipadst_identity_1_8bpc_c:       156.3 ( 1.00x)
    inv_txfm_add_16x8_flipadst_identity_1_8bpc_lsx:      18.6 ( 8.40x)
    inv_txfm_add_16x8_flipadst_identity_2_8bpc_c:       156.6 ( 1.00x)
    inv_txfm_add_16x8_flipadst_identity_2_8bpc_lsx:      18.6 ( 8.42x)
    inv_txfm_add_16x8_identity_adst_0_8bpc_c:           120.7 ( 1.00x)
    inv_txfm_add_16x8_identity_adst_0_8bpc_lsx:          21.1 ( 5.71x)
    inv_txfm_add_16x8_identity_adst_1_8bpc_c:           120.8 ( 1.00x)
    inv_txfm_add_16x8_identity_adst_1_8bpc_lsx:          21.1 ( 5.71x)
    inv_txfm_add_16x8_identity_adst_2_8bpc_c:           145.5 ( 1.00x)
    inv_txfm_add_16x8_identity_adst_2_8bpc_lsx:          21.2 ( 6.88x)
    inv_txfm_add_16x8_identity_dct_0_8bpc_c:            119.1 ( 1.00x)
    inv_txfm_add_16x8_identity_dct_0_8bpc_lsx:           17.9 ( 6.67x)
    inv_txfm_add_16x8_identity_dct_1_8bpc_c:            119.1 ( 1.00x)
    inv_txfm_add_16x8_identity_dct_1_8bpc_lsx:           17.9 ( 6.67x)
    inv_txfm_add_16x8_identity_dct_2_8bpc_c:            143.8 ( 1.00x)
    inv_txfm_add_16x8_identity_dct_2_8bpc_lsx:           17.9 ( 8.06x)
    inv_txfm_add_16x8_identity_flipadst_0_8bpc_c:       120.7 ( 1.00x)
    inv_txfm_add_16x8_identity_flipadst_0_8bpc_lsx:      21.3 ( 5.66x)
    inv_txfm_add_16x8_identity_flipadst_1_8bpc_c:       120.4 ( 1.00x)
    inv_txfm_add_16x8_identity_flipadst_1_8bpc_lsx:      21.3 ( 5.65x)
    inv_txfm_add_16x8_identity_flipadst_2_8bpc_c:       144.9 ( 1.00x)
    inv_txfm_add_16x8_identity_flipadst_2_8bpc_lsx:      21.3 ( 6.80x)
    inv_txfm_add_16x8_identity_identity_0_8bpc_c:        70.2 ( 1.00x)
    inv_txfm_add_16x8_identity_identity_0_8bpc_lsx:       9.5 ( 7.38x)
    inv_txfm_add_16x8_identity_identity_1_8bpc_c:        95.6 ( 1.00x)
    inv_txfm_add_16x8_identity_identity_1_8bpc_lsx:       9.5 (10.06x)
    inv_txfm_add_16x8_identity_identity_2_8bpc_c:        95.6 ( 1.00x)
    inv_txfm_add_16x8_identity_identity_2_8bpc_lsx:       9.5 (10.06x)
    
    Change-Id: If1e274cab0e8441297a1eb44bd86be580f4c8f62
    13a857d0
  • Hecai Yuan's avatar
    loongarch: Add the some optimization function about itx for 8bpc · f398bf96
    Hecai Yuan authored and Hecai Yuan's avatar Hecai Yuan committed
    1. inv_txfm_add_dct_dct_32x16_8bpc_lsx
    2. inv_txfm_add_dct_dct_32x8_8bpc_lsx
    3. inv_txfm_add_dct_dct_64x32_8bpc_lsx
    4. inv_txfm_add_adst_flipadst_16x16_8bpc_lsx
    5. inv_txfm_add_flipadst_adst_16x16_8bpc_lsx
    6. inv_txfm_add_adst_adst_16x16_8bpc_lasx
    
    Relative speedup over C code:
    
    inv_txfm_add_32x16_dct_dct_0_8bpc_c:                 78.4 ( 1.00x)
    inv_txfm_add_32x16_dct_dct_0_8bpc_lsx:                5.7 (13.81x)
    inv_txfm_add_32x16_dct_dct_1_8bpc_c:                710.1 ( 1.00x)
    inv_txfm_add_32x16_dct_dct_1_8bpc_lsx:              102.9 ( 6.90x)
    inv_txfm_add_32x16_dct_dct_2_8bpc_c:                918.0 ( 1.00x)
    inv_txfm_add_32x16_dct_dct_2_8bpc_lsx:              103.2 ( 8.90x)
    inv_txfm_add_32x16_dct_dct_3_8bpc_c:                914.3 ( 1.00x)
    inv_txfm_add_32x16_dct_dct_3_8bpc_lsx:              103.2 ( 8.86x)
    inv_txfm_add_32x16_dct_dct_4_8bpc_c:                929.8 ( 1.00x)
    inv_txfm_add_32x16_dct_dct_4_8bpc_lsx:              102.9 ( 9.03x)
    
    inv_txfm_add_32x8_dct_dct_0_8bpc_c:                  39.6 ( 1.00x)
    inv_txfm_add_32x8_dct_dct_0_8bpc_lsx:                 3.0 (13.10x)
    inv_txfm_add_32x8_dct_dct_1_8bpc_c:                 431.6 ( 1.00x)
    inv_txfm_add_32x8_dct_dct_1_8bpc_lsx:                42.6 (10.13x)
    inv_txfm_add_32x8_dct_dct_2_8bpc_c:                 431.5 ( 1.00x)
    inv_txfm_add_32x8_dct_dct_2_8bpc_lsx:                42.6 (10.13x)
    inv_txfm_add_32x8_dct_dct_3_8bpc_c:                 432.0 ( 1.00x)
    inv_txfm_add_32x8_dct_dct_3_8bpc_lsx:                42.6 (10.14x)
    inv_txfm_add_32x8_dct_dct_4_8bpc_c:                 431.3 ( 1.00x)
    inv_txfm_add_32x8_dct_dct_4_8bpc_lsx:                42.6 (10.13x)
    
    inv_txfm_add_64x32_dct_dct_0_8bpc_c:                304.3 ( 1.00x)
    inv_txfm_add_64x32_dct_dct_0_8bpc_lsx:               20.3 (15.01x)
    inv_txfm_add_64x32_dct_dct_1_8bpc_c:               2743.1 ( 1.00x)
    inv_txfm_add_64x32_dct_dct_1_8bpc_lsx:              270.9 (10.13x)
    inv_txfm_add_64x32_dct_dct_2_8bpc_c:               3197.1 ( 1.00x)
    inv_txfm_add_64x32_dct_dct_2_8bpc_lsx:              327.7 ( 9.76x)
    inv_txfm_add_64x32_dct_dct_3_8bpc_c:               3638.3 ( 1.00x)
    inv_txfm_add_64x32_dct_dct_3_8bpc_lsx:              383.7 ( 9.48x)
    inv_txfm_add_64x32_dct_dct_4_8bpc_c:               4084.5 ( 1.00x)
    inv_txfm_add_64x32_dct_dct_4_8bpc_lsx:              441.7 ( 9.25x)
    
    inv_txfm_add_16x16_adst_flipadst_0_8bpc_c:          277.3 ( 1.00x)
    inv_txfm_add_16x16_adst_flipadst_0_8bpc_lsx:         58.7 ( 4.72x)
    inv_txfm_add_16x16_adst_flipadst_1_8bpc_c:          358.1 ( 1.00x)
    inv_txfm_add_16x16_adst_flipadst_1_8bpc_lsx:         58.7 ( 6.10x)
    inv_txfm_add_16x16_adst_flipadst_2_8bpc_c:          449.3 ( 1.00x)
    inv_txfm_add_16x16_adst_flipadst_2_8bpc_lsx:         58.7 ( 7.65x)
    
    inv_txfm_add_16x16_flipadst_adst_0_8bpc_c:          277.2 ( 1.00x)
    inv_txfm_add_16x16_flipadst_adst_0_8bpc_lsx:         58.7 ( 4.72x)
    inv_txfm_add_16x16_flipadst_adst_1_8bpc_c:          358.7 ( 1.00x)
    inv_txfm_add_16x16_flipadst_adst_1_8bpc_lsx:         58.7 ( 6.11x)
    inv_txfm_add_16x16_flipadst_adst_2_8bpc_c:          450.4 ( 1.00x)
    inv_txfm_add_16x16_flipadst_adst_2_8bpc_lsx:         58.7 ( 7.67x)
    
    inv_txfm_add_16x16_adst_adst_0_8bpc_c:              253.4 ( 1.00x)
    inv_txfm_add_16x16_adst_adst_0_8bpc_lasx:            23.1 (10.98x)
    inv_txfm_add_16x16_adst_adst_1_8bpc_c:              325.2 ( 1.00x)
    inv_txfm_add_16x16_adst_adst_1_8bpc_lasx:            23.1 (14.08x)
    inv_txfm_add_16x16_adst_adst_2_8bpc_c:              405.9 ( 1.00x)
    inv_txfm_add_16x16_adst_adst_2_8bpc_lasx:            23.1 (17.56x)
    
    Change-Id: Iaa5419a830c3308e2c4c9ac5b3699c3a971301ed
    f398bf96
  • pengxu's avatar
    Loongarch: Optimized ipred_filter 8bpc functions by LSX · 7f891597
    pengxu authored and Hecai Yuan's avatar Hecai Yuan committed
    intra_pred_filter_w4_8bpc_c:          17.9 ( 1.00x)
    intra_pred_filter_w4_8bpc_lsx:         8.9 ( 2.00x)
    intra_pred_filter_w8_8bpc_c:          55.3 ( 1.00x)
    intra_pred_filter_w8_8bpc_lsx:        23.8 ( 2.33x)
    intra_pred_filter_w16_8bpc_c:        109.4 ( 1.00x)
    intra_pred_filter_w16_8bpc_lsx:       49.1 ( 2.23x)
    intra_pred_filter_w32_8bpc_c:        270.2 ( 1.00x)
    intra_pred_filter_w32_8bpc_lsx:      126.1 ( 2.14x)
    
    Change-Id: Ic4c23cb1d54d5f8557c31cdfbbd54f8beaaa32c2
    7f891597
  • guxiwei's avatar
    LoongArch64: Implement checked_call() · e3101ddc
    guxiwei authored and Hecai Yuan's avatar Hecai Yuan committed
    Now checkasm calls the test function 'func_new' through
    the wrapper 'checked_call' instead of calling it directly.
    The purpose of the wrapper is to check if 'func_new' correctly
    saves and restores static registers. The wrapper writes dirty
    values to the static registers, and after calling 'func_new',
    it checks if the dirty values in the static registers remain consistent.
    
    Change-Id: Ia9290b55ab0f2dd87801f6fd175813d3f717d851
    e3101ddc
  • zhoupeng's avatar
    Loongarch: Optimized emu_edge_c function by LSX · 7c63bb1b
    zhoupeng authored and Hecai Yuan's avatar Hecai Yuan committed
    emu_edge_w4_8bpc_c:        9.0 ( 1.00x)
    emu_edge_w4_8bpc_lsx:      6.7 ( 1.34x)
    emu_edge_w8_8bpc_c:       12.9 ( 1.00x)
    emu_edge_w8_8bpc_lsx:      9.2 ( 1.40x)
    emu_edge_w16_8bpc_c:       20.0 ( 1.00x)
    emu_edge_w16_8bpc_lsx:     16.3 ( 1.23x)
    emu_edge_w32_8bpc_c:       44.6 ( 1.00x)
    emu_edge_w32_8bpc_lsx:     33.3 ( 1.34x)
    emu_edge_w64_8bpc_c:       79.9 ( 1.00x)
    emu_edge_w64_8bpc_lsx:     66.2 ( 1.21x)
    emu_edge_w128_8bpc_c:      193.9 ( 1.00x)
    emu_edge_w128_8bpc_lsx:    197.8 ( 0.98x)
    
    Change-Id: I180c94d311509740b03793419d5790a931532980
    7c63bb1b
......@@ -310,6 +310,8 @@ static int cdef_find_dir_c(const pixel *img, const ptrdiff_t stride,
#include "src/ppc/cdef.h"
#elif ARCH_X86
#include "src/x86/cdef.h"
#elif ARCH_LOONGARCH64
#include "src/loongarch/cdef.h"
#endif
#endif
......@@ -326,6 +328,8 @@ COLD void bitfn(dav1d_cdef_dsp_init)(Dav1dCdefDSPContext *const c) {
cdef_dsp_init_ppc(c);
#elif ARCH_X86
cdef_dsp_init_x86(c);
#elif ARCH_LOONGARCH64
cdef_dsp_init_loongarch(c);
#endif
#endif
}
......@@ -734,6 +734,8 @@ static void pal_pred_c(pixel *dst, const ptrdiff_t stride,
#include "src/arm/ipred.h"
#elif ARCH_X86
#include "src/x86/ipred.h"
#elif ARCH_LOONGARCH64
#include "src/loongarch/ipred.h"
#endif
#endif
......@@ -769,6 +771,8 @@ COLD void bitfn(dav1d_intra_pred_dsp_init)(Dav1dIntraPredDSPContext *const c) {
intra_pred_dsp_init_arm(c);
#elif ARCH_X86
intra_pred_dsp_init_x86(c);
#elif ARCH_LOONGARCH64
intra_pred_dsp_init_loongarch(c);
#endif
#endif
}
This diff is collapsed.
/*
* Copyright © 2024, VideoLAN and dav1d authors
* Copyright © 2024, Loongson Technology Corporation Limited
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef DAV1D_SRC_LOONGARCH_CDEF_H
#define DAV1D_SRC_LOONGARCH_CDEF_H
#include "config.h"
#include "src/cdef.h"
#include "src/cpu.h"
decl_cdef_dir_fn(BF(dav1d_cdef_find_dir, lsx));
decl_cdef_fn(BF(dav1d_cdef_filter_block_4x4, lsx));
decl_cdef_fn(BF(dav1d_cdef_filter_block_4x8, lsx));
decl_cdef_fn(BF(dav1d_cdef_filter_block_8x8, lsx));
static ALWAYS_INLINE void cdef_dsp_init_loongarch(Dav1dCdefDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LSX)) return;
#if BITDEPTH == 8
c->dir = BF(dav1d_cdef_find_dir, lsx);
c->fb[0] = BF(dav1d_cdef_filter_block_8x8, lsx);
c->fb[1] = BF(dav1d_cdef_filter_block_4x8, lsx);
c->fb[2] = BF(dav1d_cdef_filter_block_4x4, lsx);
#endif
}
#endif /* DAV1D_SRC_LOONGARCH_CDEF_H */
This diff is collapsed.
/*
* Copyright © 2024, VideoLAN and dav1d authors
* Copyright © 2024, Loongson Technology Corporation Limited
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef DAV1D_SRC_LOONGARCH_IPRED_H
#define DAV1D_SRC_LOONGARCH_IPRED_H
#include "config.h"
#include "src/ipred.h"
#include "src/cpu.h"
#include "src/tables.h"
#define MULTIPLIER_1x2 0x5556
#define MULTIPLIER_1x4 0x3334
#define BASE_SHIFT 16
#define init_fn(type0, type1, name, suffix) \
c->type0[type1] = BF(dav1d_##name, suffix)
#define init_angular_ipred_fn(type, name, suffix) \
init_fn(intra_pred, type, name, suffix)
#define init_cfl_pred_fn(type, name, suffix) \
init_fn(cfl_pred, type, name, suffix)
decl_angular_ipred_fn(BF(dav1d_ipred_dc, lsx));
decl_angular_ipred_fn(BF(dav1d_ipred_dc_128, lsx));
decl_angular_ipred_fn(BF(dav1d_ipred_dc_top, lsx));
decl_angular_ipred_fn(BF(dav1d_ipred_dc_left, lsx));
decl_angular_ipred_fn(BF(dav1d_ipred_h, lsx));
decl_angular_ipred_fn(BF(dav1d_ipred_v, lsx));
decl_angular_ipred_fn(BF(dav1d_ipred_paeth, lsx));
decl_angular_ipred_fn(BF(dav1d_ipred_smooth, lsx));
decl_angular_ipred_fn(BF(dav1d_ipred_smooth_v, lsx));
decl_angular_ipred_fn(BF(dav1d_ipred_smooth_h, lsx));
decl_angular_ipred_fn(BF(dav1d_ipred_filter, lsx));
decl_cfl_pred_fn(BF(dav1d_ipred_cfl, lsx));
decl_cfl_pred_fn(BF(dav1d_ipred_cfl_128, lsx));
decl_cfl_pred_fn(BF(dav1d_ipred_cfl_top, lsx));
decl_cfl_pred_fn(BF(dav1d_ipred_cfl_left, lsx));
decl_pal_pred_fn(BF(dav1d_pal_pred, lsx));
static ALWAYS_INLINE void intra_pred_dsp_init_loongarch(Dav1dIntraPredDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LSX)) return;
#if BITDEPTH == 8
init_angular_ipred_fn(DC_PRED, ipred_dc, lsx);
init_angular_ipred_fn(DC_128_PRED, ipred_dc_128, lsx);
init_angular_ipred_fn(TOP_DC_PRED, ipred_dc_top, lsx);
init_angular_ipred_fn(LEFT_DC_PRED, ipred_dc_left, lsx);
init_angular_ipred_fn(HOR_PRED, ipred_h, lsx);
init_angular_ipred_fn(VERT_PRED, ipred_v, lsx);
init_angular_ipred_fn(PAETH_PRED, ipred_paeth, lsx);
init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, lsx);
init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, lsx);
init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, lsx);
init_angular_ipred_fn(FILTER_PRED, ipred_filter, lsx);
init_cfl_pred_fn(DC_PRED, ipred_cfl, lsx);
init_cfl_pred_fn(DC_128_PRED, ipred_cfl_128, lsx);
init_cfl_pred_fn(TOP_DC_PRED, ipred_cfl_top, lsx);
init_cfl_pred_fn(LEFT_DC_PRED, ipred_cfl_left, lsx);
c->pal_pred = BF(dav1d_pal_pred, lsx);
#endif
}
#endif /* DAV1D_SRC_LOONGARCH_IPRED_H */
This diff is collapsed.
......@@ -31,67 +31,18 @@
#include "src/cpu.h"
#include "src/itx.h"
decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_4x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_4x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_4x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_4x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_4x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_4x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_4x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_4x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_4x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_4x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_4x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_4x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_4x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_4x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_4x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_4x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_4x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_4x8, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_8x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_8x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_8x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_8x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_8x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_8x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_8x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_8x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_8x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_8x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_8x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_8x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_8x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_8x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_8x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_8x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_8x8, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_8x8, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_8x8, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_8x8, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_8x8, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_8x8, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_8x8, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_8x8, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_8x8, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_8x8, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_8x8, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_8x8, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_8x8, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_8x8, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_8x8, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_8x8, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_8x16, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_8x16, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_8x16, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_8x16, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x8, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_16x8, lsx));
decl_itx17_fns( 4, 4, lsx);
decl_itx16_fns( 4, 8, lsx);
decl_itx16_fns( 4, 16, lsx);
decl_itx16_fns( 8, 4, lsx);
decl_itx16_fns( 8, 8, lsx);
decl_itx16_fns( 8, 16, lsx);
decl_itx2_fns ( 8, 32, lsx);
decl_itx16_fns(16, 8, lsx);
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_16x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_16x4, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x16, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_16x16, lsx));
......@@ -99,14 +50,23 @@ decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_16x16, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_16x16, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_16x16, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_16x16, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_16x16, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_16x16, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_8x32, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x32, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x32, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x8, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x16, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x32, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x32, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, lsx));
decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_16x16, lasx));
static ALWAYS_INLINE void itx_dsp_init_loongarch(Dav1dInvTxfmDSPContext *const c, int bpc) {
#if BITDEPTH == 8
const unsigned flags = dav1d_get_cpu_flags();
......@@ -115,67 +75,20 @@ static ALWAYS_INLINE void itx_dsp_init_loongarch(Dav1dInvTxfmDSPContext *const c
if (BITDEPTH != 8 ) return;
c->itxfm_add[TX_4X4][WHT_WHT] = dav1d_inv_txfm_add_wht_wht_4x4_8bpc_lsx;
c->itxfm_add[TX_4X4][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_4x4_8bpc_lsx;
c->itxfm_add[TX_4X4][IDTX] = dav1d_inv_txfm_add_identity_identity_4x4_8bpc_lsx;
c->itxfm_add[TX_4X4][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_4x4_8bpc_lsx;
c->itxfm_add[TX_4X4][ADST_DCT] = dav1d_inv_txfm_add_dct_adst_4x4_8bpc_lsx;
c->itxfm_add[TX_4X4][ADST_ADST] = dav1d_inv_txfm_add_adst_adst_4x4_8bpc_lsx;
c->itxfm_add[TX_4X4][FLIPADST_DCT] = dav1d_inv_txfm_add_dct_flipadst_4x4_8bpc_lsx;
c->itxfm_add[TX_4X4][ADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_adst_4x4_8bpc_lsx;
c->itxfm_add[TX_4X4][FLIPADST_ADST] = dav1d_inv_txfm_add_adst_flipadst_4x4_8bpc_lsx;
c->itxfm_add[TX_4X4][DCT_FLIPADST] = dav1d_inv_txfm_add_flipadst_dct_4x4_8bpc_lsx;
c->itxfm_add[TX_4X4][FLIPADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_flipadst_4x4_8bpc_lsx;
c->itxfm_add[TX_4X4][H_DCT] = dav1d_inv_txfm_add_dct_identity_4x4_8bpc_lsx;
c->itxfm_add[TX_4X4][V_DCT] = dav1d_inv_txfm_add_identity_dct_4x4_8bpc_lsx;
c->itxfm_add[TX_4X4][H_FLIPADST] = dav1d_inv_txfm_add_flipadst_identity_4x4_8bpc_lsx;
c->itxfm_add[TX_4X4][V_FLIPADST] = dav1d_inv_txfm_add_identity_flipadst_4x4_8bpc_lsx;
c->itxfm_add[TX_4X4][V_ADST] = dav1d_inv_txfm_add_identity_adst_4x4_8bpc_lsx;
c->itxfm_add[TX_4X4][H_ADST] = dav1d_inv_txfm_add_adst_identity_4x4_8bpc_lsx;
c->itxfm_add[RTX_4X8][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_4x8_8bpc_lsx;
c->itxfm_add[RTX_8X4][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_8x4_8bpc_lsx;
c->itxfm_add[RTX_8X4][IDTX] = dav1d_inv_txfm_add_identity_identity_8x4_8bpc_lsx;
c->itxfm_add[RTX_8X4][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_8x4_8bpc_lsx;
c->itxfm_add[RTX_8X4][ADST_DCT] = dav1d_inv_txfm_add_dct_adst_8x4_8bpc_lsx;
c->itxfm_add[RTX_8X4][ADST_ADST] = dav1d_inv_txfm_add_adst_adst_8x4_8bpc_lsx;
c->itxfm_add[RTX_8X4][ADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_adst_8x4_8bpc_lsx;
c->itxfm_add[RTX_8X4][FLIPADST_ADST] = dav1d_inv_txfm_add_adst_flipadst_8x4_8bpc_lsx;
c->itxfm_add[RTX_8X4][DCT_FLIPADST] = dav1d_inv_txfm_add_flipadst_dct_8x4_8bpc_lsx;
c->itxfm_add[RTX_8X4][FLIPADST_DCT] = dav1d_inv_txfm_add_dct_flipadst_8x4_8bpc_lsx;
c->itxfm_add[RTX_8X4][FLIPADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_flipadst_8x4_8bpc_lsx;
c->itxfm_add[RTX_8X4][H_DCT] = dav1d_inv_txfm_add_dct_identity_8x4_8bpc_lsx;
c->itxfm_add[RTX_8X4][V_DCT] = dav1d_inv_txfm_add_identity_dct_8x4_8bpc_lsx;
c->itxfm_add[RTX_8X4][H_FLIPADST] = dav1d_inv_txfm_add_flipadst_identity_8x4_8bpc_lsx;
c->itxfm_add[RTX_8X4][V_FLIPADST] = dav1d_inv_txfm_add_identity_flipadst_8x4_8bpc_lsx;
c->itxfm_add[RTX_8X4][H_ADST] = dav1d_inv_txfm_add_adst_identity_8x4_8bpc_lsx;
c->itxfm_add[RTX_8X4][V_ADST] = dav1d_inv_txfm_add_identity_adst_8x4_8bpc_lsx;
c->itxfm_add[TX_8X8][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_8x8_8bpc_lsx;
c->itxfm_add[TX_8X8][IDTX] = dav1d_inv_txfm_add_identity_identity_8x8_8bpc_lsx;
c->itxfm_add[TX_8X8][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_8x8_8bpc_lsx;
c->itxfm_add[TX_8X8][ADST_DCT] = dav1d_inv_txfm_add_dct_adst_8x8_8bpc_lsx;
c->itxfm_add[TX_8X8][ADST_ADST] = dav1d_inv_txfm_add_adst_adst_8x8_8bpc_lsx;
c->itxfm_add[TX_8X8][ADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_adst_8x8_8bpc_lsx;
c->itxfm_add[TX_8X8][FLIPADST_ADST] = dav1d_inv_txfm_add_adst_flipadst_8x8_8bpc_lsx;
c->itxfm_add[TX_8X8][DCT_FLIPADST] = dav1d_inv_txfm_add_flipadst_dct_8x8_8bpc_lsx;
c->itxfm_add[TX_8X8][FLIPADST_DCT] = dav1d_inv_txfm_add_dct_flipadst_8x8_8bpc_lsx;
c->itxfm_add[TX_8X8][FLIPADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_flipadst_8x8_8bpc_lsx;
c->itxfm_add[TX_8X8][H_DCT] = dav1d_inv_txfm_add_dct_identity_8x8_8bpc_lsx;
c->itxfm_add[TX_8X8][V_DCT] = dav1d_inv_txfm_add_identity_dct_8x8_8bpc_lsx;
c->itxfm_add[TX_8X8][H_FLIPADST] = dav1d_inv_txfm_add_flipadst_identity_8x8_8bpc_lsx;
c->itxfm_add[TX_8X8][V_FLIPADST] = dav1d_inv_txfm_add_identity_flipadst_8x8_8bpc_lsx;
c->itxfm_add[TX_8X8][H_ADST] = dav1d_inv_txfm_add_adst_identity_8x8_8bpc_lsx;
c->itxfm_add[TX_8X8][V_ADST] = dav1d_inv_txfm_add_identity_adst_8x8_8bpc_lsx;
c->itxfm_add[RTX_8X16][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_8x16_8bpc_lsx;
c->itxfm_add[RTX_8X16][IDTX] = dav1d_inv_txfm_add_identity_identity_8x16_8bpc_lsx;
c->itxfm_add[RTX_8X16][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_8x16_8bpc_lsx;
c->itxfm_add[RTX_8X16][ADST_DCT] = dav1d_inv_txfm_add_dct_adst_8x16_8bpc_lsx;
c->itxfm_add[RTX_16X8][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_16x8_8bpc_lsx;
c->itxfm_add[RTX_16X8][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_16x8_8bpc_lsx;
assign_itx17_fn( , 4, 4, lsx);
assign_itx16_fn(R, 4, 8, lsx);
assign_itx16_fn(R, 4, 16, lsx);
assign_itx16_fn(R, 8, 4, lsx);
assign_itx16_fn( , 8, 8, lsx);
assign_itx16_fn(R, 8, 16, lsx);
assign_itx2_fn (R, 8, 32, lsx);
assign_itx16_fn(R, 16, 8, lsx);
assign_itx1_fn (R, 64, 32, lsx);
assign_itx1_fn ( , 64, 64, lsx);
c->itxfm_add[RTX_16X4][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_16x4_8bpc_lsx;
c->itxfm_add[RTX_16X4][IDTX] = dav1d_inv_txfm_add_identity_identity_16x4_8bpc_lsx;
c->itxfm_add[RTX_16X4][DCT_ADST] = dav1d_inv_txfm_add_adst_dct_16x4_8bpc_lsx;
c->itxfm_add[TX_16X16][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_16x16_8bpc_lsx;
c->itxfm_add[TX_16X16][ADST_ADST] = dav1d_inv_txfm_add_adst_adst_16x16_8bpc_lsx;
......@@ -183,12 +96,23 @@ static ALWAYS_INLINE void itx_dsp_init_loongarch(Dav1dInvTxfmDSPContext *const c
c->itxfm_add[TX_16X16][ADST_DCT] = dav1d_inv_txfm_add_dct_adst_16x16_8bpc_lsx;
c->itxfm_add[TX_16X16][DCT_FLIPADST] = dav1d_inv_txfm_add_flipadst_dct_16x16_8bpc_lsx;
c->itxfm_add[TX_16X16][FLIPADST_DCT] = dav1d_inv_txfm_add_dct_flipadst_16x16_8bpc_lsx;
c->itxfm_add[TX_16X16][FLIPADST_ADST] = dav1d_inv_txfm_add_adst_flipadst_16x16_8bpc_lsx;
c->itxfm_add[TX_16X16][ADST_FLIPADST] = dav1d_inv_txfm_add_flipadst_adst_16x16_8bpc_lsx;
c->itxfm_add[RTX_16X32][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_16x32_8bpc_lsx;
c->itxfm_add[RTX_8X32][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_8x32_8bpc_lsx;
c->itxfm_add[RTX_32X8][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_32x8_8bpc_lsx;
c->itxfm_add[RTX_32X16][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_32x16_8bpc_lsx;
c->itxfm_add[TX_32X32][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_32x32_8bpc_lsx;
c->itxfm_add[TX_64X64][DCT_DCT] = dav1d_inv_txfm_add_dct_dct_64x64_8bpc_lsx;
if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LASX)) return;
if (BITDEPTH != 8 ) return;
c->itxfm_add[TX_16X16][ADST_ADST] = dav1d_inv_txfm_add_adst_adst_16x16_8bpc_lasx;
#endif
}
......
This diff is collapsed.
......@@ -43,6 +43,10 @@ decl_mask_fn(BF(dav1d_mask, lsx));
decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, lsx));
decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, lsx));
decl_w_mask_fn(BF(dav1d_w_mask_420, lsx));
decl_blend_fn(BF(dav1d_blend, lsx));
decl_blend_dir_fn(BF(dav1d_blend_v, lsx));
decl_blend_dir_fn(BF(dav1d_blend_h, lsx));
decl_emu_edge_fn(BF(dav1d_emu_edge, lsx));
decl_mc_fn(BF(dav1d_put_8tap_regular, lsx));
decl_mc_fn(BF(dav1d_put_8tap_regular_smooth, lsx));
......@@ -60,6 +64,17 @@ decl_mask_fn(BF(dav1d_mask, lasx));
decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, lasx));
decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, lasx));
decl_w_mask_fn(BF(dav1d_w_mask_420, lasx));
decl_blend_dir_fn(BF(dav1d_blend_h, lasx));
decl_mct_fn(BF(dav1d_prep_8tap_regular, lsx));
decl_mct_fn(BF(dav1d_prep_8tap_regular_smooth, lsx));
decl_mct_fn(BF(dav1d_prep_8tap_regular_sharp, lsx));
decl_mct_fn(BF(dav1d_prep_8tap_smooth, lsx));
decl_mct_fn(BF(dav1d_prep_8tap_smooth_regular, lsx));
decl_mct_fn(BF(dav1d_prep_8tap_smooth_sharp, lsx));
decl_mct_fn(BF(dav1d_prep_8tap_sharp, lsx));
decl_mct_fn(BF(dav1d_prep_8tap_sharp_regular, lsx));
decl_mct_fn(BF(dav1d_prep_8tap_sharp_smooth, lsx));
decl_mct_fn(BF(dav1d_prep_8tap_regular, lasx));
decl_mct_fn(BF(dav1d_prep_8tap_regular_smooth, lasx));
......@@ -83,6 +98,10 @@ static ALWAYS_INLINE void mc_dsp_init_loongarch(Dav1dMCDSPContext *const c) {
c->warp8x8 = BF(dav1d_warp_affine_8x8, lsx);
c->warp8x8t = BF(dav1d_warp_affine_8x8t, lsx);
c->w_mask[2] = BF(dav1d_w_mask_420, lsx);
c->blend = BF(dav1d_blend, lsx);
c->blend_v = BF(dav1d_blend_v, lsx);
c->blend_h = BF(dav1d_blend_h, lsx);
c->emu_edge = BF(dav1d_emu_edge, lsx);
init_mc_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, lsx);
init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, lsx);
......@@ -94,6 +113,16 @@ static ALWAYS_INLINE void mc_dsp_init_loongarch(Dav1dMCDSPContext *const c) {
init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, lsx);
init_mc_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, lsx);
init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, lsx);
init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, lsx);
init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, lsx);
init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, lsx);
init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, lsx);
init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, lsx);
init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, lsx);
init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, lsx);
init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, lsx);
if (!(flags & DAV1D_LOONGARCH_CPU_FLAG_LASX)) return;
c->avg = BF(dav1d_avg, lasx);
......@@ -102,6 +131,7 @@ static ALWAYS_INLINE void mc_dsp_init_loongarch(Dav1dMCDSPContext *const c) {
c->warp8x8 = BF(dav1d_warp_affine_8x8, lasx);
c->warp8x8t = BF(dav1d_warp_affine_8x8t, lasx);
c->w_mask[2] = BF(dav1d_w_mask_420, lasx);
c->blend_h = BF(dav1d_blend_h, lasx);
init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, lasx);
init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, lasx);
......
......@@ -31,6 +31,12 @@ const min_prob
.short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
endconst
const ph_0xff00
.rept 8
.short 0xff00
.endr
endconst
.macro decode_symbol_adapt w
addi.d sp, sp, -48
addi.d a4, a0, 24
......@@ -281,6 +287,82 @@ function msac_decode_bool_lsx
move a0, t8
endfunc
function msac_decode_bool_equi_lsx
ld.w t0, a0, 24 // rng
ld.d t1, a0, 16 // dif
ld.w a5, a0, 28 // cnt
srli.w t2, t0, 8 // r >> 8
slli.w t2, t2, 7
addi.w t2, t2, 4 // v
slli.d t3, t2, 48 // vw
sltu t4, t1, t3
move t8, t4 // ret
xori t4, t4, 1
maskeqz t6, t3, t4 // if (ret) vw
sub.d t6, t1, t6 // dif
slli.w t5, t2, 1
sub.w t5, t0, t5 // r - 2v
maskeqz t7, t5, t4 // if (ret) r - 2v
add.w t5, t2, t7 // v(rng)
// renorm
clz.w t4, t5 // d
xori t4, t4, 16 // d
sll.d t6, t6, t4
sll.w t5, t5, t4
sub.w t7, a5, t4 // cnt-d
st.w t5, a0, 24 // store rng
bgeu a5, t4, 9f
// refill
ld.d t0, a0, 0 // buf_pos
ld.d t1, a0, 8 // buf_end
addi.d t2, t0, 8
bltu t1, t2, 2f
ld.d t3, t0, 0 // next_bits
addi.w t1, t7, -48 // shift_bits = cnt + 16 (- 64)
nor t3, t3, t3
sub.w t2, zero, t1
revb.d t3, t3 // next_bits = bswap(next_bits)
srli.w t2, t2, 3 // num_bytes_read
srl.d t3, t3, t1 // next_bits >>= (shift_bits & 63)
b 3f
1:
addi.w t3, t7, -48
srl.d t3, t3, t3 // pad with ones
b 4f
2:
bgeu t0, t1, 1b
ld.d t3, t1, -8 // next_bits
sub.w t2, t2, t1
sub.w t1, t1, t0 // num_bytes_left
slli.w t2, t2, 3
srl.d t3, t3, t2
addi.w t2, t7, -48
nor t3, t3, t3
sub.w t4, zero, t2
revb.d t3, t3
srli.w t4, t4, 3
srl.d t3, t3, t2
sltu t2, t1, t4
maskeqz t1, t1, t2
masknez t2, t4, t2
or t2, t2, t1 // num_bytes_read
3:
slli.w t1, t2, 3
add.d t0, t0, t2
add.w t7, t7, t1 // cnt += num_bits_read
st.d t0, a0, 0
4:
or t6, t6, t3 // dif |= next_bits
9:
st.w t7, a0, 28 // store cnt
st.d t6, a0, 16 // store dif
move a0, t8
endfunc
function msac_decode_bool_adapt_lsx
ld.hu a3, a1, 0 // cdf[0] /f
ld.w t0, a0, 24 // rng
......@@ -374,3 +456,162 @@ function msac_decode_bool_adapt_lsx
st.d t6, a0, 16 // store dif
move a0, t8
endfunc
.macro HI_TOK allow_update_cdf
.\allow_update_cdf\()_hi_tok_lsx_start:
.if \allow_update_cdf == 1
ld.hu a4, a1, 0x06 // cdf[3]
.endif
vor.v vr1, vr0, vr0
vsrli.h vr1, vr1, 0x06 // cdf[val] >> EC_PROB_SHIFT
vstelm.h vr2, sp, 0, 0 // -0x1a
vand.v vr2, vr2, vr4 // (8 x rng) & 0xff00
vslli.h vr1, vr1, 0x07
vmuh.hu vr1, vr1, vr2
vadd.h vr1, vr1, vr5 // v += EC_MIN_PROB/* 4 */ * ((unsigned)n_symbols/* 3 */ - val);
vst vr1, sp, 0x02 // -0x18
vssub.hu vr1, vr1, vr3 // v - c
vseqi.h vr1, vr1, 0
.if \allow_update_cdf == 1
addi.d t4, a4, 0x50
srli.d t4, t4, 0x04
sltui t7, a4, 32
add.w a4, a4, t7
vreplgr2vr.h vr7, t4
vavgr.hu vr9, vr8, vr1
vsub.h vr9, vr9, vr0
vsub.h vr0, vr0, vr1
vsra.h vr9, vr9, vr7
vadd.h vr0, vr0, vr9
vstelm.d vr0, a1, 0, 0
st.h a4, a1, 0x06
.endif
vmsknz.b vr7, vr1
movfr2gr.s t4, f7
ctz.w t4, t4 // loop_times * 2
addi.d t7, t4, 2
ldx.hu t6, sp, t4 // u
ldx.hu t5, sp, t7 // v
addi.w t3, t3, 0x05
addi.w t4, t4, -0x05 // if t4 == 3, continue
sub.w t6, t6, t5 // u - v , rng for ctx_norm
slli.d t5, t5, 0x30 // (ec_win)v << (EC_WIN_SIZE - 16)
sub.d t1, t1, t5 // s->dif - ((ec_win)v << (EC_WIN_SIZE - 16))
// Init ctx_norm param
clz.w t7, t6
xori t7, t7, 0x1f
xori t7, t7, 0x0f // d = 15 ^ (31 ^ clz(rng));
sll.d t1, t1, t7 // dif << d
sll.d t6, t6, t7 // rng << d
// update vr2 8 x rng
vreplgr2vr.h vr2, t6
vreplvei.h vr2, vr2, 0
st.w t6, a0, 0x18 // store rng
move t0, t2
sub.w t2, t2, t7 // cnt - d
bgeu t0, t7, .\allow_update_cdf\()_hi_tok_lsx_ctx_norm_end // if ((unsigned)cnt < (unsigned)d) goto ctx_norm_end
// Step into ctx_fill
ld.d t5, a0, 0x00 // buf_pos
ld.d t6, a0, 0x08 // end_pos
addi.d t7, t5, 0x08 // buf_pos + 8
sub.d t7, t7, t6 // (buf_pos + 8) - end_pos
blt zero, t7, .\allow_update_cdf\()_hi_tok_lsx_ctx_refill_eob
// (end_pos - buf_pos) >= 8
ld.d t6, t5, 0x00 // load buf_pos[0]~buf_pos[7]
addi.w t7, t2, -0x30 // cnt - 0x30
nor t6, t6, t6 // not buf data
revb.d t6, t6 // Byte reversal
srl.d t6, t6, t7 // Replace left shift with right shift
sub.w t7, zero, t7 // neg
srli.w t7, t7, 0x03 // Loop times
or t1, t1, t6 // dif |= (ec_win)(*buf_pos++ ^ 0xff) << c
b .\allow_update_cdf\()_hi_tok_lsx_ctx_refill_end
.\allow_update_cdf\()_hi_tok_lsx_ctx_refill_eob:
bge t5, t6, .\allow_update_cdf\()_hi_tok_lsx_ctx_refill_one
// end_pos - buf_pos < 8 && buf_pos < end_pos
ld.d t0, t6, -0x08
slli.d t7, t7, 0x03
srl.d t6, t0, t7 // Retrieve the buf data and remove the excess data
addi.w t7, t2, -0x30 // cnt - 0x30
nor t6, t6, t6 // not
revb.d t6, t6 // Byte reversal
srl.d t6, t6, t7 // Replace left shift with right shift
sub.w t7, zero, t7 // neg
or t1, t1, t6 // dif |= (ec_win)(*buf_pos++ ^ 0xff) << c
ld.d t6, a0, 0x08 // end_pos
srli.w t7, t7, 0x03 // Loop times
sub.d t6, t6, t5 // end_pos - buf_pos
slt t0, t6, t7
maskeqz a3, t6, t0 // min(loop_times, end_pos - buf_pos)
masknez t0, t7, t0
or t7, a3, t0
b .\allow_update_cdf\()_hi_tok_lsx_ctx_refill_end
.\allow_update_cdf\()_hi_tok_lsx_ctx_refill_one:
// buf_pos >= end_pos
addi.w t7, t2, -0x10
andi t7, t7, 0xf
nor t0, zero, zero
srl.d t0, t0, t7
or t1, t1, t0 // dif |= ~(~(ec_win)0xff << c);
b .\allow_update_cdf\()_hi_tok_lsx_ctx_norm_end
.\allow_update_cdf\()_hi_tok_lsx_ctx_refill_end:
add.d t5, t5, t7 // buf_pos + Loop_times
st.d t5, a0, 0x00 // Store buf_pos
alsl.w t2, t7, t2, 0x03 // update cnt
.\allow_update_cdf\()_hi_tok_lsx_ctx_norm_end:
srli.d t7, t1, 0x30
vreplgr2vr.h vr3, t7 // broadcast the high 16 bits of dif
add.w t3, t4, t3 // update control parameter
beqz t3, .\allow_update_cdf\()_hi_tok_lsx_end // control loop for at most 4 times.
blt zero, t4, .\allow_update_cdf\()_hi_tok_lsx_start // tok_br == 3
.\allow_update_cdf\()_hi_tok_lsx_end:
addi.d t3, t3, 0x1e
st.d t1, a0, 0x10 // store dif
st.w t2, a0, 0x1c // store cnt
srli.w a0, t3, 0x01 // tok
addi.d sp, sp, 0x1a
.endm
/**
* @param unsigned dav1d_msac_decode_hi_tok_c(MsacContext *const s, uint16_t *const cdf)
* * Reg Alloction
* * vr0: cdf;
* * vr1: temp;
* * vr2: rng;
* * vr3: dif;
* * vr4: const 0xff00ff00...ff00ff00;
* * vr5: const 0x0004080c;
* * vr6: const 0;
* * t0: allow_update_cdf, tmp;
* * t1: dif;
* * t2: cnt;
* * t3: 0xffffffe8, outermost control parameter;
* * t4: loop time
* * t5: v, buf_pos, temp;
* * t6: u, rng, end_pos, buf, temp;
* * t7: temp;
*/
function msac_decode_hi_tok_lsx
fld.d f0, a1, 0 // Load cdf[0]~cdf[3]
vldrepl.h vr2, a0, 0x18 // 8 x rng, assert(rng <= 65535U), only the lower 16 bits are valid
vldrepl.h vr3, a0, 0x16 // broadcast the high 16 bits of dif, c = s->dif >> (EC_WIN_SIZE - 16)
ld.w t0, a0, 0x20 // allow_update_cdf
la.local t7, ph_0xff00
vld vr4, t7, 0x00 // 0xff00ff00...ff00ff00
la.local t7, min_prob
vld vr5, t7, 12 * 2 // 0x0004080c
vxor.v vr6, vr6, vr6 // const 0
ld.d t1, a0, 0x10 // dif
ld.w t2, a0, 0x1c // cnt
orn t3, t3, t3
srli.d t3, t3, 32
addi.d t3, t3, -0x17 // 0xffffffe8
vseq.h vr8, vr8, vr8
addi.d sp, sp, -0x1a // alloc stack
beqz t0, .hi_tok_lsx_no_update_cdf
HI_TOK 1
jirl zero, ra, 0x0
.hi_tok_lsx_no_update_cdf:
HI_TOK 0
endfunc
......@@ -36,11 +36,15 @@ unsigned dav1d_msac_decode_symbol_adapt16_lsx(MsacContext *s, uint16_t *cdf,
size_t n_symbols);
unsigned dav1d_msac_decode_bool_adapt_lsx(MsacContext *s, uint16_t *cdf);
unsigned dav1d_msac_decode_bool_lsx(MsacContext *s, unsigned f);
unsigned dav1d_msac_decode_bool_equi_lsx(MsacContext *s);
unsigned dav1d_msac_decode_hi_tok_lsx(MsacContext *s, uint16_t *cdf);
#define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_lsx
#define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_lsx
#define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_lsx
#define dav1d_msac_decode_bool_adapt dav1d_msac_decode_bool_adapt_lsx
#define dav1d_msac_decode_bool dav1d_msac_decode_bool_lsx
#define dav1d_msac_decode_bool_equi dav1d_msac_decode_bool_equi_lsx
#define dav1d_msac_decode_hi_tok dav1d_msac_decode_hi_tok_lsx
#endif /* DAV1D_SRC_LOONGARCH_MSAC_H */
......@@ -239,6 +239,8 @@ if is_asm_enabled
)}
libdav1d_sources_asm = files(
'loongarch/cdef.S',
'loongarch/ipred.S',
'loongarch/mc.S',
'loongarch/loopfilter.S',
'loongarch/looprestoration.S',
......
......@@ -334,6 +334,17 @@ void checkasm_stack_clobber(uint64_t clobber, ...);
7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0));\
checkasm_set_signal_handler_state(0)
#elif ARCH_RISCV
#define declare_new(ret, ...)\
ret (*checked_call)(void *, int, int, int, int, int, int, int,\
__VA_ARGS__, int, int, int, int, int, int, int, int,\
int, int, int, int, int, int, int) =\
(void *)checkasm_checked_call;
#define call_new(...)\
(checkasm_set_signal_handler_state(1),\
checked_call(func_new, 0, 0, 0, 0, 0, 0, 0, __VA_ARGS__,\
7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0));\
checkasm_set_signal_handler_state(0)
#elif ARCH_LOONGARCH
#define declare_new(ret, ...)\
ret (*checked_call)(void *, int, int, int, int, int, int, int,\
__VA_ARGS__, int, int, int, int, int, int, int, int,\
......
/******************************************************************************
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2024, Loongson Technology Corporation Limited
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define PRIVATE_PREFIX checkasm_
#include "src/loongarch/loongson_asm.S"
const register_init, align=4
.quad 0x21f86d66c8ca00ce
.quad 0x75b6ba21077c48ad
.quad 0xed56bb2dcb3c7736
.quad 0x8bda43d3fd1a7e06
.quad 0xb64a9c9e5d318408
.quad 0xdf9a54b303f1d3a3
.quad 0x4a75479abd64e097
.quad 0x249214109d5d1c88
.quad 0x1a1b2550a612b48c
.quad 0x79445c159ce79064
.quad 0x2eed899d5a28ddcd
.quad 0x86b2536fcd8cf636
.quad 0xb0856806085e7943
.quad 0x3f2bf84fc0fcca4e
.quad 0xacbd382dcf5b8de2
.quad 0xd229e1f5b281303f
.quad 0x71aeaff20b095fd9
.quad 0xab63e2e11fa38ed9
endconst
const error_message
.asciz "failed to preserve register"
endconst
// max number of args used by any asm function.
#define MAX_ARGS 15
#define CLOBBER_STACK ((8*MAX_ARGS + 15) & ~15)
// Fill dirty data at stack space
function stack_clobber
move t0, sp
addi.d t1, zero, CLOBBER_STACK
1:
st.d a0, sp, 0x00
st.d a1, sp, -0x08
addi.d sp, sp, -0x10
addi.d t1, t1, -0x10
blt zero, t1, 1b
move sp, t0
endfunc
#define ARG_STACK ((8*(MAX_ARGS - 8) + 15) & ~15)
function checked_call
// Saved s0 - s8, fs0 - fs7
move t4, sp
addi.d sp, sp, -136
st.d s0, sp, 0
st.d s1, sp, 8
st.d s2, sp, 16
st.d s3, sp, 24
st.d s4, sp, 32
st.d s5, sp, 40
st.d s6, sp, 48
st.d s7, sp, 56
st.d s8, sp, 64
fst.d fs0, sp, 72
fst.d fs1, sp, 80
fst.d fs2, sp, 88
fst.d fs3, sp, 96
fst.d fs4, sp, 104
fst.d fs5, sp, 112
fst.d fs6, sp, 120
fst.d fs7, sp, 128
la.local t1, register_init
ld.d s0, t1, 0
ld.d s1, t1, 8
ld.d s2, t1, 16
ld.d s3, t1, 24
ld.d s4, t1, 32
ld.d s5, t1, 40
ld.d s6, t1, 48
ld.d s7, t1, 56
ld.d s8, t1, 64
fld.d fs0, t1, 72
fld.d fs1, t1, 80
fld.d fs2, t1, 88
fld.d fs3, t1, 96
fld.d fs4, t1, 104
fld.d fs5, t1, 112
fld.d fs6, t1, 120
fld.d fs7, t1, 128
addi.d sp, sp, -16
st.d a1, sp, 0 // ok
st.d ra, sp, 8 // Ret address
addi.d sp, sp, -ARG_STACK
addi.d t0, zero, 8*8
xor t1, t1, t1
.rept MAX_ARGS - 8
// Skip the first 8 args, that are loaded into registers
ldx.d t2, t4, t0
stx.d t2, sp, t1
addi.d t0, t0, 8
addi.d t1, t1, 8
.endr
move t3, a0 // Func
ld.d a0, t4, 0
ld.d a1, t4, 8
ld.d a2, t4, 16
ld.d a3, t4, 24
ld.d a4, t4, 32
ld.d a5, t4, 40
ld.d a6, t4, 48
ld.d a7, t4, 56
jirl ra, t3, 0
addi.d sp, sp, ARG_STACK
ld.d t2, sp, 0 // ok
ld.d ra, sp, 8 // Ret address
addi.d sp, sp, 16
la.local t1, register_init
xor t3, t3, t3
.macro check_reg_gr reg1
ld.d t0, t1, 0
xor t0, $s\reg1, t0
or t3, t3, t0
addi.d t1, t1, 8
.endm
check_reg_gr 0
check_reg_gr 1
check_reg_gr 2
check_reg_gr 3
check_reg_gr 4
check_reg_gr 5
check_reg_gr 6
check_reg_gr 7
check_reg_gr 8
.macro check_reg_fr reg1
ld.d t0, t1, 0
movfr2gr.d t4, $fs\reg1
xor t0, t0, t4
or t3, t3, t0
addi.d t1, t1, 8
.endm
check_reg_fr 0
check_reg_fr 1
check_reg_fr 2
check_reg_fr 3
check_reg_fr 4
check_reg_fr 5
check_reg_fr 6
check_reg_fr 7
beqz t3, 0f
st.d zero, t2, 0x00 // Set OK to 0
la.local a0, error_message
addi.d sp, sp, -8
st.d ra, sp, 0
bl puts
ld.d ra, sp, 0
addi.d sp, sp, 8
0:
ld.d s0, sp, 0
ld.d s1, sp, 8
ld.d s2, sp, 16
ld.d s3, sp, 24
ld.d s4, sp, 32
ld.d s5, sp, 40
ld.d s6, sp, 48
ld.d s7, sp, 56
ld.d s8, sp, 64
fld.d fs0, sp, 72
fld.d fs1, sp, 80
fld.d fs2, sp, 88
fld.d fs3, sp, 96
fld.d fs4, sp, 104
fld.d fs5, sp, 112
fld.d fs6, sp, 120
fld.d fs7, sp, 128
addi.d sp, sp, 136
endfunc
......@@ -280,6 +280,8 @@ void checkasm_check_msac(void) {
c.decode_symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_lsx;
c.decode_bool_adapt = dav1d_msac_decode_bool_adapt_lsx;
c.decode_bool = dav1d_msac_decode_bool_lsx;
c.decode_bool_equi = dav1d_msac_decode_bool_equi_lsx;
c.decode_hi_tok = dav1d_msac_decode_hi_tok_lsx;
}
#elif ARCH_X86 && HAVE_ASM
if (dav1d_get_cpu_flags() & DAV1D_X86_CPU_FLAG_SSE2) {
......
......@@ -73,6 +73,8 @@ if is_asm_enabled
checkasm_asm_sources += files('checkasm/riscv/checkasm_64.S')
elif host_machine.cpu_family().startswith('x86')
checkasm_asm_objs += nasm_gen.process(files('checkasm/x86/checkasm.asm'))
elif host_machine.cpu_family().startswith('loongarch')
checkasm_asm_objs += files('checkasm/loongarch/checkasm.S')
endif
if use_gaspp
......