Commit e80955cc authored by Martin Storsjö's avatar Martin Storsjö Committed by Jean-Baptiste Kempf

arm64: mc: Optimize mc_8tap_regular_w4_hv_8bpc for A53

Before:                       Cortex A53   Snapdragon 835
mc_8tap_regular_w4_hv_8bpc_neon:   543.6   359.1
After:
mc_8tap_regular_w4_hv_8bpc_neon:   466.7   355.5

The same kind of change doesn't seem to give any benefits on the 8
pixel wide hv filtering though, potentially related to the fact that
it uses not only smull/smlal but also smull2/smlal2.
parent 72af9329
......@@ -1447,14 +1447,17 @@ L(\type\()_8tap_filter_2):
mov v18.8b, v29.8b
4:
smull v2.4s, v16.4h, v1.h[0]
bl L(\type\()_8tap_filter_4)
smull v3.4s, v17.4h, v1.h[0]
// Interleaving the mul/mla chains actually hurts performance
// significantly on Cortex A53, thus keeping mul/mla tightly
// chained like this.
smull v2.4s, v16.4h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
smlal v3.4s, v18.4h, v1.h[1]
smlal v2.4s, v18.4h, v1.h[2]
smlal v3.4s, v28.4h, v1.h[2]
smlal v2.4s, v28.4h, v1.h[3]
smull v3.4s, v17.4h, v1.h[0]
smlal v3.4s, v18.4h, v1.h[1]
smlal v3.4s, v28.4h, v1.h[2]
smlal v3.4s, v29.4h, v1.h[3]
sqrshrn v2.4h, v2.4s, #\shift_hv
sqrshrn v3.4h, v3.4s, #\shift_hv
......@@ -1508,22 +1511,22 @@ L(\type\()_8tap_filter_2):
mov v22.8b, v29.8b
48:
smull v2.4s, v16.4h, v1.h[0]
bl L(\type\()_8tap_filter_4)
smull v3.4s, v17.4h, v1.h[0]
smull v2.4s, v16.4h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
smlal v3.4s, v18.4h, v1.h[1]
smlal v2.4s, v18.4h, v1.h[2]
smlal v3.4s, v19.4h, v1.h[2]
smlal v2.4s, v19.4h, v1.h[3]
smlal v3.4s, v20.4h, v1.h[3]
smlal v2.4s, v20.4h, v1.h[4]
smlal v3.4s, v21.4h, v1.h[4]
smlal v2.4s, v21.4h, v1.h[5]
smlal v3.4s, v22.4h, v1.h[5]
smlal v2.4s, v22.4h, v1.h[6]
smlal v3.4s, v28.4h, v1.h[6]
smlal v2.4s, v28.4h, v1.h[7]
smull v3.4s, v17.4h, v1.h[0]
smlal v3.4s, v18.4h, v1.h[1]
smlal v3.4s, v19.4h, v1.h[2]
smlal v3.4s, v20.4h, v1.h[3]
smlal v3.4s, v21.4h, v1.h[4]
smlal v3.4s, v22.4h, v1.h[5]
smlal v3.4s, v28.4h, v1.h[6]
smlal v3.4s, v29.4h, v1.h[7]
sqrshrn v2.4h, v2.4s, #\shift_hv
sqrshrn v3.4h, v3.4s, #\shift_hv
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment