Commit fc5a3728 authored by Martin Storsjö's avatar Martin Storsjö Committed by Jean-Baptiste Kempf

arm64: mc: Optimize the mul_mla_8_* macros for Cortex A53

Before:                      Cortex A53   Snapdragon 835
mc_8tap_regular_w2_v_8bpc_neon:   155.1   131.8
mc_8tap_regular_w4_v_8bpc_neon:   199.6   148.1
mc_8tap_regular_w8_v_8bpc_neon:   286.2   225.5
After:
mc_8tap_regular_w2_v_8bpc_neon:   134.1   129.5
mc_8tap_regular_w4_v_8bpc_neon:   157.6   146.5
mc_8tap_regular_w8_v_8bpc_neon:   208.0   225.0
parent 1407506a
...@@ -546,58 +546,61 @@ endfunc ...@@ -546,58 +546,61 @@ endfunc
mla \d\wd, \s2\wd, v0.h[2] mla \d\wd, \s2\wd, v0.h[2]
mla \d\wd, \s3\wd, v0.h[3] mla \d\wd, \s3\wd, v0.h[3]
.endm .endm
// Interleaving the mul/mla chains actually hurts performance
// significantly on Cortex A53, thus keeping mul/mla tightly
// chained like this.
.macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8 .macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
mul \d0\().8h, \s0\().8h, v0.h[0] mul \d0\().8h, \s0\().8h, v0.h[0]
mul \d1\().8h, \s1\().8h, v0.h[0]
mla \d0\().8h, \s1\().8h, v0.h[1] mla \d0\().8h, \s1\().8h, v0.h[1]
mla \d1\().8h, \s2\().8h, v0.h[1]
mla \d0\().8h, \s2\().8h, v0.h[2] mla \d0\().8h, \s2\().8h, v0.h[2]
mla \d1\().8h, \s3\().8h, v0.h[2]
mla \d0\().8h, \s3\().8h, v0.h[3] mla \d0\().8h, \s3\().8h, v0.h[3]
mla \d1\().8h, \s4\().8h, v0.h[3]
mla \d0\().8h, \s4\().8h, v0.h[4] mla \d0\().8h, \s4\().8h, v0.h[4]
mla \d1\().8h, \s5\().8h, v0.h[4]
mla \d0\().8h, \s5\().8h, v0.h[5] mla \d0\().8h, \s5\().8h, v0.h[5]
mla \d1\().8h, \s6\().8h, v0.h[5]
mla \d0\().8h, \s6\().8h, v0.h[6] mla \d0\().8h, \s6\().8h, v0.h[6]
mla \d1\().8h, \s7\().8h, v0.h[6]
mla \d0\().8h, \s7\().8h, v0.h[7] mla \d0\().8h, \s7\().8h, v0.h[7]
mul \d1\().8h, \s1\().8h, v0.h[0]
mla \d1\().8h, \s2\().8h, v0.h[1]
mla \d1\().8h, \s3\().8h, v0.h[2]
mla \d1\().8h, \s4\().8h, v0.h[3]
mla \d1\().8h, \s5\().8h, v0.h[4]
mla \d1\().8h, \s6\().8h, v0.h[5]
mla \d1\().8h, \s7\().8h, v0.h[6]
mla \d1\().8h, \s8\().8h, v0.h[7] mla \d1\().8h, \s8\().8h, v0.h[7]
.endm .endm
.macro mul_mla_8_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9 .macro mul_mla_8_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9
mul \d0\().8h, \s0\().8h, v0.h[0] mul \d0\().8h, \s0\().8h, v0.h[0]
mul \d1\().8h, \s2\().8h, v0.h[0]
mla \d0\().8h, \s1\().8h, v0.h[1] mla \d0\().8h, \s1\().8h, v0.h[1]
mla \d1\().8h, \s3\().8h, v0.h[1]
mla \d0\().8h, \s2\().8h, v0.h[2] mla \d0\().8h, \s2\().8h, v0.h[2]
mla \d1\().8h, \s4\().8h, v0.h[2]
mla \d0\().8h, \s3\().8h, v0.h[3] mla \d0\().8h, \s3\().8h, v0.h[3]
mla \d1\().8h, \s5\().8h, v0.h[3]
mla \d0\().8h, \s4\().8h, v0.h[4] mla \d0\().8h, \s4\().8h, v0.h[4]
mla \d1\().8h, \s6\().8h, v0.h[4]
mla \d0\().8h, \s5\().8h, v0.h[5] mla \d0\().8h, \s5\().8h, v0.h[5]
mla \d1\().8h, \s7\().8h, v0.h[5]
mla \d0\().8h, \s6\().8h, v0.h[6] mla \d0\().8h, \s6\().8h, v0.h[6]
mla \d1\().8h, \s8\().8h, v0.h[6]
mla \d0\().8h, \s7\().8h, v0.h[7] mla \d0\().8h, \s7\().8h, v0.h[7]
mul \d1\().8h, \s2\().8h, v0.h[0]
mla \d1\().8h, \s3\().8h, v0.h[1]
mla \d1\().8h, \s4\().8h, v0.h[2]
mla \d1\().8h, \s5\().8h, v0.h[3]
mla \d1\().8h, \s6\().8h, v0.h[4]
mla \d1\().8h, \s7\().8h, v0.h[5]
mla \d1\().8h, \s8\().8h, v0.h[6]
mla \d1\().8h, \s9\().8h, v0.h[7] mla \d1\().8h, \s9\().8h, v0.h[7]
.endm .endm
.macro mul_mla_8_4 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11 .macro mul_mla_8_4 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11
mul \d0\().8h, \s0\().8h, v0.h[0] mul \d0\().8h, \s0\().8h, v0.h[0]
mul \d1\().8h, \s4\().8h, v0.h[0]
mla \d0\().8h, \s1\().8h, v0.h[1] mla \d0\().8h, \s1\().8h, v0.h[1]
mla \d1\().8h, \s5\().8h, v0.h[1]
mla \d0\().8h, \s2\().8h, v0.h[2] mla \d0\().8h, \s2\().8h, v0.h[2]
mla \d1\().8h, \s6\().8h, v0.h[2]
mla \d0\().8h, \s3\().8h, v0.h[3] mla \d0\().8h, \s3\().8h, v0.h[3]
mla \d1\().8h, \s7\().8h, v0.h[3]
mla \d0\().8h, \s4\().8h, v0.h[4] mla \d0\().8h, \s4\().8h, v0.h[4]
mla \d1\().8h, \s8\().8h, v0.h[4]
mla \d0\().8h, \s5\().8h, v0.h[5] mla \d0\().8h, \s5\().8h, v0.h[5]
mla \d1\().8h, \s9\().8h, v0.h[5]
mla \d0\().8h, \s6\().8h, v0.h[6] mla \d0\().8h, \s6\().8h, v0.h[6]
mla \d1\().8h, \s10\().8h, v0.h[6]
mla \d0\().8h, \s7\().8h, v0.h[7] mla \d0\().8h, \s7\().8h, v0.h[7]
mul \d1\().8h, \s4\().8h, v0.h[0]
mla \d1\().8h, \s5\().8h, v0.h[1]
mla \d1\().8h, \s6\().8h, v0.h[2]
mla \d1\().8h, \s7\().8h, v0.h[3]
mla \d1\().8h, \s8\().8h, v0.h[4]
mla \d1\().8h, \s9\().8h, v0.h[5]
mla \d1\().8h, \s10\().8h, v0.h[6]
mla \d1\().8h, \s11\().8h, v0.h[7] mla \d1\().8h, \s11\().8h, v0.h[7]
.endm .endm
.macro sqrshrun_b shift, r0, r1, r2, r3 .macro sqrshrun_b shift, r0, r1, r2, r3
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment