Commit bf920fba authored by Martin Storsjö's avatar Martin Storsjö
Browse files

arm: mc: Fix 8tap_v w8 with OBMC 3/4 heights

Also make sure that the w4 case can exit after processing 12 pixels,
where it is convenient.

This gives a small slowdown for in-order cores like A7, A8, A53, but
acutally seems to give a small speedup for out-of-order cores like
A9, A72 and A73.

AArch64:
Before:                      Cortex A53     A72     A73
mc_8tap_regular_w8_v_8bpc_neon:   223.8   247.3   228.5
After:
mc_8tap_regular_w8_v_8bpc_neon:   232.5   243.9   223.4

AArch32:
Before:                       Cortex A7      A8      A9     A53     A72     A73
mc_8tap_regular_w8_v_8bpc_neon:   550.2   470.7   520.5   257.0   256.4   248.2
After:
mc_8tap_regular_w8_v_8bpc_neon:   554.3   474.2   511.6   267.5   252.6   246.8
parent f64fdae5
......@@ -1112,7 +1112,7 @@ L(\type\()_8tap_v_tbl):
vmovl_u8 q3, d6, q4, d8, q8, d16, q9, d18
mul_mla_8_2 q12, q13, q12, q13, q14, q15, q1, q2, q3, q4, q8, q9
shift_store_4 \type, \d_strd, q12, d24, d25, q13, d26, d27
b 48b
bgt 48b
0:
vpop {q4}
pop {r4-r11,pc}
......@@ -1145,7 +1145,7 @@ L(\type\()_8tap_v_tbl):
0:
pop {r4-r11,pc}
880: // 8x8, 8x16, 8x32 v
880: // 8x6, 8x8, 8x16, 8x32 v
1680: // 16x8, 16x16, ...
320: // 32x8, 32x16, ...
640:
......@@ -1178,12 +1178,17 @@ L(\type\()_8tap_v_tbl):
mul_mla_8_1 q3, q4, q3, q4, q8, q9, q10, q11, q12, q13, q14
shift_store_8 \type, \d_strd, q3, d6, q4, d8
ble 9f
subs \h, \h, #4
load_reg \sr2, \src, \s_strd, d30, d2, d4, d6
vmovl_u8 q15, d30, q1, d2, q2, d4, q3, d6
subs \h, \h, #2
load_reg \sr2, \src, \s_strd, d30, d2
vmovl_u8 q15, d30, q1, d2
mul_mla_8_1 q8, q9, q8, q9, q10, q11, q12, q13, q14, q15, q1
shift_store_8 \type, \d_strd, q8, d16, q9, d18
ble 9f
subs \h, \h, #2
load_reg \sr2, \src, \s_strd, d4, d6
vmovl_u8 q2, d4, q3, d6
mul_mla_8_1 q10, q11, q10, q11, q12, q13, q14, q15, q1, q2, q3
shift_store_8 \type, \d_strd, q8, d16, q9, d18, q10, d20, q11, d22
shift_store_8 \type, \d_strd, q10, d20, q11, d22
ble 9f
subs \h, \h, #4
load_reg \sr2, \src, \s_strd, d8, d16, d18, d20
......
......@@ -1119,7 +1119,7 @@ L(\type\()_8tap_v):
uxtl_b v18, v19, v20, v21
mul_mla_8_2 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20, v21
shift_store_4 \type, \d_strd, v1, v2
b 48b
b.gt 48b
0:
ret
......@@ -1151,7 +1151,7 @@ L(\type\()_8tap_v):
0:
ret
880: // 8x8, 8x16, 8x32 v
880: // 8x6, 8x8, 8x16, 8x32 v
1680: // 16x8, 16x16, ...
320: // 32x8, 32x16, ...
640:
......@@ -1183,12 +1183,17 @@ L(\type\()_8tap_v):
mul_mla_8_1 v3, v4, v18, v19, v20, v21, v22, v23, v24, v25, v26
shift_store_8 \type, \d_strd, v3, v4
b.le 9f
subs \h, \h, #4
load_8b \sr2, \src, \s_strd, v27, v16, v17, v18
uxtl_b v27, v16, v17, v18
subs \h, \h, #2
load_8b \sr2, \src, \s_strd, v27, v16
uxtl_b v27, v16
mul_mla_8_1 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16
shift_store_8 \type, \d_strd, v1, v2
b.le 9f
subs \h, \h, #2
load_8b \sr2, \src, \s_strd, v17, v18
uxtl_b v17, v18
mul_mla_8_1 v3, v4, v22, v23, v24, v25, v26, v27, v16, v17, v18
shift_store_8 \type, \d_strd, v1, v2, v3, v4
shift_store_8 \type, \d_strd, v3, v4
b.le 9f
subs \h, \h, #4
load_8b \sr2, \src, \s_strd, v19, v20, v21, v22
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment