Commit 6f9f3391 authored by Martin Storsjö's avatar Martin Storsjö
Browse files

arm64: itx16: Use usqadd to avoid separate clamping of negative values

Before:                                Cortex A53     A72      A73
inv_txfm_add_4x4_dct_dct_0_10bpc_neon:       40.7    23.0     24.0
inv_txfm_add_4x4_dct_dct_1_10bpc_neon:      116.0    71.5     78.2
inv_txfm_add_8x8_dct_dct_0_10bpc_neon:       85.7    50.7     53.8
inv_txfm_add_8x8_dct_dct_1_10bpc_neon:      287.0   203.5    215.2
inv_txfm_add_16x16_dct_dct_0_10bpc_neon:    255.7   129.1    140.4
inv_txfm_add_16x16_dct_dct_1_10bpc_neon:   1401.4  1026.7   1039.2
inv_txfm_add_16x16_dct_dct_2_10bpc_neon:   1913.2  1407.3   1479.6
After:
inv_txfm_add_4x4_dct_dct_0_10bpc_neon:       38.7    21.5     22.2
inv_txfm_add_4x4_dct_dct_1_10bpc_neon:      116.0    71.3     77.2
inv_txfm_add_8x8_dct_dct_0_10bpc_neon:       76.7    44.7     43.5
inv_txfm_add_8x8_dct_dct_1_10bpc_neon:      278.0   203.0    203.9
inv_txfm_add_16x16_dct_dct_0_10bpc_neon:    236.9   106.2    116.2
inv_txfm_add_16x16_dct_dct_1_10bpc_neon:   1368.7   999.7   1008.4
inv_txfm_add_16x16_dct_dct_2_10bpc_neon:   1880.5  1381.2   1459.4
parent 2e73051c
Pipeline #66454 passed with stages
in 9 minutes and 37 seconds
......@@ -124,7 +124,7 @@ endconst
.endif
.endm
.macro load_add_store load, shift, addsrc, adddst, max, min, store, dst, src, shiftbits=4
.macro load_add_store load, shift, addsrc, adddst, min, store, dst, src, shiftbits=4
.ifnb \load
ld1 {\load}, [\src], x1
.endif
......@@ -132,10 +132,7 @@ endconst
srshr \shift, \shift, #\shiftbits
.endif
.ifnb \addsrc
sqadd \adddst, \adddst, \addsrc
.endif
.ifnb \max
smax \max, \max, v6.8h
usqadd \adddst, \addsrc
.endif
.ifnb \min
smin \min, \min, v7.8h
......@@ -146,63 +143,57 @@ endconst
.endm
.macro load_add_store_8x16 dst, src
mov \src, \dst
movi v6.8h, #0
mvni v7.8h, #0xfc, lsl #8 // 0x3ff
load_add_store v2.8h, v16.8h, , , , , , \dst, \src
load_add_store v3.8h, v17.8h, , , , , , \dst, \src
load_add_store v4.8h, v18.8h, v2.8h, v16.8h, , , , \dst, \src
load_add_store v5.8h, v19.8h, v3.8h, v17.8h, v16.8h, , , \dst, \src
load_add_store v2.8h, v20.8h, v4.8h, v18.8h, v17.8h, v16.8h, , \dst, \src
load_add_store v3.8h, v21.8h, v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src
load_add_store v4.8h, v22.8h, v2.8h, v20.8h, v19.8h, v18.8h, v17.8h, \dst, \src
load_add_store v5.8h, v23.8h, v3.8h, v21.8h, v20.8h, v19.8h, v18.8h, \dst, \src
load_add_store v2.8h, v24.8h, v4.8h, v22.8h, v21.8h, v20.8h, v19.8h, \dst, \src
load_add_store v3.8h, v25.8h, v5.8h, v23.8h, v22.8h, v21.8h, v20.8h, \dst, \src
load_add_store v4.8h, v26.8h, v2.8h, v24.8h, v23.8h, v22.8h, v21.8h, \dst, \src
load_add_store v5.8h, v27.8h, v3.8h, v25.8h, v24.8h, v23.8h, v22.8h, \dst, \src
load_add_store v2.8h, v28.8h, v4.8h, v26.8h, v25.8h, v24.8h, v23.8h, \dst, \src
load_add_store v3.8h, v29.8h, v5.8h, v27.8h, v26.8h, v25.8h, v24.8h, \dst, \src
load_add_store v4.8h, v30.8h, v2.8h, v28.8h, v27.8h, v26.8h, v25.8h, \dst, \src
load_add_store v5.8h, v31.8h, v3.8h, v29.8h, v28.8h, v27.8h, v26.8h, \dst, \src
load_add_store , , v4.8h, v30.8h, v29.8h, v28.8h, v27.8h, \dst, \src
load_add_store , , v5.8h, v31.8h, v30.8h, v29.8h, v28.8h, \dst, \src
load_add_store , , , , v31.8h, v30.8h, v29.8h, \dst, \src
load_add_store , , , , , v31.8h, v30.8h, \dst, \src
load_add_store , , , , , , v31.8h, \dst, \src
load_add_store v2.8h, v16.8h, , , , , \dst, \src
load_add_store v3.8h, v17.8h, , , , , \dst, \src
load_add_store v4.8h, v18.8h, v16.8h, v2.8h, , , \dst, \src
load_add_store v5.8h, v19.8h, v17.8h, v3.8h, v2.8h, , \dst, \src
load_add_store v16.8h, v20.8h, v18.8h, v4.8h, v3.8h, v2.8h, \dst, \src
load_add_store v17.8h, v21.8h, v19.8h, v5.8h, v4.8h, v3.8h, \dst, \src
load_add_store v18.8h, v22.8h, v20.8h, v16.8h, v5.8h, v4.8h, \dst, \src
load_add_store v19.8h, v23.8h, v21.8h, v17.8h, v16.8h, v5.8h, \dst, \src
load_add_store v20.8h, v24.8h, v22.8h, v18.8h, v17.8h, v16.8h, \dst, \src
load_add_store v21.8h, v25.8h, v23.8h, v19.8h, v18.8h, v17.8h, \dst, \src
load_add_store v22.8h, v26.8h, v24.8h, v20.8h, v19.8h, v18.8h, \dst, \src
load_add_store v23.8h, v27.8h, v25.8h, v21.8h, v20.8h, v19.8h, \dst, \src
load_add_store v24.8h, v28.8h, v26.8h, v22.8h, v21.8h, v20.8h, \dst, \src
load_add_store v25.8h, v29.8h, v27.8h, v23.8h, v22.8h, v21.8h, \dst, \src
load_add_store v26.8h, v30.8h, v28.8h, v24.8h, v23.8h, v22.8h, \dst, \src
load_add_store v27.8h, v31.8h, v29.8h, v25.8h, v24.8h, v23.8h, \dst, \src
load_add_store , , v30.8h, v26.8h, v25.8h, v24.8h, \dst, \src
load_add_store , , v31.8h, v27.8h, v26.8h, v25.8h, \dst, \src
load_add_store , , , , v27.8h, v26.8h, \dst, \src
load_add_store , , , , , v27.8h, \dst, \src
.endm
.macro load_add_store_8x8 dst, src, shiftbits=4
mov \src, \dst
movi v6.8h, #0
mvni v7.8h, #0xfc, lsl #8 // 0x3ff
load_add_store v2.8h, v16.8h, , , , , , \dst, \src, \shiftbits
load_add_store v3.8h, v17.8h, , , , , , \dst, \src, \shiftbits
load_add_store v4.8h, v18.8h, v2.8h, v16.8h, , , , \dst, \src, \shiftbits
load_add_store v5.8h, v19.8h, v3.8h, v17.8h, v16.8h, , , \dst, \src, \shiftbits
load_add_store v2.8h, v20.8h, v4.8h, v18.8h, v17.8h, v16.8h, , \dst, \src, \shiftbits
load_add_store v3.8h, v21.8h, v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits
load_add_store v4.8h, v22.8h, v2.8h, v20.8h, v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits
load_add_store v5.8h, v23.8h, v3.8h, v21.8h, v20.8h, v19.8h, v18.8h, \dst, \src, \shiftbits
load_add_store , , v4.8h, v22.8h, v21.8h, v20.8h, v19.8h, \dst, \src, \shiftbits
load_add_store , , v5.8h, v23.8h, v22.8h, v21.8h, v20.8h, \dst, \src, \shiftbits
load_add_store , , , , v23.8h, v22.8h, v21.8h, \dst, \src, \shiftbits
load_add_store , , , , , v23.8h, v22.8h, \dst, \src, \shiftbits
load_add_store , , , , , , v23.8h, \dst, \src, \shiftbits
load_add_store v2.8h, v16.8h, , , , , \dst, \src, \shiftbits
load_add_store v3.8h, v17.8h, , , , , \dst, \src, \shiftbits
load_add_store v4.8h, v18.8h, v16.8h, v2.8h, , , \dst, \src, \shiftbits
load_add_store v5.8h, v19.8h, v17.8h, v3.8h, v2.8h, , \dst, \src, \shiftbits
load_add_store v16.8h, v20.8h, v18.8h, v4.8h, v3.8h, v2.8h, \dst, \src, \shiftbits
load_add_store v17.8h, v21.8h, v19.8h, v5.8h, v4.8h, v3.8h, \dst, \src, \shiftbits
load_add_store v18.8h, v22.8h, v20.8h, v16.8h, v5.8h, v4.8h, \dst, \src, \shiftbits
load_add_store v19.8h, v23.8h, v21.8h, v17.8h, v16.8h, v5.8h, \dst, \src, \shiftbits
load_add_store , , v22.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits
load_add_store , , v23.8h, v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits
load_add_store , , , , v19.8h, v18.8h, \dst, \src, \shiftbits
load_add_store , , , , , v19.8h, \dst, \src, \shiftbits
.endm
.macro load_add_store_8x4 dst, src, shiftbits=4
mov \src, \dst
movi v6.8h, #0
mvni v7.8h, #0xfc, lsl #8 // 0x3ff
load_add_store v2.8h, v16.8h, , , , , , \dst, \src, \shiftbits
load_add_store v3.8h, v17.8h, , , , , , \dst, \src, \shiftbits
load_add_store v4.8h, v18.8h, v2.8h, v16.8h, , , , \dst, \src, \shiftbits
load_add_store v5.8h, v19.8h, v3.8h, v17.8h, v16.8h, , , \dst, \src, \shiftbits
load_add_store , , v4.8h, v18.8h, v17.8h, v16.8h, , \dst, \src, \shiftbits
load_add_store , , v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits
load_add_store , , , , v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits
load_add_store , , , , , v19.8h, v18.8h, \dst, \src, \shiftbits
load_add_store , , , , , , v19.8h, \dst, \src, \shiftbits
load_add_store v2.8h, v16.8h, , , , , \dst, \src, \shiftbits
load_add_store v3.8h, v17.8h, , , , , \dst, \src, \shiftbits
load_add_store v4.8h, v18.8h, v16.8h, v2.8h, , , \dst, \src, \shiftbits
load_add_store v5.8h, v19.8h, v17.8h, v3.8h, v2.8h, , \dst, \src, \shiftbits
load_add_store , , v18.8h, v4.8h, v3.8h, v2.8h, \dst, \src, \shiftbits
load_add_store , , v19.8h, v5.8h, v4.8h, v3.8h, \dst, \src, \shiftbits
load_add_store , , , , v5.8h, v4.8h, \dst, \src, \shiftbits
load_add_store , , , , , v5.8h, \dst, \src, \shiftbits
.endm
.macro load_add_store4 load, inssrc, insdst, shift, addsrc, adddst, max, min, store, dst, src
.macro load_add_store4 load, inssrc, insdst, shift, addsrc, adddst, min, store, dst, src
.ifnb \load
ld1 {\load}[0], [\src], x1
.endif
......@@ -216,14 +207,11 @@ endconst
ld1 {\load}[1], [\src], x1
.endif
.ifnb \addsrc
sqadd \adddst, \adddst, \addsrc
usqadd \adddst, \addsrc
.endif
.ifnb \store
st1 {\store}[0], [\dst], x1
.endif
.ifnb \max
smax \max, \max, v6.8h
.endif
.ifnb \min
smin \min, \min, v7.8h
.endif
......@@ -233,37 +221,33 @@ endconst
.endm
.macro load_add_store_4x16 dst, src
mov \src, \dst
movi v6.8h, #0
mvni v7.8h, #0xfc, lsl #8 // 0x3ff
load_add_store4 v0.d, v17, v16, , , , , , , \dst, \src
load_add_store4 v1.d, v19, v18, , , , , , , \dst, \src
load_add_store4 v2.d, v21, v20, v16.8h, , , , , , \dst, \src
load_add_store4 v3.d, v23, v22, v18.8h, v0.8h, v16.8h, , , , \dst, \src
load_add_store4 v0.d, v25, v24, v20.8h, v1.8h, v18.8h, v16.8h, , , \dst, \src
load_add_store4 v1.d, v27, v26, v22.8h, v2.8h, v20.8h, v18.8h, v16.8h, , \dst, \src
load_add_store4 v2.d, v29, v28, v24.8h, v3.8h, v22.8h, v20.8h, v18.8h, v16.d, \dst, \src
load_add_store4 v3.d, v31, v30, v26.8h, v0.8h, v24.8h, v22.8h, v20.8h, v18.d, \dst, \src
load_add_store4 , , , v28.8h, v1.8h, v26.8h, v24.8h, v22.8h, v20.d, \dst, \src
load_add_store4 , , , v30.8h, v2.8h, v28.8h, v26.8h, v24.8h, v22.d, \dst, \src
load_add_store4 , , , , v3.8h, v30.8h, v28.8h, v26.8h, v24.d, \dst, \src
load_add_store4 , , , , , , v30.8h, v28.8h, v26.d, \dst, \src
load_add_store4 , , , , , , , v30.8h, v28.d, \dst, \src
load_add_store4 , , , , , , , , v30.d, \dst, \src
load_add_store4 v0.d, v17, v16, , , , , , \dst, \src
load_add_store4 v1.d, v19, v18, , , , , , \dst, \src
load_add_store4 v2.d, v21, v20, v16.8h, , , , , \dst, \src
load_add_store4 v3.d, v23, v22, v18.8h, v16.8h, v0.8h, , , \dst, \src
load_add_store4 v17.d, v25, v24, v20.8h, v18.8h, v1.8h, v0.8h, , \dst, \src
load_add_store4 v19.d, v27, v26, v22.8h, v20.8h, v2.8h, v1.8h, v0.d, \dst, \src
load_add_store4 v21.d, v29, v28, v24.8h, v22.8h, v3.8h, v2.8h, v1.d, \dst, \src
load_add_store4 v23.d, v31, v30, v26.8h, v24.8h, v17.8h, v3.8h, v2.d, \dst, \src
load_add_store4 , , , v28.8h, v26.8h, v19.8h, v17.8h, v3.d, \dst, \src
load_add_store4 , , , v30.8h, v28.8h, v21.8h, v19.8h, v17.d, \dst, \src
load_add_store4 , , , , v30.8h, v23.8h, v21.8h, v19.d, \dst, \src
load_add_store4 , , , , , , v23.8h, v21.d, \dst, \src
load_add_store4 , , , , , , , v23.d, \dst, \src
.endm
.macro load_add_store_4x8 dst, src
mov \src, \dst
movi v6.8h, #0
mvni v7.8h, #0xfc, lsl #8 // 0x3ff
load_add_store4 v0.d, v17, v16, , , , , , , \dst, \src
load_add_store4 v1.d, v19, v18, , , , , , , \dst, \src
load_add_store4 v2.d, v21, v20, v16.8h, , , , , , \dst, \src
load_add_store4 v3.d, v23, v22, v18.8h, v0.8h, v16.8h, , , , \dst, \src
load_add_store4 , , , v20.8h, v1.8h, v18.8h, v16.8h, , , \dst, \src
load_add_store4 , , , v22.8h, v2.8h, v20.8h, v18.8h, v16.8h, , \dst, \src
load_add_store4 , , , , v3.8h, v22.8h, v20.8h, v18.8h, v16.d, \dst, \src
load_add_store4 , , , , , , v22.8h, v20.8h, v18.d, \dst, \src
load_add_store4 , , , , , , , v22.8h, v20.d, \dst, \src
load_add_store4 , , , , , , , , v22.d, \dst, \src
load_add_store4 v0.d, v17, v16, , , , , , \dst, \src
load_add_store4 v1.d, v19, v18, , , , , , \dst, \src
load_add_store4 v2.d, v21, v20, v16.8h, , , , , \dst, \src
load_add_store4 v3.d, v23, v22, v18.8h, v16.8h, v0.8h, , , \dst, \src
load_add_store4 , , , v20.8h, v18.8h, v1.8h, v0.8h, , \dst, \src
load_add_store4 , , , v22.8h, v20.8h, v2.8h, v1.8h, v0.d, \dst, \src
load_add_store4 , , , , v22.8h, v3.8h, v2.8h, v1.d, \dst, \src
load_add_store4 , , , , , , v3.8h, v2.d, \dst, \src
load_add_store4 , , , , , , , v3.d, \dst, \src
.endm
.macro idct_dc w, h, shift
......@@ -291,7 +275,6 @@ endconst
.endm
function idct_dc_w4_neon
movi v30.8h, #0
mvni v31.8h, #0xfc, lsl #8 // 0x3ff
1:
ld1 {v0.d}[0], [x0], x1
......@@ -299,11 +282,9 @@ function idct_dc_w4_neon
ld1 {v1.d}[0], [x0], x1
subs w4, w4, #4
ld1 {v1.d}[1], [x0], x1
sqadd v0.8h, v0.8h, v16.8h
usqadd v0.8h, v16.8h
sub x0, x0, x1, lsl #2
sqadd v1.8h, v1.8h, v16.8h
smax v0.8h, v0.8h, v30.8h
smax v1.8h, v1.8h, v30.8h
usqadd v1.8h, v16.8h
smin v0.8h, v0.8h, v31.8h
st1 {v0.d}[0], [x0], x1
smin v1.8h, v1.8h, v31.8h
......@@ -315,23 +296,18 @@ function idct_dc_w4_neon
endfunc
function idct_dc_w8_neon
movi v30.8h, #0
mvni v31.8h, #0xfc, lsl #8 // 0x3ff
1:
ld1 {v0.8h}, [x0], x1
subs w4, w4, #4
ld1 {v1.8h}, [x0], x1
sqadd v0.8h, v0.8h, v16.8h
usqadd v0.8h, v16.8h
ld1 {v2.8h}, [x0], x1
sqadd v1.8h, v1.8h, v16.8h
usqadd v1.8h, v16.8h
ld1 {v3.8h}, [x0], x1
sqadd v2.8h, v2.8h, v16.8h
sqadd v3.8h, v3.8h, v16.8h
usqadd v2.8h, v16.8h
usqadd v3.8h, v16.8h
sub x0, x0, x1, lsl #2
smax v0.8h, v0.8h, v30.8h
smax v1.8h, v1.8h, v30.8h
smax v2.8h, v2.8h, v30.8h
smax v3.8h, v3.8h, v30.8h
smin v0.8h, v0.8h, v31.8h
smin v1.8h, v1.8h, v31.8h
st1 {v0.8h}, [x0], x1
......@@ -345,21 +321,16 @@ function idct_dc_w8_neon
endfunc
function idct_dc_w16_neon
movi v30.8h, #0
mvni v31.8h, #0xfc, lsl #8 // 0x3ff
1:
ld1 {v0.8h, v1.8h}, [x0], x1
subs w4, w4, #2
ld1 {v2.8h, v3.8h}, [x0], x1
sqadd v0.8h, v0.8h, v16.8h
sqadd v1.8h, v1.8h, v16.8h
usqadd v0.8h, v16.8h
usqadd v1.8h, v16.8h
sub x0, x0, x1, lsl #1
sqadd v2.8h, v2.8h, v16.8h
sqadd v3.8h, v3.8h, v16.8h
smax v0.8h, v0.8h, v30.8h
smax v1.8h, v1.8h, v30.8h
smax v2.8h, v2.8h, v30.8h
smax v3.8h, v3.8h, v30.8h
usqadd v2.8h, v16.8h
usqadd v3.8h, v16.8h
smin v0.8h, v0.8h, v31.8h
smin v1.8h, v1.8h, v31.8h
smin v2.8h, v2.8h, v31.8h
......@@ -371,19 +342,14 @@ function idct_dc_w16_neon
endfunc
function idct_dc_w32_neon
movi v30.8h, #0
mvni v31.8h, #0xfc, lsl #8 // 0x3ff
1:
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
subs w4, w4, #1
sqadd v0.8h, v0.8h, v16.8h
sqadd v1.8h, v1.8h, v16.8h
sqadd v2.8h, v2.8h, v16.8h
sqadd v3.8h, v3.8h, v16.8h
smax v0.8h, v0.8h, v30.8h
smax v1.8h, v1.8h, v30.8h
smax v2.8h, v2.8h, v30.8h
smax v3.8h, v3.8h, v30.8h
usqadd v0.8h, v16.8h
usqadd v1.8h, v16.8h
usqadd v2.8h, v16.8h
usqadd v3.8h, v16.8h
smin v0.8h, v0.8h, v31.8h
smin v1.8h, v1.8h, v31.8h
smin v2.8h, v2.8h, v31.8h
......@@ -394,30 +360,21 @@ function idct_dc_w32_neon
endfunc
function idct_dc_w64_neon
movi v30.8h, #0
mvni v31.8h, #0xfc, lsl #8 // 0x3ff
sub x1, x1, #64
1:
ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
subs w4, w4, #1
sqadd v0.8h, v0.8h, v16.8h
usqadd v0.8h, v16.8h
ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0]
sqadd v1.8h, v1.8h, v16.8h
usqadd v1.8h, v16.8h
sub x0, x0, #64
sqadd v2.8h, v2.8h, v16.8h
sqadd v3.8h, v3.8h, v16.8h
sqadd v4.8h, v4.8h, v16.8h
sqadd v5.8h, v5.8h, v16.8h
sqadd v6.8h, v6.8h, v16.8h
sqadd v7.8h, v7.8h, v16.8h
smax v0.8h, v0.8h, v30.8h
smax v1.8h, v1.8h, v30.8h
smax v2.8h, v2.8h, v30.8h
smax v3.8h, v3.8h, v30.8h
smax v4.8h, v4.8h, v30.8h
smax v5.8h, v5.8h, v30.8h
smax v6.8h, v6.8h, v30.8h
smax v7.8h, v7.8h, v30.8h
usqadd v2.8h, v16.8h
usqadd v3.8h, v16.8h
usqadd v4.8h, v16.8h
usqadd v5.8h, v16.8h
usqadd v6.8h, v16.8h
usqadd v7.8h, v16.8h
smin v0.8h, v0.8h, v31.8h
smin v1.8h, v1.8h, v31.8h
smin v2.8h, v2.8h, v31.8h
......@@ -575,16 +532,14 @@ function inv_txfm_add_4x4_neon
L(itx_4x4_end):
mvni v31.8h, #0xfc, lsl #8 // 0x3ff
sub x0, x0, x1, lsl #2
sqadd v16.8h, v16.8h, v0.8h
sqadd v18.8h, v18.8h, v1.8h
smax v16.8h, v16.8h, v30.8h
smax v18.8h, v18.8h, v30.8h
smin v16.8h, v16.8h, v31.8h
st1 {v16.d}[0], [x0], x1
smin v18.8h, v18.8h, v31.8h
st1 {v16.d}[1], [x0], x1
st1 {v18.d}[0], [x0], x1
st1 {v18.d}[1], [x0], x1
usqadd v0.8h, v16.8h
usqadd v1.8h, v18.8h
smin v0.8h, v0.8h, v31.8h
st1 {v0.d}[0], [x0], x1
smin v1.8h, v1.8h, v31.8h
st1 {v0.d}[1], [x0], x1
st1 {v1.d}[0], [x0], x1
st1 {v1.d}[1], [x0], x1
br x15
endfunc
......@@ -2219,7 +2174,6 @@ function inv_txfm_add_vert_dct_8x32_neon
neg x9, x8
mov x10, x6
movi v0.8h, #0
mvni v1.8h, #0xfc, lsl #8 // 0x3ff
.macro combine r0, r1, r2, r3, op, stride
ld1 {v5.8h}, [x7], \stride
......@@ -2231,27 +2185,23 @@ function inv_txfm_add_vert_dct_8x32_neon
ld1 {v4.8h}, [x10], x1
srshr v5.8h, v5.8h, #4
\op v6.8h, v6.8h, \r1
sqadd v5.8h, v5.8h, v2.8h
usqadd v2.8h, v5.8h
srshr v6.8h, v6.8h, #4
\op v7.8h, v7.8h, \r2
smax v2.8h, v5.8h, v0.8h
ld1 {v5.8h}, [x7], \stride
sqadd v6.8h, v6.8h, v3.8h
usqadd v3.8h, v6.8h
smin v2.8h, v2.8h, v1.8h
srshr v7.8h, v7.8h, #4
\op v5.8h, v5.8h, \r3
st1 {v2.8h}, [x6], x1
ld1 {v2.8h}, [x10], x1
smax v3.8h, v6.8h, v0.8h
sqadd v7.8h, v7.8h, v4.8h
usqadd v4.8h, v7.8h
smin v3.8h, v3.8h, v1.8h
srshr v5.8h, v5.8h, #4
st1 {v3.8h}, [x6], x1
smax v4.8h, v7.8h, v0.8h
sqadd v5.8h, v5.8h, v2.8h
usqadd v2.8h, v5.8h
smin v4.8h, v4.8h, v1.8h
st1 {v4.8h}, [x6], x1
smax v2.8h, v5.8h, v0.8h
smin v2.8h, v2.8h, v1.8h
st1 {v2.8h}, [x6], x1
.endm
......@@ -3195,7 +3145,6 @@ function inv_txfm_add_vert_dct_8x64_neon
ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64
ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11
movi v6.8h, #0
mvni v7.8h, #0xfc, lsl #8 // 0x3ff
.macro add_dest_addsub src0, src1, src2, src3
ld1 {v0.8h}, [x6], x1
......@@ -3211,18 +3160,14 @@ function inv_txfm_add_vert_dct_8x64_neon
srshr v4.8h, v4.8h, #4
srshr v5.8h, v5.8h, #4
srshr \src0, \src0, #4
sqadd v0.8h, v0.8h, v4.8h
usqadd v0.8h, v4.8h
srshr \src2, \src2, #4
sqadd v1.8h, v1.8h, \src0
sqadd v2.8h, v2.8h, v5.8h
smax v0.8h, v0.8h, v6.8h
sqadd v3.8h, v3.8h, \src2
smax v1.8h, v1.8h, v6.8h
usqadd v1.8h, \src0
usqadd v2.8h, v5.8h
smin v0.8h, v0.8h, v7.8h
smax v2.8h, v2.8h, v6.8h
usqadd v3.8h, \src2
smin v1.8h, v1.8h, v7.8h
st1 {v0.8h}, [x6], x1
smax v3.8h, v3.8h, v6.8h
smin v2.8h, v2.8h, v7.8h
st1 {v1.8h}, [x9], x10
smin v3.8h, v3.8h, v7.8h
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment