Skip to content
Snippets Groups Projects
Commit 67c60d76 authored by Nathan E. Egge's avatar Nathan E. Egge
Browse files

riscv64/mc16: Branchless vsetvl in blend function

Kendryte K230              Before               After         Delta

blend_w4_16bpc_c:       208.8 ( 1.00x)      209.9 ( 1.00x)    0.53%
blend_w4_16bpc_rvv:      85.9 ( 2.43x)       88.6 ( 2.37x)    3.14%
blend_w8_16bpc_c:       613.2 ( 1.00x)      614.3 ( 1.00x)    0.18%
blend_w8_16bpc_rvv:     145.4 ( 4.22x)      143.1 ( 4.29x)   -1.58%
blend_w16_16bpc_c:     2371.9 ( 1.00x)     2373.6 ( 1.00x)    0.07%
blend_w16_16bpc_rvv:    464.0 ( 5.11x)      461.2 ( 5.15x)   -0.60%
blend_w32_16bpc_c:     6005.6 ( 1.00x)     6007.7 ( 1.00x)    0.03%
blend_w32_16bpc_rvv:    981.6 ( 6.12x)      979.4 ( 6.13x)   -0.22%

SpacemiT K1                Before               After         Delta

blend_w4_16bpc_c:       206.4 ( 1.00x)      205.7 ( 1.00x)   -0.34%
blend_w4_16bpc_rvv:      79.5 ( 2.60x)       81.0 ( 2.54x)    1.89%
blend_w8_16bpc_c:       600.7 ( 1.00x)      599.7 ( 1.00x)   -0.17%
blend_w8_16bpc_rvv:     133.3 ( 4.51x)      134.1 ( 4.47x)    0.60%
blend_w16_16bpc_c:     2315.9 ( 1.00x)     2315.2 ( 1.00x)   -0.03%
blend_w16_16bpc_rvv:    305.2 ( 7.59x)      300.7 ( 7.70x)   -1.47%
blend_w32_16bpc_c:     5861.1 ( 1.00x)     5860.2 ( 1.00x)   -0.02%
blend_w32_16bpc_rvv:    592.5 ( 9.89x)      589.5 ( 9.94x)   -0.51%
parent 3437a26b
No related branches found
No related tags found
1 merge request!1749riscv64/mc16: Unroll 16bpc RVV blend 2x
......@@ -27,49 +27,22 @@
#include "src/riscv/asm.S"
function blend_vl256_16bpc_rvv, export=1, ext=v
li t0, 4
beq a3, t0, 4f
li t0, 8
beq a3, t0, 8f
li t0, 16
beq a3, t0, 16f
32:
vsetvli zero, a3, e16, m2, ta, ma
j L(blend_epilog)
16:
vsetvli zero, a3, e16, m1, ta, ma
j L(blend_epilog)
8:
vsetvli zero, a3, e16, mf2, ta, ma
j L(blend_epilog)
4:
vsetvli zero, a3, e16, mf4, ta, ma
function blend_vl256_16bpc_rvv, export=1, ext=zbb
ctz t0, a3
addi t0, t0, 0xc4
j L(blend_epilog)
endfunc
function blend_16bpc_rvv, export=1, ext="v,zbb"
li t0, 4
beq a3, t0, 4f
li t0, 8
beq a3, t0, 8f
li t0, 16
beq a3, t0, 16f
32:
vsetvli zero, a3, e16, m4, ta, ma
j L(blend_epilog)
16:
vsetvli zero, a3, e16, m2, ta, ma
j L(blend_epilog)
8:
vsetvli zero, a3, e16, m1, ta, ma
j L(blend_epilog)
4:
vsetvli zero, a3, e16, mf2, ta, ma
ctz t0, a3
addi t0, t0, 0xc5
L(blend_epilog):
csrw vxrm, zero
andi t0, t0, 0xc7
li t1, 64
ori t0, t0, 8
add a6, a3, a3
vsetvl zero, a3, t0
1:
addi a4, a4, -1
vle8.v v8, (a5)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment