Add refmvs.save_tmvs AVX2 asm
Speed ups:
c..avx2: 2.801x (o=0.0158)
Speed diffs:
c..avx2: 35.71% (o=0.2)
save_tmvs_c: 23562.5
save_tmvs_avx2: 8119.5
Edited by Victorien Le Couviour--Tuffet
Merge request reports
Activity
Filter activity
changed milestone to %1.2.0
added performance label
requested review from @gramner
assigned to @psilokos
- Resolved by Victorien Le Couviour--Tuffet
@gramner Do you know why the last commit (nostack) makes it slower?
chimera-8bit-1080.ivf
532.33/23.98 fps (22.20x) => 541.40/23.98 fps (22.58x)
about +1.7% \o/
Edited by Victorien Le Couviour--Tuffet- Resolved by Victorien Le Couviour--Tuffet
- Resolved by Victorien Le Couviour--Tuffet
- Automatically resolved by Victorien Le Couviour--Tuffet
- Automatically resolved by Victorien Le Couviour--Tuffet
- Resolved by Victorien Le Couviour--Tuffet
- Automatically resolved by Victorien Le Couviour--Tuffet
- Automatically resolved by Victorien Le Couviour--Tuffet
- Resolved by Victorien Le Couviour--Tuffet
Saving one register, plus keeping
xstart
in a register:diff --git a/src/x86/refmvs.asm b/src/x86/refmvs.asm index 286d2fb3..0c735a4b 100644 --- a/src/x86/refmvs.asm +++ b/src/x86/refmvs.asm @@ -45,8 +45,8 @@ splat_mv_shuf: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3 db 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3 save_ref_shuf: db 0, -1, -1, -1, 1, -1, -1, -1, 8, -1, -1, -1, 9, -1, -1, -1 -save_cond0: db 0, 1, 2, 3, 8, -1, -1, -1, 0, 1, 2, 3, 8, -1, -1, -1 -save_cond1: db 4, 5, 6, 7, 9, -1, -1, -1, 4, 5, 6, 7, 9, -1, -1, -1 +save_cond0: db 0, 1, 2, 3, 8, -1, -1, -1 +save_cond1: db 4, 5, 6, 7, 9, -1, -1, -1 save_cond_bct0: times 8 db 0 times 8 db 8 save_cond_bct1: times 8 db 4 @@ -59,8 +59,8 @@ blk_dims: times 2 db 16 times 5 db 2 times 7 db 1 +ALIGN 4 JMP_TABLE save_tmvs_avx2, write, 1, 2, 4, 8, 16 - JMP_TABLE splat_mv_avx512icl, w, 1, 2, 4, 8, 16, 32 JMP_TABLE splat_mv_avx2, w, 1, 2, 4, 8, 16, 32 %endif @@ -136,10 +136,14 @@ INIT_YMM avx2 ; refmvs_temporal_block *rp, ptrdiff_t stride, ; refmvs_block **rr, uint8_t *ref_sign, ; int col_end8, int row_end8, int col_start8, int row_start8 -cglobal save_tmvs, 6, 15, 11, rp, stride, rr, ref_sign, \ +cglobal save_tmvs, 4, 14, 11, rp, stride, rr, ref_sign, \ xend, yend, xstart, ystart -%define base r11q-save_tmvs_avx2_table - lea r11q, [save_tmvs_avx2_table] +%define base r13-save_tmvs_avx2_table + lea r13, [save_tmvs_avx2_table] + movifnidn xendd, xendm + movifnidn yendd, yendm + mov xstartd, xstartm + mov ystartd, ystartm movq xm5, [ref_signq] vbroadcasti128 m4, [base+save_ref_shuf] vbroadcasti128 m6, [base+save_cond_bct0] @@ -151,57 +155,54 @@ cglobal save_tmvs, 6, 15, 11, rp, stride, rr, ref_sign, \ shl r3d, 24 movd xm5, r3d vpbroadcastd m5, xm5 - mov ystartd, ystartm - vbroadcasti128 m8, [base+save_cond0] - vbroadcasti128 m9, [base+save_cond1] + vpbroadcastq m8, [base+save_cond0] + vpbroadcastq m9, [base+save_cond1] sub yendd, ystartd add ystartd, ystartd - DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand, bs + DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b .loop_y: and ystartd, 30 + mov xd, xstartd mov bq, [rrq+ystartq*8] add ystartd, 2 - mov xstartd, xstartm - mov xd, xstartd .loop_x: - lea candd, [xstartq*3] - movzx bsd, byte [bq+candq*8+22] ; cand_b->bs - movu xm0, [bq+candq*8+12] ; cand_b - movzx r12d, byte [base+blk_dims+bsq] ; bw8 - add xstartd, r12d - tzcnt r12d, r12d - movsxd r12q, [r11q+r12q*4] - add r12q, r11q - cmp xstartd, xendd - jge .calc - lea candd, [xstartq*3] - movzx bsd, byte [bq+candq*8+22] - movu xm1, [bq+candq*8+12] - movzx r13d, byte [base+blk_dims+bsq] - add xstartd, r13d - tzcnt r13d, r13d - movsxd r13q, [r11q+r13q*4] - add r13q, r11q - cmp xstartd, xendd - jge .calc - lea candd, [xstartq*3] - movzx bsd, byte [bq+candq*8+22] - vinserti128 m0, [bq+candq*8+12], 1 - movzx r9d, byte [base+blk_dims+bsq] - add xstartd, r9d + lea r9d, [xq*3] + movu xm0, [bq+r9*8+12] ; cand_b + movzx r9d, byte [bq+r9*8+22] ; cand_b->bs + movzx r9d, byte [base+blk_dims+r9] ; bw8 + lea r12d, [xq+r9] tzcnt r9d, r9d - movsxd r9q, [r11q+r9q*4] - add r9q, r11q - cmp xstartd, xendd + movsxd r9, [r13+r9*4] + add r9, r13 + cmp r12d, xendd jge .calc - lea r14d, [xstartq*3] - movzx bsd, byte [bq+r14q*8+22] - vinserti128 m1, [bq+r14q*8+12], 1 - movzx r10d, byte [base+blk_dims+bsq] - add xstartd, r10d + lea r10d, [r12*3] + movu xm1, [bq+r10*8+12] + movzx r10d, byte [bq+r10*8+22] + movzx r10d, byte [base+blk_dims+r10] + add r12d, r10d tzcnt r10d, r10d - movsxd r10q, [r11q+r10q*4] - add r10q, r11q + movsxd r10, [r13+r10*4] + add r10, r13 + cmp r12d, xendd + jge .calc + lea r11d, [r12*3] + vinserti128 m0, [bq+r11*8+12], 1 + movzx r11d, byte [bq+r11*8+22] + movzx r11d, byte [base+blk_dims+r11] + add r12d, r11d + tzcnt r11d, r11d + movsxd r11, [r13+r11*4] + add r11, r13 + cmp r12d, xendd + jge .calc + lea r12d, [r12*3] + vinserti128 m1, [bq+r12*8+12], 1 + movzx r12d, byte [bq+r12*8+22] + movzx r12d, byte [base+blk_dims+r12] + tzcnt r12d, r12d + movsxd r12, [r13+r12*4] + add r12, r13 .calc: ; mv check punpcklqdq m2, m0, m1 ; b0.mv0 b0.mv1 b1.mv0 b1.mv1 | ... @@ -225,19 +226,19 @@ cglobal save_tmvs, 6, 15, 11, rp, stride, rr, ref_sign, \ pshufb m0, m3 pshufb m1, m3 vpbroadcastq m2, xm0 - call r12q + call r9 cmp xd, xendd jge .next_line vpermq m2, m1, q1111 - call r13q + call r10 cmp xd, xendd jge .next_line vpermq m2, m0, q2222 - call r9q + call r11 cmp xd, xendd jge .next_line vpermq m2, m1, q3333 - call r10q + call r12 cmp xd, xendd jl .loop_x .next_line:
commit speed up 9e64a170 (original) 1.395x 1e144a0d (no stack) 1.443x patch 0 (-1 reg) 2.275x patch 1 (reduce scalar) 2.643x Edited by Victorien Le Couviour--Tuffet
Please register or sign in to reply