Skip to content
Snippets Groups Projects

Add refmvs.save_tmvs AVX2 asm

Merged Victorien Le Couviour--Tuffet requested to merge psilokos/dav1d:refmvs_simd into master
Speed ups:
c..avx2: 2.801x (o=0.0158)
Speed diffs:
c..avx2: 35.71% (o=0.2)
save_tmvs_c: 23562.5
save_tmvs_avx2: 8119.5
Edited by Victorien Le Couviour--Tuffet

Merge request reports

Loading
Loading

Activity

Filter activity
  • Approvals
  • Assignees & reviewers
  • Comments (from bots)
  • Comments (from users)
  • Commits & branches
  • Edits
  • Labels
  • Lock status
  • Mentions
  • Merge request status
  • Tracking
  • Henrik Gramner
  • Henrik Gramner
  • Henrik Gramner
  • Henrik Gramner
  • added 3 commits

    Compare with previous version

    • Resolved by Victorien Le Couviour--Tuffet

      Saving one register, plus keeping xstart in a register:

      diff --git a/src/x86/refmvs.asm b/src/x86/refmvs.asm
      index 286d2fb3..0c735a4b 100644
      --- a/src/x86/refmvs.asm
      +++ b/src/x86/refmvs.asm
      @@ -45,8 +45,8 @@ splat_mv_shuf: db  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11,  0,  1,  2,  3
                      db  8,  9, 10, 11,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11
                      db  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11,  0,  1,  2,  3
       save_ref_shuf: db  0, -1, -1, -1,  1, -1, -1, -1,  8, -1, -1, -1,  9, -1, -1, -1
      -save_cond0:    db  0,  1,  2,  3,  8, -1, -1, -1,  0,  1,  2,  3,  8, -1, -1, -1
      -save_cond1:    db  4,  5,  6,  7,  9, -1, -1, -1,  4,  5,  6,  7,  9, -1, -1, -1
      +save_cond0:    db  0,  1,  2,  3,  8, -1, -1, -1
      +save_cond1:    db  4,  5,  6,  7,  9, -1, -1, -1
       save_cond_bct0: times 8 db 0
                       times 8 db 8
       save_cond_bct1: times 8 db 4
      @@ -59,8 +59,8 @@ blk_dims: times 2 db 16
                 times 5 db 2
                 times 7 db 1
       
      +ALIGN 4
       JMP_TABLE save_tmvs_avx2, write, 1, 2, 4, 8, 16
      -
       JMP_TABLE splat_mv_avx512icl, w, 1, 2, 4, 8, 16, 32
       JMP_TABLE splat_mv_avx2,      w, 1, 2, 4, 8, 16, 32
       %endif
      @@ -136,10 +136,14 @@ INIT_YMM avx2
       ; refmvs_temporal_block *rp, ptrdiff_t stride,
       ; refmvs_block **rr, uint8_t *ref_sign,
       ; int col_end8, int row_end8, int col_start8, int row_start8
      -cglobal save_tmvs, 6, 15, 11, rp, stride, rr, ref_sign, \
      +cglobal save_tmvs, 4, 14, 11, rp, stride, rr, ref_sign, \
                                     xend, yend, xstart, ystart
      -%define base r11q-save_tmvs_avx2_table
      -    lea           r11q, [save_tmvs_avx2_table]
      +%define base r13-save_tmvs_avx2_table
      +    lea            r13, [save_tmvs_avx2_table]
      +    movifnidn    xendd, xendm
      +    movifnidn    yendd, yendm
      +    mov        xstartd, xstartm
      +    mov        ystartd, ystartm
           movq           xm5, [ref_signq]
           vbroadcasti128  m4, [base+save_ref_shuf]
           vbroadcasti128  m6, [base+save_cond_bct0]
      @@ -151,57 +155,54 @@ cglobal save_tmvs, 6, 15, 11, rp, stride, rr, ref_sign, \
           shl            r3d, 24
           movd           xm5, r3d
           vpbroadcastd    m5, xm5
      -    mov        ystartd, ystartm
      -    vbroadcasti128  m8, [base+save_cond0]
      -    vbroadcasti128  m9, [base+save_cond1]
      +    vpbroadcastq    m8, [base+save_cond0]
      +    vpbroadcastq    m9, [base+save_cond1]
           sub          yendd, ystartd
           add        ystartd, ystartd
      - DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand, bs
      + DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b
       .loop_y:
           and        ystartd, 30
      +    mov             xd, xstartd
           mov             bq, [rrq+ystartq*8]
           add        ystartd, 2
      -    mov        xstartd, xstartm
      -    mov             xd, xstartd
       .loop_x:
      -    lea          candd, [xstartq*3]
      -    movzx          bsd, byte [bq+candq*8+22]        ; cand_b->bs
      -    movu           xm0, [bq+candq*8+12]             ; cand_b
      -    movzx         r12d, byte [base+blk_dims+bsq]    ; bw8
      -    add        xstartd, r12d
      -    tzcnt         r12d, r12d
      -    movsxd        r12q, [r11q+r12q*4]
      -    add           r12q, r11q
      -    cmp        xstartd, xendd
      -    jge .calc
      -    lea          candd, [xstartq*3]
      -    movzx          bsd, byte [bq+candq*8+22]
      -    movu           xm1, [bq+candq*8+12]
      -    movzx         r13d, byte [base+blk_dims+bsq]
      -    add        xstartd, r13d
      -    tzcnt         r13d, r13d
      -    movsxd        r13q, [r11q+r13q*4]
      -    add           r13q, r11q
      -    cmp        xstartd, xendd
      -    jge .calc
      -    lea          candd, [xstartq*3]
      -    movzx          bsd, byte [bq+candq*8+22]
      -    vinserti128     m0, [bq+candq*8+12], 1
      -    movzx          r9d, byte [base+blk_dims+bsq]
      -    add        xstartd, r9d
      +    lea            r9d, [xq*3]
      +    movu           xm0, [bq+r9*8+12]             ; cand_b
      +    movzx          r9d, byte [bq+r9*8+22]        ; cand_b->bs
      +    movzx          r9d, byte [base+blk_dims+r9]  ; bw8
      +    lea           r12d, [xq+r9]
           tzcnt          r9d, r9d
      -    movsxd         r9q, [r11q+r9q*4]
      -    add            r9q, r11q
      -    cmp        xstartd, xendd
      +    movsxd          r9, [r13+r9*4]
      +    add             r9, r13
      +    cmp           r12d, xendd
           jge .calc
      -    lea           r14d, [xstartq*3]
      -    movzx          bsd, byte [bq+r14q*8+22]
      -    vinserti128     m1, [bq+r14q*8+12], 1
      -    movzx         r10d, byte [base+blk_dims+bsq]
      -    add        xstartd, r10d
      +    lea           r10d, [r12*3]
      +    movu           xm1, [bq+r10*8+12]
      +    movzx         r10d, byte [bq+r10*8+22]
      +    movzx         r10d, byte [base+blk_dims+r10]
      +    add           r12d, r10d
           tzcnt         r10d, r10d
      -    movsxd        r10q, [r11q+r10q*4]
      -    add           r10q, r11q
      +    movsxd         r10, [r13+r10*4]
      +    add            r10, r13
      +    cmp           r12d, xendd
      +    jge .calc
      +    lea           r11d, [r12*3]
      +    vinserti128     m0, [bq+r11*8+12], 1
      +    movzx         r11d, byte [bq+r11*8+22]
      +    movzx         r11d, byte [base+blk_dims+r11]
      +    add           r12d, r11d
      +    tzcnt         r11d, r11d
      +    movsxd         r11, [r13+r11*4]
      +    add            r11, r13
      +    cmp           r12d, xendd
      +    jge .calc
      +    lea           r12d, [r12*3]
      +    vinserti128     m1, [bq+r12*8+12], 1
      +    movzx         r12d, byte [bq+r12*8+22]
      +    movzx         r12d, byte [base+blk_dims+r12]
      +    tzcnt         r12d, r12d
      +    movsxd         r12, [r13+r12*4]
      +    add            r12, r13
       .calc:
           ; mv check
           punpcklqdq      m2, m0, m1  ; b0.mv0 b0.mv1 b1.mv0 b1.mv1 | ...
      @@ -225,19 +226,19 @@ cglobal save_tmvs, 6, 15, 11, rp, stride, rr, ref_sign, \
           pshufb          m0, m3
           pshufb          m1, m3
           vpbroadcastq    m2, xm0
      -    call          r12q
      +    call            r9
           cmp             xd, xendd
           jge .next_line
           vpermq          m2, m1, q1111
      -    call          r13q
      +    call           r10
           cmp             xd, xendd
           jge .next_line
           vpermq          m2, m0, q2222
      -    call           r9q
      +    call           r11
           cmp             xd, xendd
           jge .next_line
           vpermq          m2, m1, q3333
      -    call          r10q
      +    call           r12
           cmp             xd, xendd
           jl .loop_x
       .next_line:
  • added 1 commit

    • fa20a8d1 - Add refmvs.save_tmvs AVX2 asm

    Compare with previous version

  • commit speed up
    9e64a170 (original) 1.395x
    1e144a0d (no stack) 1.443x
    patch 0 (-1 reg) 2.275x
    patch 1 (reduce scalar) 2.643x
    Edited by Victorien Le Couviour--Tuffet
  • Henrik Gramner approved this merge request

    approved this merge request

  • Loading
  • Loading
  • Loading
  • Loading
  • Loading
  • Loading
  • Loading
  • Loading
  • Loading
  • Loading
  • Please register or sign in to reply
    Loading