...
 
Commits (29)
  • Jean-Baptiste Kempf's avatar
    On the road to 0.2.2 · dc769f5d
    Jean-Baptiste Kempf authored
    dc769f5d
  • Janne Grunau's avatar
    tools/dav1d: make the md5 muxer endian-aware · d9a911a4
    Janne Grunau authored
    Fixes tests on big endian architectures.
    d9a911a4
  • Janne Grunau's avatar
  • James Almer's avatar
    decode: don't realloc the tile data buffer when it needs to be enlarged · f821d9ad
    James Almer authored
    Its previous contents don't need to be preserved.
    f821d9ad
  • James Almer's avatar
    decode: add a frame tile data buffer size check · 7350c59e
    James Almer authored
    This check was already done in dav1d_parse_obus(), so it's added as an assert
    here for extra precaution.
    7350c59e
  • Xuefeng Jiang's avatar
    Add SSSE3 implementation for ipred_cfl_ac_420 and ipred_cfl_ac_422 · 5d944dc6
    Xuefeng Jiang authored
    cfl_ac_420_w4_8bpc_c: 1621.0
    cfl_ac_420_w4_8bpc_ssse3: 92.5
    cfl_ac_420_w8_8bpc_c: 3344.1
    cfl_ac_420_w8_8bpc_ssse3: 115.4
    cfl_ac_420_w16_8bpc_c: 6024.9
    cfl_ac_420_w16_8bpc_ssse3: 187.8
    cfl_ac_422_w4_8bpc_c: 1762.5
    cfl_ac_422_w4_8bpc_ssse3: 81.4
    cfl_ac_422_w8_8bpc_c: 4941.2
    cfl_ac_422_w8_8bpc_ssse3: 166.5
    cfl_ac_422_w16_8bpc_c: 8261.8
    cfl_ac_422_w16_8bpc_ssse3: 272.3
    5d944dc6
  • Liwei Wang's avatar
    Add SSSE3 implementation for the 8x32 and 32x8 blocks in itx · 585ac462
    Liwei Wang authored
    Cycle times:
    inv_txfm_add_8x32_dct_dct_0_8bpc_c: 1164.7
    inv_txfm_add_8x32_dct_dct_0_8bpc_ssse3: 79.5
    inv_txfm_add_8x32_dct_dct_1_8bpc_c: 11291.6
    inv_txfm_add_8x32_dct_dct_1_8bpc_ssse3: 508.5
    inv_txfm_add_8x32_dct_dct_2_8bpc_c: 10720.4
    inv_txfm_add_8x32_dct_dct_2_8bpc_ssse3: 507.9
    inv_txfm_add_8x32_dct_dct_3_8bpc_c: 12351.5
    inv_txfm_add_8x32_dct_dct_3_8bpc_ssse3: 687.2
    inv_txfm_add_8x32_dct_dct_4_8bpc_c: 10402.3
    inv_txfm_add_8x32_dct_dct_4_8bpc_ssse3: 687.9
    inv_txfm_add_8x32_identity_identity_0_8bpc_c: 3485.0
    inv_txfm_add_8x32_identity_identity_0_8bpc_ssse3: 97.7
    inv_txfm_add_8x32_identity_identity_1_8bpc_c: 3495.7
    inv_txfm_add_8x32_identity_identity_1_8bpc_ssse3: 97.7
    inv_txfm_add_8x32_identity_identity_2_8bpc_c: 3503.7
    inv_txfm_add_8x32_identity_identity_2_8bpc_ssse3: 97.8
    inv_txfm_add_8x32_identity_identity_3_8bpc_c: 3489.5
    inv_txfm_add_8x32_identity_identity_3_8bpc_ssse3: 184.4
    inv_txfm_add_8x32_identity_identity_4_8bpc_c: 3498.1
    inv_txfm_add_8x32_identity_identity_4_8bpc_ssse3: 182.8
    inv_txfm_add_32x8_dct_dct_0_8bpc_c: 1220.4
    inv_txfm_add_32x8_dct_dct_0_8bpc_ssse3: 65.6
    inv_txfm_add_32x8_dct_dct_1_8bpc_c: 11120.7
    inv_txfm_add_32x8_dct_dct_1_8bpc_ssse3: 623.8
    inv_txfm_add_32x8_dct_dct_2_8bpc_c: 12236.3
    inv_txfm_add_32x8_dct_dct_2_8bpc_ssse3: 624.7
    inv_txfm_add_32x8_dct_dct_3_8bpc_c: 10866.3
    inv_txfm_add_32x8_dct_dct_3_8bpc_ssse3: 694.1
    inv_txfm_add_32x8_dct_dct_4_8bpc_c: 10322.8
    inv_txfm_add_32x8_dct_dct_4_8bpc_ssse3: 692.5
    inv_txfm_add_32x8_identity_identity_0_8bpc_c: 3368.1
    inv_txfm_add_32x8_identity_identity_0_8bpc_ssse3: 98.6
    inv_txfm_add_32x8_identity_identity_1_8bpc_c: 3381.1
    inv_txfm_add_32x8_identity_identity_1_8bpc_ssse3: 98.3
    inv_txfm_add_32x8_identity_identity_2_8bpc_c: 3376.6
    inv_txfm_add_32x8_identity_identity_2_8bpc_ssse3: 98.3
    inv_txfm_add_32x8_identity_identity_3_8bpc_c: 3364.3
    inv_txfm_add_32x8_identity_identity_3_8bpc_ssse3: 182.2
    inv_txfm_add_32x8_identity_identity_4_8bpc_c: 3390.0
    inv_txfm_add_32x8_identity_identity_4_8bpc_ssse3: 182.2
    585ac462
  • Henrik Gramner's avatar
    x86: Add minor CDEF AVX2 optimizations · 08020016
    Henrik Gramner authored
    08020016
  • Henrik Gramner's avatar
    Simplify C for inverse transforms · ad8d2174
    Henrik Gramner authored
    The second shift is constant.
    ad8d2174
  • Martin Storsjö's avatar
    Only define DAV1D_API to dllexport when building dav1d itself · 3f2bb0d9
    Martin Storsjö authored
    As meson still doesn't allow specifying different cflags between
    static and dynamic libraries, this still includes the dllexport
    in the static library when built with default_library=both, but
    it at least is avoided in static-only builds, and avoids
    defining these symbols as dllexport in the callers' translation
    units.
    3f2bb0d9
  • Henrik Gramner's avatar
    build: Split x86 asm files per bitdepth · 72f8cc62
    Henrik Gramner authored
    72f8cc62
  • Liwei Wang's avatar
    Add SSSE3 implementation for the 16x32,32x16 and 32x32 blocks in itx · bd12b1ec
    Liwei Wang authored
    Cycle times:
    inv_txfm_add_16x32_dct_dct_0_8bpc_c: 2464.6
    inv_txfm_add_16x32_dct_dct_0_8bpc_ssse3: 121.6
    inv_txfm_add_16x32_dct_dct_1_8bpc_c: 24751.6
    inv_txfm_add_16x32_dct_dct_1_8bpc_ssse3: 1101.9
    inv_txfm_add_16x32_dct_dct_2_8bpc_c: 24377.0
    inv_txfm_add_16x32_dct_dct_2_8bpc_ssse3: 1117.2
    inv_txfm_add_16x32_dct_dct_3_8bpc_c: 24155.6
    inv_txfm_add_16x32_dct_dct_3_8bpc_ssse3: 2349.3
    inv_txfm_add_16x32_dct_dct_4_8bpc_c: 24175.6
    inv_txfm_add_16x32_dct_dct_4_8bpc_ssse3: 1642.0
    inv_txfm_add_16x32_identity_identity_0_8bpc_c: 10304.7
    inv_txfm_add_16x32_identity_identity_0_8bpc_ssse3: 137.7
    inv_txfm_add_16x32_identity_identity_1_8bpc_c: 10341.6
    inv_txfm_add_16x32_identity_identity_1_8bpc_ssse3: 137.9
    inv_txfm_add_16x32_identity_identity_2_8bpc_c: 10299.9
    inv_txfm_add_16x32_identity_identity_2_8bpc_ssse3: 253.9
    inv_txfm_add_16x32_identity_identity_3_8bpc_c: 10331.4
    inv_txfm_add_16x32_identity_identity_3_8bpc_ssse3: 369.7
    inv_txfm_add_16x32_identity_identity_4_8bpc_c: 10360.4
    inv_txfm_add_16x32_identity_identity_4_8bpc_ssse3: 484.0
    inv_txfm_add_32x16_dct_dct_0_8bpc_c: 2288.4
    inv_txfm_add_32x16_dct_dct_0_8bpc_ssse3: 142.3
    inv_txfm_add_32x16_dct_dct_1_8bpc_c: 23819.9
    inv_txfm_add_32x16_dct_dct_1_8bpc_ssse3: 1740.1
    inv_txfm_add_32x16_dct_dct_2_8bpc_c: 23755.8
    inv_txfm_add_32x16_dct_dct_2_8bpc_ssse3: 1641.4
    inv_txfm_add_32x16_dct_dct_3_8bpc_c: 23839.9
    inv_txfm_add_32x16_dct_dct_3_8bpc_ssse3: 1559.0
    inv_txfm_add_32x16_dct_dct_4_8bpc_c: 23757.7
    inv_txfm_add_32x16_dct_dct_4_8bpc_ssse3: 1579.0
    inv_txfm_add_32x16_identity_identity_0_8bpc_c: 10381.7
    inv_txfm_add_32x16_identity_identity_0_8bpc_ssse3: 126.3
    inv_txfm_add_32x16_identity_identity_1_8bpc_c: 10402.5
    inv_txfm_add_32x16_identity_identity_1_8bpc_ssse3: 126.5
    inv_txfm_add_32x16_identity_identity_2_8bpc_c: 10429.2
    inv_txfm_add_32x16_identity_identity_2_8bpc_ssse3: 244.9
    inv_txfm_add_32x16_identity_identity_3_8bpc_c: 10382.0
    inv_txfm_add_32x16_identity_identity_3_8bpc_ssse3: 491.0
    inv_txfm_add_32x16_identity_identity_4_8bpc_c: 10381.0
    inv_txfm_add_32x16_identity_identity_4_8bpc_ssse3: 468.0
    inv_txfm_add_32x32_dct_dct_0_8bpc_c: 4168.2
    inv_txfm_add_32x32_dct_dct_0_8bpc_ssse3: 204.0
    inv_txfm_add_32x32_dct_dct_1_8bpc_c: 46306.2
    inv_txfm_add_32x32_dct_dct_1_8bpc_ssse3: 2216.0
    inv_txfm_add_32x32_dct_dct_2_8bpc_c: 46300.2
    inv_txfm_add_32x32_dct_dct_2_8bpc_ssse3: 2194.2
    inv_txfm_add_32x32_dct_dct_3_8bpc_c: 46350.1
    inv_txfm_add_32x32_dct_dct_3_8bpc_ssse3: 3484.4
    inv_txfm_add_32x32_dct_dct_4_8bpc_c: 46318.1
    inv_txfm_add_32x32_dct_dct_4_8bpc_ssse3: 3440.9
    inv_txfm_add_32x32_identity_identity_0_8bpc_c: 14663.1
    inv_txfm_add_32x32_identity_identity_0_8bpc_ssse3: 179.0
    inv_txfm_add_32x32_identity_identity_1_8bpc_c: 14737.0
    inv_txfm_add_32x32_identity_identity_1_8bpc_ssse3: 179.2
    inv_txfm_add_32x32_identity_identity_2_8bpc_c: 14640.4
    inv_txfm_add_32x32_identity_identity_2_8bpc_ssse3: 179.1
    inv_txfm_add_32x32_identity_identity_3_8bpc_c: 14638.5
    inv_txfm_add_32x32_identity_identity_3_8bpc_ssse3: 663.8
    inv_txfm_add_32x32_identity_identity_4_8bpc_c: 14635.6
    inv_txfm_add_32x32_identity_identity_4_8bpc_ssse3: 663.9
    bd12b1ec
  • Victorien Le Couviour--Tuffet's avatar
  • Victorien Le Couviour--Tuffet's avatar
    x86: cdef_filter: use a better constant for SSE4 · 22c3594d
    Victorien Le Couviour--Tuffet authored
    Port of dc2ae517 for AVX-2
    from Kyle Siefring.
    
    ---------------------
    x86_64:
    ------------------------------------------
    cdef_filter_4x4_8bpc_ssse3: 141.7
    cdef_filter_4x4_8bpc_sse4: 128.3
    ------------------------------------------
    cdef_filter_4x8_8bpc_ssse3: 253.4
    cdef_filter_4x8_8bpc_sse4: 228.5
    ------------------------------------------
    cdef_filter_8x8_8bpc_ssse3: 429.6
    cdef_filter_8x8_8bpc_sse4: 379.9
    ------------------------------------------
    
    ---------------------
    x86_32:
    ------------------------------------------
    cdef_filter_4x4_8bpc_ssse3: 184.3
    cdef_filter_4x4_8bpc_sse4: 168.9
    ------------------------------------------
    cdef_filter_4x8_8bpc_ssse3: 335.3
    cdef_filter_4x8_8bpc_sse4: 305.1
    ------------------------------------------
    cdef_filter_8x8_8bpc_ssse3: 579.1
    cdef_filter_8x8_8bpc_sse4: 517.0
    ------------------------------------------
    22c3594d
  • Victorien Le Couviour--Tuffet's avatar
    x86: cdef_filter: use 8-bit arithmetic for SSE · 75e88fab
    Victorien Le Couviour--Tuffet authored
    Port of c204da0f for AVX-2
    from Kyle Siefring.
    
    ---------------------
    x86_64:
    ------------------------------------------
    before: cdef_filter_4x4_8bpc_ssse3: 141.7
     after: cdef_filter_4x4_8bpc_ssse3: 131.6
    before: cdef_filter_4x4_8bpc_sse4: 128.3
     after: cdef_filter_4x4_8bpc_sse4: 119.0
    ------------------------------------------
    before: cdef_filter_4x8_8bpc_ssse3: 253.4
     after: cdef_filter_4x8_8bpc_ssse3: 236.1
    before: cdef_filter_4x8_8bpc_sse4: 228.5
     after: cdef_filter_4x8_8bpc_sse4: 213.2
    ------------------------------------------
    before: cdef_filter_8x8_8bpc_ssse3: 429.6
     after: cdef_filter_8x8_8bpc_ssse3: 386.9
    before: cdef_filter_8x8_8bpc_sse4: 379.9
     after: cdef_filter_8x8_8bpc_sse4: 335.9
    ------------------------------------------
    
    ---------------------
    x86_32:
    ------------------------------------------
    before: cdef_filter_4x4_8bpc_ssse3: 184.3
     after: cdef_filter_4x4_8bpc_ssse3: 163.3
    before: cdef_filter_4x4_8bpc_sse4: 168.9
     after: cdef_filter_4x4_8bpc_sse4: 146.1
    ------------------------------------------
    before: cdef_filter_4x8_8bpc_ssse3: 335.3
     after: cdef_filter_4x8_8bpc_ssse3: 280.7
    before: cdef_filter_4x8_8bpc_sse4: 305.1
     after: cdef_filter_4x8_8bpc_sse4: 257.9
    ------------------------------------------
    before: cdef_filter_8x8_8bpc_ssse3: 579.1
     after: cdef_filter_8x8_8bpc_ssse3: 500.5
    before: cdef_filter_8x8_8bpc_sse4: 517.0
     after: cdef_filter_8x8_8bpc_sse4: 455.8
    ------------------------------------------
    75e88fab
  • Victorien Le Couviour--Tuffet's avatar
    x86: cdef_dir: optimize best cost finding for SSE · 91568b2a
    Victorien Le Couviour--Tuffet authored
    Port of 65ee1233 for AVX-2
    from Kyle Siefring to SSE41, and optimize SSSE3.
    
    ---------------------
    x86_64:
    ------------------------------------------
    before: cdef_dir_8bpc_ssse3: 110.3
     after: cdef_dir_8bpc_ssse3: 105.9
       new: cdef_dir_8bpc_sse4:   96.4
    ------------------------------------------
    
    ---------------------
    x86_32:
    ------------------------------------------
    before: cdef_dir_8bpc_ssse3: 120.6
     after: cdef_dir_8bpc_ssse3: 110.7
       new: cdef_dir_8bpc_sse4:  106.5
    ------------------------------------------
    91568b2a
  • Henrik Gramner's avatar
    CI: Check for newline at end of file · abb972a5
    Henrik Gramner authored
    abb972a5
  • Xuefeng Jiang's avatar
    Add SSSE3 implementation for ipred_cfl_ac_444 · 0d936a1a
    Xuefeng Jiang authored
    cfl_ac_444_w4_8bpc_c: 978.2
    cfl_ac_444_w4_8bpc_ssse3: 110.4
    cfl_ac_444_w8_8bpc_c: 2312.3
    cfl_ac_444_w8_8bpc_ssse3: 197.5
    cfl_ac_444_w16_8bpc_c: 4081.1
    cfl_ac_444_w16_8bpc_ssse3: 274.1
    cfl_ac_444_w32_8bpc_c: 9544.3
    cfl_ac_444_w32_8bpc_ssse3: 617.1
    0d936a1a
  • Martin Storsjö's avatar
    arm: Consistently use 8/24 columns indentation for assembly · 5d888dde
    Martin Storsjö authored
    For cases with indented, nested .if/.macro in asm.S, ident those
    by 4 chars.
    
    Some initial assembly files were indented to 4/16 columns, while all
    the actual implementation files, starting with src/arm/64/mc.S, have
    used 8/24 for indentation.
    5d888dde
  • Martin Storsjö's avatar
    arm: Fix typos in comments · 556780b7
    Martin Storsjö authored
    The width register has been set to clz(w)-24, not the other way
    around. And the 32 bit prep function has got the h parameter in
    r4, not in r5.
    556780b7
  • Martin Storsjö's avatar
    arm: Add a _neon suffix to all internal functions · 8bbcd3f7
    Martin Storsjö authored
    This eases disambiguating these functions when looking at perf
    profiles.
    8bbcd3f7
  • Xuefeng Jiang's avatar
    Add SSSE3 implementation for ipred_paeth · 44d0de41
    Xuefeng Jiang authored
    intra_pred_paeth_w4_8bpc_c: 561.6
    intra_pred_paeth_w4_8bpc_ssse3: 49.2
    intra_pred_paeth_w8_8bpc_c: 1475.8
    intra_pred_paeth_w8_8bpc_ssse3: 103.0
    intra_pred_paeth_w16_8bpc_c: 4697.8
    intra_pred_paeth_w16_8bpc_ssse3: 279.0
    intra_pred_paeth_w32_8bpc_c: 13245.1
    intra_pred_paeth_w32_8bpc_ssse3: 614.7
    intra_pred_paeth_w64_8bpc_c: 32638.9
    intra_pred_paeth_w64_8bpc_ssse3: 1477.6
    44d0de41
  • Henrik Gramner's avatar
    x86-64: Add msac_decode_symbol_adapt SSE2 asm · fa1b2651
    Henrik Gramner authored
    Also make various minor optimizations/style fixes to the MSAC C functions.
    fa1b2651
  • Martin Storsjö's avatar
    msac: Add a cast to indicate intended narrowing from size_t to unsigned · 003fa104
    Martin Storsjö authored
    This fixes this compiler warning with MSVC:
    ../src/msac.c(148): warning C4267: '+=': conversion from 'size_t' to 'unsigned int', possible loss of data
    003fa104
  • Martin Storsjö's avatar
    arm64: looprestoration: Add a NEON implementation of SGR · 204bf211
    Martin Storsjö authored
    Relative speedup vs (autovectorized) C code:
                          Cortex A53    A72    A73
    selfguided_3x3_8bpc_neon:   2.91   2.12   2.68
    selfguided_5x5_8bpc_neon:   3.18   2.65   3.39
    selfguided_mix_8bpc_neon:   3.04   2.29   2.98
    
    The relative speedup vs non-vectorized C code is around 2.6-4.6x.
    204bf211
  • Martin Storsjö's avatar
    arm64: loopfilter: Implement NEON loop filters · 0282f6f3
    Martin Storsjö authored
    The exact relative speedup compared to C code is a bit vague and hard
    to measure, depending on eactly how many filtered blocks are skipped,
    as the NEON version always filters 16 pixels at a time, while the
    C code can skip processing individual 4 pixel blocks.
    
    Additionally, the checkasm benchmarking code runs the same function
    repeatedly on the same buffer, which can make the filter take
    different codepaths on each run, as the function updates the buffer
    which will be used as input for the next run.
    
    If tweaking the checkasm test data to try to avoid skipped blocks,
    the relative speedups compared to C is between 2x and 5x, while
    it is around 1x to 4x with the current checkasm test as such.
    
    Benchmark numbers from a tweaked checkasm that avoids skipped
    blocks:
    
                            Cortex A53     A72     A73
    lpf_h_sb_uv_w4_8bpc_c:      2954.7  1399.3  1655.3
    lpf_h_sb_uv_w4_8bpc_neon:    895.5   650.8   692.0
    lpf_h_sb_uv_w6_8bpc_c:      3879.2  1917.2  2257.7
    lpf_h_sb_uv_w6_8bpc_neon:   1125.6   759.5   838.4
    lpf_h_sb_y_w4_8bpc_c:       6711.0  3275.5  3913.7
    lpf_h_sb_y_w4_8bpc_neon:    1744.0  1342.1  1351.5
    lpf_h_sb_y_w8_8bpc_c:      10695.7  6155.8  6638.9
    lpf_h_sb_y_w8_8bpc_neon:    2146.5  1560.4  1609.1
    lpf_h_sb_y_w16_8bpc_c:     11355.8  6292.0  6995.9
    lpf_h_sb_y_w16_8bpc_neon:   2475.4  1949.6  1968.4
    lpf_v_sb_uv_w4_8bpc_c:      2639.7  1204.8  1425.9
    lpf_v_sb_uv_w4_8bpc_neon:    510.7   351.4   334.7
    lpf_v_sb_uv_w6_8bpc_c:      3468.3  1757.1  2021.5
    lpf_v_sb_uv_w6_8bpc_neon:    625.0   415.0   397.8
    lpf_v_sb_y_w4_8bpc_c:       5428.7  2731.7  3068.5
    lpf_v_sb_y_w4_8bpc_neon:    1172.6   792.1   768.0
    lpf_v_sb_y_w8_8bpc_c:       8946.1  4412.8  5121.0
    lpf_v_sb_y_w8_8bpc_neon:    1565.5  1063.6  1062.7
    lpf_v_sb_y_w16_8bpc_c:      8978.9  4411.7  5112.0
    lpf_v_sb_y_w16_8bpc_neon:   1775.0  1288.1  1236.7
    0282f6f3
  • Ronald S. Bultje's avatar
    Over-allocate level array by 3-bytes · 36e1490b
    Ronald S. Bultje authored
    This is a workaround so that the AVX2 implementation of deblock can
    index the levels array starting from the level type, which causes it
    to over-read by up to 3 bytes. This is intended to fix #269.
    36e1490b
  • Liwei Wang's avatar
    Add SSSE3 implementation for the {16, 32, 64}x64 and 64 x{16, 32} blocks in itx · 589e96a1
    Liwei Wang authored
    Cycle times:
    inv_txfm_add_16x64_dct_dct_0_8bpc_c: 3973.5
    inv_txfm_add_16x64_dct_dct_0_8bpc_ssse3: 185.7
    inv_txfm_add_16x64_dct_dct_1_8bpc_c: 37869.1
    inv_txfm_add_16x64_dct_dct_1_8bpc_ssse3: 2103.1
    inv_txfm_add_16x64_dct_dct_2_8bpc_c: 37822.9
    inv_txfm_add_16x64_dct_dct_2_8bpc_ssse3: 2099.1
    inv_txfm_add_16x64_dct_dct_3_8bpc_c: 37871.7
    inv_txfm_add_16x64_dct_dct_3_8bpc_ssse3: 2663.5
    inv_txfm_add_16x64_dct_dct_4_8bpc_c: 38002.9
    inv_txfm_add_16x64_dct_dct_4_8bpc_ssse3: 2589.7
    inv_txfm_add_32x64_dct_dct_0_8bpc_c: 8319.2
    inv_txfm_add_32x64_dct_dct_0_8bpc_ssse3: 376.9
    inv_txfm_add_32x64_dct_dct_1_8bpc_c: 85956.8
    inv_txfm_add_32x64_dct_dct_1_8bpc_ssse3: 4298.1
    inv_txfm_add_32x64_dct_dct_2_8bpc_c: 89906.2
    inv_txfm_add_32x64_dct_dct_2_8bpc_ssse3: 4291.3
    inv_txfm_add_32x64_dct_dct_3_8bpc_c: 83710.9
    inv_txfm_add_32x64_dct_dct_3_8bpc_ssse3: 5589.5
    inv_txfm_add_32x64_dct_dct_4_8bpc_c: 87733.5
    inv_txfm_add_32x64_dct_dct_4_8bpc_ssse3: 5658.4
    inv_txfm_add_64x16_dct_dct_0_8bpc_c: 3895.9
    inv_txfm_add_64x16_dct_dct_0_8bpc_ssse3: 179.5
    inv_txfm_add_64x16_dct_dct_1_8bpc_c: 51375.2
    inv_txfm_add_64x16_dct_dct_1_8bpc_ssse3: 3859.2
    inv_txfm_add_64x16_dct_dct_2_8bpc_c: 52562.9
    inv_txfm_add_64x16_dct_dct_2_8bpc_ssse3: 4044.1
    inv_txfm_add_64x16_dct_dct_3_8bpc_c: 51347.0
    inv_txfm_add_64x16_dct_dct_3_8bpc_ssse3: 5259.5
    inv_txfm_add_64x16_dct_dct_4_8bpc_c: 49642.2
    inv_txfm_add_64x16_dct_dct_4_8bpc_ssse3: 4008.4
    inv_txfm_add_64x32_dct_dct_0_8bpc_c: 7196.4
    inv_txfm_add_64x32_dct_dct_0_8bpc_ssse3: 355.8
    inv_txfm_add_64x32_dct_dct_1_8bpc_c: 106588.4
    inv_txfm_add_64x32_dct_dct_1_8bpc_ssse3: 4965.3
    inv_txfm_add_64x32_dct_dct_2_8bpc_c: 106230.7
    inv_txfm_add_64x32_dct_dct_2_8bpc_ssse3: 4772.0
    inv_txfm_add_64x32_dct_dct_3_8bpc_c: 107427.0
    inv_txfm_add_64x32_dct_dct_3_8bpc_ssse3: 7146.9
    inv_txfm_add_64x32_dct_dct_4_8bpc_c: 111785.7
    inv_txfm_add_64x32_dct_dct_4_8bpc_ssse3: 7156.2
    inv_txfm_add_64x64_dct_dct_0_8bpc_c: 14512.4
    inv_txfm_add_64x64_dct_dct_0_8bpc_ssse3: 674.2
    inv_txfm_add_64x64_dct_dct_1_8bpc_c: 173246.3
    inv_txfm_add_64x64_dct_dct_1_8bpc_ssse3: 8790.8
    inv_txfm_add_64x64_dct_dct_2_8bpc_c: 174264.6
    inv_txfm_add_64x64_dct_dct_2_8bpc_ssse3: 8767.6
    inv_txfm_add_64x64_dct_dct_3_8bpc_c: 170047.3
    inv_txfm_add_64x64_dct_dct_3_8bpc_ssse3: 10784.9
    inv_txfm_add_64x64_dct_dct_4_8bpc_c: 170182.2
    inv_txfm_add_64x64_dct_dct_4_8bpc_ssse3: 10795.6
    589e96a1
  • Jean-Baptiste Kempf's avatar
    Update NEWS for 0.2.2 · 1f7a7e8a
    Jean-Baptiste Kempf authored
    1f7a7e8a
......@@ -12,6 +12,12 @@ style-check:
script:
- git grep -n -P "\t|\r| $" -- . ':(exclude)*/compat/*' && exit 1
- git grep -n -i "david" -- . ':(exclude)THANKS.md' ':(exclude).gitlab-ci.yml' && exit 1
- for i in $(git ls-files -- . ':(exclude)*/compat/*'); do
if [ -n "$(tail -c 1 "$i")" ]; then
echo "No newline at end of $i";
exit 1;
fi;
done
- git remote rm upstream 2> /dev/null || true
- git remote add upstream https://code.videolan.org/videolan/dav1d.git
- git fetch -q upstream master
......
Changes for 0.2.2 'Antelope':
----------------------------
- Large improvement on MSAC decoding with SSE, bringing 4-6% speed increase
The impact is important on SSSE3, SSE4 and AVX-2 cpus
- SSSE3 optimizations for all blocks size in itx
- SSSE3 optimizations for ipred_paeth and ipref_cfl (420, 422 and 444)
- Speed improvements on CDEF for SSE4 CPUs
- NEON optimizations for SGR and loop filter
- Minor crashes, improvements and build changes
Changes for 0.2.1 'Antelope':
----------------------------
......
......@@ -33,7 +33,11 @@
#ifndef DAV1D_API
#if defined _WIN32
#define DAV1D_API __declspec(dllexport)
#if defined DAV1D_BUILDING_DLL
#define DAV1D_API __declspec(dllexport)
#else
#define DAV1D_API
#endif
#else
#if __GNUC__ >= 4
#define DAV1D_API __attribute__ ((visibility ("default")))
......
......@@ -23,7 +23,7 @@
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
project('dav1d', ['c'],
version: '0.2.1',
version: '0.2.2',
default_options: ['c_std=c99',
'warning_level=2',
'buildtype=release',
......@@ -214,6 +214,8 @@ endif
stackalign_flag = []
stackrealign_flag = []
cdata.set10('ENDIANNESS_BIG', host_machine.endian() == 'big')
if host_machine.cpu_family().startswith('x86')
if get_option('stack_alignment') > 0
stack_alignment = get_option('stack_alignment')
......
......@@ -217,8 +217,8 @@ bidir_fn mask
// This has got the same signature as the put_8tap functions,
// assumes that the caller has loaded the h argument into r5,
// and assumes that r8 is set to (24-clz(w)).
function put
// and assumes that r8 is set to (clz(w)-24).
function put_neon
adr r9, L(put_tbl)
ldr r8, [r9, r8, lsl #2]
add r9, r9, r8
......@@ -307,9 +307,9 @@ endfunc
// This has got the same signature as the put_8tap functions,
// assumes that the caller has loaded the h argument into r5,
// and assumes that r8 is set to (24-clz(w)), and r7 to w*2.
function prep
// assumes that the caller has loaded the h argument into r4,
// and assumes that r8 is set to (clz(w)-24), and r7 to w*2.
function prep_neon
adr r9, L(prep_tbl)
ldr r8, [r9, r8, lsl #2]
add r9, r9, r8
......@@ -660,7 +660,7 @@ function \op\()_8tap_\type\()_8bpc_neon, export=1
push {r4-r11,lr}
movw r8, \type_h
movw r9, \type_v
b \op\()_8tap
b \op\()_8tap_neon
endfunc
.endm
......@@ -680,7 +680,7 @@ make_8tap_fn \type, sharp, SHARP, SHARP
make_8tap_fn \type, sharp_regular, SHARP, REGULAR
make_8tap_fn \type, sharp_smooth, SHARP, SMOOTH
function \type\()_8tap
function \type\()_8tap_neon
ldrd r4, r5, [sp, #36]
ldrd r6, r7, [sp, #44]
movw r10, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0)
......@@ -699,7 +699,7 @@ function \type\()_8tap
bne L(\type\()_8tap_h)
tst \my, #(0x7f << 14)
bne L(\type\()_8tap_v)
b \type
b \type\()_neon
L(\type\()_8tap_h):
cmp \w, #4
......@@ -1831,7 +1831,7 @@ function \type\()_bilin_8bpc_neon, export=1
bne L(\type\()_bilin_h)
cmp \my, #0
bne L(\type\()_bilin_v)
b \type
b \type\()_neon
L(\type\()_bilin_h):
cmp \my, #0
......
......@@ -34,32 +34,32 @@
.macro movrel rd, val, offset=0
#if defined(PIC) && defined(__APPLE__)
ldr \rd, 1f
b 2f
ldr \rd, 1f
b 2f
1:
.word 3f - (2f + 8 - 4 * CONFIG_THUMB)
.word 3f - (2f + 8 - 4 * CONFIG_THUMB)
2:
ldr \rd, [pc, \rd]
ldr \rd, [pc, \rd]
.if \offset < 0
sub \rd, \rd, #-(\offset)
sub \rd, \rd, #-(\offset)
.elseif \offset > 0
add \rd, \rd, #\offset
add \rd, \rd, #\offset
.endif
.non_lazy_symbol_pointer
.non_lazy_symbol_pointer
3:
.indirect_symbol \val
.word 0
.text
.indirect_symbol \val
.word 0
.text
#elif defined(PIC)
ldr \rd, 1f
b 2f
ldr \rd, 1f
b 2f
1:
.word \val + \offset - (2f + 8 - 4 * CONFIG_THUMB)
.word \val + \offset - (2f + 8 - 4 * CONFIG_THUMB)
2:
add \rd, \rd, pc
add \rd, \rd, pc
#else
movw \rd, #:lower16:\val+\offset
movt \rd, #:upper16:\val+\offset
movw \rd, #:lower16:\val+\offset
movt \rd, #:upper16:\val+\offset
#endif
.endm
......
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2019, Martin Storsjo
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/arm/asm.S"
#include "util.S"
.macro loop_filter wd
function lpf_16_wd\wd\()_neon
uabd v0.16b, v22.16b, v23.16b // abs(p1 - p0)
uabd v1.16b, v25.16b, v24.16b // abs(q1 - q0)
uabd v2.16b, v23.16b, v24.16b // abs(p0 - q0)
uabd v3.16b, v22.16b, v25.16b // abs(p1 - q1)
.if \wd >= 6
uabd v4.16b, v21.16b, v22.16b // abs(p2 - p1)
uabd v5.16b, v26.16b, v25.16b // abs(q2 - q1)
.if \wd >= 8
uabd v6.16b, v20.16b, v21.16b // abs(p3 - p2)
uabd v7.16b, v27.16b, v26.16b // abs(q3 - q3)
.endif
.endif
.if \wd >= 6
umax v4.16b, v4.16b, v5.16b
.endif
uqadd v2.16b, v2.16b, v2.16b // abs(p0 - q0) * 2
.if \wd >= 8
umax v6.16b, v6.16b, v7.16b
.endif
ushr v3.16b, v3.16b, #1
.if \wd >= 8
umax v4.16b, v4.16b, v6.16b
.endif
.if \wd >= 6
and v4.16b, v4.16b, v14.16b
.endif
umax v0.16b, v0.16b, v1.16b // max(abs(p1 - p0), abs(q1 - q0))
uqadd v2.16b, v2.16b, v3.16b // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
.if \wd >= 6
umax v4.16b, v0.16b, v4.16b
cmhs v1.16b, v11.16b, v4.16b // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I
.else
cmhs v1.16b, v11.16b, v0.16b // max(abs(p1 - p0), abs(q1 - q0)) <= I
.endif
cmhs v2.16b, v10.16b, v2.16b // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E
and v1.16b, v1.16b, v2.16b // fm
and v1.16b, v1.16b, v13.16b // fm && wd >= 4
.if \wd >= 6
and v14.16b, v14.16b, v1.16b // fm && wd > 4
.endif
.if \wd >= 6
and v15.16b, v15.16b, v1.16b // fm && wd == 16
.endif
mov x16, v1.d[0]
mov x17, v1.d[1]
adds x16, x16, x17
b.eq 9f // if (!fm || wd < 4) return;
.if \wd >= 6
movi v10.16b, #1
uabd v2.16b, v21.16b, v23.16b // abs(p2 - p0)
uabd v3.16b, v22.16b, v23.16b // abs(p1 - p0)
uabd v4.16b, v25.16b, v24.16b // abs(q1 - q0)
uabd v5.16b, v26.16b, v24.16b // abs(q2 - q0)
.if \wd >= 8
uabd v6.16b, v20.16b, v23.16b // abs(p3 - p0)
uabd v7.16b, v27.16b, v24.16b // abs(q3 - q0)
.endif
umax v2.16b, v2.16b, v3.16b
umax v4.16b, v4.16b, v5.16b
.if \wd >= 8
umax v6.16b, v6.16b, v7.16b
.endif
umax v2.16b, v2.16b, v4.16b
.if \wd >= 8
umax v2.16b, v2.16b, v6.16b
.endif
.if \wd == 16
uabd v3.16b, v17.16b, v23.16b // abs(p6 - p0)
uabd v4.16b, v18.16b, v23.16b // abs(p5 - p0)
uabd v5.16b, v19.16b, v23.16b // abs(p4 - p0)
.endif
cmhs v2.16b, v10.16b, v2.16b // flat8in
.if \wd == 16
uabd v6.16b, v28.16b, v24.16b // abs(q4 - q0)
uabd v7.16b, v29.16b, v24.16b // abs(q5 - q0)
uabd v8.16b, v30.16b, v24.16b // abs(q6 - q0)
.endif
and v14.16b, v2.16b, v14.16b // flat8in && fm && wd > 4
bic v1.16b, v1.16b, v14.16b // fm && wd >= 4 && !flat8in
.if \wd == 16
umax v3.16b, v3.16b, v4.16b
umax v5.16b, v5.16b, v6.16b
.endif
mov x16, v1.d[0]
mov x17, v1.d[1]
.if \wd == 16
umax v7.16b, v7.16b, v8.16b
umax v3.16b, v3.16b, v5.16b
umax v3.16b, v3.16b, v7.16b
cmhs v3.16b, v10.16b, v3.16b // flat8out
.endif
adds x16, x16, x17
.if \wd == 16
and v15.16b, v15.16b, v3.16b // flat8out && fm && wd == 16
and v15.16b, v15.16b, v14.16b // flat8out && flat8in && fm && wd == 16
bic v14.16b, v14.16b, v15.16b // flat8in && fm && wd >= 4 && !flat8out
.endif
b.eq 1f // skip wd == 4 case
.endif
usubl v2.8h, v22.8b, v25.8b // p1 - q1
usubl2 v3.8h, v22.16b, v25.16b
cmhi v0.16b, v0.16b, v12.16b // hev
sqxtn v2.8b, v2.8h // iclip_diff(p1 - q1)
sqxtn2 v2.16b, v3.8h
and v4.16b, v2.16b, v0.16b // if (hev) iclip_diff(p1 - q1)
bic v0.16b, v1.16b, v0.16b // (fm && wd >= 4 && !hev)
usubl v2.8h, v24.8b, v23.8b
movi v5.8h, #3
usubl2 v3.8h, v24.16b, v23.16b
mul v2.8h, v2.8h, v5.8h
mul v3.8h, v3.8h, v5.8h
movi v6.16b, #4
saddw v2.8h, v2.8h, v4.8b
saddw2 v3.8h, v3.8h, v4.16b
movi v7.16b, #3
sqxtn v2.8b, v2.8h // f
sqxtn2 v2.16b, v3.8h
sqadd v4.16b, v6.16b, v2.16b // imin(f + 4, 128)
sqadd v5.16b, v7.16b, v2.16b // imin(f + 3, 128)
sshr v4.16b, v4.16b, #3 // f1
sshr v5.16b, v5.16b, #3 // f2
uxtl v2.8h, v23.8b // p0
uxtl2 v3.8h, v23.16b
uxtl v6.8h, v24.8b // q0
uxtl2 v7.8h, v24.16b
saddw v2.8h, v2.8h, v5.8b
saddw2 v3.8h, v3.8h, v5.16b
ssubw v6.8h, v6.8h, v4.8b
ssubw2 v7.8h, v7.8h, v4.16b
srshr v4.16b, v4.16b, #1 // (f1 + 1) >> 1
sqxtun v2.8b, v2.8h // out p0
sqxtun2 v2.16b, v3.8h
sqxtun v6.8b, v6.8h // out q0
sqxtun2 v6.16b, v7.8h
bit v23.16b, v2.16b, v1.16b // if (fm && wd >= 4)
uxtl v2.8h, v22.8b // p1
uxtl2 v3.8h, v22.16b
bit v24.16b, v6.16b, v1.16b // if (fm && wd >= 4)
uxtl v6.8h, v25.8b // q1
uxtl2 v7.8h, v25.16b
saddw v2.8h, v2.8h, v4.8b
saddw2 v3.8h, v3.8h, v4.16b
ssubw v6.8h, v6.8h, v4.8b
ssubw2 v7.8h, v7.8h, v4.16b
sqxtun v2.8b, v2.8h // out p1
sqxtun2 v2.16b, v3.8h
sqxtun v6.8b, v6.8h // out q1
sqxtun2 v6.16b, v7.8h
bit v22.16b, v2.16b, v0.16b // if (fm && wd >= 4 && !hev)
bit v25.16b, v6.16b, v0.16b // if (fm && wd >= 4 && !hev)
1:
.if \wd == 6
mov x16, v14.d[0]
mov x17, v14.d[1]
adds x16, x16, x17
b.eq 2f // skip if there's no flat8in
uaddl v0.8h, v21.8b, v21.8b // p2 * 2
uaddl2 v1.8h, v21.16b, v21.16b
uaddl v2.8h, v21.8b, v22.8b // p2 + p1
uaddl2 v3.8h, v21.16b, v22.16b
uaddl v4.8h, v22.8b, v23.8b // p1 + p0
uaddl2 v5.8h, v22.16b, v23.16b
uaddl v6.8h, v23.8b, v24.8b // p0 + q0
uaddl2 v7.8h, v23.16b, v24.16b
add v8.8h, v0.8h, v2.8h
add v9.8h, v1.8h, v3.8h
add v10.8h, v4.8h, v6.8h
add v11.8h, v5.8h, v7.8h
uaddl v12.8h, v24.8b, v25.8b // q0 + q1
uaddl2 v13.8h, v24.16b, v25.16b
add v8.8h, v8.8h, v10.8h
add v9.8h, v9.8h, v11.8h
sub v12.8h, v12.8h, v0.8h
sub v13.8h, v13.8h, v1.8h
uaddl v10.8h, v25.8b, v26.8b // q1 + q2
uaddl2 v11.8h, v25.16b, v26.16b
rshrn v0.8b, v8.8h, #3 // out p1
rshrn2 v0.16b, v9.8h, #3
add v8.8h, v8.8h, v12.8h
add v9.8h, v9.8h, v13.8h
sub v10.8h, v10.8h, v2.8h
sub v11.8h, v11.8h, v3.8h
uaddl v12.8h, v26.8b, v26.8b // q2 + q2
uaddl2 v13.8h, v26.16b, v26.16b
rshrn v1.8b, v8.8h, #3 // out p0
rshrn2 v1.16b, v9.8h, #3
add v8.8h, v8.8h, v10.8h
add v9.8h, v9.8h, v11.8h
sub v12.8h, v12.8h, v4.8h
sub v13.8h, v13.8h, v5.8h
rshrn v2.8b, v8.8h, #3 // out q0
rshrn2 v2.16b, v9.8h, #3
bit v22.16b, v0.16b, v14.16b // p1 if (flat8in)
add v8.8h, v8.8h, v12.8h
add v9.8h, v9.8h, v13.8h
bit v23.16b, v1.16b, v14.16b // p0 if (flat8in)
rshrn v3.8b, v8.8h, #3 // out q1
rshrn2 v3.16b, v9.8h, #3
bit v24.16b, v2.16b, v14.16b // q0 if (flat8in)
bit v25.16b, v3.16b, v14.16b // q1 if (flat8in)
.elseif \wd >= 8
mov x16, v14.d[0]
mov x17, v14.d[1]
adds x16, x16, x17
.if \wd == 8
b.eq 8f // skip if there's no flat8in
.else
b.eq 2f // skip if there's no flat8in
.endif
uaddl v0.8h, v20.8b, v21.8b // p3 + p2
uaddl2 v1.8h, v20.16b, v21.16b
uaddl v2.8h, v22.8b, v25.8b // p1 + q1
uaddl2 v3.8h, v22.16b, v25.16b
uaddl v4.8h, v20.8b, v22.8b // p3 + p1
uaddl2 v5.8h, v20.16b, v22.16b
uaddl v6.8h, v23.8b, v26.8b // p0 + q2
uaddl2 v7.8h, v23.16b, v26.16b
add v8.8h, v0.8h, v0.8h // 2 * (p3 + p2)
add v9.8h, v1.8h, v1.8h
uaddw v8.8h, v8.8h, v23.8b // + p0
uaddw2 v9.8h, v9.8h, v23.16b
uaddw v8.8h, v8.8h, v24.8b // + q0
uaddw2 v9.8h, v9.8h, v24.16b
add v8.8h, v8.8h, v4.8h
add v9.8h, v9.8h, v5.8h // + p3 + p1
sub v2.8h, v2.8h, v0.8h // p1 + q1 - p3 - p2
sub v3.8h, v3.8h, v1.8h
sub v6.8h, v6.8h, v4.8h // p0 + q2 - p3 - p1
sub v7.8h, v7.8h, v5.8h
rshrn v10.8b, v8.8h, #3 // out p2
rshrn2 v10.16b, v9.8h, #3
add v8.8h, v8.8h, v2.8h
add v9.8h, v9.8h, v3.8h
uaddl v0.8h, v20.8b, v23.8b // p3 + p0
uaddl2 v1.8h, v20.16b, v23.16b
uaddl v2.8h, v24.8b, v27.8b // q0 + q3
uaddl2 v3.8h, v24.16b, v27.16b
rshrn v11.8b, v8.8h, #3 // out p1
rshrn2 v11.16b, v9.8h, #3
add v8.8h, v8.8h, v6.8h
add v9.8h, v9.8h, v7.8h
sub v2.8h, v2.8h, v0.8h // q0 + q3 - p3 - p0
sub v3.8h, v3.8h, v1.8h
uaddl v4.8h, v21.8b, v24.8b // p2 + q0
uaddl2 v5.8h, v21.16b, v24.16b
uaddl v6.8h, v25.8b, v27.8b // q1 + q3
uaddl2 v7.8h, v25.16b, v27.16b
rshrn v12.8b, v8.8h, #3 // out p0
rshrn2 v12.16b, v9.8h, #3
add v8.8h, v8.8h, v2.8h
add v9.8h, v9.8h, v3.8h
sub v6.8h, v6.8h, v4.8h // q1 + q3 - p2 - q0
sub v7.8h, v7.8h, v5.8h
uaddl v0.8h, v22.8b, v25.8b // p1 + q1
uaddl2 v1.8h, v22.16b, v25.16b
uaddl v2.8h, v26.8b, v27.8b // q2 + q3
uaddl2 v3.8h, v26.16b, v27.16b
rshrn v13.8b, v8.8h, #3 // out q0
rshrn2 v13.16b, v9.8h, #3
add v8.8h, v8.8h, v6.8h
add v9.8h, v9.8h, v7.8h
sub v2.8h, v2.8h, v0.8h // q2 + q3 - p1 - q1
sub v3.8h, v3.8h, v1.8h
rshrn v0.8b, v8.8h, #3 // out q1
rshrn2 v0.16b, v9.8h, #3
add v8.8h, v8.8h, v2.8h
add v9.8h , v9.8h, v3.8h
bit v21.16b, v10.16b, v14.16b
bit v22.16b, v11.16b, v14.16b
bit v23.16b, v12.16b, v14.16b
rshrn v1.8b, v8.8h, #3 // out q2
rshrn2 v1.16b, v9.8h, #3
bit v24.16b, v13.16b, v14.16b
bit v25.16b, v0.16b, v14.16b
bit v26.16b, v1.16b, v14.16b
.endif
2:
.if \wd == 16
mov x16, v15.d[0]
mov x17, v15.d[1]
adds x16, x16, x17
b.ne 1f // check if flat8out is needed
mov x16, v14.d[0]
mov x17, v14.d[1]
adds x16, x16, x17
b.eq 8f // if there was no flat8in, just write the inner 4 pixels
b 7f // if flat8in was used, write the inner 6 pixels
1:
uaddl v2.8h, v17.8b, v17.8b // p6 + p6
uaddl2 v3.8h, v17.16b, v17.16b
uaddl v4.8h, v17.8b, v18.8b // p6 + p5
uaddl2 v5.8h, v17.16b, v18.16b
uaddl v6.8h, v17.8b, v19.8b // p6 + p4
uaddl2 v7.8h, v17.16b, v19.16b
uaddl v8.8h, v17.8b, v20.8b // p6 + p3
uaddl2 v9.8h, v17.16b, v20.16b
add v12.8h, v2.8h, v4.8h
add v13.8h, v3.8h, v5.8h
add v10.8h, v6.8h, v8.8h
add v11.8h, v7.8h, v9.8h
uaddl v6.8h, v17.8b, v21.8b // p6 + p2
uaddl2 v7.8h, v17.16b, v21.16b
add v12.8h, v12.8h, v10.8h
add v13.8h, v13.8h, v11.8h
uaddl v8.8h, v17.8b, v22.8b // p6 + p1
uaddl2 v9.8h, v17.16b, v22.16b
uaddl v10.8h, v18.8b, v23.8b // p5 + p0
uaddl2 v11.8h, v18.16b, v23.16b
add v6.8h, v6.8h, v8.8h
add v7.8h, v7.8h, v9.8h
uaddl v8.8h, v19.8b, v24.8b // p4 + q0
uaddl2 v9.8h, v19.16b, v24.16b
add v12.8h, v12.8h, v6.8h
add v13.8h, v13.8h, v7.8h
add v10.8h, v10.8h, v8.8h
add v11.8h, v11.8h, v9.8h
uaddl v6.8h, v20.8b, v25.8b // p3 + q1
uaddl2 v7.8h, v20.16b, v25.16b
add v12.8h, v12.8h, v10.8h
add v13.8h, v13.8h, v11.8h
sub v6.8h, v6.8h, v2.8h
sub v7.8h, v7.8h, v3.8h
uaddl v2.8h, v21.8b, v26.8b // p2 + q2
uaddl2 v3.8h, v21.16b, v26.16b
rshrn v0.8b, v12.8h, #4 // out p5
rshrn2 v0.16b, v13.8h, #4
add v12.8h, v12.8h, v6.8h // - (p6 + p6) + (p3 + q1)
add v13.8h, v13.8h, v7.8h
sub v2.8h, v2.8h, v4.8h
sub v3.8h, v3.8h, v5.8h
uaddl v4.8h, v22.8b, v27.8b // p1 + q3
uaddl2 v5.8h, v22.16b, v27.16b
uaddl v6.8h, v17.8b, v19.8b // p6 + p4
uaddl2 v7.8h, v17.16b, v19.16b
rshrn v1.8b, v12.8h, #4 // out p4
rshrn2 v1.16b, v13.8h, #4
add v12.8h, v12.8h, v2.8h // - (p6 + p5) + (p2 + q2)
add v13.8h, v13.8h, v3.8h
sub v4.8h, v4.8h, v6.8h
sub v5.8h, v5.8h, v7.8h
uaddl v6.8h, v23.8b, v28.8b // p0 + q4
uaddl2 v7.8h, v23.16b, v28.16b
uaddl v8.8h, v17.8b, v20.8b // p6 + p3
uaddl2 v9.8h, v17.16b, v20.16b
rshrn v2.8b, v12.8h, #4 // out p3
rshrn2 v2.16b, v13.8h, #4
add v12.8h, v12.8h, v4.8h // - (p6 + p4) + (p1 + q3)
add v13.8h, v13.8h, v5.8h
sub v6.8h, v6.8h, v8.8h
sub v7.8h, v7.8h, v9.8h
uaddl v8.8h, v24.8b, v29.8b // q0 + q5
uaddl2 v9.8h, v24.16b, v29.16b
uaddl v4.8h, v17.8b, v21.8b // p6 + p2
uaddl2 v5.8h, v17.16b, v21.16b
rshrn v3.8b, v12.8h, #4 // out p2
rshrn2 v3.16b, v13.8h, #4
add v12.8h, v12.8h, v6.8h // - (p6 + p3) + (p0 + q4)
add v13.8h, v13.8h, v7.8h
sub v8.8h, v8.8h, v4.8h
sub v9.8h, v9.8h, v5.8h
uaddl v6.8h, v25.8b, v30.8b // q1 + q6
uaddl2 v7.8h, v25.16b, v30.16b
uaddl v10.8h, v17.8b, v22.8b // p6 + p1
uaddl2 v11.8h, v17.16b, v22.16b
rshrn v4.8b, v12.8h, #4 // out p1
rshrn2 v4.16b, v13.8h, #4
add v12.8h, v12.8h, v8.8h // - (p6 + p2) + (q0 + q5)
add v13.8h, v13.8h, v9.8h
sub v6.8h, v6.8h, v10.8h
sub v7.8h, v7.8h, v11.8h
uaddl v8.8h, v26.8b, v30.8b // q2 + q6
uaddl2 v9.8h, v26.16b, v30.16b
uaddl v10.8h, v18.8b, v23.8b // p5 + p0
uaddl2 v11.8h, v18.16b, v23.16b
rshrn v5.8b, v12.8h, #4 // out p0
rshrn2 v5.16b, v13.8h, #4
add v12.8h, v12.8h, v6.8h // - (p6 + p1) + (q1 + q6)
add v13.8h, v13.8h, v7.8h
sub v8.8h, v8.8h, v10.8h
sub v9.8h, v9.8h, v11.8h
uaddl v10.8h, v27.8b, v30.8b // q3 + q6
uaddl2 v11.8h, v27.16b, v30.16b
bif v0.16b, v18.16b, v15.16b // out p5
uaddl v14.8h, v19.8b, v24.8b // p4 + q0
uaddl2 v18.8h, v19.16b, v24.16b
rshrn v6.8b, v12.8h, #4 // out q0
rshrn2 v6.16b, v13.8h, #4
add v12.8h, v12.8h, v8.8h // - (p5 + p0) + (q2 + q6)
add v13.8h, v13.8h, v9.8h
sub v10.8h, v10.8h, v14.8h
sub v11.8h, v11.8h, v18.8h
uaddl v14.8h, v28.8b, v30.8b // q4 + q6
uaddl2 v18.8h, v28.16b, v30.16b
bif v1.16b, v19.16b, v15.16b // out p4
uaddl v8.8h, v20.8b, v25.8b // p3 + q1
uaddl2 v9.8h, v20.16b, v25.16b
rshrn v7.8b, v12.8h, #4 // out q1
rshrn2 v7.16b, v13.8h, #4
add v12.8h, v12.8h, v10.8h // - (p4 + q0) + (q3 + q6)
add v13.8h, v13.8h, v11.8h
sub v14.8h, v14.8h, v8.8h
sub v18.8h, v18.8h, v9.8h
uaddl v10.8h, v29.8b, v30.8b // q5 + q6
uaddl2 v11.8h, v29.16b, v30.16b
bif v2.16b, v20.16b, v15.16b // out p3
uaddl v19.8h, v21.8b, v26.8b // p2 + q2
uaddl2 v20.8h, v21.16b, v26.16b
rshrn v8.8b, v12.8h, #4 // out q2
rshrn2 v8.16b, v13.8h, #4
add v12.8h, v12.8h, v14.8h // - (p3 + q1) + (q4 + q6)
add v13.8h, v13.8h, v18.8h
sub v10.8h, v10.8h, v19.8h
sub v11.8h, v11.8h, v20.8h
uaddl v14.8h, v30.8b, v30.8b // q6 + q6
uaddl2 v18.8h, v30.16b, v30.16b
bif v3.16b, v21.16b, v15.16b // out p2
uaddl v19.8h, v22.8b, v27.8b // p1 + q3
uaddl2 v20.8h, v22.16b, v27.16b
rshrn v9.8b, v12.8h, #4 // out q3
rshrn2 v9.16b, v13.8h, #4
add v12.8h, v12.8h, v10.8h // - (p2 + q2) + (q5 + q6)
add v13.8h, v13.8h, v11.8h
sub v14.8h, v14.8h, v19.8h
sub v18.8h, v18.8h, v20.8h
bif v4.16b, v22.16b, v15.16b // out p1
rshrn v10.8b, v12.8h, #4 // out q4
rshrn2 v10.16b, v13.8h, #4
add v12.8h, v12.8h, v14.8h // - (p1 + q3) + (q6 + q6)
add v13.8h, v13.8h, v18.8h
rshrn v11.8b, v12.8h, #4 // out q5
rshrn2 v11.16b, v13.8h, #4
bif v5.16b, v23.16b, v15.16b // out p0
bif v6.16b, v24.16b, v15.16b // out q0
bif v7.16b, v25.16b, v15.16b // out q1
bif v8.16b, v26.16b, v15.16b // out q2
bif v9.16b, v27.16b, v15.16b // out q3
bif v10.16b, v28.16b, v15.16b // out q4
bif v11.16b, v29.16b, v15.16b // out q5
.endif
ret
.if \wd == 16
7:
// Return to a shorter epilogue, writing only the inner 6 pixels
br x13
.endif
.if \wd >= 8
8:
// Return to a shorter epilogue, writing only the inner 4 pixels
br x14
.endif
9:
// Return directly without writing back any pixels
br x15
endfunc
.endm
loop_filter 16
loop_filter 8
loop_filter 6
loop_filter 4
.macro lpf_16_wd16
adr x13, 7f
adr x14, 8f
bl lpf_16_wd16_neon
.endm
.macro lpf_16_wd8
adr x14, 8f
bl lpf_16_wd8_neon
.endm
.macro lpf_16_wd6
bl lpf_16_wd6_neon
.endm
.macro lpf_16_wd4
bl lpf_16_wd4_neon
.endm
function lpf_v_4_16_neon
mov x15, x30
sub x16, x0, x1, lsl #1
ld1 {v22.16b}, [x16], x1 // p1
ld1 {v24.16b}, [x0], x1 // q0
ld1 {v23.16b}, [x16], x1 // p0
ld1 {v25.16b}, [x0], x1 // q1
sub x0, x0, x1, lsl #1
lpf_16_wd4
sub x16, x0, x1, lsl #1
st1 {v22.16b}, [x16], x1 // p1
st1 {v24.16b}, [x0], x1 // q0
st1 {v23.16b}, [x16], x1 // p0
st1 {v25.16b}, [x0], x1 // q1
sub x0, x0, x1, lsl #1
br x15
endfunc
function lpf_h_4_16_neon
mov x15, x30
sub x16, x0, #2
add x0, x16, x1, lsl #3
ld1 {v22.s}[0], [x16], x1
ld1 {v22.s}[2], [x0], x1
ld1 {v23.s}[0], [x16], x1
ld1 {v23.s}[2], [x0], x1
ld1 {v24.s}[0], [x16], x1
ld1 {v24.s}[2], [x0], x1
ld1 {v25.s}[0], [x16], x1
ld1 {v25.s}[2], [x0], x1
ld1 {v22.s}[1], [x16], x1
ld1 {v22.s}[3], [x0], x1
ld1 {v23.s}[1], [x16], x1
ld1 {v23.s}[3], [x0], x1
ld1 {v24.s}[1], [x16], x1
ld1 {v24.s}[3], [x0], x1
ld1 {v25.s}[1], [x16], x1
ld1 {v25.s}[3], [x0], x1
add x0, x0, #2
transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29
lpf_16_wd4
sub x16, x0, x1, lsl #4
sub x16, x16, #2
transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29
add x0, x16, x1, lsl #3
st1 {v22.s}[0], [x16], x1
st1 {v22.s}[2], [x0], x1
st1 {v23.s}[0], [x16], x1
st1 {v23.s}[2], [x0], x1
st1 {v24.s}[0], [x16], x1
st1 {v24.s}[2], [x0], x1
st1 {v25.s}[0], [x16], x1
st1 {v25.s}[2], [x0], x1
st1 {v22.s}[1], [x16], x1
st1 {v22.s}[3], [x0], x1
st1 {v23.s}[1], [x16], x1
st1 {v23.s}[3], [x0], x1
st1 {v24.s}[1], [x16], x1
st1 {v24.s}[3], [x0], x1
st1 {v25.s}[1], [x16], x1
st1 {v25.s}[3], [x0], x1
add x0, x0, #2
br x15
endfunc
function lpf_v_6_16_neon
mov x15, x30
sub x16, x0, x1, lsl #1
sub x16, x16, x1
ld1 {v21.16b}, [x16], x1 // p2
ld1 {v24.16b}, [x0], x1 // q0
ld1 {v22.16b}, [x16], x1 // p1
ld1 {v25.16b}, [x0], x1 // q1
ld1 {v23.16b}, [x16], x1 // p0
ld1 {v26.16b}, [x0], x1 // q2
sub x0, x0, x1, lsl #1
sub x0, x0, x1
lpf_16_wd6
sub x16, x0, x1, lsl #1
st1 {v22.16b}, [x16], x1 // p1
st1 {v24.16b}, [x0], x1 // q0
st1 {v23.16b}, [x16], x1 // p0
st1 {v25.16b}, [x0], x1 // q1
sub x0, x0, x1, lsl #1
br x15
endfunc
function lpf_h_6_16_neon
mov x15, x30
sub x16, x0, #4
add x0, x16, x1, lsl #3
ld1 {v20.d}[0], [x16], x1
ld1 {v20.d}[1], [x0], x1
ld1 {v21.d}[0], [x16], x1
ld1 {v21.d}[1], [x0], x1
ld1 {v22.d}[0], [x16], x1
ld1 {v22.d}[1], [x0], x1
ld1 {v23.d}[0], [x16], x1
ld1 {v23.d}[1], [x0], x1
ld1 {v24.d}[0], [x16], x1
ld1 {v24.d}[1], [x0], x1
ld1 {v25.d}[0], [x16], x1
ld1 {v25.d}[1], [x0], x1
ld1 {v26.d}[0], [x16], x1
ld1 {v26.d}[1], [x0], x1
ld1 {v27.d}[0], [x16], x1
ld1 {v27.d}[1], [x0], x1
add x0, x0, #4
transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
lpf_16_wd6
sub x16, x0, x1, lsl #4
sub x16, x16, #2
transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29
add x0, x16, x1, lsl #3
st1 {v22.s}[0], [x16], x1
st1 {v22.s}[2], [x0], x1
st1 {v23.s}[0], [x16], x1
st1 {v23.s}[2], [x0], x1
st1 {v24.s}[0], [x16], x1
st1 {v24.s}[2], [x0], x1
st1 {v25.s}[0], [x16], x1
st1 {v25.s}[2], [x0], x1
st1 {v22.s}[1], [x16], x1
st1 {v22.s}[3], [x0], x1
st1 {v23.s}[1], [x16], x1
st1 {v23.s}[3], [x0], x1
st1 {v24.s}[1], [x16], x1
st1 {v24.s}[3], [x0], x1
st1 {v25.s}[1], [x16], x1
st1 {v25.s}[3], [x0], x1
add x0, x0, #2
br x15
endfunc
function lpf_v_8_16_neon
mov x15, x30
sub x16, x0, x1, lsl #2
ld1 {v20.16b}, [x16], x1 // p3
ld1 {v24.16b}, [x0], x1 // q0
ld1 {v21.16b}, [x16], x1 // p2
ld1 {v25.16b}, [x0], x1 // q1
ld1 {v22.16b}, [x16], x1 // p1
ld1 {v26.16b}, [x0], x1 // q2
ld1 {v23.16b}, [x16], x1 // p0
ld1 {v27.16b}, [x0], x1 // q3
sub x0, x0, x1, lsl #2
lpf_16_wd8
sub x16, x0, x1, lsl #1
sub x16, x16, x1
st1 {v21.16b}, [x16], x1 // p2
st1 {v24.16b}, [x0], x1 // q0
st1 {v22.16b}, [x16], x1 // p1
st1 {v25.16b}, [x0], x1 // q1
st1 {v23.16b}, [x16], x1 // p0
st1 {v26.16b}, [x0], x1 // q2
sub x0, x0, x1, lsl #1
sub x0, x0, x1
br x15
8:
sub x16, x0, x1, lsl #1
st1 {v22.16b}, [x16], x1 // p1
st1 {v24.16b}, [x0], x1 // q0
st1 {v23.16b}, [x16], x1 // p0
st1 {v25.16b}, [x0], x1 // q1
sub x0, x0, x1, lsl #1
br x15
endfunc
function lpf_h_8_16_neon
mov x15, x30
sub x16, x0, #4
add x0, x16, x1, lsl #3
ld1 {v20.d}[0], [x16], x1
ld1 {v20.d}[1], [x0], x1
ld1 {v21.d}[0], [x16], x1
ld1 {v21.d}[1], [x0], x1
ld1 {v22.d}[0], [x16], x1
ld1 {v22.d}[1], [x0], x1
ld1 {v23.d}[0], [x16], x1
ld1 {v23.d}[1], [x0], x1
ld1 {v24.d}[0], [x16], x1
ld1 {v24.d}[1], [x0], x1
ld1 {v25.d}[0], [x16], x1
ld1 {v25.d}[1], [x0], x1
ld1 {v26.d}[0], [x16], x1
ld1 {v26.d}[1], [x0], x1
ld1 {v27.d}[0], [x16], x1
ld1 {v27.d}[1], [x0], x1
add x0, x0, #4
transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
lpf_16_wd8
sub x16, x0, x1, lsl #4
sub x16, x16, #4
transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
add x0, x16, x1, lsl #3
st1 {v20.d}[0], [x16], x1
st1 {v20.d}[1], [x0], x1
st1 {v21.d}[0], [x16], x1
st1 {v21.d}[1], [x0], x1
st1 {v22.d}[0], [x16], x1
st1 {v22.d}[1], [x0], x1
st1 {v23.d}[0], [x16], x1
st1 {v23.d}[1], [x0], x1
st1 {v24.d}[0], [x16], x1
st1 {v24.d}[1], [x0], x1
st1 {v25.d}[0], [x16], x1
st1 {v25.d}[1], [x0], x1
st1 {v26.d}[0], [x16], x1
st1 {v26.d}[1], [x0], x1
st1 {v27.d}[0], [x16], x1
st1 {v27.d}[1], [x0], x1
add x0, x0, #4
br x15
8:
sub x16, x0, x1, lsl #4
sub x16, x16, #2
transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29
add x0, x16, x1, lsl #3
st1 {v22.s}[0], [x16], x1
st1 {v22.s}[2], [x0], x1
st1 {v23.s}[0], [x16], x1
st1 {v23.s}[2], [x0], x1
st1 {v24.s}[0], [x16], x1
st1 {v24.s}[2], [x0], x1
st1 {v25.s}[0], [x16], x1
st1 {v25.s}[2], [x0], x1
st1 {v22.s}[1], [x16], x1
st1 {v22.s}[3], [x0], x1
st1 {v23.s}[1], [x16], x1
st1 {v23.s}[3], [x0], x1
st1 {v24.s}[1], [x16], x1
st1 {v24.s}[3], [x0], x1
st1 {v25.s}[1], [x16], x1
st1 {v25.s}[3], [x0], x1
add x0, x0, #2
br x15
endfunc
function lpf_v_16_16_neon
mov x15, x30
sub x16, x0, x1, lsl #3
add x16, x16, x1
ld1 {v17.16b}, [x16], x1 // p6
ld1 {v24.16b}, [x0], x1 // q0
ld1 {v18.16b}, [x16], x1 // p5
ld1 {v25.16b}, [x0], x1 // q1
ld1 {v19.16b}, [x16], x1 // p4
ld1 {v26.16b}, [x0], x1 // q2
ld1 {v20.16b}, [x16], x1 // p3
ld1 {v27.16b}, [x0], x1 // q3
ld1 {v21.16b}, [x16], x1 // p2
ld1 {v28.16b}, [x0], x1 // q4
ld1 {v22.16b}, [x16], x1 // p1
ld1 {v29.16b}, [x0], x1 // q5
ld1 {v23.16b}, [x16], x1 // p0
ld1 {v30.16b}, [x0], x1 // q6
sub x0, x0, x1, lsl #3
add x0, x0, x1
lpf_16_wd16
sub x16, x0, x1, lsl #2
sub x16, x16, x1, lsl #1
st1 {v0.16b}, [x16], x1 // p5
st1 {v6.16b}, [x0], x1 // q0
st1 {v1.16b}, [x16], x1 // p4
st1 {v7.16b}, [x0], x1 // q1
st1 {v2.16b}, [x16], x1 // p3
st1 {v8.16b}, [x0], x1 // q2
st1 {v3.16b}, [x16], x1 // p2
st1 {v9.16b}, [x0], x1 // q3
st1 {v4.16b}, [x16], x1 // p1
st1 {v10.16b}, [x0], x1 // q4
st1 {v5.16b}, [x16], x1 // p0
st1 {v11.16b}, [x0], x1 // q5
sub x0, x0, x1, lsl #2
sub x0, x0, x1, lsl #1
br x15
7:
sub x16, x0, x1
sub x16, x16, x1, lsl #1
st1 {v21.16b}, [x16], x1 // p2
st1 {v24.16b}, [x0], x1 // q0
st1 {v22.16b}, [x16], x1 // p1
st1 {v25.16b}, [x0], x1 // q1
st1 {v23.16b}, [x16], x1 // p0
st1 {v26.16b}, [x0], x1 // q2
sub x0, x0, x1, lsl #1
sub x0, x0, x1
br x15
8:
sub x16, x0, x1, lsl #1
st1 {v22.16b}, [x16], x1 // p1
st1 {v24.16b}, [x0], x1 // q0
st1 {v23.16b}, [x16], x1 // p0
st1 {v25.16b}, [x0], x1 // q1
sub x0, x0, x1, lsl #1
br x15
endfunc
function lpf_h_16_16_neon
mov x15, x30
sub x16, x0, #8
ld1 {v16.d}[0], [x16], x1
ld1 {v24.d}[0], [x0], x1
ld1 {v17.d}[0], [x16], x1
ld1 {v25.d}[0], [x0], x1
ld1 {v18.d}[0], [x16], x1
ld1 {v26.d}[0], [x0], x1
ld1 {v19.d}[0], [x16], x1
ld1 {v27.d}[0], [x0], x1
ld1 {v20.d}[0], [x16], x1
ld1 {v28.d}[0], [x0], x1
ld1 {v21.d}[0], [x16], x1
ld1 {v29.d}[0], [x0], x1
ld1 {v22.d}[0], [x16], x1
ld1 {v30.d}[0], [x0], x1
ld1 {v23.d}[0], [x16], x1
ld1 {v31.d}[0], [x0], x1
ld1 {v16.d}[1], [x16], x1
ld1 {v24.d}[1], [x0], x1
ld1 {v17.d}[1], [x16], x1
ld1 {v25.d}[1], [x0], x1
ld1 {v18.d}[1], [x16], x1
ld1 {v26.d}[1], [x0], x1
ld1 {v19.d}[1], [x16], x1
ld1 {v27.d}[1], [x0], x1
ld1 {v20.d}[1], [x16], x1
ld1 {v28.d}[1], [x0], x1
ld1 {v21.d}[1], [x16], x1
ld1 {v29.d}[1], [x0], x1
ld1 {v22.d}[1], [x16], x1
ld1 {v30.d}[1], [x0], x1
ld1 {v23.d}[1], [x16], x1
ld1 {v31.d}[1], [x0], x1
transpose_8x16b v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
transpose_8x16b v24, v25, v26, v27, v28, v29, v30, v31, v0, v1
lpf_16_wd16
sub x0, x0, x1, lsl #4
sub x16, x0, #8
transpose_8x16b v16, v17, v0, v1, v2, v3, v4, v5, v18, v19
transpose_8x16b v6, v7, v8, v9, v10, v11, v30, v31, v18, v19
st1 {v16.d}[0], [x16], x1
st1 {v6.d}[0], [x0], x1
st1 {v17.d}[0], [x16], x1
st1 {v7.d}[0], [x0], x1
st1 {v0.d}[0], [x16], x1
st1 {v8.d}[0], [x0], x1
st1 {v1.d}[0], [x16], x1
st1 {v9.d}[0], [x0], x1
st1 {v2.d}[0], [x16], x1
st1 {v10.d}[0], [x0], x1
st1 {v3.d}[0], [x16], x1
st1 {v11.d}[0], [x0], x1
st1 {v4.d}[0], [x16], x1
st1 {v30.d}[0], [x0], x1
st1 {v5.d}[0], [x16], x1
st1 {v31.d}[0], [x0], x1
st1 {v16.d}[1], [x16], x1
st1 {v6.d}[1], [x0], x1
st1 {v17.d}[1], [x16], x1
st1 {v7.d}[1], [x0], x1
st1 {v0.d}[1], [x16], x1
st1 {v8.d}[1], [x0], x1
st1 {v1.d}[1], [x16], x1
st1 {v9.d}[1], [x0], x1
st1 {v2.d}[1], [x16], x1
st1 {v10.d}[1], [x0], x1
st1 {v3.d}[1], [x16], x1
st1 {v11.d}[1], [x0], x1
st1 {v4.d}[1], [x16], x1
st1 {v30.d}[1], [x0], x1
st1 {v5.d}[1], [x16], x1
st1 {v31.d}[1], [x0], x1
br x15
7:
sub x16, x0, x1, lsl #4
sub x16, x16, #4
transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
add x0, x16, x1, lsl #3
st1 {v20.d}[0], [x16], x1
st1 {v20.d}[1], [x0], x1
st1 {v21.d}[0], [x16], x1
st1 {v21.d}[1], [x0], x1
st1 {v22.d}[0], [x16], x1
st1 {v22.d}[1], [x0], x1
st1 {v23.d}[0], [x16], x1
st1 {v23.d}[1], [x0], x1
st1 {v24.d}[0], [x16], x1
st1 {v24.d}[1], [x0], x1
st1 {v25.d}[0], [x16], x1
st1 {v25.d}[1], [x0], x1
st1 {v26.d}[0], [x16], x1
st1 {v26.d}[1], [x0], x1
st1 {v27.d}[0], [x16], x1
st1 {v27.d}[1], [x0], x1
add x0, x0, #4
br x15
8:
sub x16, x0, x1, lsl #4
sub x16, x16, #2
transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29
add x0, x16, x1, lsl #3
st1 {v22.s}[0], [x16], x1
st1 {v22.s}[2], [x0], x1
st1 {v23.s}[0], [x16], x1
st1 {v23.s}[2], [x0], x1
st1 {v24.s}[0], [x16], x1
st1 {v24.s}[2], [x0], x1
st1 {v25.s}[0], [x16], x1
st1 {v25.s}[2], [x0], x1
st1 {v22.s}[1], [x16], x1
st1 {v22.s}[3], [x0], x1
st1 {v23.s}[1], [x16], x1
st1 {v23.s}[3], [x0], x1
st1 {v24.s}[1], [x16], x1
st1 {v24.s}[3], [x0], x1
st1 {v25.s}[1], [x16], x1
st1 {v25.s}[3], [x0], x1
add x0, x0, #2
br x15
endfunc
// void dav1d_lpf_v_sb_y_neon(pixel *dst, const ptrdiff_t stride,
// const uint32_t *const vmask,
// const uint8_t (*l)[4], ptrdiff_t b4_stride,
// const Av1FilterLUT *lut, const int w)
.macro lpf_func dir, type
function lpf_\dir\()_sb_\type\()_neon, export=1
mov x11, x30
stp d8, d9, [sp, #-0x40]!
stp d10, d11, [sp, #0x10]
stp d12, d13, [sp, #0x20]
stp d14, d15, [sp, #0x30]
ldp w6, w7, [x2] // vmask[0], vmask[1]
.ifc \type, y
ldr w2, [x2, #8] // vmask[2]
.endif
add x5, x5, #128 // Move to sharp part of lut
.ifc \type, y
orr w7, w7, w2 // vmask[1] |= vmask[2]
.endif
.ifc \dir, v
sub x4, x3, x4, lsl #2
.else
sub x3, x3, #4
lsl x4, x4, #2
.endif
orr w6, w6, w7 // vmask[0] |= vmask[1]
1:
tst w6, #0x0f
.ifc \dir, v
ld1 {v0.16b}, [x4], #16
ld1 {v1.16b}, [x3], #16
.else
ld2 {v0.s,v1.s}[0], [x3], x4
ld2 {v0.s,v1.s}[1], [x3], x4
ld2 {v0.s,v1.s}[2