...
 
Commits (92)
  • Michail Alvanos's avatar
  • James Almer's avatar
    Export frame ITU-T T.35 Metadata · 9a9c0c7e
    James Almer authored
    Based on a patch by Renato Cassaca.
    9a9c0c7e
  • Martin Storsjö's avatar
  • B Krishnan Iyer's avatar
    arm: mc: Speed up due to memory alignment in ldr/str instructions · b0d00020
    B Krishnan Iyer authored
    blend/blend_h/blend_v:
    
    Before:               Cortex A7      A8      A9     A53     A72     A73
    blend_h_w2_8bpc_neon:     169.5   194.2   153.1   134.0    63.0    72.6
    blend_h_w4_8bpc_neon:     164.4   171.8   142.2   137.8    60.5    60.2
    blend_h_w8_8bpc_neon:     184.8   121.0   146.5   123.4    55.9    63.1
    blend_h_w16_8bpc_neon:    291.0   178.6   237.3   181.0    88.6    83.9
    blend_h_w32_8bpc_neon:    531.9   321.5   432.2   358.3   155.6   156.2
    blend_h_w64_8bpc_neon:    957.6   600.3   827.4   631.2   279.7   268.4
    blend_h_w128_8bpc_neon:  2161.5  1398.4  1931.8  1403.4   607.0   597.9
    blend_v_w2_8bpc_neon:     249.3   373.4   269.2   195.6   107.9   117.6
    blend_v_w4_8bpc_neon:     451.7   676.1   555.3   376.1   198.6   266.9
    blend_v_w8_8bpc_neon:     561.0   475.2   607.6   357.0   213.9   204.1
    blend_v_w16_8bpc_neon:    928.4   626.8   823.8   592.3   269.9   245.3
    blend_v_w32_8bpc_neon:   1477.6  1024.8  1186.6   994.5   346.6   370.0
    blend_w4_8bpc_neon:       103.3   113.0    86.2    91.5    38.6    35.2
    blend_w8_8bpc_neon:       174.9   116.6   137.1   123.1    50.8    55.0
    blend_w16_8bpc_neon:      533.0   334.3   446.6   348.6   150.7   155.4
    blend_w32_8bpc_neon:     1299.2   836.8  1170.7   909.9   370.5   386.3
    
    After:
    blend_h_w2_8bpc_neon:     169.6   169.8   140.9   134.0    62.3    72.5
    blend_h_w4_8bpc_neon:     164.5   149.1   127.6   137.7    59.1    60.1
    blend_h_w8_8bpc_neon:     184.9   102.7   126.3   123.4    54.9    63.2
    blend_h_w16_8bpc_neon:    291.0   163.8   232.1   180.9    88.4    83.9
    blend_h_w32_8bpc_neon:    531.2   285.6   422.6   358.4   155.5   155.9
    blend_h_w64_8bpc_neon:    956.0   541.9   809.9   631.6   280.0   270.6
    blend_h_w128_8bpc_neon:  2159.0  1253.6  1889.0  1404.8   606.2   600.5
    blend_v_w2_8bpc_neon:     249.9   362.0   269.4   195.6   107.8   117.6
    blend_v_w4_8bpc_neon:     452.6   541.6   538.2   376.1   199.5   266.9
    blend_v_w8_8bpc_neon:     561.0   348.9   551.3   357.7   214.3   204.4
    blend_v_w16_8bpc_neon:    926.8   510.9   785.0   592.1   270.7   245.8
    blend_v_w32_8bpc_neon:   1474.4   913.3  1151.4   995.7   347.5   371.2
    blend_w4_8bpc_neon:       103.3    96.6    76.9    91.5    33.7    35.3
    blend_w8_8bpc_neon:       174.9    88.2   114.8   123.1    51.5    55.0
    blend_w16_8bpc_neon:      532.8   282.2   445.3   348.5   149.8   155.7
    blend_w32_8bpc_neon:     1295.1   735.2  1122.8   908.4   372.0   386.5
    
    w_mask_444/422/420:
    
    Before:                    Cortex A7        A8        A9       A53       A72      A73
    w_mask_420_w4_8bpc_neon:       218.1     144.4     187.3     152.7      86.9     89.0
    w_mask_420_w8_8bpc_neon:       544.0     393.7     437.0     372.5     211.1    230.9
    w_mask_420_w16_8bpc_neon:     1537.2    1063.5    1182.3    1024.3     566.4    667.7
    w_mask_420_w32_8bpc_neon:     5734.7    4207.2    4716.8    3822.8    2340.5   2521.3
    w_mask_420_w64_8bpc_neon:    14317.6   10165.0   13220.2    9578.5    5578.9   5989.9
    w_mask_420_w128_8bpc_neon:   37932.8   25299.1   39562.9   25203.8   14916.4  15465.1
    w_mask_422_w4_8bpc_neon:       206.8     141.4     177.9     143.4      82.1     84.8
    w_mask_422_w8_8bpc_neon:       511.8     380.8     416.7     342.5     198.5    221.7
    w_mask_422_w16_8bpc_neon:     1632.8    1154.4    1282.9    1061.2     595.3    684.9
    w_mask_422_w32_8bpc_neon:     6087.8    4560.3    5173.3    3945.8    2319.1   2608.7
    w_mask_422_w64_8bpc_neon:    15183.7   11013.9   14435.6    9904.6    5449.9   6100.9
    w_mask_422_w128_8bpc_neon:   39951.2   27441.0   42398.2   25995.1   14624.9  15529.2
    w_mask_444_w4_8bpc_neon:       193.4     127.0     170.0     135.4      76.8     81.4
    w_mask_444_w8_8bpc_neon:       477.8     340.0     427.9     319.3     187.2    214.7
    w_mask_444_w16_8bpc_neon:     1529.0    1058.8    1209.4     987.0     571.7    677.3
    w_mask_444_w32_8bpc_neon:     5687.9    4166.9    4882.4    3667.0    2286.8   2518.7
    w_mask_444_w64_8bpc_neon:    14394.7   10055.1   14057.9    9372.0    5369.3   5898.7
    w_mask_444_w128_8bpc_neon:   37952.0   25008.8   42169.9   24988.8   22973.7  15241.1
    
    After:
    w_mask_420_w4_8bpc_neon:       219.7     120.7     178.0     152.7      87.2     89.0
    w_mask_420_w8_8bpc_neon:       547.5     355.2     404.4     372.4     211.4    231.0
    w_mask_420_w16_8bpc_neon:     1540.9     987.1    1113.0    1024.9     567.4    669.5
    w_mask_420_w32_8bpc_neon:     5915.4    3905.8    4516.8    3929.3    2363.7   2523.6
    w_mask_420_w64_8bpc_neon:    14860.9    9437.1   12609.7    9586.4    5627.3   6005.8
    w_mask_420_w128_8bpc_neon:   38799.1   23536.1   38598.3   24787.7   14595.7  15474.9
    w_mask_422_w4_8bpc_neon:       208.3     115.4     168.6     143.4      82.4     84.8
    w_mask_422_w8_8bpc_neon:       515.2     335.7     383.2     342.5     198.9    221.8
    w_mask_422_w16_8bpc_neon:     1643.2    1053.6    1199.3    1062.2     595.6    685.7
    w_mask_422_w32_8bpc_neon:     6335.1    4161.0    4959.3    4088.5    2353.0   2606.4
    w_mask_422_w64_8bpc_neon:    15689.4   10039.8   13806.1    9937.7    5535.3   6099.8
    w_mask_422_w128_8bpc_neon:   40754.4   25033.3   41390.5   25683.7   14668.8  15537.1
    w_mask_444_w4_8bpc_neon:       194.9     107.4     162.0     135.4      77.1     81.4
    w_mask_444_w8_8bpc_neon:       481.1     300.2     422.0     319.1     187.6    214.6
    w_mask_444_w16_8bpc_neon:     1542.6     956.1    1137.7     988.4     572.4    677.5
    w_mask_444_w32_8bpc_neon:     5896.1    3766.1    4731.9    3801.2    2322.9   2521.8
    w_mask_444_w64_8bpc_neon:    14814.0    9084.7   13515.4    9311.0    5497.3   5896.3
    w_mask_444_w128_8bpc_neon:   38587.7   22615.2   41389.9   24639.4   17705.8  15244.3
    b0d00020
  • Henrik Gramner's avatar
    Set thread names on Windows 10 · 6c3e85de
    Henrik Gramner authored
    6c3e85de
  • Henrik Gramner's avatar
    Set thread names on MacOS · fa32f2de
    Henrik Gramner authored
    fa32f2de
  • Henrik Gramner's avatar
    Avoid CDF overreads in gather_top_partition_prob() · d8799d94
    Henrik Gramner authored
    Explicitly take advantage of the fact that certain probabilities are zero
    instead of loading zeros from the CDF padding.
    
    The current code works just fine, but only because those values happen to
    be zero due to what is essentially an implementation detail.
    d8799d94
  • Luc Trudeau's avatar
    5a4ae342
  • Luc Trudeau's avatar
    Unroll hi_token loop in decode_coeff · ad0c0412
    Luc Trudeau authored
    ad0c0412
  • Luc Trudeau's avatar
    decode_coefs reuse lossless variable · 42ea146f
    Luc Trudeau authored
    42ea146f
  • James Almer's avatar
    meson: move dav1dplay to a new examples section · 3a77c57b
    James Almer authored
    dav1dplay shouldn't be built by default. And it's an example more than a tool.
    3a77c57b
  • James Almer's avatar
    dff0a08c
  • Henrik Gramner's avatar
    Remove unused CDF:s · a819653e
    Henrik Gramner authored
    a819653e
  • Henrik Gramner's avatar
    Add msac optimizations · e29fd5c0
    Henrik Gramner authored
     * Eliminate the trailing zero after the CDF probabilities. We can
       reuse the count value as a terminator instead. This reduces the
       size of the CDF context by around 8%.
    
     * Align the CDF arrays.
    
     * Various other minor optimizations.
    e29fd5c0
  • Henrik Gramner's avatar
    x86: Add an msac function for coefficient hi_tok decoding · 61dcd11b
    Henrik Gramner authored
    This particular sequence is executed often enough to justify having
    a separate slightly more optimized code path instead of just chaining
    multiple generic symbol decoding function calls together.
    61dcd11b
  • Henrik Gramner's avatar
    Cosmetics: CDF tables · 0f4edbff
    Henrik Gramner authored
    0f4edbff
  • Michael Bradshaw's avatar
    d20d70e8
  • B Krishnan Iyer's avatar
    arm64: mc: NEON implementation of blend, blend_h and blend_v function · 1dc2dc7d
    B Krishnan Iyer authored
                       	A73	A53
    blend_h_w2_8bpc_c:	184.7	301.5
    blend_h_w2_8bpc_neon:	58.8	104.1
    blend_h_w4_8bpc_c:	291.4	507.3
    blend_h_w4_8bpc_neon:	48.7	108.9
    blend_h_w8_8bpc_c:	510.1	992.7
    blend_h_w8_8bpc_neon:	66.5	99.3
    blend_h_w16_8bpc_c:	972	1835.3
    blend_h_w16_8bpc_neon:	82.7	145.2
    blend_h_w32_8bpc_c:	776.7	912.9
    blend_h_w32_8bpc_neon:	155.1	266.9
    blend_h_w64_8bpc_c:	1424.3	1635.4
    blend_h_w64_8bpc_neon:	273.4	480.9
    blend_h_w128_8bpc_c:	3318.1	3774
    blend_h_w128_8bpc_neon:	614.1	1097.9
    blend_v_w2_8bpc_c:	278.8	427.5
    blend_v_w2_8bpc_neon:	113.7	170.4
    blend_v_w4_8bpc_c:	960.2	1597.7
    blend_v_w4_8bpc_neon:	222.9	351.4
    blend_v_w8_8bpc_c:	1694.2	3333.5
    blend_v_w8_8bpc_neon:	200.9	333.6
    blend_v_w16_8bpc_c:	3115.2	5971.6
    blend_v_w16_8bpc_neon:	233.2	494.8
    blend_v_w32_8bpc_c:	3949.7	6070.6
    blend_v_w32_8bpc_neon:	460.4	841.6
    blend_w4_8bpc_c:	244.2	388.3
    blend_w4_8bpc_neon:	25.5	66.7
    blend_w8_8bpc_c:	616.3	1120.8
    blend_w8_8bpc_neon:	46	110.7
    blend_w16_8bpc_c:	2193.1	4056.4
    blend_w16_8bpc_neon:	140.7	299.3
    blend_w32_8bpc_c:	2502.8	2998.5
    blend_w32_8bpc_neon:	381.4	725.3
    1dc2dc7d
  • B Krishnan Iyer's avatar
    arm64: mc: NEON implementation of w_mask_444/422/420 function · 3d94fb9a
    B Krishnan Iyer authored
    	                        A73	        A53
    
    w_mask_420_w4_8bpc_c:	        818	        1082.9
    w_mask_420_w4_8bpc_neon:	79	        126.6
    w_mask_420_w8_8bpc_c:	        2486	        3399.8
    w_mask_420_w8_8bpc_neon:	200.2	        343.7
    w_mask_420_w16_8bpc_c:	        8022.3	        10989.6
    w_mask_420_w16_8bpc_neon:	528.1   	889
    w_mask_420_w32_8bpc_c:	        31851.8	        42808.6
    w_mask_420_w32_8bpc_neon:	2062.5	        3380.8
    w_mask_420_w64_8bpc_c:	        79268.5	        102683.9
    w_mask_420_w64_8bpc_neon:	5252.9	        8575.4
    w_mask_420_w128_8bpc_c:	        193704.1	255586.5
    w_mask_420_w128_8bpc_neon:	14602.3	        22167.7
    
    w_mask_422_w4_8bpc_c:	        777.3	        1038.5
    w_mask_422_w4_8bpc_neon:	72.1	        112.9
    w_mask_422_w8_8bpc_c:	        2405.7	        3168
    w_mask_422_w8_8bpc_neon:	191.9	        314.1
    w_mask_422_w16_8bpc_c:	        7783.7	        10543.9
    w_mask_422_w16_8bpc_neon:	559.8	        835.5
    w_mask_422_w32_8bpc_c:	        30895.7	        41141.2
    w_mask_422_w32_8bpc_neon:	2089.7	        3187.2
    w_mask_422_w64_8bpc_c:	        75500.2	        98766.3
    w_mask_422_w64_8bpc_neon:	5379	        8208.2
    w_mask_422_w128_8bpc_c:	        186967.1	245809.1
    w_mask_422_w128_8bpc_neon:	15159.9	        21474.5
    
    w_mask_444_w4_8bpc_c:	        850.1	        1136.6
    w_mask_444_w4_8bpc_neon:	66.5	        104.7
    w_mask_444_w8_8bpc_c:	        2373.5	        3262.9
    w_mask_444_w8_8bpc_neon:	180.5	        290.2
    w_mask_444_w16_8bpc_c:	        7291.6	        10590.7
    w_mask_444_w16_8bpc_neon:	550.9	        809.7
    w_mask_444_w32_8bpc_c:	        8048.3	        10140.8
    w_mask_444_w32_8bpc_neon:	2136.2	        3095
    w_mask_444_w64_8bpc_c:	        18055.3	        23060
    w_mask_444_w64_8bpc_neon:	5522.5	        8124.8
    w_mask_444_w128_8bpc_c:	        42754.3	        56072
    w_mask_444_w128_8bpc_neon:	15569.5	        21531.5
    3d94fb9a
  • Henrik Gramner's avatar
    Utilize the constraints in assertions to improve code generation · 6751c980
    Henrik Gramner authored
    When compiling in release mode, instead of just deleting assertions,
    use them to give hints to the compiler. This allows for slightly
    better code generation in some cases.
    6751c980
  • Henrik Gramner's avatar
    Change scan tables from int16_t to uint16_t · 6757cab9
    Henrik Gramner authored
    Eliminates some sign extensions.
    6757cab9
  • Henrik Gramner's avatar
    Consolidate horizontal scan tables · a62c445d
    Henrik Gramner authored
    a62c445d
  • Henrik Gramner's avatar
    Optimize coef ctx calculations · 70b66ff1
    Henrik Gramner authored
    70b66ff1
  • Henrik Gramner's avatar
    x86: Fix inverse ADST transform overflows · eeca6f25
    Henrik Gramner authored
    eeca6f25
  • Henrik Gramner's avatar
    Use 64-bit integers for warp_affine mvx/mvy calculations · 2c1467b4
    Henrik Gramner authored
    Fixes integer overflows with very large frame sizes.
    
    Credit to OSS-Fuzz.
    2c1467b4
  • Ronald S. Bultje's avatar
    Check absolute tile positions in sb-to-tile_idx table generation · 37a03fc7
    Ronald S. Bultje authored
    Otherwise the table can get out of sync when the frame size and tile
    count stays the same, but the tile coordinates change. Fixes #266.
    37a03fc7
  • Martin Storsjö's avatar
    c3e5ad04
  • Martin Storsjö's avatar
    arm: mc: Push fewer registers in w_mask · f01bbbdd
    Martin Storsjö authored
    Use the so far unused lr register instead of r10.
    f01bbbdd
  • B Krishnan Iyer's avatar
    arm: mc: Making code style consistent · cfd6fe6d
    B Krishnan Iyer authored
    cfd6fe6d
  • Ronald S. Bultje's avatar
    Fix bugs in film grain generation · c09f1072
    Ronald S. Bultje authored
    - calculate chroma grain based on src (not dst) luma pixels;
    - division should precede multiplication in delta calculation.
    
    Together, these fix differences in film grain reconstruction between
    libaom and dav1d for various generated samples.
    c09f1072
  • Ronald S. Bultje's avatar
  • Ronald S. Bultje's avatar
    Apply high-bitdepth adjustment of pixel index after delta calculation · 91b0af2f
    Ronald S. Bultje authored
    Fixes libaom/dav1d mismatch in av1-1-b10-23-film_grain-50.ivf.
    91b0af2f
  • Janne Grunau's avatar
  • Janne Grunau's avatar
    TileContext: reorder scratch buffer to avoid conflicts · 863c3731
    Janne Grunau authored
    The chroma part of pal_idx potentially conflicts during intra
    reconstruction with edge_{8,16}bpc. Fixes out of range pixel values
    caused by invalid palette indices in
    clusterfuzz-testcase-minimized-dav1d_fuzzer_mt-5076736684851200.
    Fixes #294. Reported as integer overflows in boxsum5sqr with undefined
    behavior sanitizer. Credits to oss-fuzz.
    863c3731
  • Janne Grunau's avatar
    arm: Fix assembling with older binutils · e65abadf
    Janne Grunau authored
    This large constant needs a movw instruction, which newer binutils can
    figure out, but older versions need stated explicitly.
    
    This fixes #296.
    e65abadf
  • Henrik Gramner's avatar
    Fix clang-cl assertion warning · 666c71a0
    Henrik Gramner authored
    clang-cl doesn't like function calls in __assume statements, even
    trivial inline ones.
    666c71a0
  • Henrik Gramner's avatar
    Prefer __builtin_unreachable() over __assume() on clang-cl · c0e1988b
    Henrik Gramner authored
    __assume() doesn't work correctly in clang-cl versions prior to 7.0.0
    which causes bogus warnings regarding use of uninitialized variables
    to be printed. Avoid that by using __builtin_unreachable() instead.
    c0e1988b
  • Martin Storsjö's avatar
    arm64: itx: Do the final calculation of adst4/adst8/adst16 in 32 bit to avoid too narrow clipping · e2702eaf
    Martin Storsjö authored
    See issue #295, this fixes it for arm64.
    
    Before:                                 Cortex A53      A72      A73
    inv_txfm_add_4x4_adst_adst_1_8bpc_neon:      103.0     63.2     65.2
    inv_txfm_add_4x8_adst_adst_1_8bpc_neon:      197.0    145.0    134.2
    inv_txfm_add_8x8_adst_adst_1_8bpc_neon:      332.0    248.0    247.1
    inv_txfm_add_16x16_adst_adst_2_8bpc_neon:   1676.8   1197.0   1186.8
    After:
    inv_txfm_add_4x4_adst_adst_1_8bpc_neon:      103.0     76.4     67.0
    inv_txfm_add_4x8_adst_adst_1_8bpc_neon:      205.0    155.0    143.8
    inv_txfm_add_8x8_adst_adst_1_8bpc_neon:      358.0    269.0    276.2
    inv_txfm_add_16x16_adst_adst_2_8bpc_neon:   1785.2   1347.8   1312.1
    
    This would probably only be needed for adst in the first pass, but
    the additional code complexity from splitting the implementations
    (as we currently don't have transforms differentiated between first
    and second pass) isn't necessarily worth it (the speedup over C code
    is still 8-10x).
    e2702eaf
  • Henrik Gramner's avatar
    x86: Increase precision of the final inverse ADST transform stages · a9315f5f
    Henrik Gramner authored
    16-bit precision is sufficient for the second pass, but the first pass
    requires 32-bit precision to correctly handle some esoteric edge cases.
    a9315f5f
  • Henrik Gramner's avatar
    x86: Fix buffer overead in mc put · 69dae683
    Henrik Gramner authored
    For w <= 32 we can't process more than two rows per loop iteration.
    
    Credit to OSS-Fuzz.
    69dae683
  • Henrik Gramner's avatar
    Silence some clang-cl warnings · acad1a99
    Henrik Gramner authored
    For some reason the MSVC CRT _wassert() function is not flagged as
     __declspec(noreturn), so when using those headers the compiler will
    expect execution to continue after an assertion has been triggered
    and will therefore complain about the use of uninitialized variables
    when compiled in debug mode in certain code paths.
    
    Reorder some case statements as a workaround.
    acad1a99
  • James Almer's avatar
    obu: fix deriving render_width and render_height from reference frames · 79c4aa95
    James Almer authored
    Both values can be independently coded in the bitstream, and are not
    always equal to frame_width and frame_height.
    79c4aa95
  • Ronald S. Bultje's avatar
  • Ronald S. Bultje's avatar
    Add film grain checkasm tests · 04ca7112
    Ronald S. Bultje authored
    04ca7112
  • Ronald S. Bultje's avatar
    Y grain AVX2 implementations · 99307bf3
    Ronald S. Bultje authored
    fgy_32x32xn_8bpc_c: 16181.8
    fgy_32x32xn_8bpc_avx2: 3231.4
    gen_grain_y_ar0_8bpc_c: 108857.6
    gen_grain_y_ar0_8bpc_avx2: 22826.7
    gen_grain_y_ar1_8bpc_c: 168239.8
    gen_grain_y_ar1_8bpc_avx2: 72117.2
    gen_grain_y_ar2_8bpc_c: 266165.9
    gen_grain_y_ar2_8bpc_avx2: 126281.8
    gen_grain_y_ar3_8bpc_c: 448139.4
    gen_grain_y_ar3_8bpc_avx2: 137047.1
    99307bf3
  • Ronald S. Bultje's avatar
    Remove luma width check in fguv_32x32xn · 6d363223
    Ronald S. Bultje authored
    This would affect the output in samples with an odd width and horizontal
    chroma subsampling. The check does not exist in libaom, and might cause
    mismatches.
    
    This causes issues in the sample from #210, which uses super-resolution
    and has odd width. To work around this, make super-resolution's resize()
    always write an even number of pixels. This should not interfere with
    SIMD in the future.
    6d363223
  • Ronald S. Bultje's avatar
    AVX2 for chroma 4:2:0 film grain reconstruction · 556890be
    Ronald S. Bultje authored
    fguv_32x32xn_8bpc_420_csfl0_c: 8945.4
    fguv_32x32xn_8bpc_420_csfl0_avx2: 1001.6
    fguv_32x32xn_8bpc_420_csfl1_c: 6363.4
    fguv_32x32xn_8bpc_420_csfl1_avx2: 1299.5
    556890be
  • Ronald S. Bultje's avatar
    x86: add deblocking loopfilters SSSE3 asm (64-bit) · 1e4e6c7a
    Ronald S. Bultje authored
    ```------------------
    x86_64:
    ```
    
    ---------------------------------------
    lpf_h_sb_uv_w4_8bpc_c: 430.6
    lpf_h_sb_uv_w4_8bpc_ssse3: 322.0
    lpf_h_sb_uv_w4_8bpc_avx2: 200.4
    ---------------------
    lpf_h_sb_uv_w6_8bpc_c: 981.9
    lpf_h_sb_uv_w6_8bpc_ssse3: 421.5
    lpf_h_sb_uv_w6_8bpc_avx2: 270.0
    ---------------------
    lpf_h_sb_y_w4_8bpc_c: 3001.7
    lpf_h_sb_y_w4_8bpc_ssse3: 466.3
    lpf_h_sb_y_w4_8bpc_avx2: 383.1
    ---------------------
    lpf_h_sb_y_w8_8bpc_c: 4457.7
    lpf_h_sb_y_w8_8bpc_ssse3: 818.9
    lpf_h_sb_y_w8_8bpc_avx2: 537.0
    ---------------------
    lpf_h_sb_y_w16_8bpc_c: 1967.9
    lpf_h_sb_y_w16_8bpc_ssse3: 1836.7
    lpf_h_sb_y_w16_8bpc_avx2: 1078.2
    ---------------------
    lpf_v_sb_uv_w4_8bpc_c: 369.4
    lpf_v_sb_uv_w4_8bpc_ssse3: 110.9
    lpf_v_sb_uv_w4_8bpc_avx2: 58.1
    ---------------------
    lpf_v_sb_uv_w6_8bpc_c: 769.6
    lpf_v_sb_uv_w6_8bpc_ssse3: 222.2
    lpf_v_sb_uv_w6_8bpc_avx2: 117.8
    ---------------------
    lpf_v_sb_y_w4_8bpc_c: 772.4
    lpf_v_sb_y_w4_8bpc_ssse3: 179.8
    lpf_v_sb_y_w4_8bpc_avx2: 173.6
    ---------------------
    lpf_v_sb_y_w8_8bpc_c: 1660.2
    lpf_v_sb_y_w8_8bpc_ssse3: 468.3
    lpf_v_sb_y_w8_8bpc_avx2: 345.8
    ---------------------
    lpf_v_sb_y_w16_8bpc_c: 1889.6
    lpf_v_sb_y_w16_8bpc_ssse3: 1142.0
    lpf_v_sb_y_w16_8bpc_avx2: 568.1
    ------------------------------------------
    1e4e6c7a
  • Victorien Le Couviour--Tuffet's avatar
    x86: add 32-bit support to SSSE3 deblock lpf · c0865f35
    Victorien Le Couviour--Tuffet authored
    ```---------------------------------------
    x86_64: lpf_h_sb_uv_w4_8bpc_c: 430.6
    x86_32: lpf_h_sb_uv_w4_8bpc_c: 788.6
    x86_64: lpf_h_sb_uv_w4_8bpc_ssse3: 322.0
    x86_32: lpf_h_sb_uv_w4_8bpc_ssse3: 302.4
    ```
    
    ------------------
    x86_64: lpf_h_sb_uv_w6_8bpc_c: 981.9
    x86_32: lpf_h_sb_uv_w6_8bpc_c: 1579.6
    x86_64: lpf_h_sb_uv_w6_8bpc_ssse3: 421.5
    x86_32: lpf_h_sb_uv_w6_8bpc_ssse3: 431.6
    ---------------------
    x86_64: lpf_h_sb_y_w4_8bpc_c: 3001.7
    x86_32: lpf_h_sb_y_w4_8bpc_c: 7021.3
    x86_64: lpf_h_sb_y_w4_8bpc_ssse3: 466.3
    x86_32: lpf_h_sb_y_w4_8bpc_ssse3: 564.7
    ---------------------
    x86_64: lpf_h_sb_y_w8_8bpc_c: 4457.7
    x86_32: lpf_h_sb_y_w8_8bpc_c: 3657.8
    x86_64: lpf_h_sb_y_w8_8bpc_ssse3: 818.9
    x86_32: lpf_h_sb_y_w8_8bpc_ssse3: 927.9
    ---------------------
    x86_64: lpf_h_sb_y_w16_8bpc_c: 1967.9
    x86_32: lpf_h_sb_y_w16_8bpc_c: 3343.5
    x86_64: lpf_h_sb_y_w16_8bpc_ssse3: 1836.7
    x86_32: lpf_h_sb_y_w16_8bpc_ssse3: 1975.0
    ---------------------
    x86_64: lpf_v_sb_uv_w4_8bpc_c: 369.4
    x86_32: lpf_v_sb_uv_w4_8bpc_c: 793.6
    x86_64: lpf_v_sb_uv_w4_8bpc_ssse3: 110.9
    x86_32: lpf_v_sb_uv_w4_8bpc_ssse3: 133.0
    ---------------------
    x86_64: lpf_v_sb_uv_w6_8bpc_c: 769.6
    x86_32: lpf_v_sb_uv_w6_8bpc_c: 1576.7
    x86_64: lpf_v_sb_uv_w6_8bpc_ssse3: 222.2
    x86_32: lpf_v_sb_uv_w6_8bpc_ssse3: 232.2
    ---------------------
    x86_64: lpf_v_sb_y_w4_8bpc_c: 772.4
    x86_32: lpf_v_sb_y_w4_8bpc_c: 2596.5
    x86_64: lpf_v_sb_y_w4_8bpc_ssse3: 179.8
    x86_32: lpf_v_sb_y_w4_8bpc_ssse3: 234.7
    ---------------------
    x86_64: lpf_v_sb_y_w8_8bpc_c: 1660.2
    x86_32: lpf_v_sb_y_w8_8bpc_c: 3979.9
    x86_64: lpf_v_sb_y_w8_8bpc_ssse3: 468.3
    x86_32: lpf_v_sb_y_w8_8bpc_ssse3: 580.9
    ---------------------
    x86_64: lpf_v_sb_y_w16_8bpc_c: 1889.6
    x86_32: lpf_v_sb_y_w16_8bpc_c: 4728.7
    x86_64: lpf_v_sb_y_w16_8bpc_ssse3: 1142.0
    x86_32: lpf_v_sb_y_w16_8bpc_ssse3: 1174.8
    ------------------------------------------
    c0865f35
  • Niklas Haas's avatar
    dav1dplay: add --highquality to toggle render quality · f6ae8c9c
    Niklas Haas authored
    Only meaningful with libplacebo. The defaults are higher quality than
    SDL so it's an unfair comparison and definitely too much for slow iGPUs
    at 4K res. Make the defaults fast/dumb processing only, and guard the
    debanding/dithering/upscaling/etc. behind a new --highquality flag.
    f6ae8c9c
  • Niklas Haas's avatar
    dav1dplay: add --untimed for benchmarking purposes · 3f35ef1f
    Niklas Haas authored
    Useful to test the effects of performance changes to the
    decoding/rendering loop as a whole.
    3f35ef1f
  • Niklas Haas's avatar
    dav1dplay: initial support for --zerocopy · 490a1420
    Niklas Haas authored
    Right now this just allocates a new buffer for every frame, uses it,
    then discards it immediately. This is not optimal, either dav1d should
    start reusing buffers internally or we need to pool them in dav1dplay.
    
    As it stands, this is not really a performance gain. I'll have to
    investigate why, but my suspicion is that seeing any gains might require
    reusing buffers somewhere.
    
    Note: Thrashing buffers is not as bad as it seems, initially. Not only
    does libplacebo pool and reuse GPU memory and buffer state objects
    internally, but this also absolves us from having to do any manual
    polling to figure out when the buffer is reusable again. Creating, using
    and immediately destroying buffers actually isn't as bad an approach as
    it might otherwise seem.
    
    It's entirely possible that this is only bad because of lock contention.
    As said, I'll have to investigate further...
    490a1420
  • Martin Storsjö's avatar
    arm64: itx: Use smull+smlal instead of addl+mul · a4950bce
    Martin Storsjö authored
    Even though smull+smlal does two multiplications instead of one,
    the combination seems to be better handled by actual cores.
    
    Before:                                 Cortex A53      A72      A73
    inv_txfm_add_8x8_adst_adst_1_8bpc_neon:      356.0    279.2    278.0
    inv_txfm_add_16x16_adst_adst_2_8bpc_neon:   1785.0   1329.5   1308.8
    After:
    inv_txfm_add_8x8_adst_adst_1_8bpc_neon:      360.0    253.2    269.3
    inv_txfm_add_16x16_adst_adst_2_8bpc_neon:   1793.1   1300.9   1254.0
    
    (In this particular cases, it seems like it is a minor regression
    on A53, which is probably more due to having to change the ordering
    of some instructions, due to how smull+smlal+smull2+smlal2 overwrites
    the second output register sooner than an addl+addl2 would have, but
    in general, smull+smlal seems to be equally good or better than
    addl+mul on A53 as well.)
    a4950bce
  • Martin Storsjö's avatar
    arm64: itx: Consistently use the factor 2896 in adst · 0ed3ad19
    Martin Storsjö authored
    The scaled form 2896>>4 shouldn't be necessary with valid bistreams.
    0ed3ad19
  • Martin Storsjö's avatar
    arm64: itx: Fix overflows in idct · 713aa34c
    Martin Storsjö authored
    Don't add two 16 bit coefficients in 16 bit, if the result isn't supposed
    to be clipped.
    
    This fixes mismatches for some samples, see issue #299.
    
    Before:                                Cortex A53       A72       A73
    inv_txfm_add_4x4_dct_dct_1_8bpc_neon:        93.0      52.8      49.5
    inv_txfm_add_8x8_dct_dct_1_8bpc_neon:       260.0     186.0     196.4
    inv_txfm_add_16x16_dct_dct_2_8bpc_neon:    1371.0     953.4    1028.6
    inv_txfm_add_32x32_dct_dct_4_8bpc_neon:    7363.2    4887.5    5135.8
    inv_txfm_add_64x64_dct_dct_4_8bpc_neon:   25029.0   17492.3   18404.5
    After:
    inv_txfm_add_4x4_dct_dct_1_8bpc_neon:       105.0      58.7      55.2
    inv_txfm_add_8x8_dct_dct_1_8bpc_neon:       294.0     211.5     209.9
    inv_txfm_add_16x16_dct_dct_2_8bpc_neon:    1495.8    1050.4    1070.6
    inv_txfm_add_32x32_dct_dct_4_8bpc_neon:    7866.7    5197.8    5321.4
    inv_txfm_add_64x64_dct_dct_4_8bpc_neon:   25807.2   18619.3   18526.9
    713aa34c
  • Victorien Le Couviour--Tuffet's avatar
    x86: add warp_affine SSE4 and SSSE3 asm · a91a03b0
    Victorien Le Couviour--Tuffet authored
    ```---------------------------------------
    x86_64: warp_8x8_8bpc_c: 1773.4
    x86_32: warp_8x8_8bpc_c: 1740.4
    ```
    
    -------
    x86_64: warp_8x8_8bpc_ssse3: 317.5
    x86_32: warp_8x8_8bpc_ssse3: 378.4
    ----------
    x86_64: warp_8x8_8bpc_sse4: 303.7
    x86_32: warp_8x8_8bpc_sse4: 367.7
    ----------
    x86_64: warp_8x8_8bpc_avx2: 224.9
    ---------------------
    ---------------------
    x86_64: warp_8x8t_8bpc_c: 1664.6
    x86_32: warp_8x8t_8bpc_c: 1674.0
    ----------
    x86_64: warp_8x8t_8bpc_ssse3: 320.7
    x86_32: warp_8x8t_8bpc_ssse3: 379.5
    ----------
    x86_64: warp_8x8t_8bpc_sse4: 304.8
    x86_32: warp_8x8t_8bpc_sse4: 369.8
    ----------
    x86_64: warp_8x8t_8bpc_avx2: 228.5
    ------------------------------------------
    a91a03b0
  • Martin Storsjö's avatar
    arm64: ipred: NEON implementation of dc/h/v prediction modes · f7743da1
    Martin Storsjö authored
    Relative speedups over the C code:
                                  Cortex A53    A72    A73
    intra_pred_dc_128_w4_8bpc_neon:     2.08   1.47   2.17
    intra_pred_dc_128_w8_8bpc_neon:     3.33   2.49   4.03
    intra_pred_dc_128_w16_8bpc_neon:    3.93   3.86   3.75
    intra_pred_dc_128_w32_8bpc_neon:    3.14   3.79   2.90
    intra_pred_dc_128_w64_8bpc_neon:    3.68   1.97   2.42
    intra_pred_dc_left_w4_8bpc_neon:    2.41   1.70   2.23
    intra_pred_dc_left_w8_8bpc_neon:    3.53   2.41   3.32
    intra_pred_dc_left_w16_8bpc_neon:   3.87   3.54   3.34
    intra_pred_dc_left_w32_8bpc_neon:   4.10   3.60   2.76
    intra_pred_dc_left_w64_8bpc_neon:   3.72   2.00   2.39
    intra_pred_dc_top_w4_8bpc_neon:     2.27   1.66   2.07
    intra_pred_dc_top_w8_8bpc_neon:     3.83   2.69   3.43
    intra_pred_dc_top_w16_8bpc_neon:    3.66   3.60   3.20
    intra_pred_dc_top_w32_8bpc_neon:    3.92   3.54   2.66
    intra_pred_dc_top_w64_8bpc_neon:    3.60   1.98   2.30
    intra_pred_dc_w4_8bpc_neon:         2.29   1.42   2.16
    intra_pred_dc_w8_8bpc_neon:         3.56   2.83   3.05
    intra_pred_dc_w16_8bpc_neon:        3.46   3.37   3.15
    intra_pred_dc_w32_8bpc_neon:        3.79   3.41   2.74
    intra_pred_dc_w64_8bpc_neon:        3.52   2.01   2.41
    intra_pred_h_w4_8bpc_neon:         10.34   5.74   5.94
    intra_pred_h_w8_8bpc_neon:         12.13   6.33   6.43
    intra_pred_h_w16_8bpc_neon:        10.66   7.31   5.85
    intra_pred_h_w32_8bpc_neon:         6.28   4.18   2.88
    intra_pred_h_w64_8bpc_neon:         3.96   1.85   1.75
    intra_pred_v_w4_8bpc_neon:         11.44   6.12   7.57
    intra_pred_v_w8_8bpc_neon:         14.76   7.58   7.95
    intra_pred_v_w16_8bpc_neon:        11.34   6.28   5.88
    intra_pred_v_w32_8bpc_neon:         6.56   3.33   3.34
    intra_pred_v_w64_8bpc_neon:         4.57   1.24   1.97
    f7743da1
  • Ronald S. Bultje's avatar
    Minor cleanup · f6a8cc0c
    Ronald S. Bultje authored
    f6a8cc0c
  • Henrik Gramner's avatar
    Simplify README build instructions · 16e0741a
    Henrik Gramner authored
    16e0741a
  • Henrik Gramner's avatar
    checkasm: Add a function listing feature · f404c722
    Henrik Gramner authored
    --list-functions now prints a list of all function names. Uses stdout
    for easy grepping/piping. Can be combined with the --test option to
    only list functions within a specific test.
    
    Also rename --list to --list-tests and make it print to stdout as well
    for consistency.
    f404c722
  • Henrik Gramner's avatar
    x86: Increase precision of AVX2 IDCT intermediates · de561b3b
    Henrik Gramner authored
    The existing code was using 16-bit intermediate precision for certain
    calculations which is insufficient for some esoteric edge cases.
    de561b3b
  • Henrik Gramner's avatar
    d4dfa85c
  • Martin Storsjö's avatar
  • Luc Trudeau's avatar
    Check for RESTORATION_NONE once per frame · e570088d
    Luc Trudeau authored
    Prior checks were done at the sbrow level. This now allows to call
    dav1d_lr_sbrow and dav1d_lr_copy_lpf only when there's something
    for them to do.
    e570088d
  • Martin Storsjö's avatar
    arm64: mc: Schedule instructions better in the warp8x8 functions · ff41197b
    Martin Storsjö authored
    Before:           Cortex A53     A72     A73
    warp_8x8_8bpc_neon:   1997.3  1170.1  1199.9
    warp_8x8t_8bpc_neon:  1982.4  1171.5  1192.6
    After:
    warp_8x8_8bpc_neon:   1954.6  1159.2  1153.3
    warp_8x8t_8bpc_neon:  1938.5  1146.2  1136.7
    ff41197b
  • Ronald S. Bultje's avatar
    Add AVX2 version of generate_grain_uv (4:2:0) · 4e22ef3a
    Ronald S. Bultje authored
    gen_grain_uv_ar0_8bpc_420_c: 30131.8
    gen_grain_uv_ar0_8bpc_420_avx2: 6600.4
    gen_grain_uv_ar1_8bpc_420_c: 46110.5
    gen_grain_uv_ar1_8bpc_420_avx2: 17887.2
    gen_grain_uv_ar2_8bpc_420_c: 73593.2
    gen_grain_uv_ar2_8bpc_420_avx2: 26918.6
    gen_grain_uv_ar3_8bpc_420_c: 114499.3
    gen_grain_uv_ar3_8bpc_420_avx2: 29804.6
    4e22ef3a
  • Luc Trudeau's avatar
    d2c94ee1
  • Martin Storsjö's avatar
    arm64: cdef: Rewrite an expression slightly · bc26e300
    Martin Storsjö authored
    Instead of apply_sign(imin(abs(diff), clip), diff), do
    imax(imin(diff, clip), -clip).
    
    Before:                  Cortex A53     A72     A73
    cdef_filter_4x4_8bpc_neon:    592.7   374.5   384.5
    cdef_filter_4x8_8bpc_neon:   1093.0   704.4   706.6
    cdef_filter_8x8_8bpc_neon:   1962.6  1239.4  1252.1
    After:
    cdef_filter_4x4_8bpc_neon:    593.7   355.5   373.2
    cdef_filter_4x8_8bpc_neon:   1091.6   663.2   685.3
    cdef_filter_8x8_8bpc_neon:   1964.2  1182.5  1210.8
    bc26e300
  • Martin Storsjö's avatar
  • Martin Storsjö's avatar
    arm64: cdef: Calculate two initial parameters in the same vector · fa6a0924
    Martin Storsjö authored
    As there's only two individual parameters, we can insert them into
    the same vector, reducing the number of actual calculation instructions,
    but adding a few more instructions to dup the results to the final
    vectors instead.
    fa6a0924
  • Martin Storsjö's avatar
    arm64: cdef: Improve find_dir · dfaa2a10
    Martin Storsjö authored
    Only add .4h elements to the upper half of sum_alt, as only 11
    elements are needed, and .8h + .4h gives 12 in total.
    
    Fuse two consecutive ext #8 + ext #2 into ext #10.
    
    Move a few stores further away from where they are calculated.
    
    Before:         Cortex A53     A72     A73
    cdef_dir_8bpc_neon:  404.0   278.2   302.4
    After:
    cdef_dir_8bpc_neon:  400.0   269.3   282.5
    dfaa2a10
  • Luc Trudeau's avatar
    7bbc5e3d
  • Martin Storsjö's avatar
  • Martin Storsjö's avatar
    arm: cdef: Port the ARM64 CDEF NEON assembly to 32 bit arm · 3489a9c1
    Martin Storsjö authored
    The relative speedup ranges from 2.5 to 3.8x for find_dir and
    around 5 to 10x for filter.
    
    The find_dir function is a bit restricted by barely having enough
    registers, leaving very few ones for temporaries, so less things can
    be done in parallel and many instructions end up depending on the
    result of the preceding instruction.
    
    The ported functions end up slightly slower than the corresponding
    ARM64 ones, but only marginally:
    
    ARM64:                   Cortex A53     A72     A73
    cdef_dir_8bpc_neon:           400.0   268.8   282.2
    cdef_filter_4x4_8bpc_neon:    596.3   359.9   379.7
    cdef_filter_4x8_8bpc_neon:   1091.0   670.4   698.5
    cdef_filter_8x8_8bpc_neon:   1998.7  1207.2  1218.4
    ARM32:
    cdef_dir_8bpc_neon:           528.5   329.1   337.4
    cdef_filter_4x4_8bpc_neon:    632.5   482.5   432.2
    cdef_filter_4x8_8bpc_neon:   1107.2   854.8   782.3
    cdef_filter_8x8_8bpc_neon:   1984.8  1381.0  1414.4
    
    Relative speedup over C code:
                            Cortex A7     A8     A9    A53    A72    A73
    cdef_dir_8bpc_neon:          2.92   2.54   2.67   3.87   3.37   3.83
    cdef_filter_4x4_8bpc_neon:   5.09   7.61   6.10   6.85   4.94   7.41
    cdef_filter_4x8_8bpc_neon:   5.53   8.23   6.77   7.67   5.60   8.01
    cdef_filter_8x8_8bpc_neon:   6.26  10.14   8.49   8.54   6.94   4.27
    3489a9c1
  • Martin Storsjö's avatar
    arm64: mc: Use addp instead of addv+trn1 in warp · 5647a57e
    Martin Storsjö authored
    Before:           Cortex A53     A72     A73
    warp_8x8_8bpc_neon:   1952.8  1161.3  1151.1
    warp_8x8t_8bpc_neon:  1937.1  1147.5  1139.0
    After:
    warp_8x8_8bpc_neon:   1860.8  1068.6  1105.8
    warp_8x8t_8bpc_neon:  1846.9  1056.4  1099.8
    5647a57e
  • Martin Storsjö's avatar
    arm: mc: Port the ARM64 warp filter to arm32 · 61442bee
    Martin Storsjö authored
    Relative speedup over C code:
                      Cortex A7     A8     A9    A53    A72    A73
    warp_8x8_8bpc_neon:    2.79   5.45   4.18   3.96   4.16   4.51
    warp_8x8t_8bpc_neon:   2.79   5.33   4.18   3.98   4.22   4.25
    
    Comparison to original ARM64 assembly:
    
    ARM64:            Cortex A53     A72     A73
    warp_8x8_8bpc_neon:   1854.6  1072.5  1102.5
    warp_8x8t_8bpc_neon:  1839.6  1069.4  1089.5
    ARM32:
    warp_8x8_8bpc_neon:   2132.5  1160.3  1218.0
    warp_8x8t_8bpc_neon:  2113.7  1148.0  1209.1
    61442bee
  • Jean-Baptiste Kempf's avatar
    Move snap to package/ subfolder · 3e0f1508
    Jean-Baptiste Kempf authored
    3e0f1508
  • Michail Alvanos's avatar
    Add VSX wiener filter implementation · be60b142
    Michail Alvanos authored
    be60b142
  • Jean-Baptiste Kempf's avatar
    Update NEWS for 0.5.0 · c688d5b2
    Jean-Baptiste Kempf authored
    c688d5b2
  • James Almer's avatar
    x86: fix generate_grain_uv checkasm crashes on Windows x64 · a7c024ce
    James Almer authored
    The uv argument is normally in a gpr, but in checkasm it's forcefully
    loaded from stack.
    a7c024ce
  • Henrik Gramner's avatar
    checkasm: Improve ipred_z tests · dfadb6df
    Henrik Gramner authored
    dfadb6df
  • Henrik Gramner's avatar
    Simplify ipred_z C code · afe901a6
    Henrik Gramner authored
    afe901a6
  • Henrik Gramner's avatar
    x86: Add ipred_z2 AVX2 asm · ea9fc9d9
    Henrik Gramner authored
    ea9fc9d9
  • Martin Storsjö's avatar
    arm64: ipred: NEON implementation of paeth prediction · 8ab69afb
    Martin Storsjö authored
    Relative speedups over the C code:
                                Cortex A53    A72    A73
    intra_pred_paeth_w4_8bpc_neon:    8.36   6.55   7.27
    intra_pred_paeth_w8_8bpc_neon:   15.24  11.36  11.34
    intra_pred_paeth_w16_8bpc_neon:  16.63  13.20  14.17
    intra_pred_paeth_w32_8bpc_neon:  10.83   9.21   9.87
    intra_pred_paeth_w64_8bpc_neon:   8.37   7.07   7.45
    8ab69afb
  • Martin Storsjö's avatar
    arm64: ipred: NEON implementation of smooth prediction · 4318600e
    Martin Storsjö authored
    Relative speedups over the C code:
                                   Cortex A53    A72    A73
    intra_pred_smooth_h_w4_8bpc_neon:    8.02   4.53   7.09
    intra_pred_smooth_h_w8_8bpc_neon:   16.59   5.91   9.32
    intra_pred_smooth_h_w16_8bpc_neon:  18.80   5.54  10.10
    intra_pred_smooth_h_w32_8bpc_neon:   5.07   4.43   4.60
    intra_pred_smooth_h_w64_8bpc_neon:   5.03   4.26   4.34
    intra_pred_smooth_v_w4_8bpc_neon:    9.11   5.51   7.75
    intra_pred_smooth_v_w8_8bpc_neon:   17.07   6.86  10.55
    intra_pred_smooth_v_w16_8bpc_neon:  17.98   6.38  11.52
    intra_pred_smooth_v_w32_8bpc_neon:  11.69   5.66   8.09
    intra_pred_smooth_v_w64_8bpc_neon:   8.44   4.34   5.72
    intra_pred_smooth_w4_8bpc_neon:      9.81   4.85   6.93
    intra_pred_smooth_w8_8bpc_neon:     16.05   5.60   9.26
    intra_pred_smooth_w16_8bpc_neon:    14.01   5.02   8.96
    intra_pred_smooth_w32_8bpc_neon:     9.29   5.02   7.25
    intra_pred_smooth_w64_8bpc_neon:     6.53   3.94   5.26
    4318600e
  • Martin Storsjö's avatar
    arm64: ipred: NEON implementation of palette prediction · 4f14573c
    Martin Storsjö authored
    Relative speedups over the C code:
                        Cortex A53    A72    A73
    pal_pred_w4_8bpc_neon:    8.75   6.15   7.60
    pal_pred_w8_8bpc_neon:   19.93  11.79  10.98
    pal_pred_w16_8bpc_neon:  24.68  13.28  16.06
    pal_pred_w32_8bpc_neon:  23.56  11.81  16.74
    pal_pred_w64_8bpc_neon:  23.16  12.19  17.60
    4f14573c
  • Martin Storsjö's avatar
    arm64: ipred: NEON implementation of the filter function · d322d451
    Martin Storsjö authored
    Use a different layout of the filter_intra_taps depending on
    architecture; the current one is optimized for the x86 SIMD
    implementation.
    
    Relative speedups over the C code:
                                 Cortex A53    A72    A73
    intra_pred_filter_w4_8bpc_neon:    6.38   2.81   4.43
    intra_pred_filter_w8_8bpc_neon:    9.30   3.62   5.71
    intra_pred_filter_w16_8bpc_neon:   9.85   3.98   6.42
    intra_pred_filter_w32_8bpc_neon:  10.77   4.08   7.09
    d322d451
  • Martin Storsjö's avatar
    arm64: ipred: NEON implementation of the cfl_pred functions · c7693386
    Martin Storsjö authored
    Relative speedup over the C code:
                                 Cortex A53    A72    A73
    cfl_pred_cfl_128_w4_8bpc_neon:    10.81   7.90   9.80
    cfl_pred_cfl_128_w8_8bpc_neon:    18.38  11.15  13.24
    cfl_pred_cfl_128_w16_8bpc_neon:   16.52  10.83  16.00
    cfl_pred_cfl_128_w32_8bpc_neon:    3.27   3.60   3.70
    cfl_pred_cfl_left_w4_8bpc_neon:    9.82   7.38   8.76
    cfl_pred_cfl_left_w8_8bpc_neon:   17.22  10.63  11.97
    cfl_pred_cfl_left_w16_8bpc_neon:  16.03  10.49  15.66
    cfl_pred_cfl_left_w32_8bpc_neon:   3.28   3.61   3.72
    cfl_pred_cfl_top_w4_8bpc_neon:     9.74   7.39   9.29
    cfl_pred_cfl_top_w8_8bpc_neon:    17.48  10.89  12.58
    cfl_pred_cfl_top_w16_8bpc_neon:   16.01  10.62  15.31
    cfl_pred_cfl_top_w32_8bpc_neon:    3.25   3.62   3.75
    cfl_pred_cfl_w4_8bpc_neon:         8.39   6.34   8.04
    cfl_pred_cfl_w8_8bpc_neon:        15.99  10.12  12.42
    cfl_pred_cfl_w16_8bpc_neon:       15.25  10.40  15.12
    cfl_pred_cfl_w32_8bpc_neon:        3.23   3.58   3.71
    
    The C code gets autovectorized for w >= 32, which is why the
    relative speedup looks strange (but the performance of the NEON
    functions is completely as expected).
    c7693386
  • Martin Storsjö's avatar
    arm64: ipred: NEON implementation of the cfl_ac functions · 57dd0aae
    Martin Storsjö authored
    Relative speedup over the C code:
                          Cortex A53    A72    A73
    cfl_ac_420_w4_8bpc_neon:    7.73   6.48   9.22
    cfl_ac_420_w8_8bpc_neon:    6.70   5.56   6.95
    cfl_ac_420_w16_8bpc_neon:   6.51   6.93   6.67
    cfl_ac_422_w4_8bpc_neon:    9.25   7.70   9.75
    cfl_ac_422_w8_8bpc_neon:    8.53   5.95   7.13
    cfl_ac_422_w16_8bpc_neon:   7.08   6.87   6.06
    57dd0aae
  • Luc Trudeau's avatar
    b7d7c8ce
  • Martin Storsjö's avatar
    5d014b41
  • Jean-Baptiste Kempf's avatar
......@@ -241,7 +241,7 @@ build-ubuntu-snap:
- debian
- amd64
script:
- snapcraft snap
- cd package/snap && snapcraft snap
- |
if [ "$CI_PROJECT_NAMESPACE" = "videolan" ]; then
echo $SNAP_LOGIN | base64 --decode | snapcraft login --with -
......@@ -251,7 +251,7 @@ build-ubuntu-snap:
artifacts:
name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG"
paths:
- dav1d_*.snap
- package/snap/dav1d_*.snap
expire_in: 1 week
allow_failure: true
......@@ -269,6 +269,7 @@ build-debian-ppc64le:
test-debian:
image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514
stage: test
needs: ["build-debian"]
tags:
- debian
- amd64
......@@ -289,6 +290,7 @@ test-debian:
test-debian-unaligned-stack:
image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514
stage: test
needs: ["build-debian"]
tags:
- debian
- amd64
......@@ -309,6 +311,7 @@ test-debian-unaligned-stack:
test-debian-asan:
image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514
stage: test
needs: ["build-debian"]
tags:
- debian
- amd64
......@@ -331,6 +334,7 @@ test-debian-asan:
test-debian-msan:
image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514
stage: test
needs: ["build-debian"]
tags:
- debian
- amd64
......@@ -353,6 +357,7 @@ test-debian-msan:
test-debian-ubsan:
image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514
stage: test
needs: ["build-debian"]
tags:
- debian
- amd64
......@@ -375,6 +380,7 @@ test-debian-ubsan:
test-win64:
image: registry.videolan.org:5000/dav1d-debian-unstable:20190215130514
stage: test
needs: ["build-win64"]
tags:
- debian
- amd64
......@@ -399,6 +405,7 @@ test-win64:
test-debian-aarch64:
image: registry.videolan.org:5000/dav1d-debian-unstable-aarch64:20181122182457
stage: test
needs: ["build-debian-aarch64"]
tags:
- aarch64
- debian
......@@ -421,6 +428,7 @@ test-debian-aarch64:
test-debian-ppc64le:
image: registry.videolan.org/dav1d-debian-unstable-ppc64le:20190606105121
stage: test
needs: ["build-debian-ppc64le"]
tags:
- ppc64le
- docker
......@@ -443,6 +451,7 @@ test-debian-ppc64le:
test-debian-armv7-clang-5:
stage: test
image: registry.videolan.org:5000/dav1d-debian-unstable-armv7:20190202101732
needs: ["build-debian-armv7-clang-5"]
tags:
- armv7
- debian
......
Changes for 0.5.0 'Asiatic Cheetah':
----------------------------
0.5.0 is a medium release fixing regressions and minor issues,
and improving speed significantly:
- Export ITU T.35 metadata
- Speed improvements on blend_ on ARM
- Speed improvements on decode_coef and MSAC
- NEON optimizations for blend*, w_mask_, ipred functions for ARM64
- NEON optimizations for CDEF and warp on ARM32
- SSE2 optimizations for MSAC hi_tok decoding
- SSSE3 optimizations for deblocking loopfilters and warp_affine
- AVX-2 optimizations for film grain and ipred_z2
- SSE4 optimizations for warp_affine
- VSX optimizations for wiener
- Fix inverse transform overflows in x86 and NEON asm
- Fix integer overflows with large frames
- Improve film grain generation to match reference code
- Improve compatibility with older binutils for ARM
- More advanced Player example in tools
Changes for 0.4.0 'Cheetah':
----------------------------
......@@ -11,6 +33,7 @@ Changes for 0.4.0 'Cheetah':
- NEON optimizations for blend functions on ARM
- NEON optimizations for w_mask functions on ARM
- NEON optimizations for inverse transforms on ARM64
- VSX optimizations for CDEF filter
- Improve handling of malloc failures
- Simple Player example in tools
......@@ -38,7 +61,7 @@ Changes for 0.2.2 (0.3.0-rc) 'Antelope':
- Large improvement on MSAC decoding with SSE, bringing 4-6% speed increase
The impact is important on SSSE3, SSE4 and AVX-2 cpus
- SSSE3 optimizations for all blocks size in itx
- SSSE3 optimizations for ipred_paeth and ipref_cfl (420, 422 and 444)
- SSSE3 optimizations for ipred_paeth and ipred_cfl (420, 422 and 444)
- Speed improvements on CDEF for SSE4 CPUs
- NEON optimizations for SGR and loop filter
- Minor crashes, improvements and build changes
......
......@@ -73,28 +73,15 @@ The [VideoLAN Code of Conduct](https://wiki.videolan.org/CoC) applies to this pr
# Compile
1. Install [Meson](https://mesonbuild.com/) (0.47 or higher), [Ninja](https://ninja-build.org/), and, for x86\* targets, [nasm](https://nasm.us/) (2.13.02 or higher)
2. Run `meson build --buildtype release`
3. Build with `ninja -C build`
2. Run `mkdir build && cd build` to create a build directory and enter it
3. Run `meson ..` to configure meson, add `--default-library=static` if static linking is desired
4. Run `ninja` to compile
# Run tests
1. During initial build dir setup or `meson configure` specify `-Denable_tests=true`
2. In the build directory run `meson test` optionally with `-v` for more verbose output, especially useful
for checkasm
# Run testdata based tests
1. Checkout the test data repository
```
git clone https://code.videolan.org/videolan/dav1d-test-data.git tests/dav1d-test-data
```
2. During initial build dir setup or `meson configure` specify `-Denable_tests=true` and `-Dtestdata_tests=true`
```
meson .test -Denable_tests=true -Dtestdata_tests=true
```
3. In the build directory run `meson test` optionally with `-v` for more verbose output
1. In the root directory, run `git clone https://code.videolan.org/videolan/dav1d-test-data.git tests/dav1d-test-data` to fetch the test data repository
2. During meson configuration, specify `-Dtestdata_tests=true`
3. Run `meson test -v` after compiling
# Support
......
......@@ -27,19 +27,19 @@
#include "config.h"
#include "vcs_version.h"
#include <stdio.h>
#include <getopt.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include <getopt.h>
#include <SDL.h>
#include "common/attributes.h"
#include "dav1d/dav1d.h"
#include "input/input.h"
#include "dav1d_cli_parse.h"
#include "tools/input/input.h"
/**
* Settings structure
......@@ -49,6 +49,9 @@
*/
typedef struct {
const char *inputfile;
int highquality;
int untimed;
int zerocopy;
} Dav1dPlaySettings;
#define WINDOW_WIDTH 910
......@@ -157,9 +160,13 @@ typedef struct rdr_info
// Callback to destroy the renderer
void (*destroy_renderer)(void *cookie);
// Callback to the render function that renders a prevously sent frame
void (*render)(void *cookie);
void (*render)(void *cookie, const Dav1dPlaySettings *settings);
// Callback to the send frame function
int (*update_frame)(void *cookie, Dav1dPicture *dav1d_pic);
int (*update_frame)(void *cookie, Dav1dPicture *dav1d_pic,
const Dav1dPlaySettings *settings);
// Callback for alloc/release pictures (optional)
int (*alloc_pic)(Dav1dPicture *pic, void *cookie);
void (*release_pic)(Dav1dPicture *pic, void *cookie);
} Dav1dPlayRenderInfo;
#ifdef HAVE_PLACEBO_VULKAN
......@@ -326,7 +333,7 @@ static void placebo_renderer_destroy(void *cookie)
pl_context_destroy(&(rd_priv_ctx->ctx));
}
static void placebo_render(void *cookie)
static void placebo_render(void *cookie, const Dav1dPlaySettings *settings)
{
Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
assert(rd_priv_ctx != NULL);
......@@ -359,8 +366,9 @@ static void placebo_render(void *cookie)
.height = img->params.h,
};
struct pl_render_params render_params = pl_render_default_params;
//render_params.upscaler = &pl_filter_ewa_lanczos;
struct pl_render_params render_params = {0};
if (settings->highquality)
render_params = pl_render_default_params;
struct pl_render_target target;
pl_render_target_from_swapchain(&target, &frame);
......@@ -386,7 +394,8 @@ static void placebo_render(void *cookie)
SDL_UnlockMutex(rd_priv_ctx->lock);
}
static int placebo_upload_planes(void *cookie, Dav1dPicture *dav1d_pic)
static int placebo_upload_planes(void *cookie, Dav1dPicture *dav1d_pic,
const Dav1dPlaySettings *settings)
{
Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
assert(rd_priv_ctx != NULL);
......@@ -414,7 +423,6 @@ static int placebo_upload_planes(void *cookie, Dav1dPicture *dav1d_pic)
.height = height,
.pixel_stride = 1,
.row_stride = dav1d_pic->stride[0],
.pixels = dav1d_pic->data[0],
.component_size = {8},
.component_map = {0},
};
......@@ -425,7 +433,6 @@ static int placebo_upload_planes(void *cookie, Dav1dPicture *dav1d_pic)
.height = height/2,
.pixel_stride = 1,
.row_stride = dav1d_pic->stride[1],
.pixels = dav1d_pic->data[1],
.component_size = {8},
.component_map = {1},
};
......@@ -436,11 +443,23 @@ static int placebo_upload_planes(void *cookie, Dav1dPicture *dav1d_pic)
.height = height/2,
.pixel_stride = 1,
.row_stride = dav1d_pic->stride[1],
.pixels = dav1d_pic->data[2],
.component_size = {8},
.component_map = {2},
};
if (settings->zerocopy) {
const struct pl_buf *buf = dav1d_pic->allocator_data;
assert(buf);
data_y.buf = data_u.buf = data_v.buf = buf;
data_y.buf_offset = (uintptr_t) dav1d_pic->data[0] - (uintptr_t) buf->data;
data_u.buf_offset = (uintptr_t) dav1d_pic->data[1] - (uintptr_t) buf->data;
data_v.buf_offset = (uintptr_t) dav1d_pic->data[2] - (uintptr_t) buf->data;
} else {
data_y.pixels = dav1d_pic->data[0];
data_u.pixels = dav1d_pic->data[1];
data_v.pixels = dav1d_pic->data[2];
}
bool ok = true;
ok &= pl_upload_plane(rd_priv_ctx->vk->gpu, &(rd_priv_ctx->y_plane), &(rd_priv_ctx->y_tex), &data_y);
ok &= pl_upload_plane(rd_priv_ctx->vk->gpu, &(rd_priv_ctx->u_plane), &(rd_priv_ctx->u_tex), &data_u);
......@@ -457,11 +476,106 @@ static int placebo_upload_planes(void *cookie, Dav1dPicture *dav1d_pic)
return !ok;
}
// Align to power of 2
#define ALIGN2(x, align) (((x) + (align) - 1) & ~((align) - 1))
static int placebo_alloc_pic(Dav1dPicture *const p, void *cookie)
{
Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
assert(rd_priv_ctx != NULL);
SDL_LockMutex(rd_priv_ctx->lock);
const struct pl_gpu *gpu = rd_priv_ctx->vk->gpu;
int ret = DAV1D_ERR(ENOMEM);
// Copied from dav1d_default_picture_alloc
const int hbd = p->p.bpc > 8;
const int aligned_w = ALIGN2(p->p.w, 128);
const int aligned_h = ALIGN2(p->p.h, 128);
const int has_chroma = p->p.layout != DAV1D_PIXEL_LAYOUT_I400;
const int ss_ver = p->p.layout == DAV1D_PIXEL_LAYOUT_I420;
const int ss_hor = p->p.layout != DAV1D_PIXEL_LAYOUT_I444;
p->stride[0] = aligned_w << hbd;
p->stride[1] = has_chroma ? (aligned_w >> ss_hor) << hbd : 0;
// Align strides up to multiples of the GPU performance hints
p->stride[0] = ALIGN2(p->stride[0], gpu->limits.align_tex_xfer_stride);
p->stride[1] = ALIGN2(p->stride[1], gpu->limits.align_tex_xfer_stride);
// Aligning offsets to 4 also implicity aligns to the texel size (1 or 2)
size_t off_align = ALIGN2(gpu->limits.align_tex_xfer_offset, 4);
const size_t y_sz = ALIGN2(p->stride[0] * aligned_h, off_align);
const size_t uv_sz = ALIGN2(p->stride[1] * (aligned_h >> ss_ver), off_align);
// The extra DAV1D_PICTURE_ALIGNMENTs are to brute force plane alignment,
// even in the case that the driver gives us insane alignments
const size_t pic_size = y_sz + 2 * uv_sz;
const size_t total_size = pic_size + DAV1D_PICTURE_ALIGNMENT * 4;
// Validate size limitations
if (total_size > gpu->limits.max_xfer_size) {
printf("alloc of %zu bytes exceeds limits\n", total_size);
goto err;
}
const struct pl_buf *buf = pl_buf_create(gpu, &(struct pl_buf_params) {
.type = PL_BUF_TEX_TRANSFER,
.host_mapped = true,
.size = total_size,
.memory_type = PL_BUF_MEM_HOST,
.user_data = p,
});
if (!buf) {
printf("alloc of GPU mapped buffer failed\n");
goto err;
}
assert(buf->data);
uintptr_t base = (uintptr_t) buf->data, data[3];
data[0] = ALIGN2(base, DAV1D_PICTURE_ALIGNMENT);
data[1] = ALIGN2(data[0] + y_sz, DAV1D_PICTURE_ALIGNMENT);
data[2] = ALIGN2(data[1] + uv_sz, DAV1D_PICTURE_ALIGNMENT);
// Sanity check offset alignment for the sake of debugging
if (data[0] - base != ALIGN2(data[0] - base, off_align) ||
data[1] - base != ALIGN2(data[1] - base, off_align) ||
data[2] - base != ALIGN2(data[2] - base, off_align))
{
printf("GPU buffer horribly misaligned, expect slowdown!\n");
}
p->allocator_data = (void *) buf;
p->data[0] = (void *) data[0];
p->data[1] = (void *) data[1];
p->data[2] = (void *) data[2];
ret = 0;
// fall through
err:
SDL_UnlockMutex(rd_priv_ctx->lock);
return ret;
}
static void placebo_release_pic(Dav1dPicture *pic, void *cookie)
{
Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
assert(rd_priv_ctx != NULL);
assert(pic->allocator_data);
SDL_LockMutex(rd_priv_ctx->lock);
const struct pl_gpu *gpu = rd_priv_ctx->vk->gpu;
pl_buf_destroy(gpu, (const struct pl_buf **) &pic->allocator_data);
SDL_UnlockMutex(rd_priv_ctx->lock);
}
static const Dav1dPlayRenderInfo renderer_info = {
.create_renderer = placebo_renderer_create,
.destroy_renderer = placebo_renderer_destroy,
.render = placebo_render,
.update_frame = placebo_upload_planes
.update_frame = placebo_upload_planes,
.alloc_pic = placebo_alloc_pic,
.release_pic = placebo_release_pic,
};
#else
......@@ -517,7 +631,7 @@ static void sdl_renderer_destroy(void *cookie)
free(rd_priv_ctx);
}
static void sdl_render(void *cookie)
static void sdl_render(void *cookie, const Dav1dPlaySettings *settings)
{
Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
assert(rd_priv_ctx != NULL);
......@@ -537,7 +651,8 @@ static void sdl_render(void *cookie)
SDL_UnlockMutex(rd_priv_ctx->lock);
}
static int sdl_update_texture(void *cookie, Dav1dPicture *dav1d_pic)
static int sdl_update_texture(void *cookie, Dav1dPicture *dav1d_pic,
const Dav1dPlaySettings *settings)
{
Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
assert(rd_priv_ctx != NULL);
......@@ -648,8 +763,11 @@ static void dp_settings_print_usage(const char *const app,
fprintf(stderr, "Usage: %s [options]\n\n", app);
fprintf(stderr, "Supported options:\n"
" --input/-i $file: input file\n"
" --untimed/-u: ignore PTS, render as fast as possible\n"
" --framethreads $num: number of frame threads (default: 1)\n"
" --tilethreads $num: number of tile threads (default: 1)\n"
" --highquality: enable high quality rendering\n"
" --zerocopy/-z: enable zero copy upload path\n"
" --version/-v: print version and exit\n");
exit(1);
}
......@@ -673,19 +791,23 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx,
Dav1dSettings *lib_settings = &rd_ctx->lib_settings;
// Short options
static const char short_opts[] = "i:v";
static const char short_opts[] = "i:vuz";
enum {
ARG_FRAME_THREADS = 256,
ARG_TILE_THREADS,
ARG_HIGH_QUALITY,
};
// Long options
static const struct option long_opts[] = {
{ "input", 1, NULL, 'i' },
{ "version", 0, NULL, 'v' },
{ "untimed", 0, NULL, 'u' },
{ "framethreads", 1, NULL, ARG_FRAME_THREADS },
{ "tilethreads", 1, NULL, ARG_TILE_THREADS },
{ "highquality", 0, NULL, ARG_HIGH_QUALITY },
{ "zerocopy", 0, NULL, 'z' },
{ NULL, 0, NULL, 0 },
};
......@@ -697,6 +819,21 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx,
case 'v':
fprintf(stderr, "%s\n", dav1d_version());
exit(0);
case 'u':
settings->untimed = true;
break;
case ARG_HIGH_QUALITY:
settings->highquality = true;
#ifndef HAVE_PLACEBO_VULKAN
fprintf(stderr, "warning: --highquality requires libplacebo\n");
#endif
break;
case 'z':
settings->zerocopy = true;
#ifndef HAVE_PLACEBO_VULKAN
fprintf(stderr, "warning: --zerocopy requires libplacebo\n");
#endif
break;
case ARG_FRAME_THREADS:
lib_settings->n_frame_threads =
parse_unsigned(optarg, ARG_FRAME_THREADS, argv[0]);
......@@ -713,6 +850,8 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx,
if (optind < argc)
dp_settings_print_usage(argv[0],
"Extra/unused arguments found, e.g. '%s'\n", argv[optind]);
if (!settings->inputfile)
dp_settings_print_usage(argv[0], "Input file (-i/--input) is required");
}
/**
......@@ -777,6 +916,7 @@ static Dav1dPlayRenderContext *dp_rd_ctx_create(void *rd_data)
}
dav1d_default_settings(&rd_ctx->lib_settings);
memset(&rd_ctx->settings, 0, sizeof(rd_ctx->settings));
rd_ctx->last_pts = 0;
rd_ctx->last_ticks = 0;
......@@ -809,7 +949,7 @@ static void dp_rd_ctx_post_event(Dav1dPlayRenderContext *rd_ctx, uint32_t code)
static void dp_rd_ctx_update_with_dav1d_picture(Dav1dPlayRenderContext *rd_ctx,
Dav1dPicture *dav1d_pic)
{
renderer_info.update_frame(rd_ctx->rd_priv, dav1d_pic);
renderer_info.update_frame(rd_ctx->rd_priv, dav1d_pic, &rd_ctx->settings);
rd_ctx->current_pts = dav1d_pic->m.timestamp;
}
......@@ -851,16 +991,20 @@ static void dp_rd_ctx_render(Dav1dPlayRenderContext *rd_ctx)
int32_t wait_time = (pts_diff * rd_ctx->timebase) * 1000 - ticks_diff;
rd_ctx->last_pts = rd_ctx->current_pts;
// In untimed mode, simply don't wait
if (rd_ctx->settings.untimed)
wait_time = 0;
// This way of timing the playback is not accurate, as there is no guarantee
// that SDL_Delay will wait for exactly the requested amount of time so in a
// accurate player this would need to be done in a better way.
if (wait_time >= 0) {
if (wait_time > 0) {
SDL_Delay(wait_time);
} else if (wait_time < -10) { // Do not warn for minor time drifts
fprintf(stderr, "Frame displayed %f seconds too late\n", wait_time/(float)1000);
}
renderer_info.render(rd_ctx->rd_priv);
renderer_info.render(rd_ctx->rd_priv, &rd_ctx->settings);
rd_ctx->last_ticks = SDL_GetTicks();
}
......@@ -1044,6 +1188,18 @@ int main(int argc, char **argv)
// Parse and validate arguments
dp_rd_ctx_parse_args(rd_ctx, argc, argv);
if (rd_ctx->settings.zerocopy) {
if (renderer_info.alloc_pic) {
rd_ctx->lib_settings.allocator = (Dav1dPicAllocator) {
.cookie = rd_ctx->rd_priv,
.alloc_picture_callback = renderer_info.alloc_pic,
.release_picture_callback = renderer_info.release_pic,
};
} else {
fprintf(stderr, "--zerocopy unsupported by compiled renderer\n");
}
}
// Start decoder thread
decoder_thread = SDL_CreateThread(decoder_thread_main, "Decoder thread", rd_ctx);
......
# Copyright © 2018, VideoLAN and dav1d authors
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# Build definition for the dav1d examples
#
# Leave subdir if examples are disabled
if not get_option('enable_examples')
subdir_done()
endif
# dav1d player sources
dav1dplay_sources = files(
'dav1dplay.c',
)
sdl2_dependency = dependency('sdl2', version: '>= 2.0.1', required: false)
if sdl2_dependency.found()
placebo_dependency = dependency('libplacebo', version: '>= 1.18.0', required: false)
vulkan_dependency = dependency('vulkan', required: false)
sdl_has_vulkan = cc.has_header('SDL_vulkan.h', dependencies: [sdl2_dependency])
cflag_placebo = []
deps_placebo = []
if placebo_dependency.found() and vulkan_dependency.found() and sdl_has_vulkan
cflag_placebo += '-DHAVE_PLACEBO_VULKAN=1'
deps_placebo = [vulkan_dependency, placebo_dependency]
endif
dav1dplay = executable('dav1dplay',
dav1dplay_sources,
rev_target,
link_with : [libdav1d, dav1d_input_objs],
include_directories : [dav1d_inc_dirs],
dependencies : [getopt_dependency, sdl2_dependency, deps_placebo],
install : true,
c_args : cflag_placebo,
)
endif
......@@ -92,6 +92,14 @@
#define NOINLINE __attribute__((noinline))
#endif /* !_MSC_VER */
#if defined(NDEBUG) && (defined(__GNUC__) || defined(__clang__))
#define assert(x) do { if (!(x)) __builtin_unreachable(); } while (0)
#elif defined(NDEBUG) && defined(_MSC_VER)
#define assert __assume
#else
#include <assert.h>
#endif
#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
# define dav1d_uninit(x) x=x
#else
......
......@@ -40,6 +40,14 @@ static inline int imin(const int a, const int b) {
return a < b ? a : b;
}
static inline unsigned umax(const unsigned a, const unsigned b) {
return a > b ? a : b;
}
static inline unsigned umin(const unsigned a, const unsigned b) {
return a < b ? a : b;
}
static inline int iclip(const int v, const int min, const int max) {
return v < min ? min : v > max ? max : v;
}
......
......@@ -28,13 +28,14 @@
#ifndef DAV1D_COMMON_MEM_H
#define DAV1D_COMMON_MEM_H
#include <assert.h>
#include <stdlib.h>
#if defined(HAVE_ALIGNED_MALLOC) || defined(HAVE_MEMALIGN)
#include <malloc.h>
#endif
#include "common/attributes.h"
/*
* Allocate 32-byte aligned memory. The return value can be released
* by calling the standard free() function.
......
......@@ -28,6 +28,8 @@
#ifndef DAV1D_HEADERS_H
#define DAV1D_HEADERS_H
#include <stddef.h>
// Constants from Section 3. "Symbols and abbreviated terms"
#define DAV1D_MAX_CDEF_STRENGTHS 8
#define DAV1D_MAX_OPERATING_POINTS 32
......@@ -176,6 +178,13 @@ typedef struct Dav1dMasteringDisplay {
uint32_t min_luminance;
} Dav1dMasteringDisplay;
typedef struct Dav1dITUTT35 {
uint8_t country_code;
uint8_t country_code_extension_byte;
size_t payload_size;
uint8_t *payload;
} Dav1dITUTT35;
typedef struct Dav1dSequenceHeader {
/**
* Stream profile, 0 for 8-10 bits/component 4:2:0 or monochrome;
......@@ -289,7 +298,7 @@ typedef struct Dav1dLoopfilterModeRefDeltas {
} Dav1dLoopfilterModeRefDeltas;
typedef struct Dav1dFilmGrainData {
uint16_t seed;
unsigned seed;
int num_y_points;
uint8_t y_points[14][2 /* value, scaling */];
int chroma_scaling_from_luma;
......
......@@ -77,9 +77,16 @@ typedef struct Dav1dPicture {
* this picture, as defined in section 5.8.4 and 6.7.4
*/
Dav1dMasteringDisplay *mastering_display;
/**
* ITU-T T.35 metadata as defined in section 5.8.2 and 6.7.2
*/
Dav1dITUTT35 *itut_t35;
uintptr_t reserved[4]; ///< reserved for future use
struct Dav1dRef *frame_hdr_ref, *seq_hdr_ref; ///< Frame parameter allocation origins
struct Dav1dRef *content_light_ref, *mastering_display_ref; ///< Metadata allocation origins
struct Dav1dRef *content_light_ref, *mastering_display_ref, *itut_t35_ref; ///< Metadata allocation origins
uintptr_t reserved_ref[4]; ///< reserved for future use
struct Dav1dRef *ref; ///< Frame data allocation origin
void *allocator_data; ///< pointer managed by the allocator
......
......@@ -23,14 +23,14 @@
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
project('dav1d', ['c'],
version: '0.4.0',
version: '0.5.0',
default_options: ['c_std=c99',
'warning_level=2',
'buildtype=release',
'b_ndebug=if-release'],
meson_version: '>= 0.47.0')
dav1d_soname_version = '2.0.0'
dav1d_soname_version = '3.0.0'
dav1d_api_version_array = dav1d_soname_version.split('.')
dav1d_api_version_major = dav1d_api_version_array[0]
dav1d_api_version_minor = dav1d_api_version_array[1]
......@@ -85,9 +85,14 @@ test_args = []
optional_arguments = []
# Define _POSIX_C_SOURCE to POSIX.1–2001 (IEEE Std 1003.1-2001)
test_args += '-D_POSIX_C_SOURCE=200112L'
test_args += '-D_POSIX_C_SOURCE=200112L'
add_project_arguments('-D_POSIX_C_SOURCE=200112L', language: 'c')
if host_machine.system() == 'darwin'
test_args += '-D_DARWIN_C_SOURCE'
add_project_arguments('-D_DARWIN_C_SOURCE', language: 'c')
endif
if host_machine.system() == 'windows'
cdata.set('_WIN32_WINNT', '0x0601')
cdata.set('UNICODE', 1) # Define to 1 for Unicode (Wide Chars) APIs
......@@ -389,4 +394,6 @@ subdir('src')
subdir('tools')
subdir('examples')
subdir('tests')
......@@ -15,6 +15,11 @@ option('enable_tools',
value: true,
description: 'Build dav1d cli tools')
option('enable_examples',
type: 'boolean',
value: false,
description: 'Build dav1d examples')
option('enable_tests',
type: 'boolean',
value: true,
......
......@@ -17,7 +17,7 @@ apps:
parts:
dav1d:
plugin: meson
source: .
source: ../../
build-packages: [ 'nasm' ]
meson-parameters:
- --prefix=/usr
......
/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2019, Martin Storsjo
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/arm/asm.S"
#include "util.S"
// n1 = s0/d0
// w1 = d0/q0
// n2 = s4/d2
// w2 = d2/q1
.macro pad_top_bottom s1, s2, w, stride, n1, w1, n2, w2, align, ret
tst r6, #1 // CDEF_HAVE_LEFT
beq 2f
// CDEF_HAVE_LEFT
tst r6, #2 // CDEF_HAVE_RIGHT
beq 1f
// CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
ldrh r12, [\s1, #-2]
vldr \n1, [\s1]
vdup.16 d4, r12
ldrh r12, [\s1, #\w]
vmov.16 d4[1], r12
ldrh r12, [\s2, #-2]
vldr \n2, [\s2]
vmov.16 d4[2], r12
ldrh r12, [\s2, #\w]
vmovl.u8 q0, d0
vmov.16 d4[3], r12
vmovl.u8 q1, d2
vmovl.u8 q2, d4
vstr s8, [r0, #-4]
vst1.16 {\w1}, [r0, :\align]
vstr s9, [r0, #2*\w]
add r0, r0, #2*\stride
vstr s10, [r0, #-4]
vst1.16 {\w2}, [r0, :\align]
vstr s11, [r0, #2*\w]
.if \ret
pop {r4-r7,pc}
.else
add r0, r0, #2*\stride
b 3f
.endif
1:
// CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
ldrh r12, [\s1, #-2]
vldr \n1, [\s1]
vdup.16 d4, r12
ldrh r12, [\s2, #-2]
vldr \n2, [\s2]
vmovl.u8 q0, d0
vmov.16 d4[1], r12
vmovl.u8 q1, d2
vmovl.u8 q2, d4
vstr s8, [r0, #-4]
vst1.16 {\w1}, [r0, :\align]
vstr s12, [r0, #2*\w]
add r0, r0, #2*\stride
vstr s9, [r0, #-4]
vst1.16 {\w2}, [r0, :\align]
vstr s12, [r0, #2*\w]
.if \ret
pop {r4-r7,pc}
.else
add r0, r0, #2*\stride
b 3f
.endif
2:
// !CDEF_HAVE_LEFT
tst r6, #2 // CDEF_HAVE_RIGHT
beq 1f
// !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
vldr \n1, [\s1]
ldrh r12, [\s1, #\w]
vldr \n2, [\s2]
vdup.16 d4, r12
ldrh r12, [\s2, #\w]
vmovl.u8 q0, d0
vmov.16 d4[1], r12
vmovl.u8 q1, d2
vmovl.u8 q2, d4
vstr s12, [r0, #-4]
vst1.16 {\w1}, [r0, :\align]
vstr s8, [r0, #2*\w]
add r0, r0, #2*\stride
vstr s12, [r0, #-4]
vst1.16 {\w2}, [r0, :\align]
vstr s9, [r0, #2*\w]
.if \ret
pop {r4-r7,pc}
.else
add r0, r0, #2*\stride
b 3f
.endif
1:
// !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
vldr \n1, [\s1]
vldr \n2, [\s2]
vmovl.u8 q0, d0
vmovl.u8 q1, d2
vstr s12, [r0, #-4]
vst1.16 {\w1}, [r0, :\align]
vstr s12, [r0, #2*\w]
add r0, r0, #2*\stride
vstr s12, [r0, #-4]
vst1.16 {\w2}, [r0, :\align]
vstr s12, [r0, #2*\w]
.if \ret
pop {r4-r7,pc}
.else
add r0, r0, #2*\stride
.endif
3:
.endm
.macro load_n_incr dst, src, incr, w
.if \w == 4
vld1.32 {\dst\()[0]}, [\src, :32], \incr
.else
vld1.8 {\dst\()}, [\src, :64], \incr
.endif
.endm
// void dav1d_cdef_paddingX_neon(uint16_t *tmp, const pixel *src,
// ptrdiff_t src_stride, const pixel (*left)[2],
// /*const*/ pixel *const top[2], int h,
// enum CdefEdgeFlags edges);
// n1 = s0/d0
// w1 = d0/q0
// n2 = s4/d2
// w2 = d2/q1
.macro padding_func w, stride, n1, w1, n2, w2, align
function cdef_padding\w\()_neon, export=1
push {r4-r7,lr}
ldrd r4, r5, [sp, #20]
ldr r6, [sp, #28]
vmov.i16 q3, #0x8000
tst r6, #4 // CDEF_HAVE_TOP
bne 1f
// !CDEF_HAVE_TOP
sub r12, r0, #2*(2*\stride+2)
vmov.i16 q2, #0x8000
vst1.16 {q2,q3}, [r12]!
.if \w == 8
vst1.16 {q2,q3}, [r12]!
.endif
b 3f
1:
// CDEF_HAVE_TOP
ldr r7, [r4]
ldr lr, [r4, #4]
sub r0, r0, #2*(2*\stride)
pad_top_bottom r7, lr, \w, \stride, \n1, \w1, \n2, \w2, \align, 0
// Middle section
3:
tst r6, #1 // CDEF_HAVE_LEFT
beq 2f
// CDEF_HAVE_LEFT
tst r6, #2 // CDEF_HAVE_RIGHT
beq 1f
// CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
0:
ldrh r12, [r3], #2
vldr \n1, [r1]
vdup.16 d2, r12
ldrh r12, [r1, #\w]
add r1, r1, r2
subs r5, r5, #1
vmov.16 d2[1], r12
vmovl.u8 q0, d0
vmovl.u8 q1, d2
vstr s4, [r0, #-4]
vst1.16 {\w1}, [r0, :\align]
vstr s5, [r0, #2*\w]
add r0, r0, #2*\stride
bgt 0b
b 3f
1:
// CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
ldrh r12, [r3], #2
load_n_incr d0, r1, r2, \w
vdup.16 d2, r12
subs r5, r5, #1
vmovl.u8 q0, d0
vmovl.u8 q1, d2
vstr s4, [r0, #-4]
vst1.16 {\w1}, [r0, :\align]
vstr s12, [r0, #2*\w]
add r0, r0, #2*\stride
bgt 1b
b 3f
2:
tst r6, #2 // CDEF_HAVE_RIGHT
beq 1f
// !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
0:
ldrh r12, [r1, #\w]
load_n_incr d0, r1, r2, \w
vdup.16 d2, r12
subs r5, r5, #1
vmovl.u8 q0, d0
vmovl.u8 q1, d2
vstr s12, [r0, #-4]
vst1.16 {\w1}, [r0, :\align]
vstr s4, [r0, #2*\w]
add r0, r0, #2*\stride
bgt 0b
b 3f
1:
// !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
load_n_incr d0, r1, r2, \w
subs r5, r5, #1
vmovl.u8 q0, d0
vstr s12, [r0, #-4]
vst1.16 {\w1}, [r0, :\align]
vstr s12, [r0, #2*\w]
add r0, r0, #2*\stride
bgt 1b
3:
tst r6, #8 // CDEF_HAVE_BOTTOM
bne 1f
// !CDEF_HAVE_BOTTOM
sub r12, r0, #4
vmov.i16 q2, #0x8000
vst1.16 {q2,q3}, [r12]!
.if \w == 8
vst1.16 {q2,q3}, [r12]!
.endif
pop {r4-r7,pc}
1:
// CDEF_HAVE_BOTTOM
add r7, r1, r2
pad_top_bottom r1, r7, \w, \stride, \n1, \w1, \n2, \w2, \align, 1
endfunc
.endm
padding_func 8, 16, d0, q0, d2, q1, 128
padding_func 4, 8, s0, d0, s4, d2, 64
.macro dir_table w, stride
const directions\w
.byte -1 * \stride + 1, -2 * \stride + 2
.byte 0 * \stride + 1, -1 * \stride + 2
.byte 0 * \stride + 1, 0 * \stride + 2
.byte 0 * \stride + 1, 1 * \stride + 2
.byte 1 * \stride + 1, 2 * \stride + 2
.byte 1 * \stride + 0, 2 * \stride + 1
.byte 1 * \stride + 0, 2 * \stride + 0
.byte 1 * \stride + 0, 2 * \stride - 1
// Repeated, to avoid & 7
.byte -1 * \stride + 1, -2 * \stride + 2
.byte 0 * \stride + 1, -1 * \stride + 2
.byte 0 * \stride + 1, 0 * \stride + 2
.byte 0 * \stride + 1, 1 * \stride + 2
.byte 1 * \stride + 1, 2 * \stride + 2
.byte 1 * \stride + 0, 2 * \stride + 1
endconst
.endm
dir_table 8, 16
dir_table 4, 8
const pri_taps
.byte 4, 2, 3, 3
endconst
.macro load_px d11, d12, d21, d22, w
.if \w == 8
add r6, r2, r9, lsl #1 // x + off
sub r9, r2, r9, lsl #1 // x - off
vld1.16 {\d11,\d12}, [r6] // p0
vld1.16 {\d21,\d22}, [r9] // p1
.else
add r6, r2, r9, lsl #1 // x + off
sub r9, r2, r9, lsl #1 // x - off
vld1.16 {\d11}, [r6] // p0
add r6, r6, #2*8 // += stride
vld1.16 {\d21}, [r9] // p1
add r9, r9, #2*8 // += stride
vld1.16 {\d12}, [r6] // p0
vld1.16 {\d22}, [r9] // p1
.endif
.endm
.macro handle_pixel s1, s2, threshold, thresh_vec, shift, tap
cmp \threshold, #0
vmin.u16 q2, q2, \s1
vmax.s16 q3, q3, \s1
vmin.u16 q2, q2, \s2
vmax.s16 q3, q3, \s2
beq 3f
vabd.u16 q8, q0, \s1 // abs(diff)
vabd.u16 q11, q0, \s2 // abs(diff)
vshl.u16 q9, q8, \shift // abs(diff) >> shift
vshl.u16 q12, q11, \shift // abs(diff) >> shift
vqsub.u16 q9, \thresh_vec, q9 // clip = imax(0, threshold - (abs(diff) >> shift))
vqsub.u16 q12, \thresh_vec, q12// clip = imax(0, threshold - (abs(diff) >> shift))
vsub.i16 q10, \s1, q0 // diff = p0 - px
vsub.u16 q13, \s2, q0 // diff = p1 - px
vneg.s16 q8, q9 // -clip
vneg.s16 q11, q12 // -clip
vmin.s16 q10, q10, q9 // imin(diff, clip)
vmin.s16 q13, q13, q12 // imin(diff, clip)
vdup.16 q9, \tap // taps[k]
vmax.s16 q10, q10, q8 // constrain() = imax(imin(diff, clip), -clip)
vmax.s16 q13, q13, q11 // constrain() = imax(imin(diff, clip), -clip)
vmla.i16 q1, q10, q9 // sum += taps[k] * constrain()
vmla.i16 q1, q13, q9 // sum += taps[k] * constrain()
3:
.endm
// void dav1d_cdef_filterX_neon(pixel *dst, ptrdiff_t dst_stride,
// const uint16_t *tmp, int pri_strength,
// int sec_strength, int dir, int damping, int h);
.macro filter w
function cdef_filter\w\()_neon, export=1
push {r4-r9,lr}
vpush {q4-q7}
ldrd r4, r5, [sp, #92]
ldrd r6, r7, [sp, #100]
movrel_local r8, pri_taps
and r9, r3, #1
add r8, r8, r9, lsl #1
movrel_local r9, directions\w
add r5, r9, r5, lsl #1
vmov.u16 d17, #15
vdup.16 d16, r6 // damping
vdup.16 q5, r3 // threshold
vdup.16 q7, r4 // threshold
vmov.16 d8[0], r3
vmov.16 d8[1], r4
vclz.i16 d8, d8 // clz(threshold)
vsub.i16 d8, d17, d8 // ulog2(threshold)
vqsub.u16 d8, d16, d8 // shift = imax(0, damping - ulog2(threshold))
vneg.s16 d8, d8 // -shift
vdup.16 q6, d8[1]
vdup.16 q4, d8[0]
1:
.if \w == 8
vld1.16 {q0}, [r2, :128] // px
.else
add r12, r2, #2*8
vld1.16 {d0}, [r2, :64] // px
vld1.16 {d1}, [r12, :64] // px
.endif
vmov.u16 q1, #0 // sum
vmov.u16 q2, q0 // min
vmov.u16 q3, q0 // max