...
 
Commits (34)
  • Liwei Wang's avatar
    Add SSSE3 implementation for the 8x8 blocks in itx · 5fa6c44a
    Liwei Wang authored
    Cycle times:
    inv_txfm_add_8x8_adst_adst_0_8bpc_c: 2165.6
    inv_txfm_add_8x8_adst_adst_0_8bpc_ssse3: 194.5
    inv_txfm_add_8x8_adst_adst_1_8bpc_c: 2158.3
    inv_txfm_add_8x8_adst_adst_1_8bpc_ssse3: 194.7
    inv_txfm_add_8x8_adst_dct_0_8bpc_c: 2241.0
    inv_txfm_add_8x8_adst_dct_0_8bpc_ssse3: 165.1
    inv_txfm_add_8x8_adst_dct_1_8bpc_c: 2242.6
    inv_txfm_add_8x8_adst_dct_1_8bpc_ssse3: 164.2
    inv_txfm_add_8x8_adst_flipadst_0_8bpc_c: 2178.2
    inv_txfm_add_8x8_adst_flipadst_0_8bpc_ssse3: 194.4
    inv_txfm_add_8x8_adst_flipadst_1_8bpc_c: 2183.0
    inv_txfm_add_8x8_adst_flipadst_1_8bpc_ssse3: 194.2
    inv_txfm_add_8x8_adst_identity_0_8bpc_c: 1592.1
    inv_txfm_add_8x8_adst_identity_0_8bpc_ssse3: 125.2
    inv_txfm_add_8x8_adst_identity_1_8bpc_c: 1597.7
    inv_txfm_add_8x8_adst_identity_1_8bpc_ssse3: 126.3
    inv_txfm_add_8x8_dct_adst_0_8bpc_c: 2214.1
    inv_txfm_add_8x8_dct_adst_0_8bpc_ssse3: 162.0
    inv_txfm_add_8x8_dct_adst_1_8bpc_c: 2221.5
    inv_txfm_add_8x8_dct_adst_1_8bpc_ssse3: 161.9
    inv_txfm_add_8x8_dct_dct_0_8bpc_c: 2247.8
    inv_txfm_add_8x8_dct_dct_0_8bpc_ssse3: 34.0
    inv_txfm_add_8x8_dct_dct_1_8bpc_c: 2243.1
    inv_txfm_add_8x8_dct_dct_1_8bpc_ssse3: 133.7
    inv_txfm_add_8x8_dct_flipadst_0_8bpc_c: 2255.1
    inv_txfm_add_8x8_dct_flipadst_0_8bpc_ssse3: 161.2
    inv_txfm_add_8x8_dct_flipadst_1_8bpc_c: 2244.6
    inv_txfm_add_8x8_dct_flipadst_1_8bpc_ssse3: 161.8
    inv_txfm_add_8x8_dct_identity_0_8bpc_c: 1632.3
    inv_txfm_add_8x8_dct_identity_0_8bpc_ssse3: 41.3
    inv_txfm_add_8x8_dct_identity_1_8bpc_c: 1629.6
    inv_txfm_add_8x8_dct_identity_1_8bpc_ssse3: 97.7
    inv_txfm_add_8x8_flipadst_adst_0_8bpc_c: 2185.6
    inv_txfm_add_8x8_flipadst_adst_0_8bpc_ssse3: 191.0
    inv_txfm_add_8x8_flipadst_adst_1_8bpc_c: 2165.7
    inv_txfm_add_8x8_flipadst_adst_1_8bpc_ssse3: 191.6
    inv_txfm_add_8x8_flipadst_dct_0_8bpc_c: 2246.4
    inv_txfm_add_8x8_flipadst_dct_0_8bpc_ssse3: 162.8
    inv_txfm_add_8x8_flipadst_dct_1_8bpc_c: 2252.1
    inv_txfm_add_8x8_flipadst_dct_1_8bpc_ssse3: 163.9
    inv_txfm_add_8x8_flipadst_flipadst_0_8bpc_c: 2180.9
    inv_txfm_add_8x8_flipadst_flipadst_0_8bpc_ssse3: 196.3
    inv_txfm_add_8x8_flipadst_flipadst_1_8bpc_c: 2192.2
    inv_txfm_add_8x8_flipadst_flipadst_1_8bpc_ssse3: 194.5
    inv_txfm_add_8x8_flipadst_identity_0_8bpc_c: 1600.9
    inv_txfm_add_8x8_flipadst_identity_0_8bpc_ssse3: 126.6
    inv_txfm_add_8x8_flipadst_identity_1_8bpc_c: 1600.5
    inv_txfm_add_8x8_flipadst_identity_1_8bpc_ssse3: 126.4
    inv_txfm_add_8x8_identity_adst_0_8bpc_c: 1558.0
    inv_txfm_add_8x8_identity_adst_0_8bpc_ssse3: 120.7
    inv_txfm_add_8x8_identity_adst_1_8bpc_c: 1556.7
    inv_txfm_add_8x8_identity_adst_1_8bpc_ssse3: 121.0
    inv_txfm_add_8x8_identity_dct_0_8bpc_c: 1600.8
    inv_txfm_add_8x8_identity_dct_0_8bpc_ssse3: 37.9
    inv_txfm_add_8x8_identity_dct_1_8bpc_c: 1599.5
    inv_txfm_add_8x8_identity_dct_1_8bpc_ssse3: 90.3
    inv_txfm_add_8x8_identity_flipadst_0_8bpc_c: 1584.9
    inv_txfm_add_8x8_identity_flipadst_0_8bpc_ssse3: 120.2
    inv_txfm_add_8x8_identity_flipadst_1_8bpc_c: 1584.3
    inv_txfm_add_8x8_identity_flipadst_1_8bpc_ssse3: 120.5
    inv_txfm_add_8x8_identity_identity_0_8bpc_c: 975.9
    inv_txfm_add_8x8_identity_identity_0_8bpc_ssse3: 54.7
    inv_txfm_add_8x8_identity_identity_1_8bpc_c: 975.7
    inv_txfm_add_8x8_identity_identity_1_8bpc_ssse3: 54.7
    5fa6c44a
  • Xuefeng Jiang's avatar
    Add SSSE3 implementations for dav1d_ipred_top, dav1d_ipred_left and dav1d_ipred_128 · 9ea56386
    Xuefeng Jiang authored
    Cycle times:
    intra_pred_dc_128_w4_8bpc_c: 905.2
    intra_pred_dc_128_w4_8bpc_ssse3: 61.6
    intra_pred_dc_128_w8_8bpc_c: 1393.1
    intra_pred_dc_128_w8_8bpc_ssse3: 82.3
    intra_pred_dc_128_w16_8bpc_c: 2227.4
    intra_pred_dc_128_w16_8bpc_ssse3: 119.6
    intra_pred_dc_128_w32_8bpc_c: 2696.0
    intra_pred_dc_128_w32_8bpc_ssse3: 195.5
    intra_pred_dc_128_w64_8bpc_c: 4298.6
    intra_pred_dc_128_w64_8bpc_ssse3: 465.1
    intra_pred_dc_left_w4_8bpc_c: 974.2
    intra_pred_dc_left_w4_8bpc_ssse3: 80.2
    intra_pred_dc_left_w8_8bpc_c: 1478.4
    intra_pred_dc_left_w8_8bpc_ssse3: 103.7
    intra_pred_dc_left_w16_8bpc_c: 2313.0
    intra_pred_dc_left_w16_8bpc_ssse3: 159.1
    intra_pred_dc_left_w32_8bpc_c: 2835.1
    intra_pred_dc_left_w32_8bpc_ssse3: 305.3
    intra_pred_dc_left_w64_8bpc_c: 4462.2
    intra_pred_dc_left_w64_8bpc_ssse3: 525.5
    intra_pred_dc_top_w4_8bpc_c: 949.5
    intra_pred_dc_top_w4_8bpc_ssse3: 95.5
    intra_pred_dc_top_w8_8bpc_c: 1462.2
    intra_pred_dc_top_w8_8bpc_ssse3: 103.1
    intra_pred_dc_top_w16_8bpc_c: 2312.5
    intra_pred_dc_top_w16_8bpc_ssse3: 146.4
    intra_pred_dc_top_w32_8bpc_c: 2895.9
    intra_pred_dc_top_w32_8bpc_ssse3: 250.4
    intra_pred_dc_top_w64_8bpc_c: 4617.9
    intra_pred_dc_top_w64_8bpc_ssse3: 493.3
    9ea56386
  • Michael Bradshaw's avatar
    c7007c92
  • Michael Bradshaw's avatar
    8308f4b5
  • James Almer's avatar
  • James Almer's avatar
    data: add dav1d_data_ref() · aaba9f8e
    James Almer authored
    Makes both picture and data modules more consistent API wise, and
    does a few extra validation checks for the input arguments.
    aaba9f8e
  • François Cartegnie's avatar
    Add SSSE3 put_bilin · ee58d65d
    François Cartegnie authored
    ee58d65d
  • Steve Lhomme's avatar
  • James Almer's avatar
    ref: properly free the data buffer in dav1d_ref_create on failure · 60ff3002
    James Almer authored
    It was allocated with dav1d_alloc_aligned(), so a simple free() is
    not correct.
    60ff3002
  • Ronald S. Bultje's avatar
    Set subsampling to 0 for RGB. · 8f87eebe
    Ronald S. Bultje authored
    Fixes #233.
    8f87eebe
  • James Almer's avatar
    obu: set subsampling to 1 for monochrome · e19c7699
    James Almer authored
    e19c7699
  • James Almer's avatar
    obu: fix separate_uv_delta_q for RGB · 18d2d750
    James Almer authored
    18d2d750
  • Henrik Gramner's avatar
    Add ipred_z3 AVX2 asm · a440af4a
    Henrik Gramner authored
    Also backport some minor optimizations to z1.
    a440af4a
  • Henrik Gramner's avatar
    Shrink dav1d_dr_intra_derivative[] · f813285c
    Henrik Gramner authored
    f813285c
  • Henrik Gramner's avatar
    Add minor x86 bilin mc optimizations · f753caea
    Henrik Gramner authored
    f753caea
  • James Almer's avatar
    allocate Tile Group cache dynamically · 46435a53
    James Almer authored
    46435a53
  • Marvin Scholz's avatar
    33ce3829
  • Henrik Gramner's avatar
    Add SGR optimizations · 205b723e
    Henrik Gramner authored
    205b723e
  • Liwei Wang's avatar
    Add SSSE3 implementation for the 4x16 and 16x4 blocks in itx · bf659082
    Liwei Wang authored
    Cycle times:
    inv_txfm_add_4x16_adst_adst_0_8bpc_c: 2203.6
    inv_txfm_add_4x16_adst_adst_0_8bpc_ssse3: 198.7
    inv_txfm_add_4x16_adst_adst_1_8bpc_c: 2235.1
    inv_txfm_add_4x16_adst_adst_1_8bpc_ssse3: 199.7
    inv_txfm_add_4x16_adst_adst_2_8bpc_c: 2199.1
    inv_txfm_add_4x16_adst_adst_2_8bpc_ssse3: 199.9
    inv_txfm_add_4x16_adst_dct_0_8bpc_c: 2272.4
    inv_txfm_add_4x16_adst_dct_0_8bpc_ssse3: 50.0
    inv_txfm_add_4x16_adst_dct_1_8bpc_c: 2281.6
    inv_txfm_add_4x16_adst_dct_1_8bpc_ssse3: 163.7
    inv_txfm_add_4x16_adst_dct_2_8bpc_c: 2262.5
    inv_txfm_add_4x16_adst_dct_2_8bpc_ssse3: 164.7
    inv_txfm_add_4x16_adst_flipadst_0_8bpc_c: 2456.5
    inv_txfm_add_4x16_adst_flipadst_0_8bpc_ssse3: 204.3
    inv_txfm_add_4x16_adst_flipadst_1_8bpc_c: 2349.1
    inv_txfm_add_4x16_adst_flipadst_1_8bpc_ssse3: 198.5
    inv_txfm_add_4x16_adst_flipadst_2_8bpc_c: 2241.5
    inv_txfm_add_4x16_adst_flipadst_2_8bpc_ssse3: 198.7
    inv_txfm_add_4x16_adst_identity_0_8bpc_c: 1574.7
    inv_txfm_add_4x16_adst_identity_0_8bpc_ssse3: 117.0
    inv_txfm_add_4x16_adst_identity_1_8bpc_c: 1576.3
    inv_txfm_add_4x16_adst_identity_1_8bpc_ssse3: 116.6
    inv_txfm_add_4x16_adst_identity_2_8bpc_c: 1572.9
    inv_txfm_add_4x16_adst_identity_2_8bpc_ssse3: 116.7
    inv_txfm_add_4x16_dct_adst_0_8bpc_c: 2162.8
    inv_txfm_add_4x16_dct_adst_0_8bpc_ssse3: 187.6
    inv_txfm_add_4x16_dct_adst_1_8bpc_c: 2180.4
    inv_txfm_add_4x16_dct_adst_1_8bpc_ssse3: 185.6
    inv_txfm_add_4x16_dct_adst_2_8bpc_c: 2165.1
    inv_txfm_add_4x16_dct_adst_2_8bpc_ssse3: 184.9
    inv_txfm_add_4x16_dct_dct_0_8bpc_c: 2233.7
    inv_txfm_add_4x16_dct_dct_0_8bpc_ssse3: 49.5
    inv_txfm_add_4x16_dct_dct_1_8bpc_c: 2770.4
    inv_txfm_add_4x16_dct_dct_1_8bpc_ssse3: 148.4
    inv_txfm_add_4x16_dct_dct_2_8bpc_c: 2288.7
    inv_txfm_add_4x16_dct_dct_2_8bpc_ssse3: 149.0
    inv_txfm_add_4x16_dct_flipadst_0_8bpc_c: 2242.0
    inv_txfm_add_4x16_dct_flipadst_0_8bpc_ssse3: 185.8
    inv_txfm_add_4x16_dct_flipadst_1_8bpc_c: 2249.6
    inv_txfm_add_4x16_dct_flipadst_1_8bpc_ssse3: 188.4
    inv_txfm_add_4x16_dct_flipadst_2_8bpc_c: 2237.3
    inv_txfm_add_4x16_dct_flipadst_2_8bpc_ssse3: 185.1
    inv_txfm_add_4x16_dct_identity_0_8bpc_c: 1532.3
    inv_txfm_add_4x16_dct_identity_0_8bpc_ssse3: 63.7
    inv_txfm_add_4x16_dct_identity_1_8bpc_c: 1534.5
    inv_txfm_add_4x16_dct_identity_1_8bpc_ssse3: 63.6
    inv_txfm_add_4x16_dct_identity_2_8bpc_c: 1548.1
    inv_txfm_add_4x16_dct_identity_2_8bpc_ssse3: 101.6
    inv_txfm_add_4x16_flipadst_adst_0_8bpc_c: 2205.2
    inv_txfm_add_4x16_flipadst_adst_0_8bpc_ssse3: 201.6
    inv_txfm_add_4x16_flipadst_adst_1_8bpc_c: 2222.0
    inv_txfm_add_4x16_flipadst_adst_1_8bpc_ssse3: 202.6
    inv_txfm_add_4x16_flipadst_adst_2_8bpc_c: 2205.2
    inv_txfm_add_4x16_flipadst_adst_2_8bpc_ssse3: 205.7
    inv_txfm_add_4x16_flipadst_dct_0_8bpc_c: 2294.9
    inv_txfm_add_4x16_flipadst_dct_0_8bpc_ssse3: 50.0
    inv_txfm_add_4x16_flipadst_dct_1_8bpc_c: 2304.2
    inv_txfm_add_4x16_flipadst_dct_1_8bpc_ssse3: 164.5
    inv_txfm_add_4x16_flipadst_dct_2_8bpc_c: 2292.7
    inv_txfm_add_4x16_flipadst_dct_2_8bpc_ssse3: 164.5
    inv_txfm_add_4x16_flipadst_flipadst_0_8bpc_c: 2281.3
    inv_txfm_add_4x16_flipadst_flipadst_0_8bpc_ssse3: 202.9
    inv_txfm_add_4x16_flipadst_flipadst_1_8bpc_c: 2258.7
    inv_txfm_add_4x16_flipadst_flipadst_1_8bpc_ssse3: 202.4
    inv_txfm_add_4x16_flipadst_flipadst_2_8bpc_c: 2261.0
    inv_txfm_add_4x16_flipadst_flipadst_2_8bpc_ssse3: 201.3
    inv_txfm_add_4x16_flipadst_identity_0_8bpc_c: 1580.5
    inv_txfm_add_4x16_flipadst_identity_0_8bpc_ssse3: 116.1
    inv_txfm_add_4x16_flipadst_identity_1_8bpc_c: 1578.7
    inv_txfm_add_4x16_flipadst_identity_1_8bpc_ssse3: 116.7
    inv_txfm_add_4x16_flipadst_identity_2_8bpc_c: 1590.8
    inv_txfm_add_4x16_flipadst_identity_2_8bpc_ssse3: 117.4
    inv_txfm_add_4x16_identity_adst_0_8bpc_c: 1949.0
    inv_txfm_add_4x16_identity_adst_0_8bpc_ssse3: 170.9
    inv_txfm_add_4x16_identity_adst_1_8bpc_c: 1947.4
    inv_txfm_add_4x16_identity_adst_1_8bpc_ssse3: 171.0
    inv_txfm_add_4x16_identity_adst_2_8bpc_c: 1948.7
    inv_txfm_add_4x16_identity_adst_2_8bpc_ssse3: 170.3
    inv_txfm_add_4x16_identity_dct_0_8bpc_c: 2022.3
    inv_txfm_add_4x16_identity_dct_0_8bpc_ssse3: 59.2
    inv_txfm_add_4x16_identity_dct_1_8bpc_c: 2020.8
    inv_txfm_add_4x16_identity_dct_1_8bpc_ssse3: 133.7
    inv_txfm_add_4x16_identity_dct_2_8bpc_c: 2020.2
    inv_txfm_add_4x16_identity_dct_2_8bpc_ssse3: 133.2
    inv_txfm_add_4x16_identity_flipadst_0_8bpc_c: 2024.7
    inv_txfm_add_4x16_identity_flipadst_0_8bpc_ssse3: 170.3
    inv_txfm_add_4x16_identity_flipadst_1_8bpc_c: 2021.8
    inv_txfm_add_4x16_identity_flipadst_1_8bpc_ssse3: 170.0
    inv_txfm_add_4x16_identity_flipadst_2_8bpc_c: 2022.5
    inv_txfm_add_4x16_identity_flipadst_2_8bpc_ssse3: 169.9
    inv_txfm_add_4x16_identity_identity_0_8bpc_c: 1328.4
    inv_txfm_add_4x16_identity_identity_0_8bpc_ssse3: 87.7
    inv_txfm_add_4x16_identity_identity_1_8bpc_c: 1330.9
    inv_txfm_add_4x16_identity_identity_1_8bpc_ssse3: 87.7
    inv_txfm_add_4x16_identity_identity_2_8bpc_c: 1327.3
    inv_txfm_add_4x16_identity_identity_2_8bpc_ssse3: 87.6
    inv_txfm_add_16x4_adst_adst_0_8bpc_c: 2166.3
    inv_txfm_add_16x4_adst_adst_0_8bpc_ssse3: 186.3
    inv_txfm_add_16x4_adst_adst_1_8bpc_c: 2166.9
    inv_txfm_add_16x4_adst_adst_1_8bpc_ssse3: 184.9
    inv_txfm_add_16x4_adst_adst_2_8bpc_c: 2167.2
    inv_txfm_add_16x4_adst_adst_2_8bpc_ssse3: 185.2
    inv_txfm_add_16x4_adst_dct_0_8bpc_c: 2123.2
    inv_txfm_add_16x4_adst_dct_0_8bpc_ssse3: 172.1
    inv_txfm_add_16x4_adst_dct_1_8bpc_c: 2124.2
    inv_txfm_add_16x4_adst_dct_1_8bpc_ssse3: 171.2
    inv_txfm_add_16x4_adst_dct_2_8bpc_c: 2122.8
    inv_txfm_add_16x4_adst_dct_2_8bpc_ssse3: 171.8
    inv_txfm_add_16x4_adst_flipadst_0_8bpc_c: 2213.3
    inv_txfm_add_16x4_adst_flipadst_0_8bpc_ssse3: 189.6
    inv_txfm_add_16x4_adst_flipadst_1_8bpc_c: 2227.7
    inv_txfm_add_16x4_adst_flipadst_1_8bpc_ssse3: 188.4
    inv_txfm_add_16x4_adst_flipadst_2_8bpc_c: 2228.5
    inv_txfm_add_16x4_adst_flipadst_2_8bpc_ssse3: 188.4
    inv_txfm_add_16x4_adst_identity_0_8bpc_c: 1906.7
    inv_txfm_add_16x4_adst_identity_0_8bpc_ssse3: 154.3
    inv_txfm_add_16x4_adst_identity_1_8bpc_c: 1905.2
    inv_txfm_add_16x4_adst_identity_1_8bpc_ssse3: 155.6
    inv_txfm_add_16x4_adst_identity_2_8bpc_c: 1905.6
    inv_txfm_add_16x4_adst_identity_2_8bpc_ssse3: 156.3
    inv_txfm_add_16x4_dct_adst_0_8bpc_c: 2209.8
    inv_txfm_add_16x4_dct_adst_0_8bpc_ssse3: 37.4
    inv_txfm_add_16x4_dct_adst_1_8bpc_c: 2209.8
    inv_txfm_add_16x4_dct_adst_1_8bpc_ssse3: 157.9
    inv_txfm_add_16x4_dct_adst_2_8bpc_c: 2221.1
    inv_txfm_add_16x4_dct_adst_2_8bpc_ssse3: 158.5
    inv_txfm_add_16x4_dct_dct_0_8bpc_c: 2177.5
    inv_txfm_add_16x4_dct_dct_0_8bpc_ssse3: 29.6
    inv_txfm_add_16x4_dct_dct_1_8bpc_c: 2179.3
    inv_txfm_add_16x4_dct_dct_1_8bpc_ssse3: 144.9
    inv_txfm_add_16x4_dct_dct_2_8bpc_c: 2177.8
    inv_txfm_add_16x4_dct_dct_2_8bpc_ssse3: 143.7
    inv_txfm_add_16x4_dct_flipadst_0_8bpc_c: 2293.6
    inv_txfm_add_16x4_dct_flipadst_0_8bpc_ssse3: 38.3
    inv_txfm_add_16x4_dct_flipadst_1_8bpc_c: 2293.2
    inv_txfm_add_16x4_dct_flipadst_1_8bpc_ssse3: 163.9
    inv_txfm_add_16x4_dct_flipadst_2_8bpc_c: 2301.3
    inv_txfm_add_16x4_dct_flipadst_2_8bpc_ssse3: 163.7
    inv_txfm_add_16x4_dct_identity_0_8bpc_c: 1977.7
    inv_txfm_add_16x4_dct_identity_0_8bpc_ssse3: 39.9
    inv_txfm_add_16x4_dct_identity_1_8bpc_c: 1978.7
    inv_txfm_add_16x4_dct_identity_1_8bpc_ssse3: 126.8
    inv_txfm_add_16x4_dct_identity_2_8bpc_c: 1979.5
    inv_txfm_add_16x4_dct_identity_2_8bpc_ssse3: 128.1
    inv_txfm_add_16x4_flipadst_adst_0_8bpc_c: 2175.6
    inv_txfm_add_16x4_flipadst_adst_0_8bpc_ssse3: 185.1
    inv_txfm_add_16x4_flipadst_adst_1_8bpc_c: 2175.7
    inv_txfm_add_16x4_flipadst_adst_1_8bpc_ssse3: 185.7
    inv_txfm_add_16x4_flipadst_adst_2_8bpc_c: 2173.1
    inv_txfm_add_16x4_flipadst_adst_2_8bpc_ssse3: 185.0
    inv_txfm_add_16x4_flipadst_dct_0_8bpc_c: 2140.5
    inv_txfm_add_16x4_flipadst_dct_0_8bpc_ssse3: 172.0
    inv_txfm_add_16x4_flipadst_dct_1_8bpc_c: 2147.5
    inv_txfm_add_16x4_flipadst_dct_1_8bpc_ssse3: 171.9
    inv_txfm_add_16x4_flipadst_dct_2_8bpc_c: 2148.5
    inv_txfm_add_16x4_flipadst_dct_2_8bpc_ssse3: 172.0
    inv_txfm_add_16x4_flipadst_flipadst_0_8bpc_c: 2240.6
    inv_txfm_add_16x4_flipadst_flipadst_0_8bpc_ssse3: 191.3
    inv_txfm_add_16x4_flipadst_flipadst_1_8bpc_c: 2243.5
    inv_txfm_add_16x4_flipadst_flipadst_1_8bpc_ssse3: 193.2
    inv_txfm_add_16x4_flipadst_flipadst_2_8bpc_c: 2242.9
    inv_txfm_add_16x4_flipadst_flipadst_2_8bpc_ssse3: 192.0
    inv_txfm_add_16x4_flipadst_identity_0_8bpc_c: 1919.2
    inv_txfm_add_16x4_flipadst_identity_0_8bpc_ssse3: 155.1
    inv_txfm_add_16x4_flipadst_identity_1_8bpc_c: 1925.2
    inv_txfm_add_16x4_flipadst_identity_1_8bpc_ssse3: 155.2
    inv_txfm_add_16x4_flipadst_identity_2_8bpc_c: 2084.8
    inv_txfm_add_16x4_flipadst_identity_2_8bpc_ssse3: 155.0
    inv_txfm_add_16x4_identity_adst_0_8bpc_c: 1498.5
    inv_txfm_add_16x4_identity_adst_0_8bpc_ssse3: 107.6
    inv_txfm_add_16x4_identity_adst_1_8bpc_c: 1499.5
    inv_txfm_add_16x4_identity_adst_1_8bpc_ssse3: 107.0
    inv_txfm_add_16x4_identity_adst_2_8bpc_c: 1498.9
    inv_txfm_add_16x4_identity_adst_2_8bpc_ssse3: 107.9
    inv_txfm_add_16x4_identity_dct_0_8bpc_c: 1471.9
    inv_txfm_add_16x4_identity_dct_0_8bpc_ssse3: 45.4
    inv_txfm_add_16x4_identity_dct_1_8bpc_c: 1476.4
    inv_txfm_add_16x4_identity_dct_1_8bpc_ssse3: 45.5
    inv_txfm_add_16x4_identity_dct_2_8bpc_c: 1459.8
    inv_txfm_add_16x4_identity_dct_2_8bpc_ssse3: 92.3
    inv_txfm_add_16x4_identity_flipadst_0_8bpc_c: 1548.7
    inv_txfm_add_16x4_identity_flipadst_0_8bpc_ssse3: 112.1
    inv_txfm_add_16x4_identity_flipadst_1_8bpc_c: 1548.2
    inv_txfm_add_16x4_identity_flipadst_1_8bpc_ssse3: 111.7
    inv_txfm_add_16x4_identity_flipadst_2_8bpc_c: 1547.2
    inv_txfm_add_16x4_identity_flipadst_2_8bpc_ssse3: 114.1
    inv_txfm_add_16x4_identity_identity_0_8bpc_c: 1271.5
    inv_txfm_add_16x4_identity_identity_0_8bpc_ssse3: 74.5
    inv_txfm_add_16x4_identity_identity_1_8bpc_c: 1266.8
    inv_txfm_add_16x4_identity_identity_1_8bpc_ssse3: 74.5
    inv_txfm_add_16x4_identity_identity_2_8bpc_c: 1268.0
    inv_txfm_add_16x4_identity_identity_2_8bpc_ssse3: 74.6
    bf659082
  • James Almer's avatar
    337d8f94
  • Martin Storsjö's avatar
    arm: Create proper .rdata sections for COFF · 1ef86e1b
    Martin Storsjö authored
    On COFF, the default read only data section is `.rdata`, not `.rodata`.
    1ef86e1b
  • Martin Storsjö's avatar
    arm64: mc: Use ubfx instead of ubfm, for consistency with arm · 2c1eba5e
    Martin Storsjö authored
    On arm, there's no ubfm instruction, only ubfx.
    2c1eba5e
  • Martin Storsjö's avatar
    b7a5d2ff
  • Martin Storsjö's avatar
    arm64: mc: Improve a comment · 1407506a
    Martin Storsjö authored
    1407506a
  • Martin Storsjö's avatar
    arm64: mc: Optimize the mul_mla_8_* macros for Cortex A53 · fc5a3728
    Martin Storsjö authored
    Before:                      Cortex A53   Snapdragon 835
    mc_8tap_regular_w2_v_8bpc_neon:   155.1   131.8
    mc_8tap_regular_w4_v_8bpc_neon:   199.6   148.1
    mc_8tap_regular_w8_v_8bpc_neon:   286.2   225.5
    After:
    mc_8tap_regular_w2_v_8bpc_neon:   134.1   129.5
    mc_8tap_regular_w4_v_8bpc_neon:   157.6   146.5
    mc_8tap_regular_w8_v_8bpc_neon:   208.0   225.0
    fc5a3728
  • Martin Storsjö's avatar
    arm64: mc: Simplify the 8tap_2w_hv code slightly · 72af9329
    Martin Storsjö authored
    Before:                       Cortex A53   Snapdragon 835
    mc_8tap_regular_w2_hv_8bpc_neon:   415.0   286.9
    After:
    mc_8tap_regular_w2_hv_8bpc_neon:   399.1   269.9
    72af9329
  • Martin Storsjö's avatar
    arm64: mc: Optimize mc_8tap_regular_w4_hv_8bpc for A53 · e80955cc
    Martin Storsjö authored
    Before:                       Cortex A53   Snapdragon 835
    mc_8tap_regular_w4_hv_8bpc_neon:   543.6   359.1
    After:
    mc_8tap_regular_w4_hv_8bpc_neon:   466.7   355.5
    
    The same kind of change doesn't seem to give any benefits on the 8
    pixel wide hv filtering though, potentially related to the fact that
    it uses not only smull/smlal but also smull2/smlal2.
    e80955cc
  • Martin Storsjö's avatar
    CI: Add CI jobs for armv7-w64-mingw32 and aarch64-w64-mingw32 · 9a550985
    Martin Storsjö authored
    Keep artifacts from the aarch64 build job. There's less point in
    keeping artifacts from the armv7 build job, as all modern arm based
    windows desktop setups are arm64 (even though they can run these armv7
    binaries as well).
    9a550985
  • Ronald S. Bultje's avatar
    9824c5d9
  • Martin Storsjö's avatar
    588cbf94
  • Martin Storsjö's avatar
    arm: mc: Implement 8tap and bilin functions · 191f79d5
    Martin Storsjö authored
    Relative speedups measured with checkasm:
                                     Cortex A7     A8     A9    A53   Snapdragon 835
    mc_8tap_regular_w2_0_8bpc_neon:       9.63   4.05   3.82   5.41   5.68
    mc_8tap_regular_w2_h_8bpc_neon:       3.30   5.44   3.38   3.88   5.12
    mc_8tap_regular_w2_hv_8bpc_neon:      3.86   6.21   4.39   5.18   6.10
    mc_8tap_regular_w2_v_8bpc_neon:       4.69   5.43   3.56   7.27   4.86
    mc_8tap_regular_w4_0_8bpc_neon:       9.13   4.05   5.24   5.37   6.60
    mc_8tap_regular_w4_h_8bpc_neon:       4.38   7.11   4.61   6.59   7.15
    mc_8tap_regular_w4_hv_8bpc_neon:      5.11   9.77   7.37   9.21  10.29
    mc_8tap_regular_w4_v_8bpc_neon:       6.24   7.88   4.96  11.16   7.89
    mc_8tap_regular_w8_0_8bpc_neon:       9.12   4.20   5.59   5.59   9.25
    mc_8tap_regular_w8_h_8bpc_neon:       5.91   8.42   4.84   8.46   7.08
    mc_8tap_regular_w8_hv_8bpc_neon:      5.46   8.35   6.52   7.19   8.33
    mc_8tap_regular_w8_v_8bpc_neon:       7.53   8.96   6.28  16.08  10.66
    mc_8tap_regular_w16_0_8bpc_neon:      9.77   5.46   4.06   7.02   7.38
    mc_8tap_regular_w16_h_8bpc_neon:      6.33   8.87   5.03  10.30   4.29
    mc_8tap_regular_w16_hv_8bpc_neon:     5.00   7.84   6.15   6.83   7.44
    mc_8tap_regular_w16_v_8bpc_neon:      7.74   8.81   6.23  19.24  11.16
    mc_8tap_regular_w32_0_8bpc_neon:      6.11   4.63   2.44   5.92   4.70
    mc_8tap_regular_w32_h_8bpc_neon:      6.60   9.02   5.20  11.08   3.50
    mc_8tap_regular_w32_hv_8bpc_neon:     4.85   7.64   6.09   6.68   6.92
    mc_8tap_regular_w32_v_8bpc_neon:      7.61   8.36   6.13  19.94  11.17
    mc_8tap_regular_w64_0_8bpc_neon:      4.61   3.81   1.60   3.50   2.73
    mc_8tap_regular_w64_h_8bpc_neon:      6.72   9.07   5.21  11.41   3.10
    mc_8tap_regular_w64_hv_8bpc_neon:     4.67   7.43   5.92   6.43   6.59
    mc_8tap_regular_w64_v_8bpc_neon:      7.64   8.28   6.07  20.48  11.41
    mc_8tap_regular_w128_0_8bpc_neon:     2.41   3.13   1.11   2.31   1.73
    mc_8tap_regular_w128_h_8bpc_neon:     6.68   9.03   5.09  11.41   2.90
    mc_8tap_regular_w128_hv_8bpc_neon:    4.50   7.39   5.70   6.26   6.47
    mc_8tap_regular_w128_v_8bpc_neon:     7.21   8.23   5.88  19.82  11.42
    mc_bilinear_w2_0_8bpc_neon:           9.23   4.03   3.74   5.33   6.49
    mc_bilinear_w2_h_8bpc_neon:           2.07   3.52   2.71   2.35   3.40
    mc_bilinear_w2_hv_8bpc_neon:          2.60   5.24   2.73   2.74   3.89
    mc_bilinear_w2_v_8bpc_neon:           2.57   4.39   3.14   3.04   4.05
    mc_bilinear_w4_0_8bpc_neon:           8.74   4.03   5.38   5.28   6.53
    mc_bilinear_w4_h_8bpc_neon:           3.41   6.22   4.28   3.86   7.56
    mc_bilinear_w4_hv_8bpc_neon:          4.38   7.45   4.61   5.26   7.95
    mc_bilinear_w4_v_8bpc_neon:           3.65   6.57   4.51   4.45   7.62
    mc_bilinear_w8_0_8bpc_neon:           8.74   4.50   5.71   5.46   9.39
    mc_bilinear_w8_h_8bpc_neon:           6.14  10.71   6.78   6.88  14.10
    mc_bilinear_w8_hv_8bpc_neon:          7.11  12.80   8.24  11.08   7.83
    mc_bilinear_w8_v_8bpc_neon:           7.24  11.69   7.57   8.04  15.46
    mc_bilinear_w16_0_8bpc_neon:         10.01   5.47   4.07   6.97   7.64
    mc_bilinear_w16_h_8bpc_neon:          8.36  17.00   8.34  11.61   7.64
    mc_bilinear_w16_hv_8bpc_neon:         7.67  13.54   8.53  13.32   8.05
    mc_bilinear_w16_v_8bpc_neon:         10.19  22.56  10.52  15.39  10.62
    mc_bilinear_w32_0_8bpc_neon:          6.22   4.73   2.43   5.89   4.90
    mc_bilinear_w32_h_8bpc_neon:          9.47  18.96   9.34  13.10   7.24
    mc_bilinear_w32_hv_8bpc_neon:         7.95  13.15   9.49  13.78   8.71
    mc_bilinear_w32_v_8bpc_neon:         11.10  23.53  11.34  16.74   8.78
    mc_bilinear_w64_0_8bpc_neon:          4.58   3.82   1.59   3.46   2.71
    mc_bilinear_w64_h_8bpc_neon:         10.07  19.77   9.60  13.99   6.88
    mc_bilinear_w64_hv_8bpc_neon:         8.08  12.95   9.39  13.84   8.90
    mc_bilinear_w64_v_8bpc_neon:         11.49  23.85  11.12  17.13   7.90
    mc_bilinear_w128_0_8bpc_neon:         2.37   3.24   1.15   2.28   1.73
    mc_bilinear_w128_h_8bpc_neon:         9.94  18.84   8.66  13.91   6.74
    mc_bilinear_w128_hv_8bpc_neon:        7.26  12.82   8.97  12.43   8.88
    mc_bilinear_w128_v_8bpc_neon:         9.89  23.88   8.93  14.73   7.33
    mct_8tap_regular_w4_0_8bpc_neon:      2.82   4.46   2.72   3.50   5.41
    mct_8tap_regular_w4_h_8bpc_neon:      4.16   6.88   4.64   6.51   6.60
    mct_8tap_regular_w4_hv_8bpc_neon:     5.22   9.87   7.81   9.39  10.11
    mct_8tap_regular_w4_v_8bpc_neon:      5.81   7.72   4.80  10.16   6.85
    mct_8tap_regular_w8_0_8bpc_neon:      4.48   6.30   3.01   5.82   5.04
    mct_8tap_regular_w8_h_8bpc_neon:      5.59   8.04   4.18   8.68   8.30
    mct_8tap_regular_w8_hv_8bpc_neon:     5.34   8.32   6.42   7.04   7.99
    mct_8tap_regular_w8_v_8bpc_neon:      7.32   8.71   5.75  17.07   9.73
    mct_8tap_regular_w16_0_8bpc_neon:     5.05   9.60   3.64  10.06   4.29
    mct_8tap_regular_w16_h_8bpc_neon:     5.53   8.20   4.54   9.98   7.33
    mct_8tap_regular_w16_hv_8bpc_neon:    4.90   7.87   6.07   6.67   7.03
    mct_8tap_regular_w16_v_8bpc_neon:     7.39   8.55   5.72  19.64   9.98
    mct_8tap_regular_w32_0_8bpc_neon:     5.28   8.16   4.07  11.03   2.38
    mct_8tap_regular_w32_h_8bpc_neon:     5.97   8.31   4.67  10.63   6.72
    mct_8tap_regular_w32_hv_8bpc_neon:    4.73   7.65   5.98   6.51   6.31
    mct_8tap_regular_w32_v_8bpc_neon:     7.33   8.18   5.72  20.50  10.03
    mct_8tap_regular_w64_0_8bpc_neon:     5.11   9.19   4.01  10.61   1.92
    mct_8tap_regular_w64_h_8bpc_neon:     6.05   8.33   4.53  10.84   6.38
    mct_8tap_regular_w64_hv_8bpc_neon:    4.61   7.54   5.69   6.35   6.11
    mct_8tap_regular_w64_v_8bpc_neon:     7.27   8.06   5.39  20.41  10.15
    mct_8tap_regular_w128_0_8bpc_neon:    4.29   8.21   4.28   9.55   1.32
    mct_8tap_regular_w128_h_8bpc_neon:    6.01   8.26   4.43  10.78   6.20
    mct_8tap_regular_w128_hv_8bpc_neon:   4.49   7.49   5.46   6.11   5.96
    mct_8tap_regular_w128_v_8bpc_neon:    6.90   8.00   5.19  18.47  10.13
    mct_bilinear_w4_0_8bpc_neon:          2.70   4.53   2.67   3.32   5.11
    mct_bilinear_w4_h_8bpc_neon:          3.02   5.06   3.13   3.28   5.38
    mct_bilinear_w4_hv_8bpc_neon:         4.14   7.04   4.75   4.99   6.30
    mct_bilinear_w4_v_8bpc_neon:          3.17   5.30   3.66   3.87   5.01
    mct_bilinear_w8_0_8bpc_neon:          4.41   6.46   2.99   5.74   5.98
    mct_bilinear_w8_h_8bpc_neon:          5.36   8.27   3.62   6.39   9.06
    mct_bilinear_w8_hv_8bpc_neon:         6.65  11.82   6.79  11.47   7.07
    mct_bilinear_w8_v_8bpc_neon:          6.26   9.62   4.05   7.75  16.81
    mct_bilinear_w16_0_8bpc_neon:         4.86   9.85   3.61  10.03   4.19
    mct_bilinear_w16_h_8bpc_neon:         5.26  12.91   4.76   9.56   9.68
    mct_bilinear_w16_hv_8bpc_neon:        6.96  12.58   7.05  13.48   7.35
    mct_bilinear_w16_v_8bpc_neon:         6.46  17.94   5.72  13.70  19.20
    mct_bilinear_w32_0_8bpc_neon:         5.31   8.10   4.06  10.88   2.77
    mct_bilinear_w32_h_8bpc_neon:         6.91  14.28   5.33  11.24  10.33
    mct_bilinear_w32_hv_8bpc_neon:        7.13  12.21   7.57  13.91   7.19
    mct_bilinear_w32_v_8bpc_neon:         8.06  18.48   5.88  14.74  15.47
    mct_bilinear_w64_0_8bpc_neon:         5.08   7.29   3.83  10.44   1.71
    mct_bilinear_w64_h_8bpc_neon:         7.24  14.59   5.40  11.70  11.03
    mct_bilinear_w64_hv_8bpc_neon:        7.24  11.98   7.59  13.72   7.30
    mct_bilinear_w64_v_8bpc_neon:         8.20  18.24   5.69  14.57  15.04
    mct_bilinear_w128_0_8bpc_neon:        4.35   8.23   4.17   9.71   1.11
    mct_bilinear_w128_h_8bpc_neon:        7.02  13.80   5.63  11.11  11.26
    mct_bilinear_w128_hv_8bpc_neon:       6.31  11.89   6.75  12.12   7.24
    mct_bilinear_w128_v_8bpc_neon:        6.95  18.26   5.84  11.31  14.78
    191f79d5
  • Anisse Astier's avatar
    Fix broken link to wikis/task-list · 25020403
    Anisse Astier authored
    25020403
  • Nathan Egge's avatar
    Add msac_decode_bool_equi() function · 45d4fde6
    Nathan Egge authored
    When decoding an equi-probable bit (e.g. prob = 1/2) we can simplify the
    decode function.
    45d4fde6
  • Nathan Egge's avatar
    Internalize the EC_PROB_SHIFT precision reduction · f0b7d999
    Nathan Egge authored
    All of the msac decode functions use 15-bit CDFs with the exception of
    msac_decode_bool_prob() which takes a (15 - EC_PROB_SHIFT)-bit
    probability.
    This patch internalizes the reduction in precision from the EC_SMALL_MUL
    experiment (hiding the define) and gives msac calls a consistent API.
    f0b7d999
......@@ -88,6 +88,40 @@ build-win64:
- build/dav1d_install/
expire_in: 1 week
build-win-arm32:
image: registry.videolan.org:5000/vlc-debian-llvm-mingw:20190117110230
stage: build
tags:
- win32
script:
- meson build --buildtype release
--werror
--libdir lib
--prefix "$(pwd)/build/dav1d_install"
--cross-file /opt/crossfiles/armv7-w64-mingw32.meson
-Ddefault_library=both
- ninja -C build
build-win-arm64:
image: registry.videolan.org:5000/vlc-debian-llvm-mingw:20190117110230
stage: build
tags:
- win64
script:
- meson build --buildtype release
--werror
--libdir lib
--prefix "$(pwd)/build/dav1d_install"
--cross-file /opt/crossfiles/aarch64-w64-mingw32.meson
-Ddefault_library=both
- ninja -C build
- ninja -C build install
artifacts:
name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG"
paths:
- build/dav1d_install/
expire_in: 1 week
build-debian-aarch64:
stage: build
image: registry.videolan.org:5000/dav1d-debian-unstable-aarch64:20181122182457
......
......@@ -35,7 +35,7 @@ The plan is the folllowing:
6. Make it fast on older desktop, by writing asm for SSE chips.
### After
7. Improve C code base with [various tweaks](wiki/task-list),
7. Improve C code base with [various tweaks](https://code.videolan.org/videolan/dav1d/wikis/task-list),
8. Accelerate for less common architectures,
9. Use more GPU, when possible.
......@@ -70,7 +70,7 @@ The [VideoLAN Code of Conduct](https://wiki.videolan.org/CoC) applies to this pr
# Compile
1. Install [Meson](https://mesonbuild.com/) (0.47 or higher), [Ninja](https://ninja-build.org/), and, for x86* targets, [nasm](https://nasm.us/) (2.13 or higher)
1. Install [Meson](https://mesonbuild.com/) (0.47 or higher), [Ninja](https://ninja-build.org/), and, for x86* targets, [nasm](https://nasm.us/) (2.13.02 or higher)
2. Run `meson build --buildtype release`
3. Build with `ninja -C build`
......
......@@ -43,6 +43,14 @@
#endif
#endif
/**
* A reference-counted object wrapper for a user-configurable pointer.
*/
typedef struct Dav1dUserData {
const uint8_t *data; ///< data pointer
struct Dav1dRef *ref; ///< allocation origin
} Dav1dUserData;
/**
* Input packet metadata which are copied from the input data used to
* decode each image into the matching structure of the output image
......@@ -56,6 +64,7 @@ typedef struct Dav1dDataProps {
int64_t duration; ///< container duration of input data, 0 if unknown (default)
int64_t offset; ///< stream offset of input data, -1 if unknown (default)
size_t size; ///< packet size, default Dav1dData.sz
struct Dav1dUserData user_data; ///< user-configurable data, default NULL members
} Dav1dDataProps;
#endif // __DAV1D_COMMON_H__
......@@ -58,19 +58,50 @@ DAV1D_API uint8_t * dav1d_data_create(Dav1dData *data, size_t sz);
* @param sz Size of the data.
* @param free_callback Function to be called when we release our last
* reference to this data. In this callback, $buf will be
* the $buf argument to this function, and $user_data
* will be the $user_data input argument to this function.
* @param user_data Opaque parameter passed to free_callback().
* the $buf argument to this function, and $cookie will
* be the $cookie input argument to this function.
* @param cookie Opaque parameter passed to free_callback().
*
* @return 0 on success. A negative errno value on error.
*/
DAV1D_API int dav1d_data_wrap(Dav1dData *data, const uint8_t *buf, size_t sz,
void (*free_callback)(const uint8_t *buf, void *user_data),
void *user_data);
void (*free_callback)(const uint8_t *buf, void *cookie),
void *cookie);
/**
* Wrap a user-provided data pointer into a reference counted object.
*
* data->m.user_data field will initialized to wrap the provided $user_data
* pointer.
*
* $free_callback will be called on the same thread that released the last
* reference. If frame threading is used, make sure $free_callback is
* thread-safe.
*
* @param data Input context.
* @param user_data The user data to be wrapped.
* @param free_callback Function to be called when we release our last
* reference to this data. In this callback, $user_data
* will be the $user_data argument to this function, and
* $cookie will be the $cookie input argument to this
* function.
* @param cookie Opaque parameter passed to $free_callback.
*
* @return 0 on success. A negative errno value on error.
*/
DAV1D_API int dav1d_data_wrap_user_data(Dav1dData *data,
const uint8_t *user_data,
void (*free_callback)(const uint8_t *user_data,
void *cookie),
void *cookie);
/**
* Free the data reference.
*
* The reference count for data->m.user_data will be decremented (if it has been
* initialized with dav1d_data_wrap_user_data). The $data object will be memset
* to 0.
*
* @param data Input context.
*/
DAV1D_API void dav1d_data_unref(Dav1dData *data);
......
......@@ -311,8 +311,8 @@ if is_asm_enabled and host_machine.cpu_family().startswith('x86')
nasm_r = run_command(nasm, '-v')
out = nasm_r.stdout().strip().split()
if out[1].to_lower() == 'version'
if out[2].version_compare('<2.13')
error('nasm 2.13 or later is required, found nasm @0@'.format(out[2]))
if out[2].version_compare('<2.13.02')
error('nasm 2.13.02 or later is required, found nasm @0@'.format(out[2]))
endif
else
error('unexpected nasm version string: @0@'.format(nasm_r.stdout()))
......
This diff is collapsed.
......@@ -32,18 +32,36 @@
#include "config.h"
#include "src/arm/asm.S"
.macro movrel rd, val
#if defined(PIC)
.macro movrel rd, val, offset=0
#if defined(PIC) && defined(__APPLE__)
ldr \rd, 1f
b 2f
1:
@ FIXME: thumb
.word \val - (2f + 8)
.word 3f - (2f + 8)
2:
ldr \rd, [pc, \rd]
.if \offset < 0
sub \rd, \rd, #-(\offset)
.elseif \offset > 0
add \rd, \rd, #\offset
.endif
.non_lazy_symbol_pointer
3:
.indirect_symbol \val
.word 0
.text
#elif defined(PIC)
ldr \rd, 1f
b 2f
1:
@ FIXME: thumb
.word \val + \offset - (2f + 8)
2:
add \rd, \rd, pc
#else
movw \rd, #:lower16:\val
movt \rd, #:upper16:\val
movw \rd, #:lower16:\val+\offset
movt \rd, #:upper16:\val+\offset
#endif
.endm
......
......@@ -546,58 +546,61 @@ endfunc
mla \d\wd, \s2\wd, v0.h[2]
mla \d\wd, \s3\wd, v0.h[3]
.endm
// Interleaving the mul/mla chains actually hurts performance
// significantly on Cortex A53, thus keeping mul/mla tightly
// chained like this.
.macro mul_mla_8_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
mul \d0\().8h, \s0\().8h, v0.h[0]
mul \d1\().8h, \s1\().8h, v0.h[0]
mla \d0\().8h, \s1\().8h, v0.h[1]
mla \d1\().8h, \s2\().8h, v0.h[1]
mla \d0\().8h, \s2\().8h, v0.h[2]
mla \d1\().8h, \s3\().8h, v0.h[2]
mla \d0\().8h, \s3\().8h, v0.h[3]
mla \d1\().8h, \s4\().8h, v0.h[3]
mla \d0\().8h, \s4\().8h, v0.h[4]
mla \d1\().8h, \s5\().8h, v0.h[4]
mla \d0\().8h, \s5\().8h, v0.h[5]
mla \d1\().8h, \s6\().8h, v0.h[5]
mla \d0\().8h, \s6\().8h, v0.h[6]
mla \d1\().8h, \s7\().8h, v0.h[6]
mla \d0\().8h, \s7\().8h, v0.h[7]
mul \d1\().8h, \s1\().8h, v0.h[0]
mla \d1\().8h, \s2\().8h, v0.h[1]
mla \d1\().8h, \s3\().8h, v0.h[2]
mla \d1\().8h, \s4\().8h, v0.h[3]
mla \d1\().8h, \s5\().8h, v0.h[4]
mla \d1\().8h, \s6\().8h, v0.h[5]
mla \d1\().8h, \s7\().8h, v0.h[6]
mla \d1\().8h, \s8\().8h, v0.h[7]
.endm
.macro mul_mla_8_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9
mul \d0\().8h, \s0\().8h, v0.h[0]
mul \d1\().8h, \s2\().8h, v0.h[0]
mla \d0\().8h, \s1\().8h, v0.h[1]
mla \d1\().8h, \s3\().8h, v0.h[1]
mla \d0\().8h, \s2\().8h, v0.h[2]
mla \d1\().8h, \s4\().8h, v0.h[2]
mla \d0\().8h, \s3\().8h, v0.h[3]
mla \d1\().8h, \s5\().8h, v0.h[3]
mla \d0\().8h, \s4\().8h, v0.h[4]
mla \d1\().8h, \s6\().8h, v0.h[4]
mla \d0\().8h, \s5\().8h, v0.h[5]
mla \d1\().8h, \s7\().8h, v0.h[5]
mla \d0\().8h, \s6\().8h, v0.h[6]
mla \d1\().8h, \s8\().8h, v0.h[6]
mla \d0\().8h, \s7\().8h, v0.h[7]
mul \d1\().8h, \s2\().8h, v0.h[0]
mla \d1\().8h, \s3\().8h, v0.h[1]
mla \d1\().8h, \s4\().8h, v0.h[2]
mla \d1\().8h, \s5\().8h, v0.h[3]
mla \d1\().8h, \s6\().8h, v0.h[4]
mla \d1\().8h, \s7\().8h, v0.h[5]
mla \d1\().8h, \s8\().8h, v0.h[6]
mla \d1\().8h, \s9\().8h, v0.h[7]
.endm
.macro mul_mla_8_4 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11
mul \d0\().8h, \s0\().8h, v0.h[0]
mul \d1\().8h, \s4\().8h, v0.h[0]
mla \d0\().8h, \s1\().8h, v0.h[1]
mla \d1\().8h, \s5\().8h, v0.h[1]
mla \d0\().8h, \s2\().8h, v0.h[2]
mla \d1\().8h, \s6\().8h, v0.h[2]
mla \d0\().8h, \s3\().8h, v0.h[3]
mla \d1\().8h, \s7\().8h, v0.h[3]
mla \d0\().8h, \s4\().8h, v0.h[4]
mla \d1\().8h, \s8\().8h, v0.h[4]
mla \d0\().8h, \s5\().8h, v0.h[5]
mla \d1\().8h, \s9\().8h, v0.h[5]
mla \d0\().8h, \s6\().8h, v0.h[6]
mla \d1\().8h, \s10\().8h, v0.h[6]
mla \d0\().8h, \s7\().8h, v0.h[7]
mul \d1\().8h, \s4\().8h, v0.h[0]
mla \d1\().8h, \s5\().8h, v0.h[1]
mla \d1\().8h, \s6\().8h, v0.h[2]
mla \d1\().8h, \s7\().8h, v0.h[3]
mla \d1\().8h, \s8\().8h, v0.h[4]
mla \d1\().8h, \s9\().8h, v0.h[5]
mla \d1\().8h, \s10\().8h, v0.h[6]
mla \d1\().8h, \s11\().8h, v0.h[7]
.endm
.macro sqrshrun_b shift, r0, r1, r2, r3
......@@ -628,7 +631,7 @@ endfunc
st1 {\reg\().h}[3], [x8], \strd
.endif
.endm
.macro st_s strd, r0, r1, r2, r3
.macro st_s strd, r0, r1
st1 {\r0\().s}[0], [x0], \strd
st1 {\r0\().s}[1], [x8], \strd
.ifnb \r1
......@@ -636,7 +639,7 @@ endfunc
st1 {\r1\().s}[1], [x8], \strd
.endif
.endm
.macro st_d strd, r0, r1, r2, r3
.macro st_d strd, r0, r1
st1 {\r0\().d}[0], [x0], \strd
st1 {\r0\().d}[1], [x8], \strd
.ifnb \r1
......@@ -644,13 +647,13 @@ endfunc
st1 {\r1\().d}[1], [x8], \strd
.endif
.endm
.macro shift_store_4 type, strd, r0, r1, r2, r3
.macro shift_store_4 type, strd, r0, r1
.ifc \type, put
sqrshrun_b 6, \r0, \r1, \r2, \r3
st_s \strd, \r0, \r1, \r2, \r3
sqrshrun_b 6, \r0, \r1
st_s \strd, \r0, \r1
.else
srshr_h 2, \r0, \r1, \r2, \r3
st_d \strd, \r0, \r1, \r2, \r3
srshr_h 2, \r0, \r1
st_d \strd, \r0, \r1
.endif
.endm
.macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7
......@@ -742,7 +745,7 @@ function \type\()_8tap
L(\type\()_8tap_h):
cmp \w, #4
ubfm w9, \mx, #7, #13
ubfx w9, \mx, #7, #7
and \mx, \mx, #0x7f
b.le 4f
mov \mx, w9
......@@ -965,7 +968,7 @@ L(\type\()_8tap_h_tbl):
L(\type\()_8tap_v):
cmp \h, #4
ubfm w9, \my, #7, #13
ubfx w9, \my, #7, #7
and \my, \my, #0x7f
b.le 4f
mov \my, w9
......@@ -1216,7 +1219,7 @@ L(\type\()_8tap_v):
160:
b.gt 1680b
// 16x4 v
// 16x2, 16x4 v
add \xmy, \xmy, #2
ld1 {v0.s}[0], [\xmy]
sub \src, \src, \s_strd
......@@ -1269,7 +1272,7 @@ L(\type\()_8tap_v_tbl):
L(\type\()_8tap_hv):
cmp \h, #4
ubfm w9, \my, #7, #13
ubfx w9, \my, #7, #7
and \my, \my, #0x7f
b.le 4f
mov \my, w9
......@@ -1304,21 +1307,19 @@ L(\type\()_8tap_hv):
ext v29.16b, v28.16b, v28.16b, #2
mul v28.4h, v28.4h, v0.4h
mul v29.4h, v29.4h, v0.4h
addv h28, v28.4h
addv h29, v29.4h
trn1 v16.4h, v28.4h, v29.4h
srshr v16.4h, v16.4h, #2
addp v28.4h, v28.4h, v29.4h
addp v16.4h, v28.4h, v28.4h
srshr v16.4h, v16.4h, #2
bl L(\type\()_8tap_filter_2)
trn1 v16.2s, v16.2s, v28.2s
trn1 v17.2s, v28.2s, v30.2s
mov v18.8b, v30.8b
mov v17.8b, v28.8b
2:
bl L(\type\()_8tap_filter_2)
trn1 v18.2s, v18.2s, v28.2s
trn1 v19.2s, v28.2s, v30.2s
ext v18.8b, v17.8b, v28.8b, #4
mov v19.8b, v28.8b
smull v2.4s, v16.4h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
smlal v2.4s, v18.4h, v1.h[2]
......@@ -1332,7 +1333,6 @@ L(\type\()_8tap_hv):
b.le 0f
mov v16.8b, v18.8b
mov v17.8b, v19.8b
mov v18.8b, v30.8b
b 2b
280: // 2x8, 2x16, 2x32 hv
......@@ -1352,28 +1352,24 @@ L(\type\()_8tap_hv):
ext v29.16b, v28.16b, v28.16b, #2
mul v28.4h, v28.4h, v0.4h
mul v29.4h, v29.4h, v0.4h
addv h28, v28.4h
addv h29, v29.4h
trn1 v16.4h, v28.4h, v29.4h
srshr v16.4h, v16.4h, #2
addp v28.4h, v28.4h, v29.4h
addp v16.4h, v28.4h, v28.4h
srshr v16.4h, v16.4h, #2
bl L(\type\()_8tap_filter_2)
trn1 v16.2s, v16.2s, v28.2s
trn1 v17.2s, v28.2s, v30.2s
mov v18.8b, v30.8b
mov v17.8b, v28.8b
bl L(\type\()_8tap_filter_2)
trn1 v18.2s, v18.2s, v28.2s
trn1 v19.2s, v28.2s, v30.2s
mov v20.8b, v30.8b
ext v18.8b, v17.8b, v28.8b, #4
mov v19.8b, v28.8b
bl L(\type\()_8tap_filter_2)
trn1 v20.2s, v20.2s, v28.2s
trn1 v21.2s, v28.2s, v30.2s
mov v22.8b, v30.8b
ext v20.8b, v19.8b, v28.8b, #4
mov v21.8b, v28.8b
28:
bl L(\type\()_8tap_filter_2)
trn1 v22.2s, v22.2s, v28.2s
trn1 v23.2s, v28.2s, v30.2s
ext v22.8b, v21.8b, v28.8b, #4
mov v23.8b, v28.8b
smull v2.4s, v16.4h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
smlal v2.4s, v18.4h, v1.h[2]
......@@ -1395,7 +1391,6 @@ L(\type\()_8tap_hv):
mov v19.8b, v21.8b
mov v20.8b, v22.8b
mov v21.8b, v23.8b
mov v22.8b, v30.8b
b 28b
0:
......@@ -1417,7 +1412,6 @@ L(\type\()_8tap_filter_2):
mla v27.4h, v30.4h, v0.h[2]
mla v27.4h, v31.4h, v0.h[3]
srshr v28.4h, v27.4h, #2
trn2 v30.2s, v28.2s, v28.2s
ret
.endif
......@@ -1453,14 +1447,17 @@ L(\type\()_8tap_filter_2):
mov v18.8b, v29.8b
4:
smull v2.4s, v16.4h, v1.h[0]
bl L(\type\()_8tap_filter_4)
smull v3.4s, v17.4h, v1.h[0]
// Interleaving the mul/mla chains actually hurts performance
// significantly on Cortex A53, thus keeping mul/mla tightly
// chained like this.
smull v2.4s, v16.4h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
smlal v3.4s, v18.4h, v1.h[1]
smlal v2.4s, v18.4h, v1.h[2]
smlal v3.4s, v28.4h, v1.h[2]
smlal v2.4s, v28.4h, v1.h[3]
smull v3.4s, v17.4h, v1.h[0]
smlal v3.4s, v18.4h, v1.h[1]
smlal v3.4s, v28.4h, v1.h[2]
smlal v3.4s, v29.4h, v1.h[3]
sqrshrn v2.4h, v2.4s, #\shift_hv
sqrshrn v3.4h, v3.4s, #\shift_hv
......@@ -1514,22 +1511,22 @@ L(\type\()_8tap_filter_2):
mov v22.8b, v29.8b
48:
smull v2.4s, v16.4h, v1.h[0]
bl L(\type\()_8tap_filter_4)
smull v3.4s, v17.4h, v1.h[0]
smull v2.4s, v16.4h, v1.h[0]
smlal v2.4s, v17.4h, v1.h[1]
smlal v3.4s, v18.4h, v1.h[1]
smlal v2.4s, v18.4h, v1.h[2]
smlal v3.4s, v19.4h, v1.h[2]
smlal v2.4s, v19.4h, v1.h[3]
smlal v3.4s, v20.4h, v1.h[3]
smlal v2.4s, v20.4h, v1.h[4]
smlal v3.4s, v21.4h, v1.h[4]
smlal v2.4s, v21.4h, v1.h[5]
smlal v3.4s, v22.4h, v1.h[5]
smlal v2.4s, v22.4h, v1.h[6]
smlal v3.4s, v28.4h, v1.h[6]
smlal v2.4s, v28.4h, v1.h[7]
smull v3.4s, v17.4h, v1.h[0]
smlal v3.4s, v18.4h, v1.h[1]
smlal v3.4s, v19.4h, v1.h[2]
smlal v3.4s, v20.4h, v1.h[3]
smlal v3.4s, v21.4h, v1.h[4]
smlal v3.4s, v22.4h, v1.h[5]
smlal v3.4s, v28.4h, v1.h[6]
smlal v3.4s, v29.4h, v1.h[7]
sqrshrn v2.4h, v2.4s, #\shift_hv
sqrshrn v3.4h, v3.4s, #\shift_hv
......
......@@ -114,7 +114,9 @@ EXTERN\name:
#endif
.purgem endconst
.endm
#if !defined(__MACH__)
#if defined(_WIN32)
.section .rdata
#elif !defined(__MACH__)
.section .rodata
#else
.const_data
......
......@@ -66,7 +66,6 @@ void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) {
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
#if BITDEPTH == 8
#if ARCH_AARCH64
init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, neon);
init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon);
init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, neon);
......@@ -88,7 +87,6 @@ void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) {
init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, neon);
init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, neon);
init_mct_fn(FILTER_2D_BILINEAR, bilin, neon);
#endif
c->avg = dav1d_avg_8bpc_neon;
c->w_avg = dav1d_w_avg_8bpc_neon;
......
......@@ -27,6 +27,7 @@
#include "config.h"
#include <assert.h>
#include <errno.h>
#include <stdint.h>
#include <stdlib.h>
......@@ -46,9 +47,7 @@ uint8_t *dav1d_data_create_internal(Dav1dData *const buf, const size_t sz) {
if (!buf->ref) return NULL;
buf->data = buf->ref->const_data;
buf->sz = buf->m.size = sz;
buf->m.timestamp = INT64_MIN;
buf->m.duration = 0;
buf->m.offset = -1;
dav1d_data_props_set_defaults(&buf->m);
return buf->ref->data;
}
......@@ -56,24 +55,52 @@ uint8_t *dav1d_data_create_internal(Dav1dData *const buf, const size_t sz) {
int dav1d_data_wrap_internal(Dav1dData *const buf, const uint8_t *const ptr,
const size_t sz,
void (*const free_callback)(const uint8_t *data,
void *user_data),
void *const user_data)
void *cookie),
void *const cookie)
{
validate_input_or_ret(buf != NULL, -EINVAL);
validate_input_or_ret(ptr != NULL, -EINVAL);
validate_input_or_ret(free_callback != NULL, -EINVAL);
buf->ref = dav1d_ref_wrap(ptr, free_callback, user_data);
buf->ref = dav1d_ref_wrap(ptr, free_callback, cookie);
if (!buf->ref) return -ENOMEM;
buf->data = ptr;
buf->sz = buf->m.size = sz;
buf->m.timestamp = INT64_MIN;
buf->m.duration = 0;
buf->m.offset = -1;
dav1d_data_props_set_defaults(&buf->m);
return 0;
}
int dav1d_data_wrap_user_data_internal(Dav1dData *const buf,
const uint8_t *const user_data,
void (*const free_callback)(const uint8_t *user_data,
void *cookie),
void *const cookie)
{
validate_input_or_ret(buf != NULL, -EINVAL);
validate_input_or_ret(free_callback != NULL, -EINVAL);
buf->m.user_data.ref = dav1d_ref_wrap(user_data, free_callback, cookie);
if (!buf->m.user_data.ref) return -ENOMEM;
buf->m.user_data.data = user_data;
return 0;
}
void dav1d_data_ref(Dav1dData *const dst, const Dav1dData *const src) {
validate_input(dst != NULL);
validate_input(dst->data == NULL);
validate_input(src != NULL);
if (src->ref) {
validate_input(src->data != NULL);
dav1d_ref_inc(src->ref);
}
if (src->m.user_data.ref) dav1d_ref_inc(src->m.user_data.ref);
*dst = *src;
}
void dav1d_data_move_ref(Dav1dData *const dst, Dav1dData *const src) {
validate_input(dst != NULL);
validate_input(dst->data == NULL);
......@@ -86,12 +113,35 @@ void dav1d_data_move_ref(Dav1dData *const dst, Dav1dData *const src) {
memset(src, 0, sizeof(*src));
}
void dav1d_data_props_copy(Dav1dDataProps *const dst,
const Dav1dDataProps *const src)
{
assert(dst != NULL);
assert(src != NULL);
dav1d_ref_dec(&dst->user_data.ref);
*dst = *src;
if (dst->user_data.ref) dav1d_ref_inc(dst->user_data.ref);
}
void dav1d_data_props_set_defaults(Dav1dDataProps *const props) {
assert(props != NULL);
props->timestamp = INT64_MIN;
props->duration = 0;
props->offset = -1;
props->user_data.data = NULL;
props->user_data.ref = NULL;
}
void dav1d_data_unref_internal(Dav1dData *const buf) {
validate_input(buf != NULL);
struct Dav1dRef *user_data_ref = buf->m.user_data.ref;
if (buf->ref) {
validate_input(buf->data != NULL);
dav1d_ref_dec(&buf->ref);
}
memset(buf, 0, sizeof(*buf));
dav1d_ref_dec(&user_data_ref);
}
......@@ -30,16 +30,31 @@
#include "dav1d/data.h"
void dav1d_data_ref(Dav1dData *dst, const Dav1dData *src);
/**
* Move a data reference.
*/
void dav1d_data_move_ref(Dav1dData *dst, Dav1dData *src);
/**
* Copy the source properties to the destitionatin and increase the
* user_data's reference count (if it's not NULL).
*/
void dav1d_data_props_copy(Dav1dDataProps *dst, const Dav1dDataProps *src);
void dav1d_data_props_set_defaults(Dav1dDataProps *props);
uint8_t *dav1d_data_create_internal(Dav1dData *buf, size_t sz);
int dav1d_data_wrap_internal(Dav1dData *buf, const uint8_t *ptr, size_t sz,
void (*free_callback)(const uint8_t *data,
void *user_data),
void *user_data);
int dav1d_data_wrap_user_data_internal(Dav1dData *buf,
const uint8_t *user_data,
void (*free_callback)(const uint8_t *user_data,
void *cookie),
void *cookie);
void dav1d_data_unref_internal(Dav1dData *buf);
#endif /* __DAV1D_SRC_DATA_H__ */
......@@ -413,7 +413,7 @@ static void read_pal_plane(Dav1dTileContext *const t, Av1Block *const b,
// find reused cache entries
int i = 0;
for (int n = 0; n < n_cache && i < pal_sz; n++)
if (msac_decode_bool(&ts->msac, EC_BOOL_EPROB))
if (msac_decode_bool_equi(&ts->msac))
used_cache[i++] = cache[n];
const int n_used_cache = i;
......@@ -477,13 +477,13 @@ static void read_pal_uv(Dav1dTileContext *const t, Av1Block *const b,
uint16_t *const pal = f->frame_thread.pass ?
f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
((t->bx >> 1) + (t->by & 1))][2] : t->pal[2];
if (msac_decode_bool(&ts->msac, EC_BOOL_EPROB)) {
if (msac_decode_bool_equi(&ts->msac)) {
const int bits = f->cur.p.bpc - 4 + msac_decode_bools(&ts->msac, 2);
int prev = pal[0] = msac_decode_bools(&ts->msac, f->cur.p.bpc);
const int max = (1 << f->cur.p.bpc) - 1;
for (int i = 1; i < b->pal_sz[1]; i++) {
int delta = msac_decode_bools(&ts->msac, bits);
if (delta && msac_decode_bool(&ts->msac, EC_BOOL_EPROB)) delta = -delta;
if (delta && msac_decode_bool_equi(&ts->msac)) delta = -delta;
prev = pal[i] = (prev + delta) & max;
}
} else {
......@@ -927,7 +927,7 @@ static int decode_b(Dav1dTileContext *const t,
delta_q = msac_decode_bools(&ts->msac, n_bits) + 1 + (1 << n_bits);
}
if (delta_q) {
if (msac_decode_bool(&ts->msac, EC_BOOL_EPROB)) delta_q = -delta_q;
if (msac_decode_bool_equi(&ts->msac)) delta_q = -delta_q;
delta_q *= 1 << f->frame_hdr->delta.q.res_log2;
}
ts->last_qidx = iclip(ts->last_qidx + delta_q, 1, 255);
......@@ -949,7 +949,7 @@ static int decode_b(Dav1dTileContext *const t,
1 + (1 << n_bits);
}
if (delta_lf) {
if (msac_decode_bool(&ts->msac, EC_BOOL_EPROB))
if (msac_decode_bool_equi(&ts->msac))
delta_lf = -delta_lf;
delta_lf *= 1 << f->frame_hdr->delta.lf.res_log2;
}
......@@ -1572,7 +1572,7 @@ static int decode_b(Dav1dTileContext *const t,
} else {
b->comp_type = COMP_INTER_SEG;
}
b->mask_sign = msac_decode_bool(&ts->msac, EC_BOOL_EPROB);
b->mask_sign = msac_decode_bool_equi(&ts->msac);
if (DEBUG_BLOCK_INFO)
printf("Post-seg/wedge[%d,wedge_idx=%d,sign=%d]: r=%d\n",
b->comp_type == COMP_INTER_WEDGE,
......@@ -2165,7 +2165,7 @@ static int decode_sb(Dav1dTileContext *const t, const enum BlockLevel bl,
const Av1Block *const b = &f->frame_thread.b[t->by * f->b4_stride + t->bx];
is_split = b->bl != bl;
} else {
is_split = msac_decode_bool(&t->ts->msac, gather_top_partition_prob(pc, bl) >> EC_PROB_SHIFT);
is_split = msac_decode_bool(&t->ts->msac, gather_top_partition_prob(pc, bl));
if (DEBUG_BLOCK_INFO)
printf("poc=%d,y=%d,x=%d,bl=%d,ctx=%d,bp=%d: r=%d\n",
f->frame_hdr->frame_offset, t->by, t->bx, bl, ctx,
......@@ -2193,7 +2193,7 @@ static int decode_sb(Dav1dTileContext *const t, const enum BlockLevel bl,
const Av1Block *const b = &f->frame_thread.b[t->by * f->b4_stride + t->bx];
is_split = b->bl != bl;
} else {
is_split = msac_decode_bool(&t->ts->msac, gather_left_partition_prob(pc, bl) >> EC_PROB_SHIFT);
is_split = msac_decode_bool(&t->ts->msac, gather_left_partition_prob(pc, bl));
if (f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I422 && !is_split)
return 1;
if (DEBUG_BLOCK_INFO)
......@@ -3172,6 +3172,12 @@ int dav1d_submit_frame(Dav1dContext *const c) {
}
// FIXME qsort so tiles are in order (for frame threading)
if (f->n_tile_data_alloc < c->n_tile_data) {
struct Dav1dTileGroup *tile = realloc(f->tile, c->n_tile_data * sizeof(*f->tile));
if (!tile) goto error;
f->tile = tile;
f->n_tile_data_alloc = c->n_tile_data;
}
memcpy(f->tile, c->tile, c->n_tile_data * sizeof(*f->tile));
memset(c->tile, 0, c->n_tile_data * sizeof(*c->tile));
f->n_tile_data = c->n_tile_data;
......@@ -3180,19 +3186,13 @@ int dav1d_submit_frame(Dav1dContext *const c) {
// allocate frame
res = dav1d_thread_picture_alloc(&f->sr_cur, f->frame_hdr->width[1],
f->frame_hdr->height,
f->seq_hdr->layout, bpc,
f->seq_hdr, f->seq_hdr_ref,
f->frame_hdr, f->frame_hdr_ref,
bpc, &f->tile[0].data.m,
c->n_fc > 1 ? &f->frame_thread.td : NULL,
f->frame_hdr->show_frame, &c->allocator);
if (res < 0) goto error;
f->sr_cur.p.m = f->tile[0].data.m;
f->sr_cur.p.frame_hdr = f->frame_hdr;
f->sr_cur.p.frame_hdr_ref = f->frame_hdr_ref;
dav1d_ref_inc(f->frame_hdr_ref);
f->sr_cur.p.seq_hdr = f->seq_hdr;
f->sr_cur.p.seq_hdr_ref = f->seq_hdr_ref;
dav1d_ref_inc(f->seq_hdr_ref);
if (f->frame_hdr->super_res.enabled) {
res = dav1d_picture_alloc_copy(&f->cur, f->frame_hdr->width[0], &f->sr_cur.p);
if (res < 0) goto error;
......
......@@ -65,16 +65,19 @@ typedef struct Dav1dDSPContext {
Dav1dLoopRestorationDSPContext lr;
} Dav1dDSPContext;
struct Dav1dTileGroup {
Dav1dData data;
int start, end;
};
struct Dav1dContext {
Dav1dFrameContext *fc;
unsigned n_fc;
// cache of OBUs that make up a single frame before we submit them
// to a frame worker to be decoded
struct {
Dav1dData data;
int start, end;
} tile[256];
struct Dav1dTileGroup *tile;
int n_tile_data_alloc;
int n_tile_data;
int n_tiles;
Dav1dRef *seq_hdr_ref;
......@@ -139,10 +142,8 @@ struct Dav1dFrameContext {
unsigned refpoc[7], refrefpoc[7][7];
uint8_t gmv_warp_allowed[7];
CdfThreadContext in_cdf, out_cdf;
struct {
Dav1dData data;
int start, end;
} tile[256];
struct Dav1dTileGroup *tile;
int n_tile_data_alloc;
int n_tile_data;
// for scalable references
......
......@@ -81,8 +81,8 @@ enum IntraPredMode
const pixel *dst, ptrdiff_t stride,
const pixel *prefilter_toplevel_sb_edge,
enum IntraPredMode mode, int *angle,
int tw, int th, pixel *topleft_out
HIGHBD_DECL_SUFFIX);
int tw, int th, int filter_edge,
pixel *topleft_out HIGHBD_DECL_SUFFIX);
// These flags are OR'd with the angle argument into intra predictors.
// ANGLE_USE_EDGE_FILTER_FLAG signals that edges should be convolved
......
......@@ -82,7 +82,7 @@ bytefn(dav1d_prepare_intra_edges)(const int x, const int have_left,
const ptrdiff_t stride,
const pixel *prefilter_toplevel_sb_edge,
enum IntraPredMode mode, int *const angle,
const int tw, const int th,
const int tw, const int th, const int filter_edge,
pixel *const topleft_out HIGHBD_DECL_SUFFIX)
{
const int bitdepth = bitdepth_from_max(bitdepth_max);
......@@ -201,7 +201,7 @@ bytefn(dav1d_prepare_intra_edges)(const int x, const int have_left,
} else {
*topleft_out = have_top ? *dst_top : (1 << bitdepth) >> 1;
}
if (mode == Z2_PRED && tw + th >= 6)
if (mode == Z2_PRED && tw + th >= 6 && filter_edge)
*topleft_out = (topleft_out[-1] * 5 + topleft_out[0] * 6 +
topleft_out[1] * 5 + 8) >> 4;
}
......
......@@ -422,7 +422,7 @@ static void ipred_z1_c(pixel *dst, const ptrdiff_t stride,
const int enable_intra_edge_filter = angle >> 10;
angle &= 511;
assert(angle < 90);
int dx = dav1d_dr_intra_derivative[angle];
int dx = dav1d_dr_intra_derivative[angle >> 1];
pixel top_out[(64 + 64) * 2];
const pixel *top;
int max_base_x;
......@@ -476,8 +476,8 @@ static void ipred_z2_c(pixel *dst, const ptrdiff_t stride,
const int enable_intra_edge_filter = angle >> 10;
angle &= 511;
assert(angle > 90 && angle < 180);
int dy = dav1d_dr_intra_derivative[angle - 90];
int dx = dav1d_dr_intra_derivative[180 - angle];
int dy = dav1d_dr_intra_derivative[(angle - 90) >> 1];
int dx = dav1d_dr_intra_derivative[(180 - angle) >> 1];
const int upsample_left = enable_intra_edge_filter ?
get_upsample(width + height, 180 - angle, is_sm) : 0;
const int upsample_above = enable_intra_edge_filter ?
......@@ -557,7 +557,7 @@ static void ipred_z3_c(pixel *dst, const ptrdiff_t stride,
const int enable_intra_edge_filter = angle >> 10;
angle &= 511;
assert(angle > 180);
int dy = dav1d_dr_intra_derivative[270 - angle];
int dy = dav1d_dr_intra_derivative[(270 - angle) >> 1];
pixel left_out[(64 + 64) * 2];
const pixel *left;
int max_base_y;
......
......@@ -473,6 +473,7 @@ void dav1d_close(Dav1dContext **const c_out) {
dav1d_free_aligned(f->tc);
dav1d_free_aligned(f->ipred_edge[0]);
free(f->a);
free(f->tile);
free(f->lf.mask);
free(f->lf.lr_mask);
free(f->lf.level);
......@@ -491,6 +492,7 @@ void dav1d_close(Dav1dContext **const c_out) {
}
for (int n = 0; n < c->n_tile_data; n++)
dav1d_data_unref_internal(&c->tile[n].data);
free(c->tile);
for (int n = 0; n < 8; n++) {
dav1d_cdf_thread_unref(&c->cdf[n]);
if (c->refs[n].p.p.data[0])
......@@ -521,6 +523,18 @@ int dav1d_data_wrap(Dav1dData *const buf, const uint8_t *const ptr,
return dav1d_data_wrap_internal(buf, ptr, sz, free_callback, user_data);
}
int dav1d_data_wrap_user_data(Dav1dData *const buf,
const uint8_t *const user_data,
void (*const free_callback)(const uint8_t *user_data,
void *cookie),
void *const cookie)
{
return dav1d_data_wrap_user_data_internal(buf,
user_data,
free_callback,
cookie);
}
void dav1d_data_unref(Dav1dData *const buf) {
dav1d_data_unref_internal(buf);
}
......@@ -446,11 +446,11 @@ static void selfguided_filter(coef *dst, const pixel *src,
const unsigned p = imax(a * n - b * b, 0);
const unsigned z = (p * s + (1 << 19)) >> 20;
const unsigned x = dav1d_sgr_x_by_x[imin(z, 255)];
const int x = dav1d_sgr_x_by_xplus1[imin(z, 255)];
// This is where we invert A and B, so that B is of size coef.
AA[i] = (((1U << 8) - x) * BB[i] * sgr_one_by_x + (1 << 11)) >> 12;
BB[i] = x;
AA[i] = (x * BB[i] * sgr_one_by_x + (1 << 11)) >> 12;
BB[i] = 256 - x;
}
AA += step * REST_UNIT_STRIDE;
BB += step * REST_UNIT_STRIDE;
......
......@@ -34,6 +34,7 @@
#include "src/msac.h"
#define EC_PROB_SHIFT 6
#define EC_MIN_PROB 4 // must be <= (1<<EC_PROB_SHIFT)/16
#define EC_WIN_SIZE (sizeof(ec_win) << 3)
......@@ -91,6 +92,22 @@ unsigned msac_decode_symbol(MsacContext *const s, const uint16_t *const cdf,
return ret - 1;
}
unsigned msac_decode_bool_equi(MsacContext *const s) {
ec_win v, vw, dif = s->dif;
uint16_t r = s->rng;
unsigned ret;
assert((dif >> (EC_WIN_SIZE - 16)) < r);
// When the probability is 1/2, f = 16384 >> EC_PROB_SHIFT = 256 and we can
// replace the multiply with a simple shift.
v = ((r >> 8) << 7) + EC_MIN_PROB;
vw = v << (EC_WIN_SIZE - 16);
ret = dif >= vw;
dif -= ret*vw;
v += ret*(r - 2*v);
ctx_norm(s, dif, v);
return !ret;
}
/* Decode a single binary value.
* f: The probability that the bit is one
* Return: The value decoded (0 or 1). */
......@@ -99,7 +116,7 @@ unsigned msac_decode_bool(MsacContext *const s, const unsigned f) {
uint16_t r = s->rng;
unsigned ret;
assert((dif >> (EC_WIN_SIZE - 16)) < r);
v = ((r >> 8) * f >> (7 - EC_PROB_SHIFT)) + EC_MIN_PROB;
v = ((r >> 8) * (f >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT)) + EC_MIN_PROB;
vw = v << (EC_WIN_SIZE - 16);
ret = dif >= vw;
dif -= ret*vw;
......@@ -111,7 +128,7 @@ unsigned msac_decode_bool(MsacContext *const s, const unsigned f) {
unsigned msac_decode_bools(MsacContext *const c, const unsigned l) {
int v = 0;
for (int n = (int) l - 1; n >= 0; n--)
v = (v << 1) | msac_decode_bool(c, EC_BOOL_EPROB);
v = (v << 1) | msac_decode_bool_equi(c);
return v;
}
......@@ -122,7 +139,7 @@ int msac_decode_subexp(MsacContext *const c, const int ref,
int a = 0;
int b = k;
while ((2 << b) < n) {
if (!msac_decode_bool(c, EC_BOOL_EPROB)) break;
if (!msac_decode_bool_equi(c)) break;
b = k + i++;
a = (1 << b);
}
......@@ -137,7 +154,7 @@ int msac_decode_uniform(MsacContext *const c, const unsigned n) {
assert(l > 1);
const unsigned m = (1 << l) - n;
const unsigned v = msac_decode_bools(c, l - 1);
return v < m ? v : (v << 1) - m + msac_decode_bool(c, EC_BOOL_EPROB);
return v < m ? v : (v << 1) - m + msac_decode_bool_equi(c);
}
static void update_cdf(uint16_t *const cdf, const unsigned val,
......@@ -163,7 +180,7 @@ unsigned msac_decode_symbol_adapt(MsacContext *const c,
}
unsigned msac_decode_bool_adapt(MsacContext *const c, uint16_t *const cdf) {
const unsigned bit = msac_decode_bool(c, *cdf >> EC_PROB_SHIFT);
const unsigned bit = msac_decode_bool(c, *cdf);
if(c->allow_update_cdf){
// update_cdf() specialized for boolean CDFs
......
......@@ -43,14 +43,12 @@ typedef struct MsacContext {
int allow_update_cdf;
} MsacContext;
#define EC_PROB_SHIFT 6
#define EC_BOOL_EPROB 256
void msac_init(MsacContext *c, const uint8_t *data, size_t sz, int disable_cdf_update_flag);
unsigned msac_decode_symbol(MsacContext *s, const uint16_t *cdf,
const unsigned n_symbols);
unsigned msac_decode_symbol_adapt(MsacContext *s, uint16_t *cdf,
const unsigned n_symbols);
unsigned msac_decode_bool_equi(MsacContext *const s);
unsigned msac_decode_bool(MsacContext *s, unsigned f);
unsigned msac_decode_bool_adapt(MsacContext *s, uint16_t *cdf);
unsigned msac_decode_bools(MsacContext *c, unsigned l);
......
......@@ -221,7 +221,7 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb,
if (hdr->monochrome) {
hdr->color_range = dav1d_get_bits(gb, 1);
hdr->layout = DAV1D_PIXEL_LAYOUT_I400;
hdr->ss_hor = hdr->ss_ver = 0;
hdr->ss_hor = hdr->ss_ver = 1;
hdr->chr = DAV1D_CHR_UNKNOWN;
hdr->separate_uv_delta_q = 0;
} else if (hdr->pri == DAV1D_COLOR_PRI_BT709 &&
......@@ -229,7 +229,7 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb,
hdr->mtrx == DAV1D_MC_IDENTITY)
{
hdr->layout = DAV1D_PIXEL_LAYOUT_I444;
hdr->ss_hor = hdr->ss_ver = 1;
hdr->ss_hor = hdr->ss_ver = 0;
hdr->color_range = 1;
if (hdr->profile != 1 && !(hdr->profile == 2 && hdr->hbd == 2))
goto error;
......@@ -258,8 +258,8 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb,
}
hdr->chr = hdr->ss_hor == 1 && hdr->ss_ver == 1 ?
dav1d_get_bits(gb, 2) : DAV1D_CHR_UNKNOWN;
hdr->separate_uv_delta_q = dav1d_get_bits(gb, 1);
}
hdr->separate_uv_delta_q = !hdr->monochrome && dav1d_get_bits(gb, 1);
#if DEBUG_SEQ_HDR
printf("SEQHDR: post-colorinfo: off=%ld\n",
dav1d_get_bits_pos(gb) - init_bit_pos);
......@@ -1311,7 +1311,14 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) {
case OBU_TILE_GRP: {
if (global) break;
if (!c->frame_hdr) goto error;
if (c->n_tile_data >= 256) goto error;
if (c->n_tile_data_alloc < c->n_tile_data + 1) {
if ((c->n_tile_data + 1) > INT_MAX / (int)sizeof(*c->tile)) goto error;
struct Dav1dTileGroup *tile = realloc(c->tile, (c->n_tile_data + 1) * sizeof(*c->tile));
if (!tile) goto error;
c->tile = tile;
memset(c->tile + c->n_tile_data, 0, sizeof(*c->tile));
c->n_tile_data_alloc = c->n_tile_data + 1;
}
parse_tile_hdr(c, &gb);
// Align to the next byte boundary and check for overrun.
dav1d_bytealign_get_bits(&gb);
......@@ -1323,10 +1330,8 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) {
const unsigned bit_pos = dav1d_get_bits_pos(&gb);
assert((bit_pos & 7) == 0);
assert(pkt_bytelen >= (bit_pos >> 3));
dav1d_ref_inc(in->ref);
c->tile[c->n_tile_data].data.ref = in->ref;
c->tile[c->n_tile_data].data.m = in->m;
c->tile[c->n_tile_data].data.data = in->data + (bit_pos >> 3);
dav1d_data_ref(&c->tile[c->n_tile_data].data, in);
c->tile[c->n_tile_data].data.data += bit_pos >> 3;
c->tile[c->n_tile_data].data.sz = pkt_bytelen - (bit_pos >> 3);
// ensure tile groups are in order and sane, see 6.10.1
if (c->tile[c->n_tile_data].start > c->tile[c->n_tile_data].end ||
......@@ -1359,7 +1364,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) {
if (c->n_fc == 1) {
dav1d_picture_ref(&c->out,
&c->refs[c->frame_hdr->existing_frame_idx].p.p);
c->out.m = in->m;
dav1d_data_props_copy(&c->out.m, &in->m);
} else {
// need to append this to the frame output queue
const unsigned next = c->frame_thread.next++;
......@@ -1383,7 +1388,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) {
dav1d_thread_picture_ref(out_delayed,
&c->refs[c->frame_hdr->existing_frame_idx].p);
out_delayed->visible = 1;
out_delayed->p.m = in->m;
dav1d_data_props_copy(&out_delayed->p.m, &in->m);
pthread_mutex_unlock(&f->frame_thread.td.lock);
}
if (c->refs[c->frame_hdr->existing_frame_idx].p.p.frame_hdr->frame_type == DAV1D_FRAME_TYPE_KEY) {
......
......@@ -99,8 +99,9 @@ static void free_buffer(const uint8_t *const data, void *const user_data) {
static int picture_alloc_with_edges(Dav1dPicture *const p,
const int w, const int h,
const enum Dav1dPixelLayout layout,
const int bpc,
Dav1dSequenceHeader *seq_hdr, Dav1dRef *seq_hdr_ref,
Dav1dFrameHeader *frame_hdr, Dav1dRef *frame_hdr_ref,
const int bpc, const Dav1dDataProps *props,
Dav1dPicAllocator *const p_allocator,
const size_t extra, void **const extra_ptr)
{
......@@ -117,11 +118,11 @@ static int picture_alloc_with_edges(Dav1dPicture *const p,
p->p.w = w;
p->p.h = h;
p->m.timestamp = INT64_MIN;
p->m.duration = 0;
p->m.offset = -1;
p->p.layout = layout;
p->seq_hdr = seq_hdr;
p->frame_hdr = frame_hdr;
p->p.layout = seq_hdr->layout;
p->p.bpc = bpc;
dav1d_data_props_set_defaults(&p->m);
int res = p_allocator->alloc_picture_callback(p, p_allocator->cookie);
if (res < 0) {
free(pic_ctx);
......@@ -138,6 +139,14 @@ static int picture_alloc_with_edges(Dav1dPicture *const p,
return -ENOMEM;
}
p->seq_hdr_ref = seq_hdr_ref;
if (seq_hdr_ref) dav1d_ref_inc(seq_hdr_ref);
p->frame_hdr_ref = frame_hdr_ref;
if (frame_hdr_ref) dav1d_ref_inc(frame_hdr_ref);
dav1d_data_props_copy(&p->m, props);
if (extra && extra_ptr)
*extra_ptr = &pic_ctx->extra_ptr;
......@@ -146,14 +155,19 @@ static int picture_alloc_with_edges(Dav1dPicture *const p,
int dav1d_thread_picture_alloc(Dav1dThreadPicture *const p,
const int w, const int h,
const enum Dav1dPixelLayout layout, const int bpc,
Dav1dSequenceHeader *seq_hdr, Dav1dRef *seq_hdr_ref,
Dav1dFrameHeader *frame_hdr, Dav1dRef *frame_hdr_ref,
const int bpc, const Dav1dDataProps *props,
struct thread_data *const t, const int visible,
Dav1dPicAllocator *const p_allocator)
{
p->t = t;
const int res =
picture_alloc_with_edges(&p->p, w, h, layout, bpc, p_allocator,
picture_alloc_with_edges(&p->p, w, h,
seq_hdr, seq_hdr_ref,
frame_hdr, frame_hdr_ref,
bpc, props, p_allocator,
t != NULL ? sizeof(atomic_int) * 2 : 0,
(void **) &p->progress);
if (res) return res;
......@@ -170,22 +184,11 @@ int dav1d_picture_alloc_copy(Dav1dPicture *const dst, const int w,
const Dav1dPicture *const src)
{
struct pic_ctx_context *const pic_ctx = src->ref->user_data;
const int res = picture_alloc_with_edges(dst, w, src->p.h, src->p.layout,
src->p.bpc, &pic_ctx->allocator,
const int res = picture_alloc_with_edges(dst, w, src->p.h,
src->seq_hdr, src->seq_hdr_ref,
src->frame_hdr, src->frame_hdr_ref,
src->p.bpc, &src->m, &pic_ctx->allocator,
0, NULL);
if (!res) {
dst->p = src->p;
dst->m = src->m;
dst->p.w = w;
dst->frame_hdr = src->frame_hdr;
dst->frame_hdr_ref = src->frame_hdr_ref;
if (dst->frame_hdr_ref) dav1d_ref_inc(dst->frame_hdr_ref);
dst->seq_hdr = src->seq_hdr;
dst->seq_hdr_ref = src->seq_hdr_ref;
if (dst->seq_hdr_ref) dav1d_ref_inc(dst->seq_hdr_ref);
}
return res;
}
......@@ -199,6 +202,7 @@ void dav1d_picture_ref(Dav1dPicture *const dst, const Dav1dPicture *const src) {
dav1d_ref_inc(src->ref);
if (src->frame_hdr_ref) dav1d_ref_inc(src->frame_hdr_ref);
if (src->seq_hdr_ref) dav1d_ref_inc(src->seq_hdr_ref);
if (src->m.user_data.ref) dav1d_ref_inc(src->m.user_data.ref);
}
*dst = *src;
}
......@@ -232,6 +236,7 @@ void dav1d_picture_unref_internal(Dav1dPicture *const p) {
dav1d_ref_dec(&p->ref);
dav1d_ref_dec(&p->seq_hdr_ref);
dav1d_ref_dec(&p->frame_hdr_ref);
dav1d_ref_dec(&p->m.user_data.ref);
}
memset(p, 0, sizeof(*p));
}
......
......@@ -34,6 +34,7 @@
#include "dav1d/picture.h"
#include "src/thread_data.h"
#include "src/ref.h"
enum PlaneType {
PLANE_TYPE_Y,
......@@ -55,7 +56,9 @@ typedef struct Dav1dThreadPicture {
* Allocate a picture with custom border size.
*/
int dav1d_thread_picture_alloc(Dav1dThreadPicture *p, int w, int h,
enum Dav1dPixelLayout layout, int bpc,
Dav1dSequenceHeader *seq_hdr, Dav1dRef *seq_hdr_ref,
Dav1dFrameHeader *frame_hdr, Dav1dRef *frame_hdr_ref,
int bpc, const Dav1dDataProps *props,
struct thread_data *t, int visible,
Dav1dPicAllocator *);
......
......@@ -50,8 +50,8 @@ static unsigned read_golomb(MsacContext *const msac) {
int len = 0;
unsigned val = 1;
while (!msac_decode_bool(msac, EC_BOOL_EPROB) && len < 32) len++;
while (len--) val = (val << 1) | msac_decode_bool(msac, EC_BOOL_EPROB);
while (!msac_decode_bool_equi(msac) && len < 32) len++;
while (len--) val = (val << 1) | msac_decode_bool_equi(msac);
return val - 1;
}
......@@ -152,7 +152,7 @@ static int decode_coefs(Dav1dTileContext *const t,
unsigned mask = eob >> 1;
if (eob_hi_bit) eob |= mask;
for (mask >>= 1; mask; mask >>= 1) {
const int eob_bit = msac_decode_bool(&ts->msac, EC_BOOL_EPROB);
const int eob_bit = msac_decode_bool_equi(&ts->msac);
if (eob_bit) eob |= mask;
}
if (dbg)
......@@ -231,7 +231,7 @@ static int decode_coefs(Dav1dTileContext *const t,
dc_sign = sign ? 0 : 2;
dq = (dq_tbl[0] * qm_tbl[0] + 16) >> 5;
} else {
sign = msac_decode_bool(&ts->msac, EC_BOOL_EPROB);
sign = msac_decode_bool_equi(&ts->msac);
if (dbg)
printf("Post-sign[%d=%d=%d]: r=%d\n", i, rc, sign, ts->msac.rng);
dq = (dq_tbl[1] * qm_tbl[rc] + 16) >> 5;
......@@ -833,8 +833,9 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
edge_flags, dst,
f->cur.stride[0], top_sb_edge,
b->y_mode, &angle,
t_dim->w, t_dim->h, edge
HIGHBD_CALL_SUFFIX);
t_dim->w, t_dim->h,
f->seq_hdr->intra_edge_filter,
edge HIGHBD_CALL_SUFFIX);
dsp->ipred.intra_pred[m](dst, f->cur.stride[0], edge,
t_dim->w * 4, t_dim->h * 4,
angle | intra_flags,
......@@ -951,9 +952,8 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
ts->tiling.row_end >> ss_ver,
0, uv_dst[pl], stride,
top_sb_edge, DC_PRED, &angle,
uv_t_dim->w,
uv_t_dim->h, edge
HIGHBD_CALL_SUFFIX);
uv_t_dim->w, uv_t_dim->h, 0,
edge HIGHBD_CALL_SUFFIX);
dsp->ipred.cfl_pred[m](uv_dst[pl], stride, edge,
uv_t_dim->w * 4,
uv_t_dim->h * 4,
......@@ -1053,8 +1053,9 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
edge_flags, dst, stride,
top_sb_edge, uv_mode,
&angle, uv_t_dim->w,
uv_t_dim->h, edge
HIGHBD_CALL_SUFFIX);
uv_t_dim->h,
f->seq_hdr->intra_edge_filter,
edge HIGHBD_CALL_SUFFIX);
angle |= intra_edge_filter_flag;
dsp->ipred.intra_pred[m](dst, stride, edge,
uv_t_dim->w * 4,
......@@ -1216,7 +1217,7 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize
t->by, t->by > ts->tiling.row_start,
ts->tiling.col_end, ts->tiling.row_end,
0, dst, f->cur.stride[0], top_sb_edge,
m, &angle, bw4, bh4, tl_edge
m, &angle, bw4, bh4, 0, tl_edge
HIGHBD_CALL_SUFFIX);
dsp->ipred.intra_pred[m](tmp, 4 * bw4 * sizeof(pixel),
tl_edge, bw4 * 4, bh4 * 4, 0, 0, 0
......@@ -1358,7 +1359,7 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize
ts->tiling.row_end >> ss_ver,
0, uvdst, f->cur.stride[1],
top_sb_edge, m,
&angle, cbw4, cbh4, tl_edge
&angle, cbw4, cbh4, 0, tl_edge
HIGHBD_CALL_SUFFIX);
dsp->ipred.intra_pred[m](tmp, cbw4 * 4 * sizeof(pixel),
tl_edge, cbw4 * 4, cbh4 * 4, 0, 0, 0
......
......@@ -45,7 +45,9 @@ Dav1dRef *dav1d_ref_create(const size_t size) {
res = dav1d_ref_wrap(data, default_free_callback, data);
if (!res) {
free(data);
dav1d_free_aligned(data);
} else {
res->data = data;
}
return res;
......@@ -58,8 +60,7 @@ Dav1dRef *dav1d_ref_wrap(const uint8_t *const ptr,
Dav1dRef *res = malloc(sizeof(Dav1dRef));
if (!res) return NULL;
if (ptr == user_data)
res->data = user_data;
res->data = NULL;
res->const_data = ptr;
atomic_init(&res->ref_cnt, 1);
res->free_callback = free_callback;
......@@ -86,5 +87,5 @@ void dav1d_ref_dec(Dav1dRef **const pref) {
}
int dav1d_ref_is_writable(Dav1dRef *const ref) {
return atomic_load(&ref->ref_cnt) == 1;
return atomic_load(&ref->ref_cnt) == 1 && ref->data;
}
......@@ -502,25 +502,25 @@ const int16_t dav1d_sgr_params[16][4] = { // r0, r1, e0, e1
{ 2, 0, 22, -1 },
};
const int dav1d_sgr_x_by_xplus1[256] = {
1, 128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247,
248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250,
250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252,
252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253,
253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253,