mc: Add 6-tap put/prep implementation to C code
This commit adds C implementations of 6-tap motion compensation filters
for 4 of the 9 separable configurations that do not contain SHARP e.g.
REGULAR_REGULAR, REGULAR_SMOOTH, SMOOTH_REGULAR AND SMOOTH_SMOOTH.
Ideally this would have no impact binary size as by default meson builds
-Dtrim_dsp=true which uses dead code elimination to remove the C code.
However, the allowed CPU instructions are SSE2 for x86 and NEON for arm
and SSE2 mc routines were removed in 85c1639 while arm has never had
SIMD for any of the scaled functions thus C routines are always called.
Also, on riscv and ppc architectures, there are *only* C implementations
and the 6-tap routines provide a meaningful speed-up when decoding the
600 frame Bosphorus_1080p_8bit.ivf file described in e51f437:
Before After Delta
SpacemiT K1 (riscv64) 137.534 114.116 -17.03%
Grouping routines by type and bitdepth, this commit gives the following
average C function improvements across all supported architectures:
x86_64 aarch64 riscv64
put_8bpc -19.47% -13.02% -17.10%
put_16bpc -17.67% -15.40% -18.24%
prep_8bpc -20.49% -14.29% -17.06%
prep_16bpc -20.39% -16.28% -16.34%
put_scaled_8bpc -23.17% -9.89% -16.85%
put_scaled_16bpc -14.00% -9.20% -22.03%
prep_scaled_8bpc -21.96% -26.17% -21.29%
prep_scaled_16bpc -17.63% -24.34% -21.27%
i7-1370P (x86_64) Old New Delta
mc_8tap_regular_w2_0_8bpc_c 148.8 73.3 -50.74%
mc_8tap_regular_w2_h_8bpc_c 234.9 138.8 -40.91%
mc_8tap_regular_w2_hv_8bpc_c 357.9 296.5 -17.16%
mc_8tap_regular_w2_v_8bpc_c 201.2 139.9 -30.47%
mc_8tap_regular_w4_0_8bpc_c 60.2 60.8 1.00%
mc_8tap_regular_w4_h_8bpc_c 280.8 233.6 -16.81%
mc_8tap_regular_w4_hv_8bpc_c 643.9 496.3 -22.92%
mc_8tap_regular_w4_v_8bpc_c 279.3 250.7 -10.24%
mc_8tap_regular_w8_0_8bpc_c 57.2 60.0 4.90%
mc_8tap_regular_w8_h_8bpc_c 633.1 483.3 -23.66%
mc_8tap_regular_w8_hv_8bpc_c 1229.1 981.6 -20.14%
mc_8tap_regular_w8_v_8bpc_c 680.8 502.3 -26.22%
mc_8tap_regular_w16_0_8bpc_c 73.4 74.9 2.04%
mc_8tap_regular_w16_h_8bpc_c 847.3 674.9 -20.35%
mc_8tap_regular_w16_hv_8bpc_c 1542.1 1128.0 -26.85%
mc_8tap_regular_w16_v_8bpc_c 864.5 693.7 -19.76%
mc_8tap_regular_w32_0_8bpc_c 98.4 92.7 -5.79%
mc_8tap_regular_w32_h_8bpc_c 2236.2 1717.5 -23.20%
mc_8tap_regular_w32_hv_8bpc_c 4150.4 3029.4 -27.01%
mc_8tap_regular_w32_v_8bpc_c 2265.0 1770.7 -21.82%
mc_8tap_regular_w64_0_8bpc_c 164.0 156.2 -4.76%
mc_8tap_regular_w64_h_8bpc_c 7175.5 5533.5 -22.88%
mc_8tap_regular_w64_hv_8bpc_c 13422.8 9825.5 -26.80%
mc_8tap_regular_w64_v_8bpc_c 7211.2 5612.9 -22.16%
mc_8tap_regular_w128_0_8bpc_c 308.7 309.7 0.32%
mc_8tap_regular_w128_h_8bpc_c 19191.0 14826.7 -22.74%
mc_8tap_regular_w128_hv_8bpc_c 36391.9 26199.9 -28.01%
mc_8tap_regular_w128_v_8bpc_c 19192.8 14976.0 -21.97%
mc_8tap_regular_w2_0_16bpc_c 94.0 76.6 -18.51%
mc_8tap_regular_w2_h_16bpc_c 156.0 125.6 -19.49%
mc_8tap_regular_w2_hv_16bpc_c 343.7 306.2 -10.91%
mc_8tap_regular_w2_v_16bpc_c 184.6 129.9 -29.63%
mc_8tap_regular_w4_0_16bpc_c 64.7 62.2 -3.86%
mc_8tap_regular_w4_h_16bpc_c 282.9 211.1 -25.38%
mc_8tap_regular_w4_hv_16bpc_c 669.1 506.3 -24.33%
mc_8tap_regular_w4_v_16bpc_c 280.8 212.8 -24.22%
mc_8tap_regular_w8_0_16bpc_c 62.0 61.0 -1.61%
mc_8tap_regular_w8_h_16bpc_c 534.0 393.1 -26.39%
mc_8tap_regular_w8_hv_16bpc_c 1071.7 814.9 -23.96%
mc_8tap_regular_w8_v_16bpc_c 527.8 390.9 -25.94%
mc_8tap_regular_w16_0_16bpc_c 73.5 74.6 1.50%
mc_8tap_regular_w16_h_16bpc_c 1707.6 1361.5 -20.27%
mc_8tap_regular_w16_hv_16bpc_c 2518.9 1852.3 -26.46%
mc_8tap_regular_w16_v_16bpc_c 1751.4 1391.7 -20.54%
mc_8tap_regular_w32_0_16bpc_c 101.3 102.2 0.89%
mc_8tap_regular_w32_h_16bpc_c 5128.2 3983.2 -22.33%
mc_8tap_regular_w32_hv_16bpc_c 6448.2 5113.1 -20.71%
mc_8tap_regular_w32_v_16bpc_c 5143.6 4046.7 -21.33%
mc_8tap_regular_w64_0_16bpc_c 232.8 231.0 -0.77%
mc_8tap_regular_w64_h_16bpc_c 17493.3 13606.5 -22.22%
mc_8tap_regular_w64_hv_16bpc_c 20834.8 16578.6 -20.43%
mc_8tap_regular_w64_v_16bpc_c 17452.5 13747.8 -21.23%
mc_8tap_regular_w128_0_16bpc_c 826.0 824.5 -0.18%
mc_8tap_regular_w128_h_16bpc_c 48273.4 37986.2 -21.31%
mc_8tap_regular_w128_hv_16bpc_c 58284.0 44935.2 -22.90%
mc_8tap_regular_w128_v_16bpc_c 48847.3 38000.0 -22.21%
Ampere Altra (aarch64) Old New Delta
mc_8tap_regular_w2_0_8bpc_c 193.3 205.2 6.16%
mc_8tap_regular_w2_h_8bpc_c 338.0 303.2 -10.30%
mc_8tap_regular_w2_hv_8bpc_c 781.8 636.3 -18.61%
mc_8tap_regular_w2_v_8bpc_c 511.3 381.9 -25.31%
mc_8tap_regular_w4_0_8bpc_c 178.9 191.1 6.82%
mc_8tap_regular_w4_h_8bpc_c 592.7 504.9 -14.81%
mc_8tap_regular_w4_hv_8bpc_c 1378.0 1088.7 -20.99%
mc_8tap_regular_w4_v_8bpc_c 745.4 573.9 -23.01%
mc_8tap_regular_w8_0_8bpc_c 166.5 178.6 7.27%
mc_8tap_regular_w8_h_8bpc_c 487.0 414.1 -14.97%
mc_8tap_regular_w8_hv_8bpc_c 1026.8 762.5 -25.74%
mc_8tap_regular_w8_v_8bpc_c 608.8 487.7 -19.89%
mc_8tap_regular_w16_0_8bpc_c 216.9 214.4 -1.15%
mc_8tap_regular_w16_h_8bpc_c 1196.5 985.3 -17.65%
mc_8tap_regular_w16_hv_8bpc_c 2332.3 1886.0 -19.14%
mc_8tap_regular_w16_v_8bpc_c 1389.5 1103.5 -20.58%
mc_8tap_regular_w32_0_8bpc_c 333.2 331.0 -0.66%
mc_8tap_regular_w32_h_8bpc_c 3440.8 2818.4 -18.09%
mc_8tap_regular_w32_hv_8bpc_c 6624.7 5315.8 -19.76%
mc_8tap_regular_w32_v_8bpc_c 3657.9 2965.0 -18.94%
mc_8tap_regular_w64_0_8bpc_c 559.5 608.2 8.70%
mc_8tap_regular_w64_h_8bpc_c 11420.5 9437.5 -17.36%
mc_8tap_regular_w64_hv_8bpc_c 21796.9 17450.6 -19.94%
mc_8tap_regular_w64_v_8bpc_c 11721.0 9673.7 -17.47%
mc_8tap_regular_w128_0_8bpc_c 891.6 927.0 3.97%
mc_8tap_regular_w128_h_8bpc_c 31319.1 26020.0 -16.92%
mc_8tap_regular_w128_hv_8bpc_c 59354.8 47634.9 -19.75%
mc_8tap_regular_w128_v_8bpc_c 31375.4 26229.2 -16.40%
mc_8tap_regular_w2_0_16bpc_c 178.3 181.3 1.68%
mc_8tap_regular_w2_h_16bpc_c 338.2 297.5 -12.03%
mc_8tap_regular_w2_hv_16bpc_c 762.5 647.7 -15.06%
mc_8tap_regular_w2_v_16bpc_c 502.3 385.2 -23.31%
mc_8tap_regular_w4_0_16bpc_c 165.9 169.2 1.99%
mc_8tap_regular_w4_h_16bpc_c 583.8 483.6 -17.16%
mc_8tap_regular_w4_hv_16bpc_c 991.5 796.2 -19.70%
mc_8tap_regular_w4_v_16bpc_c 734.4 573.1 -21.96%
mc_8tap_regular_w8_0_16bpc_c 157.3 158.8 0.95%
mc_8tap_regular_w8_h_16bpc_c 1061.9 871.9 -17.89%
mc_8tap_regular_w8_hv_16bpc_c 1137.0 876.9 -22.88%
mc_8tap_regular_w8_v_16bpc_c 1207.7 934.1 -22.65%
mc_8tap_regular_w16_0_16bpc_c 216.9 218.4 0.69%
mc_8tap_regular_w16_h_16bpc_c 2097.6 1622.1 -22.67%
mc_8tap_regular_w16_hv_16bpc_c 2824.9 2225.3 -21.23%
mc_8tap_regular_w16_v_16bpc_c 2206.7 1747.2 -20.82%
mc_8tap_regular_w32_0_16bpc_c 322.6 325.5 0.90%
mc_8tap_regular_w32_h_16bpc_c 5713.5 4460.0 -21.94%
mc_8tap_regular_w32_hv_16bpc_c 8022.0 6335.3 -21.03%
mc_8tap_regular_w32_v_16bpc_c 5801.9 4671.3 -19.49%
mc_8tap_regular_w64_0_16bpc_c 656.4 639.4 -2.59%
mc_8tap_regular_w64_h_16bpc_c 18763.1 14743.3 -21.42%
mc_8tap_regular_w64_hv_16bpc_c 26631.8 21037.7 -21.01%
mc_8tap_regular_w64_v_16bpc_c 18720.0 15117.6 -19.24%
mc_8tap_regular_w128_0_16bpc_c 2020.9 1773.6 -12.24%
mc_8tap_regular_w128_h_16bpc_c 50882.4 40092.6 -21.21%
mc_8tap_regular_w128_hv_16bpc_c 72142.1 57118.5 -20.83%
mc_8tap_regular_w128_v_16bpc_c 50191.8 40619.5 -19.07%
SpacemiT K1 (riscv64) Old New Delta
mc_8tap_regular_w2_0_8bpc_c 339.0 327.1 -3.51%
mc_8tap_regular_w2_h_8bpc_c 668.4 531.7 -20.45%
mc_8tap_regular_w2_hv_8bpc_c 1397.3 1091.4 -21.89%
mc_8tap_regular_w2_v_8bpc_c 720.2 565.5 -21.48%
mc_8tap_regular_w4_0_8bpc_c 406.1 392.6 -3.32%
mc_8tap_regular_w4_h_8bpc_c 1251.5 988.6 -21.01%
mc_8tap_regular_w4_hv_8bpc_c 2562.9 2025.1 -20.98%
mc_8tap_regular_w4_v_8bpc_c 1321.1 1038.7 -21.38%
mc_8tap_regular_w8_0_8bpc_c 536.0 524.1 -2.22%
mc_8tap_regular_w8_h_8bpc_c 2425.5 1901.5 -21.60%
mc_8tap_regular_w8_hv_8bpc_c 4890.9 3824.4 -21.81%
mc_8tap_regular_w8_v_8bpc_c 2525.3 1985.1 -21.39%
mc_8tap_regular_w16_0_8bpc_c 805.1 786.4 -2.32%
mc_8tap_regular_w16_h_8bpc_c 6838.5 5333.5 -22.01%
mc_8tap_regular_w16_hv_8bpc_c 12967.8 9720.8 -25.04%
mc_8tap_regular_w16_v_8bpc_c 7067.9 5544.0 -21.56%
mc_8tap_regular_w32_0_8bpc_c 1430.7 1408.1 -1.58%
mc_8tap_regular_w32_h_8bpc_c 21615.9 16844.3 -22.07%
mc_8tap_regular_w32_hv_8bpc_c 39207.8 29300.1 -25.27%
mc_8tap_regular_w32_v_8bpc_c 22253.5 17475.4 -21.47%
mc_8tap_regular_w64_0_8bpc_c 3228.9 3158.8 -2.17%
mc_8tap_regular_w64_h_8bpc_c 76824.3 59906.2 -22.02%
mc_8tap_regular_w64_hv_8bpc_c 133618.6 99802.2 -25.31%
mc_8tap_regular_w64_v_8bpc_c 79020.2 62041.2 -21.49%
mc_8tap_regular_w128_0_8bpc_c 5759.5 5975.1 3.74%
mc_8tap_regular_w128_h_8bpc_c 217395.3 169473.8 -22.04%
mc_8tap_regular_w128_hv_8bpc_c 371760.4 276004.4 -25.76%
mc_8tap_regular_w128_v_8bpc_c 223748.4 175879.6 -21.39%
mc_8tap_regular_w2_0_16bpc_c 421.7 395.4 -6.24%
mc_8tap_regular_w2_h_16bpc_c 665.6 535.8 -19.50%
mc_8tap_regular_w2_hv_16bpc_c 1418.2 1099.0 -22.51%
mc_8tap_regular_w2_v_16bpc_c 732.8 546.5 -25.42%
mc_8tap_regular_w4_0_16bpc_c 553.0 526.3 -4.83%
mc_8tap_regular_w4_h_16bpc_c 1236.8 976.6 -21.04%
mc_8tap_regular_w4_hv_16bpc_c 2566.5 2015.4 -21.47%
mc_8tap_regular_w4_v_16bpc_c 1319.7 1003.6 -23.95%
mc_8tap_regular_w8_0_16bpc_c 601.8 553.3 -8.06%
mc_8tap_regular_w8_h_16bpc_c 2373.3 1852.4 -21.95%
mc_8tap_regular_w8_hv_16bpc_c 4858.5 3895.7 -19.82%
mc_8tap_regular_w8_v_16bpc_c 2493.8 1917.0 -23.13%
mc_8tap_regular_w16_0_16bpc_c 931.4 873.1 -6.26%
mc_8tap_regular_w16_h_16bpc_c 6688.5 5192.5 -22.37%
mc_8tap_regular_w16_hv_16bpc_c 12856.3 9800.0 -23.77%
mc_8tap_regular_w16_v_16bpc_c 6962.1 5403.7 -22.38%
mc_8tap_regular_w32_0_16bpc_c 1857.1 1779.0 -4.21%
mc_8tap_regular_w32_h_16bpc_c 21269.9 16506.1 -22.40%
mc_8tap_regular_w32_hv_16bpc_c 38936.2 29558.7 -24.08%
mc_8tap_regular_w32_v_16bpc_c 21925.7 17107.1 -21.98%
mc_8tap_regular_w64_0_16bpc_c 4052.2 3932.4 -2.96%
mc_8tap_regular_w64_h_16bpc_c 75947.2 59017.3 -22.29%
mc_8tap_regular_w64_hv_16bpc_c 132996.0 100723.7 -24.27%
mc_8tap_regular_w64_v_16bpc_c 78595.3 61668.7 -21.54%
mc_8tap_regular_w128_0_16bpc_c 9566.2 9025.9 -5.65%
mc_8tap_regular_w128_h_16bpc_c 213355.7 165274.8 -22.54%
mc_8tap_regular_w128_hv_16bpc_c 367353.0 276373.1 -24.77%
mc_8tap_regular_w128_v_16bpc_c 221746.3 174176.8 -21.45%
Edited by Nathan E. Egge