Skip to content

x86: Add AVX-512 (Ice Lake) asm for bidirectional mc functions

Henrik Gramner requested to merge gramner/dav1d:mc_bidir_avx512icl into master
avg_w4_8bpc_c: 71.5
avg_w4_8bpc_ssse3: 6.1
avg_w4_8bpc_avx2: 5.7
avg_w4_8bpc_avx512icl: 5.7

avg_w8_8bpc_c: 179.4
avg_w8_8bpc_ssse3: 10.2
avg_w8_8bpc_avx2: 7.4
avg_w8_8bpc_avx512icl: 6.4

avg_w16_8bpc_c: 188.5
avg_w16_8bpc_ssse3: 31.9
avg_w16_8bpc_avx2: 15.8
avg_w16_8bpc_avx512icl: 12.2

avg_w32_8bpc_c: 638.0
avg_w32_8bpc_ssse3: 99.5
avg_w32_8bpc_avx2: 51.9
avg_w32_8bpc_avx512icl: 39.1

avg_w64_8bpc_c: 1522.7
avg_w64_8bpc_ssse3: 204.9
avg_w64_8bpc_avx2: 120.6
avg_w64_8bpc_avx512icl: 86.6

avg_w128_8bpc_c: 3871.8
avg_w128_8bpc_ssse3: 703.8
avg_w128_8bpc_avx2: 454.2
avg_w128_8bpc_avx512icl: 366.2


w_avg_w4_8bpc_c: 86.7
w_avg_w4_8bpc_ssse3: 8.3
w_avg_w4_8bpc_avx2: 7.6
w_avg_w4_8bpc_avx512icl: 7.3

w_avg_w8_8bpc_c: 210.2
w_avg_w8_8bpc_ssse3: 17.8
w_avg_w8_8bpc_avx2: 10.1
w_avg_w8_8bpc_avx512icl: 9.2

w_avg_w16_8bpc_c: 450.5
w_avg_w16_8bpc_ssse3: 45.6
w_avg_w16_8bpc_avx2: 24.4
w_avg_w16_8bpc_avx512icl: 17.0

w_avg_w32_8bpc_c: 1812.9
w_avg_w32_8bpc_ssse3: 144.0
w_avg_w32_8bpc_avx2: 79.9
w_avg_w32_8bpc_avx512icl: 57.5

w_avg_w64_8bpc_c: 4597.1
w_avg_w64_8bpc_ssse3: 321.4
w_avg_w64_8bpc_avx2: 190.0
w_avg_w64_8bpc_avx512icl: 130.3

w_avg_w128_8bpc_c: 11309.4
w_avg_w128_8bpc_ssse3: 831.3
w_avg_w128_8bpc_avx2: 539.1
w_avg_w128_8bpc_avx512icl: 417.0


mask_w4_8bpc_c: 96.7
mask_w4_8bpc_ssse3: 13.6
mask_w4_8bpc_avx2: 13.4
mask_w4_8bpc_avx512icl: 12.2

mask_w8_8bpc_c: 239.4
mask_w8_8bpc_ssse3: 23.7
mask_w8_8bpc_avx2: 16.5
mask_w8_8bpc_avx512icl: 15.8

mask_w16_8bpc_c: 236.4
mask_w16_8bpc_ssse3: 72.5
mask_w16_8bpc_avx2: 35.2
mask_w16_8bpc_avx512icl: 26.3

mask_w32_8bpc_c: 836.1
mask_w32_8bpc_ssse3: 243.5
mask_w32_8bpc_avx2: 136.6
mask_w32_8bpc_avx512icl: 89.6

mask_w64_8bpc_c: 2013.1
mask_w64_8bpc_ssse3: 540.8
mask_w64_8bpc_avx2: 304.0
mask_w64_8bpc_avx512icl: 211.3

mask_w128_8bpc_c: 5253.6
mask_w128_8bpc_ssse3: 1368.4
mask_w128_8bpc_avx2: 770.1
mask_w128_8bpc_avx512icl: 563.2


w_mask_420_w4_8bpc_c: 129.0
w_mask_420_w4_8bpc_ssse3: 14.4
w_mask_420_w4_8bpc_avx2: 11.4
w_mask_420_w4_8bpc_avx512icl: 9.9

w_mask_420_w8_8bpc_c: 398.2
w_mask_420_w8_8bpc_ssse3: 33.9
w_mask_420_w8_8bpc_avx2: 19.6
w_mask_420_w8_8bpc_avx512icl: 13.6

w_mask_420_w16_8bpc_c: 1273.3
w_mask_420_w16_8bpc_ssse3: 92.1
w_mask_420_w16_8bpc_avx2: 49.7
w_mask_420_w16_8bpc_avx512icl: 34.8

w_mask_420_w32_8bpc_c: 5546.7
w_mask_420_w32_8bpc_ssse3: 345.6
w_mask_420_w32_8bpc_avx2: 179.2
w_mask_420_w32_8bpc_avx512icl: 131.0

w_mask_420_w64_8bpc_c: 13304.4
w_mask_420_w64_8bpc_ssse3: 804.3
w_mask_420_w64_8bpc_avx2: 413.9
w_mask_420_w64_8bpc_avx512icl: 312.7

w_mask_420_w128_8bpc_c: 33952.3
w_mask_420_w128_8bpc_ssse3: 2055.0
w_mask_420_w128_8bpc_avx2: 1018.3
w_mask_420_w128_8bpc_avx512icl: 809.0


w_mask_422_w4_8bpc_c: 130.2
w_mask_422_w4_8bpc_avx2: 11.4
w_mask_422_w4_8bpc_avx512icl: 9.3

w_mask_422_w8_8bpc_c: 398.8
w_mask_422_w8_8bpc_avx2: 19.1
w_mask_422_w8_8bpc_avx512icl: 14.1

w_mask_422_w16_8bpc_c: 1303.5
w_mask_422_w16_8bpc_avx2: 47.7
w_mask_422_w16_8bpc_avx512icl: 35.0

w_mask_422_w32_8bpc_c: 6084.1
w_mask_422_w32_8bpc_avx2: 174.5
w_mask_422_w32_8bpc_avx512icl: 131.5

w_mask_422_w64_8bpc_c: 14259.5
w_mask_422_w64_8bpc_avx2: 413.7
w_mask_422_w64_8bpc_avx512icl: 313.6

w_mask_422_w128_8bpc_c: 34475.7
w_mask_422_w128_8bpc_avx2: 1028.1
w_mask_422_w128_8bpc_avx512icl: 789.9


w_mask_444_w4_8bpc_c: 116.4
w_mask_444_w4_8bpc_avx2: 9.3
w_mask_444_w4_8bpc_avx512icl: 9.0

w_mask_444_w8_8bpc_c: 355.2
w_mask_444_w8_8bpc_avx2: 15.9
w_mask_444_w8_8bpc_avx512icl: 12.8

w_mask_444_w16_8bpc_c: 1156.7
w_mask_444_w16_8bpc_avx2: 47.2
w_mask_444_w16_8bpc_avx512icl: 32.3

w_mask_444_w32_8bpc_c: 5026.0
w_mask_444_w32_8bpc_avx2: 170.5
w_mask_444_w32_8bpc_avx512icl: 121.0

w_mask_444_w64_8bpc_c: 12353.4
w_mask_444_w64_8bpc_avx2: 390.2
w_mask_444_w64_8bpc_avx512icl: 289.5

w_mask_444_w128_8bpc_c: 31557.9
w_mask_444_w128_8bpc_avx2: 1109.1
w_mask_444_w128_8bpc_avx512icl: 724.3

Merge request reports