Commit bf659082 authored by Liwei Wang's avatar Liwei Wang Committed by Henrik Gramner

Add SSSE3 implementation for the 4x16 and 16x4 blocks in itx

Cycle times:
inv_txfm_add_4x16_adst_adst_0_8bpc_c: 2203.6
inv_txfm_add_4x16_adst_adst_0_8bpc_ssse3: 198.7
inv_txfm_add_4x16_adst_adst_1_8bpc_c: 2235.1
inv_txfm_add_4x16_adst_adst_1_8bpc_ssse3: 199.7
inv_txfm_add_4x16_adst_adst_2_8bpc_c: 2199.1
inv_txfm_add_4x16_adst_adst_2_8bpc_ssse3: 199.9
inv_txfm_add_4x16_adst_dct_0_8bpc_c: 2272.4
inv_txfm_add_4x16_adst_dct_0_8bpc_ssse3: 50.0
inv_txfm_add_4x16_adst_dct_1_8bpc_c: 2281.6
inv_txfm_add_4x16_adst_dct_1_8bpc_ssse3: 163.7
inv_txfm_add_4x16_adst_dct_2_8bpc_c: 2262.5
inv_txfm_add_4x16_adst_dct_2_8bpc_ssse3: 164.7
inv_txfm_add_4x16_adst_flipadst_0_8bpc_c: 2456.5
inv_txfm_add_4x16_adst_flipadst_0_8bpc_ssse3: 204.3
inv_txfm_add_4x16_adst_flipadst_1_8bpc_c: 2349.1
inv_txfm_add_4x16_adst_flipadst_1_8bpc_ssse3: 198.5
inv_txfm_add_4x16_adst_flipadst_2_8bpc_c: 2241.5
inv_txfm_add_4x16_adst_flipadst_2_8bpc_ssse3: 198.7
inv_txfm_add_4x16_adst_identity_0_8bpc_c: 1574.7
inv_txfm_add_4x16_adst_identity_0_8bpc_ssse3: 117.0
inv_txfm_add_4x16_adst_identity_1_8bpc_c: 1576.3
inv_txfm_add_4x16_adst_identity_1_8bpc_ssse3: 116.6
inv_txfm_add_4x16_adst_identity_2_8bpc_c: 1572.9
inv_txfm_add_4x16_adst_identity_2_8bpc_ssse3: 116.7
inv_txfm_add_4x16_dct_adst_0_8bpc_c: 2162.8
inv_txfm_add_4x16_dct_adst_0_8bpc_ssse3: 187.6
inv_txfm_add_4x16_dct_adst_1_8bpc_c: 2180.4
inv_txfm_add_4x16_dct_adst_1_8bpc_ssse3: 185.6
inv_txfm_add_4x16_dct_adst_2_8bpc_c: 2165.1
inv_txfm_add_4x16_dct_adst_2_8bpc_ssse3: 184.9
inv_txfm_add_4x16_dct_dct_0_8bpc_c: 2233.7
inv_txfm_add_4x16_dct_dct_0_8bpc_ssse3: 49.5
inv_txfm_add_4x16_dct_dct_1_8bpc_c: 2770.4
inv_txfm_add_4x16_dct_dct_1_8bpc_ssse3: 148.4
inv_txfm_add_4x16_dct_dct_2_8bpc_c: 2288.7
inv_txfm_add_4x16_dct_dct_2_8bpc_ssse3: 149.0
inv_txfm_add_4x16_dct_flipadst_0_8bpc_c: 2242.0
inv_txfm_add_4x16_dct_flipadst_0_8bpc_ssse3: 185.8
inv_txfm_add_4x16_dct_flipadst_1_8bpc_c: 2249.6
inv_txfm_add_4x16_dct_flipadst_1_8bpc_ssse3: 188.4
inv_txfm_add_4x16_dct_flipadst_2_8bpc_c: 2237.3
inv_txfm_add_4x16_dct_flipadst_2_8bpc_ssse3: 185.1
inv_txfm_add_4x16_dct_identity_0_8bpc_c: 1532.3
inv_txfm_add_4x16_dct_identity_0_8bpc_ssse3: 63.7
inv_txfm_add_4x16_dct_identity_1_8bpc_c: 1534.5
inv_txfm_add_4x16_dct_identity_1_8bpc_ssse3: 63.6
inv_txfm_add_4x16_dct_identity_2_8bpc_c: 1548.1
inv_txfm_add_4x16_dct_identity_2_8bpc_ssse3: 101.6
inv_txfm_add_4x16_flipadst_adst_0_8bpc_c: 2205.2
inv_txfm_add_4x16_flipadst_adst_0_8bpc_ssse3: 201.6
inv_txfm_add_4x16_flipadst_adst_1_8bpc_c: 2222.0
inv_txfm_add_4x16_flipadst_adst_1_8bpc_ssse3: 202.6
inv_txfm_add_4x16_flipadst_adst_2_8bpc_c: 2205.2
inv_txfm_add_4x16_flipadst_adst_2_8bpc_ssse3: 205.7
inv_txfm_add_4x16_flipadst_dct_0_8bpc_c: 2294.9
inv_txfm_add_4x16_flipadst_dct_0_8bpc_ssse3: 50.0
inv_txfm_add_4x16_flipadst_dct_1_8bpc_c: 2304.2
inv_txfm_add_4x16_flipadst_dct_1_8bpc_ssse3: 164.5
inv_txfm_add_4x16_flipadst_dct_2_8bpc_c: 2292.7
inv_txfm_add_4x16_flipadst_dct_2_8bpc_ssse3: 164.5
inv_txfm_add_4x16_flipadst_flipadst_0_8bpc_c: 2281.3
inv_txfm_add_4x16_flipadst_flipadst_0_8bpc_ssse3: 202.9
inv_txfm_add_4x16_flipadst_flipadst_1_8bpc_c: 2258.7
inv_txfm_add_4x16_flipadst_flipadst_1_8bpc_ssse3: 202.4
inv_txfm_add_4x16_flipadst_flipadst_2_8bpc_c: 2261.0
inv_txfm_add_4x16_flipadst_flipadst_2_8bpc_ssse3: 201.3
inv_txfm_add_4x16_flipadst_identity_0_8bpc_c: 1580.5
inv_txfm_add_4x16_flipadst_identity_0_8bpc_ssse3: 116.1
inv_txfm_add_4x16_flipadst_identity_1_8bpc_c: 1578.7
inv_txfm_add_4x16_flipadst_identity_1_8bpc_ssse3: 116.7
inv_txfm_add_4x16_flipadst_identity_2_8bpc_c: 1590.8
inv_txfm_add_4x16_flipadst_identity_2_8bpc_ssse3: 117.4
inv_txfm_add_4x16_identity_adst_0_8bpc_c: 1949.0
inv_txfm_add_4x16_identity_adst_0_8bpc_ssse3: 170.9
inv_txfm_add_4x16_identity_adst_1_8bpc_c: 1947.4
inv_txfm_add_4x16_identity_adst_1_8bpc_ssse3: 171.0
inv_txfm_add_4x16_identity_adst_2_8bpc_c: 1948.7
inv_txfm_add_4x16_identity_adst_2_8bpc_ssse3: 170.3
inv_txfm_add_4x16_identity_dct_0_8bpc_c: 2022.3
inv_txfm_add_4x16_identity_dct_0_8bpc_ssse3: 59.2
inv_txfm_add_4x16_identity_dct_1_8bpc_c: 2020.8
inv_txfm_add_4x16_identity_dct_1_8bpc_ssse3: 133.7
inv_txfm_add_4x16_identity_dct_2_8bpc_c: 2020.2
inv_txfm_add_4x16_identity_dct_2_8bpc_ssse3: 133.2
inv_txfm_add_4x16_identity_flipadst_0_8bpc_c: 2024.7
inv_txfm_add_4x16_identity_flipadst_0_8bpc_ssse3: 170.3
inv_txfm_add_4x16_identity_flipadst_1_8bpc_c: 2021.8
inv_txfm_add_4x16_identity_flipadst_1_8bpc_ssse3: 170.0
inv_txfm_add_4x16_identity_flipadst_2_8bpc_c: 2022.5
inv_txfm_add_4x16_identity_flipadst_2_8bpc_ssse3: 169.9
inv_txfm_add_4x16_identity_identity_0_8bpc_c: 1328.4
inv_txfm_add_4x16_identity_identity_0_8bpc_ssse3: 87.7
inv_txfm_add_4x16_identity_identity_1_8bpc_c: 1330.9
inv_txfm_add_4x16_identity_identity_1_8bpc_ssse3: 87.7
inv_txfm_add_4x16_identity_identity_2_8bpc_c: 1327.3
inv_txfm_add_4x16_identity_identity_2_8bpc_ssse3: 87.6
inv_txfm_add_16x4_adst_adst_0_8bpc_c: 2166.3
inv_txfm_add_16x4_adst_adst_0_8bpc_ssse3: 186.3
inv_txfm_add_16x4_adst_adst_1_8bpc_c: 2166.9
inv_txfm_add_16x4_adst_adst_1_8bpc_ssse3: 184.9
inv_txfm_add_16x4_adst_adst_2_8bpc_c: 2167.2
inv_txfm_add_16x4_adst_adst_2_8bpc_ssse3: 185.2
inv_txfm_add_16x4_adst_dct_0_8bpc_c: 2123.2
inv_txfm_add_16x4_adst_dct_0_8bpc_ssse3: 172.1
inv_txfm_add_16x4_adst_dct_1_8bpc_c: 2124.2
inv_txfm_add_16x4_adst_dct_1_8bpc_ssse3: 171.2
inv_txfm_add_16x4_adst_dct_2_8bpc_c: 2122.8
inv_txfm_add_16x4_adst_dct_2_8bpc_ssse3: 171.8
inv_txfm_add_16x4_adst_flipadst_0_8bpc_c: 2213.3
inv_txfm_add_16x4_adst_flipadst_0_8bpc_ssse3: 189.6
inv_txfm_add_16x4_adst_flipadst_1_8bpc_c: 2227.7
inv_txfm_add_16x4_adst_flipadst_1_8bpc_ssse3: 188.4
inv_txfm_add_16x4_adst_flipadst_2_8bpc_c: 2228.5
inv_txfm_add_16x4_adst_flipadst_2_8bpc_ssse3: 188.4
inv_txfm_add_16x4_adst_identity_0_8bpc_c: 1906.7
inv_txfm_add_16x4_adst_identity_0_8bpc_ssse3: 154.3
inv_txfm_add_16x4_adst_identity_1_8bpc_c: 1905.2
inv_txfm_add_16x4_adst_identity_1_8bpc_ssse3: 155.6
inv_txfm_add_16x4_adst_identity_2_8bpc_c: 1905.6
inv_txfm_add_16x4_adst_identity_2_8bpc_ssse3: 156.3
inv_txfm_add_16x4_dct_adst_0_8bpc_c: 2209.8
inv_txfm_add_16x4_dct_adst_0_8bpc_ssse3: 37.4
inv_txfm_add_16x4_dct_adst_1_8bpc_c: 2209.8
inv_txfm_add_16x4_dct_adst_1_8bpc_ssse3: 157.9
inv_txfm_add_16x4_dct_adst_2_8bpc_c: 2221.1
inv_txfm_add_16x4_dct_adst_2_8bpc_ssse3: 158.5
inv_txfm_add_16x4_dct_dct_0_8bpc_c: 2177.5
inv_txfm_add_16x4_dct_dct_0_8bpc_ssse3: 29.6
inv_txfm_add_16x4_dct_dct_1_8bpc_c: 2179.3
inv_txfm_add_16x4_dct_dct_1_8bpc_ssse3: 144.9
inv_txfm_add_16x4_dct_dct_2_8bpc_c: 2177.8
inv_txfm_add_16x4_dct_dct_2_8bpc_ssse3: 143.7
inv_txfm_add_16x4_dct_flipadst_0_8bpc_c: 2293.6
inv_txfm_add_16x4_dct_flipadst_0_8bpc_ssse3: 38.3
inv_txfm_add_16x4_dct_flipadst_1_8bpc_c: 2293.2
inv_txfm_add_16x4_dct_flipadst_1_8bpc_ssse3: 163.9
inv_txfm_add_16x4_dct_flipadst_2_8bpc_c: 2301.3
inv_txfm_add_16x4_dct_flipadst_2_8bpc_ssse3: 163.7
inv_txfm_add_16x4_dct_identity_0_8bpc_c: 1977.7
inv_txfm_add_16x4_dct_identity_0_8bpc_ssse3: 39.9
inv_txfm_add_16x4_dct_identity_1_8bpc_c: 1978.7
inv_txfm_add_16x4_dct_identity_1_8bpc_ssse3: 126.8
inv_txfm_add_16x4_dct_identity_2_8bpc_c: 1979.5
inv_txfm_add_16x4_dct_identity_2_8bpc_ssse3: 128.1
inv_txfm_add_16x4_flipadst_adst_0_8bpc_c: 2175.6
inv_txfm_add_16x4_flipadst_adst_0_8bpc_ssse3: 185.1
inv_txfm_add_16x4_flipadst_adst_1_8bpc_c: 2175.7
inv_txfm_add_16x4_flipadst_adst_1_8bpc_ssse3: 185.7
inv_txfm_add_16x4_flipadst_adst_2_8bpc_c: 2173.1
inv_txfm_add_16x4_flipadst_adst_2_8bpc_ssse3: 185.0
inv_txfm_add_16x4_flipadst_dct_0_8bpc_c: 2140.5
inv_txfm_add_16x4_flipadst_dct_0_8bpc_ssse3: 172.0
inv_txfm_add_16x4_flipadst_dct_1_8bpc_c: 2147.5
inv_txfm_add_16x4_flipadst_dct_1_8bpc_ssse3: 171.9
inv_txfm_add_16x4_flipadst_dct_2_8bpc_c: 2148.5
inv_txfm_add_16x4_flipadst_dct_2_8bpc_ssse3: 172.0
inv_txfm_add_16x4_flipadst_flipadst_0_8bpc_c: 2240.6
inv_txfm_add_16x4_flipadst_flipadst_0_8bpc_ssse3: 191.3
inv_txfm_add_16x4_flipadst_flipadst_1_8bpc_c: 2243.5
inv_txfm_add_16x4_flipadst_flipadst_1_8bpc_ssse3: 193.2
inv_txfm_add_16x4_flipadst_flipadst_2_8bpc_c: 2242.9
inv_txfm_add_16x4_flipadst_flipadst_2_8bpc_ssse3: 192.0
inv_txfm_add_16x4_flipadst_identity_0_8bpc_c: 1919.2
inv_txfm_add_16x4_flipadst_identity_0_8bpc_ssse3: 155.1
inv_txfm_add_16x4_flipadst_identity_1_8bpc_c: 1925.2
inv_txfm_add_16x4_flipadst_identity_1_8bpc_ssse3: 155.2
inv_txfm_add_16x4_flipadst_identity_2_8bpc_c: 2084.8
inv_txfm_add_16x4_flipadst_identity_2_8bpc_ssse3: 155.0
inv_txfm_add_16x4_identity_adst_0_8bpc_c: 1498.5
inv_txfm_add_16x4_identity_adst_0_8bpc_ssse3: 107.6
inv_txfm_add_16x4_identity_adst_1_8bpc_c: 1499.5
inv_txfm_add_16x4_identity_adst_1_8bpc_ssse3: 107.0
inv_txfm_add_16x4_identity_adst_2_8bpc_c: 1498.9
inv_txfm_add_16x4_identity_adst_2_8bpc_ssse3: 107.9
inv_txfm_add_16x4_identity_dct_0_8bpc_c: 1471.9
inv_txfm_add_16x4_identity_dct_0_8bpc_ssse3: 45.4
inv_txfm_add_16x4_identity_dct_1_8bpc_c: 1476.4
inv_txfm_add_16x4_identity_dct_1_8bpc_ssse3: 45.5
inv_txfm_add_16x4_identity_dct_2_8bpc_c: 1459.8
inv_txfm_add_16x4_identity_dct_2_8bpc_ssse3: 92.3
inv_txfm_add_16x4_identity_flipadst_0_8bpc_c: 1548.7
inv_txfm_add_16x4_identity_flipadst_0_8bpc_ssse3: 112.1
inv_txfm_add_16x4_identity_flipadst_1_8bpc_c: 1548.2
inv_txfm_add_16x4_identity_flipadst_1_8bpc_ssse3: 111.7
inv_txfm_add_16x4_identity_flipadst_2_8bpc_c: 1547.2
inv_txfm_add_16x4_identity_flipadst_2_8bpc_ssse3: 114.1
inv_txfm_add_16x4_identity_identity_0_8bpc_c: 1271.5
inv_txfm_add_16x4_identity_identity_0_8bpc_ssse3: 74.5
inv_txfm_add_16x4_identity_identity_1_8bpc_c: 1266.8
inv_txfm_add_16x4_identity_identity_1_8bpc_ssse3: 74.5
inv_txfm_add_16x4_identity_identity_2_8bpc_c: 1268.0
inv_txfm_add_16x4_identity_identity_2_8bpc_ssse3: 74.6
parent 205b723e
...@@ -77,10 +77,12 @@ decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x16_avx2); ...@@ -77,10 +77,12 @@ decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x16_avx2);
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x32_avx2); decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x32_avx2);
decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x64_avx2); decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x64_avx2);
decl_itx17_fns(4, 4, ssse3); decl_itx17_fns( 4, 4, ssse3);
decl_itx16_fns(4, 8, ssse3); decl_itx16_fns( 4, 8, ssse3);
decl_itx16_fns(8, 4, ssse3); decl_itx16_fns( 8, 4, ssse3);
decl_itx16_fns(8, 8, ssse3); decl_itx16_fns( 8, 8, ssse3);
decl_itx16_fns( 4, 16, ssse3);
decl_itx16_fns(16, 4, ssse3);
void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) { void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) {
#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \ #define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
...@@ -124,10 +126,12 @@ void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) { ...@@ -124,10 +126,12 @@ void bitfn(dav1d_itx_dsp_init_x86)(Dav1dInvTxfmDSPContext *const c) {
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return; if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
#if BITDEPTH == 8 #if BITDEPTH == 8
assign_itx17_fn(, 4, 4, ssse3); assign_itx17_fn(, 4, 4, ssse3);
assign_itx16_fn(R, 4, 8, ssse3); assign_itx16_fn(R, 4, 8, ssse3);
assign_itx16_fn(R, 8, 4, ssse3); assign_itx16_fn(R, 8, 4, ssse3);
assign_itx16_fn(, 8, 8, ssse3); assign_itx16_fn(, 8, 8, ssse3);
assign_itx16_fn(R, 4, 16, ssse3);
assign_itx16_fn(R, 16, 4, ssse3);
#endif #endif
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
......
...@@ -54,9 +54,20 @@ COEF_PAIR 1931, 3612 ...@@ -54,9 +54,20 @@ COEF_PAIR 1931, 3612
COEF_PAIR 3166, 2598 COEF_PAIR 3166, 2598
COEF_PAIR 3920, 1189 COEF_PAIR 3920, 1189
COEF_PAIR 3784, 1567 COEF_PAIR 3784, 1567
COEF_PAIR 995, 3973
COEF_PAIR 1751, 3703
COEF_PAIR 3513, 2106
COEF_PAIR 3857, 1380
COEF_PAIR 4017, 799
COEF_PAIR 201, 4091
COEF_PAIR 2440, 3290
COEF_PAIR 3035, 2751
COEF_PAIR 4052, 601
COEF_PAIR 2276, 3406
pd_2048: times 4 dd 2048 pd_2048: times 4 dd 2048
pw_2048: times 8 dw 2048 pw_2048: times 8 dw 2048
pw_m2048: times 8 dw -2048
pw_4096: times 8 dw 4096 pw_4096: times 8 dw 4096
pw_16384: times 8 dw 16384 pw_16384: times 8 dw 16384
pw_m16384: times 8 dw -16384 pw_m16384: times 8 dw -16384
...@@ -112,18 +123,18 @@ SECTION .text ...@@ -112,18 +123,18 @@ SECTION .text
punpcklbw m%3, m%5 ;extend byte to word punpcklbw m%3, m%5 ;extend byte to word
punpcklbw m%4, m%5 ;extend byte to word punpcklbw m%4, m%5 ;extend byte to word
paddw m%1, m%3 ;high: dst1 + out1 ;low: dst0 + out0 paddw m%3, m%1 ;high: dst1 + out1 ;low: dst0 + out0
paddw m%2, m%4 ;high: dst3 + out3 ;low: dst2 + out2 paddw m%4, m%2 ;high: dst3 + out3 ;low: dst2 + out2
packuswb m%1, m%2 ;high->low: dst3 + out3, dst2 + out2, dst1 + out1, dst0 + out0 packuswb m%3, m%4 ;high->low: dst3 + out3, dst2 + out2, dst1 + out1, dst0 + out0
movd [%%row_adr1], m%1 ;store dst0 + out0 movd [%%row_adr1], m%3 ;store dst0 + out0
pshuflw m%2, m%1, q1032 pshuflw m%4, m%3, q1032
movd [%%row_adr2], m%2 ;store dst1 + out1 movd [%%row_adr2], m%4 ;store dst1 + out1
punpckhqdq m%1, m%1 punpckhqdq m%3, m%3
movd [%%row_adr3], m%1 ;store dst2 + out2 movd [%%row_adr3], m%3 ;store dst2 + out2
psrlq m%1, 32 psrlq m%3, 32
movd [%%row_adr4], m%1 ;store dst3 + out3 movd [%%row_adr4], m%3 ;store dst3 + out3
%endmacro %endmacro
%macro ITX4_END 4-5 2048 ; row[1-4], rnd %macro ITX4_END 4-5 2048 ; row[1-4], rnd
...@@ -709,9 +720,9 @@ cglobal idct_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ...@@ -709,9 +720,9 @@ cglobal idct_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
pmulhrsw m2, m3, [coeffq+16*2] pmulhrsw m2, m3, [coeffq+16*2]
pmulhrsw m3, [coeffq+16*3] pmulhrsw m3, [coeffq+16*3]
.pass1:
call m(idct_8x4_internal).main call m(idct_8x4_internal).main
call m(iadst_4x8_internal).inversion jmp m(iadst_4x8_internal).pass1_end
jmp tx2q
.pass2: .pass2:
call .main call .main
...@@ -738,8 +749,11 @@ cglobal iadst_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ...@@ -738,8 +749,11 @@ cglobal iadst_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
pmulhrsw m2, m3, [coeffq+16*2] pmulhrsw m2, m3, [coeffq+16*2]
pmulhrsw m3, [coeffq+16*3] pmulhrsw m3, [coeffq+16*3]
.pass1:
call m(iadst_8x4_internal).main call m(iadst_8x4_internal).main
call .inversion
.pass1_end:
INV_4X8
jmp tx2q jmp tx2q
.pass2: .pass2:
...@@ -775,11 +789,6 @@ ALIGN function_align ...@@ -775,11 +789,6 @@ ALIGN function_align
IADST8_1D_PACKED IADST8_1D_PACKED
ret ret
ALIGN function_align
.inversion:
INV_4X8
ret
INV_TXFM_4X8_FN flipadst, dct, 0 INV_TXFM_4X8_FN flipadst, dct, 0
INV_TXFM_4X8_FN flipadst, adst INV_TXFM_4X8_FN flipadst, adst
INV_TXFM_4X8_FN flipadst, flipadst INV_TXFM_4X8_FN flipadst, flipadst
...@@ -792,6 +801,7 @@ cglobal iflipadst_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ...@@ -792,6 +801,7 @@ cglobal iflipadst_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
pmulhrsw m2, m3, [coeffq+16*2] pmulhrsw m2, m3, [coeffq+16*2]
pmulhrsw m3, [coeffq+16*3] pmulhrsw m3, [coeffq+16*3]
.pass1:
call m(iadst_8x4_internal).main call m(iadst_8x4_internal).main
punpcklwd m4, m3, m2 punpcklwd m4, m3, m2
...@@ -832,6 +842,7 @@ cglobal iidentity_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ...@@ -832,6 +842,7 @@ cglobal iidentity_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
pmulhrsw m2, m3, [coeffq+16*2] pmulhrsw m2, m3, [coeffq+16*2]
pmulhrsw m3, [coeffq+16*3] pmulhrsw m3, [coeffq+16*3]
.pass1:
mova m5, [o(pw_5793x4)] mova m5, [o(pw_5793x4)]
paddw m0, m0 paddw m0, m0
paddw m1, m1 paddw m1, m1
...@@ -842,8 +853,7 @@ cglobal iidentity_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ...@@ -842,8 +853,7 @@ cglobal iidentity_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
pmulhrsw m2, m5 pmulhrsw m2, m5
pmulhrsw m3, m5 pmulhrsw m3, m5
call m(iadst_4x8_internal).inversion jmp m(iadst_4x8_internal).pass1_end
jmp tx2q
.pass2: .pass2:
mova m4, [o(pw_4096)] mova m4, [o(pw_4096)]
...@@ -1476,3 +1486,746 @@ ALIGN function_align ...@@ -1476,3 +1486,746 @@ ALIGN function_align
mova [coeffq+16*6], m6 mova [coeffq+16*6], m6
mova [coeffq+16*7], m7 mova [coeffq+16*7], m7
jmp m(idct_8x8_internal).end3 jmp m(idct_8x8_internal).end3
%macro INV_TXFM_4X16_FN 2-3 -1 ; type1, type2, fast_thresh
INV_TXFM_FN %1, %2, %3, 4x16, 8
%if %3 >= 0
%ifidn %1_%2, dct_identity
mova m0, [o(pw_2896x8)]
mova m1, m0
pmulhrsw m0, [coeffq+16*0]
pmulhrsw m1, [coeffq+16*1]
mova m2, [o(pw_16384)]
mova m3, [o(pw_5793x4)]
mova m4, [o(pw_2048)]
pmulhrsw m0, m2
pmulhrsw m1, m2
psllw m0, 2
psllw m1, 2
pmulhrsw m0, m3
pmulhrsw m1, m3
pmulhrsw m0, m4
pmulhrsw m4, m1
punpckhwd m2, m0, m0
punpcklwd m0, m0
punpckhwd m6, m4, m4
punpcklwd m4, m4
punpckhdq m1, m0, m0
punpckldq m0, m0
punpckhdq m3, m2, m2
punpckldq m2, m2
punpckhdq m5, m4, m4
punpckldq m4, m4
punpckhdq m7, m6, m6
punpckldq m6, m6
mova [coeffq+16*4], m4
TAIL_CALL m(iadst_4x16_internal).end2
%elifidn %1_%2, identity_dct
movd m0, [coeffq+32*0]
punpcklwd m0, [coeffq+32*1]
movd m1, [coeffq+32*2]
punpcklwd m1, [coeffq+32*3]
mova m2, [o(pw_5793x4)]
mova m3, [o(pw_16384)]
mova m4, [o(pw_2896x8)]
punpckldq m0, m1
paddw m0, m0
pmulhrsw m0, m2
pmulhrsw m0, m3
psrlw m3, 3 ; pw_2048
pmulhrsw m0, m4
pmulhrsw m0, m3
punpcklqdq m0, m0
pxor m7, m7
REPX {mova [coeffq+32*x], m7}, 0, 1, 2, 3
%elifidn %1_%2, dct_dct
pshuflw m0, [coeffq], q0000
punpcklwd m0, m0
mova m1, [o(pw_2896x8)]
pmulhrsw m0, m1
mov [coeffq], eobd
pmulhrsw m0, [o(pw_16384)]
pmulhrsw m0, m1
pmulhrsw m0, [o(pw_2048)]
%else ; adst_dct / flipadst_dct
pshuflw m0, [coeffq], q0000
punpcklwd m0, m0
%ifidn %1, adst
pmulhrsw m0, [o(iadst4_dconly1a)]
%else ; flipadst
pmulhrsw m0, [o(iadst4_dconly1b)]
%endif
mova m1, [o(pw_16384)]
mov [coeffq], eobd
pmulhrsw m0, m1
psrlw m1, 3 ; pw_2048
pmulhrsw m0, [o(pw_2896x8)]
pmulhrsw m0, m1
%endif
.end:
WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3
lea dstq, [dstq+strideq*4]
WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3
lea dstq, [dstq+strideq*4]
WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3
lea dstq, [dstq+strideq*4]
WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3
RET
%endif
%endmacro
INV_TXFM_4X16_FN dct, dct, 0
INV_TXFM_4X16_FN dct, identity, 15
INV_TXFM_4X16_FN dct, adst
INV_TXFM_4X16_FN dct, flipadst
cglobal idct_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
lea r3, [o(m(idct_4x8_internal).pass1)]
.pass1:
mova m0, [coeffq+16*1]
mova m1, [coeffq+16*3]
mova m2, [coeffq+16*5]
mova m3, [coeffq+16*7]
push tx2q
lea tx2q, [o(m(idct_4x16_internal).pass1_2)]
jmp r3
.pass1_2:
mova [coeffq+16*1], m0
mova [coeffq+16*3], m1
mova [coeffq+16*5], m2
mova [coeffq+16*7], m3
mova m0, [coeffq+16*0]
mova m1, [coeffq+16*2]
mova m2, [coeffq+16*4]
mova m3, [coeffq+16*6]
lea tx2q, [o(m(idct_4x16_internal).pass1_end)]
jmp r3
.pass1_end:
pop tx2q
mova m4, [coeffq+16*1]
mova m5, [coeffq+16*3]
mova m6, [coeffq+16*5]
mova m7, [o(pw_16384)]
REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
pmulhrsw m7, [coeffq+16*7]
mova [coeffq+16*7], m7
jmp tx2q
.pass2:
call m(idct_16x4_internal).main
.end:
mova m7, [o(pw_2048)]
REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
pmulhrsw m7, [coeffq+16*7]
mova [coeffq+16*4], m4
.end1:
mova [coeffq+16*5], m5
mova [coeffq+16*6], m6
mov r3, coeffq
WRITE_4X8 0, 1, 3, 2
mova m0, [r3+16*4]
mova m1, [r3+16*5]
mova m2, [r3+16*6]
mova m3, m7
lea dstq, [dstq+strideq*4]
WRITE_4X8 0, 1, 3, 2
.end2:
pxor m7, m7
REPX {mova [r3+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
ret
INV_TXFM_4X16_FN adst, dct, 0
INV_TXFM_4X16_FN adst, adst
INV_TXFM_4X16_FN adst, flipadst
INV_TXFM_4X16_FN adst, identity
cglobal iadst_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
lea r3, [o(m(iadst_4x8_internal).pass1)]
jmp m(idct_4x16_internal).pass1
.pass2:
call m(iadst_16x4_internal).main
punpcklqdq m6, m5, m4 ;low: -out5 high: -out7
punpckhqdq m4, m5 ;low: out8 high: out10
punpcklqdq m5, m7, m2 ;low: out4 high: out6
punpckhqdq m2, m7 ;low: -out9 high: -out11
mova [coeffq+16*4], m2
mova [coeffq+16*5], m6
mova m2, [coeffq+16*6]
mova m6, [coeffq+16*7]
punpckhqdq m1, m6, m0 ;low: -out13 high: -out15
punpcklqdq m0, m6 ;low: out0 high: out2
punpckhqdq m6, m3, m2 ;low: out12 high: out14
punpcklqdq m2, m3 ;low: -out1 high: -out3
mova m7, [o(pw_2048)]
.end1:
REPX {pmulhrsw x, m7}, m0, m5, m4, m6
pxor m3, m3
psubw m3, m7
mova m7, [coeffq+16*4]
REPX {pmulhrsw x, m3}, m2, m7, m1
pmulhrsw m3, [coeffq+16*5]
mova [coeffq+16*7], m5
punpckhqdq m5, m4, m7 ;low: out10 high: out11
punpcklqdq m4, m7 ;low: out8 high: out9
punpckhqdq m7, m6, m1 ;low: out14 high: out15
punpcklqdq m6, m1 ;low: out12 high: out13
punpckhqdq m1, m0, m2 ;low: out2 high: out3
punpcklqdq m0, m2 ;low: out0 high: out1
mova [coeffq+16*4], m4
mova m4, [coeffq+16*7]
punpcklqdq m2, m4, m3 ;low: out4 high: out5
punpckhqdq m4, m3 ;low: out6 high: out7
mova m3, m4
.end2:
mova [coeffq+16*5], m5
mova [coeffq+16*6], m6
mov r3, coeffq
WRITE_4X8 0, 1, 2, 3
mova m0, [r3+16*4]
mova m1, [r3+16*5]
mova m2, [r3+16*6]
mova m3, m7
lea dstq, [dstq+strideq*4]
WRITE_4X8 0, 1, 2, 3
.end3:
pxor m7, m7
REPX {mova [r3+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7
ret
INV_TXFM_4X16_FN flipadst, dct, 0
INV_TXFM_4X16_FN flipadst, adst
INV_TXFM_4X16_FN flipadst, flipadst
INV_TXFM_4X16_FN flipadst, identity
cglobal iflipadst_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
lea r3, [o(m(iflipadst_4x8_internal).pass1)]
jmp m(idct_4x16_internal).pass1
.pass2:
call m(iadst_16x4_internal).main
punpckhqdq m6, m5, m4 ;low: out5 high: out7
punpcklqdq m4, m5 ;low: -out8 high: -out10
punpckhqdq m5, m7, m2 ;low: -out4 high: -out6
punpcklqdq m2, m7 ;low: out9 high: out11
mova [coeffq+16*4], m2
mova [coeffq+16*5], m6
mova m2, [coeffq+16*6]
mova m6, [coeffq+16*7]
punpcklqdq m1, m6, m0 ;low: out13 high: out15
punpckhqdq m0, m6 ;low: -out0 high: -out2
punpcklqdq m6, m3, m2 ;low: -out12 high: -out14
punpckhqdq m2, m3 ;low: out1 high: out3
mova m7, [o(pw_m2048)]
jmp m(iadst_4x16_internal).end1
INV_TXFM_4X16_FN identity, dct, 3
INV_TXFM_4X16_FN identity, adst
INV_TXFM_4X16_FN identity, flipadst
INV_TXFM_4X16_FN identity, identity
cglobal iidentity_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
lea r3, [o(m(iidentity_4x8_internal).pass1)]
jmp m(idct_4x16_internal).pass1
.pass2:
mova m7, [o(pw_5793x4)]
REPX {psllw x, 2 }, m0, m1, m2, m3, m4, m5, m6
REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
psllw m7, [coeffq+16*7], 2
pmulhrsw m7, [o(pw_5793x4)]
mova [coeffq+16*7], m7
mova m7, [o(pw_2048)]
REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
pmulhrsw m7, [coeffq+16*7]
mova [coeffq+16*4], m4
jmp m(iadst_4x16_internal).end2
%macro INV_TXFM_16X4_FN 2-3 -1 ; type1, type2, fast_thresh
INV_TXFM_FN %1, %2, %3, 16x4, 8
%if %3 >= 0
%ifidn %1_%2, dct_identity
mova m3, [o(pw_2896x8)]
pmulhrsw m3, [coeffq]
mova m0, [o(pw_16384)]
pmulhrsw m3, m0
psrlw m0, 3 ; pw_2048
paddw m3, m3
pmulhrsw m3, [o(pw_5793x4)]
pmulhrsw m3, m0
punpcklwd m3, m3
pshufd m0, m3, q0000
pshufd m1, m3, q1111
pshufd m2, m3, q2222
pshufd m3, m3, q3333
lea tx2q, [dstq+8]
call m(iadst_8x4_internal).end2
add coeffq, 16*4
mov dstq, tx2q
TAIL_CALL m(iadst_8x4_internal).end2
%elifidn %1_%2, identity_dct
mova m5, [o(pw_16384)]
mova m6, [o(pw_5793x4)]
mova m7, [o(pw_2896x8)]
mov r3d, 2
.main_loop:
mova m0, [coeffq+16*0]
mova m1, [coeffq+16*1]
mova m2, [coeffq+16*2]
mova m3, [coeffq+16*3]
punpckhwd m4, m0, m1
punpcklwd m0, m1
punpckhwd m1, m2, m3
punpcklwd m2, m3
punpcklwd m0, m4
punpcklwd m2, m1
punpcklqdq m0, m2
psllw m0, 2
pmulhrsw m0, m6
pmulhrsw m0, m5
psrlw m1, m5, 3 ; pw_2048
pmulhrsw m0, m7
pmulhrsw m0, m1
.end:
pxor m3, m3
mova [coeffq+16*0], m3
mova [coeffq+16*1], m3
mova [coeffq+16*2], m3
mova [coeffq+16*3], m3
add coeffq, 16*4
lea tx2q, [dstq+8]
WRITE_8X4 0, 0, 0, 0, 1, 2, 3
mov dstq, tx2q
dec r3d
jg .main_loop
RET
%else
movd m1, [o(pw_2896x8)]
pmulhrsw m0, m1, [coeffq]
%ifidn %2, dct
movd m2, [o(pw_16384)]
mov [coeffq], eobd
mov r2d, 2
.dconly:
pmulhrsw m0, m2
movd m2, [o(pw_2048)] ;intentionally rip-relative
pmulhrsw m0, m1
pmulhrsw m0, m2
pshuflw m0, m0, q0000
punpcklwd m0, m0
pxor m5, m5
.dconly_loop:
mova m1, [dstq]
mova m3, [dstq+strideq]
punpckhbw m2, m1, m5
punpcklbw m1, m5
punpckhbw m4, m3, m5
punpcklbw m3, m5
paddw m2, m0
paddw m1, m0
paddw m4, m0
paddw m3, m0
packuswb m1, m2
packuswb m3, m4
mova [dstq], m1
mova [dstq+strideq], m3
lea dstq, [dstq+strideq*2]
dec r2d
jg .dconly_loop
RET
%else ; adst / flipadst
movd m2, [o(pw_16384)]
pmulhrsw m0, m2
pshuflw m0, m0, q0000
punpcklwd m0, m0
mov [coeffq], eobd
pmulhrsw m2, m0, [o(iadst4_dconly2b)]
pmulhrsw m0, [o(iadst4_dconly2a)]
mova m1, [o(pw_2048)]
pmulhrsw m0, m1
pmulhrsw m2, m1
%ifidn %2, adst
punpckhqdq m1, m0, m0
punpcklqdq m0, m0
punpckhqdq m3, m2, m2
punpcklqdq m2, m2
%else ; flipadst
mova m3, m0
punpckhqdq m0, m2, m2
punpcklqdq m1, m2, m2
punpckhqdq m2, m3, m3
punpcklqdq m3, m3
%endif
lea tx2q, [dstq+8]
call m(iadst_8x4_internal).end3
mov dstq, tx2q
TAIL_CALL m(iadst_8x4_internal).end3
%endif
%endif
%endif
%endmacro
%macro ITX_16X4_LOAD_COEFS 0
ITX_8X8_LOAD_COEFS
%endmacro
%macro IDCT16_1D_PACKED_ODDHALF 7 ;src[1-4], tmp[1-3]
punpckhwd m%5, m%4, m%1 ;packed in13 in3
punpcklwd m%1, m%4 ;packed in1 in15
punpcklwd m%6, m%3, m%2 ;packed in9 in7
punpckhwd m%2, m%3 ;packed in5 in11
mova m%7, [o(pd_2048)]
ITX_MUL2X_PACK %1, %4, %7, 401, 4076, 1 ;low: t8a high: t15a
ITX_MUL2X_PACK %6, %4, %7, 3166, 2598, 1 ;low: t9a high: t14a
ITX_MUL2X_PACK %2, %4, %7, 1931, 3612, 1 ;low: t10a high: t13a
ITX_MUL2X_PACK %5, %4, %7, 3920, 1189, 1 ;low: t11a high: t12a
psubsw m%4, m%1, m%6 ;low: t9 high: t14
paddsw m%1, m%6 ;low: t8 high: t15
psubsw m%3, m%5, m%2 ;low: t10 high: t13
paddsw m%2, m%5 ;low: t11 high: t12
punpcklqdq m%5, m%4, m%3 ;low: t9 high: t10
punpckhqdq m%4, m%3 ;low: t14 high: t13
punpcklwd m%6, m%4, m%5 ;packed t14 t9
punpckhwd m%5, m%4 ;packed t10 t13
pxor m%4, m%4
psubw m%4, m%5 ;packed -t10 -t13
ITX_MUL2X_PACK %6, %3, %7, 1567, 3784, 1 ;low: t9a high: t14a
ITX_MUL2X_PACK %4, %3, %7, 3784, 1567 ;low: t10a high: t13a
psubsw m%3, m%1, m%2 ;low: t11a high: t12a
paddsw m%1, m%2 ;low: t8a high: t15a
psubsw m%5, m%6, m%4 ;low: t10 high: t13
paddsw m%6, m%4 ;low: t9 high: t14
mova m%7, [o(pw_2896x8)]
punpckhqdq m%4, m%3, m%5 ;low: t12a high: t13
punpcklqdq m%3, m%5 ;low: t11a high: t10
psubw m%2, m%4, m%3
paddw m%3, m%4
pmulhrsw m%2, m%7 ;low: t11 high: t10a
pmulhrsw m%3, m%7 ;low: t12 high: t13a
punpckhqdq m%4, m%1, m%6 ;low: t15a high: t14
punpcklqdq m%1, m%6 ;low: t8a high: t9
%endmacro
INV_TXFM_16X4_FN dct, dct, 0
INV_TXFM_16X4_FN dct, adst, 0
INV_TXFM_16X4_FN dct, flipadst, 0
INV_TXFM_16X4_FN dct, identity, 3
cglobal idct_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
ITX_16X4_LOAD_COEFS
call .main
.pass1_end:
punpckhwd m7, m0, m2 ;packed out1, out5
punpcklwd m0, m2 ;packed out0, out4
punpcklwd m2, m1, m3 ;packed out3, out7
punpckhwd m1, m3 ;packed out2, out6
mova [coeffq+16*6], m7
mova m7, [coeffq+16*7]
punpckhwd m3, m4, m6 ;packed out9, out13
punpcklwd m4, m6 ;packed out8, out12
punpcklwd m6, m5, m7 ;packed out11, out15
punpckhwd m5, m7 ;packed out10, out14
.pass1_end2:
mova m7, [o(pw_16384)]
REPX {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
pmulhrsw m7, [coeffq+16*6]
mova [coeffq+16*6], m7
.pass1_end3:
punpckhwd m7, m3, m6 ;packed 9, 11, 13, 15 high
punpcklwd m3, m6 ;packed 9, 10, 13, 15 low
punpckhwd m6, m4, m5 ;packed 8, 10, 12, 14 high
punpcklwd m4, m5 ;packed 8, 10, 12, 14 low
punpckhwd m5, m4, m3 ;8, 9, 10, 11, 12, 13, 14, 15(1)
punpcklwd m4, m3 ;8, 9, 10, 11, 12, 13, 14, 15(0)
punpckhwd m3, m6, m7 ;8, 9, 10, 11, 12, 13, 14, 15(3)
punpcklwd m6, m7 ;8, 9, 10, 11, 12, 13, 14, 15(2)
mova [coeffq+16*7], m3
mova m3, [coeffq+16*6]
punpckhwd m7, m3, m2 ;packed 1, 3, 5, 7 high
punpcklwd m3, m2 ;packed 1, 3, 5, 7 low
punpckhwd m2, m0, m1 ;packed 0, 2, 4, 6 high
punpcklwd m0, m1 ;packed 0, 2, 4, 6 low
punpckhwd m1, m0, m3 ;0, 1, 2, 3, 4, 5, 6, 7(1)
punpcklwd m0, m3 ;0, 1, 2, 3, 4, 5, 6, 7(0)
punpckhwd m3, m2, m7 ;0, 1, 2, 3, 4, 5, 6, 7(3)
punpcklwd m2, m7 ;0, 1, 2, 3, 4, 5, 6, 7(2)
jmp tx2q
.pass2:
lea tx2q, [o(m(idct_8x4_internal).pass2)]
.pass2_end:
mova [coeffq+16*4], m4
mova [coeffq+16*5], m5
mova [coeffq+16*6], m6
lea r3, [dstq+8]
call tx2q
add coeffq, 16*4
mova m0, [coeffq+16*0]
mova m1, [coeffq+16*1]
mova m2, [coeffq+16*2]
mova m3, [coeffq+16*3]
mov dstq, r3
jmp tx2q
ALIGN function_align
.main:
punpckhqdq m7, m0, m1 ;low:in1 high:in3