Commit a755b6e3 authored by Henrik Gramner's avatar Henrik Gramner

Clip coefficients in SSSE3/AVX2 inverse transform asm

parent eb01bdb9
Pipeline #3601 passed with stages
in 4 minutes and 56 seconds
......@@ -231,20 +231,20 @@ SECTION .text
psubw m%1, m%3
pmulhrsw m%1, m%6 ; t1
pmulhrsw m%5, m%6 ; t0
psubw m%3, m%1, m%2
paddw m%2, m%1
paddw m%1, m%5, m%4
psubw m%4, m%5, m%4
psubsw m%3, m%1, m%2
paddsw m%2, m%1
paddsw m%1, m%5, m%4
psubsw m%4, m%5, m%4
%endmacro
%macro IDCT8_1D 11 ; src[1-8], tmp[1-2], pd_2048
ITX_MULSUB_2W %6, %4, %9, %10, %11, 3406, 2276 ; t5a, t6a
ITX_MULSUB_2W %2, %8, %9, %10, %11, 799, 4017 ; t4a, t7a
ITX_MULSUB_2W %3, %7, %9, %10, %11, 1567, 3784 ; t2, t3
paddw m%9, m%2, m%6 ; t4
psubw m%2, m%6 ; t5a
paddw m%10, m%8, m%4 ; t7
psubw m%8, m%4 ; t6a
paddsw m%9, m%2, m%6 ; t4
psubsw m%2, m%6 ; t5a
paddsw m%10, m%8, m%4 ; t7
psubsw m%8, m%4 ; t6a
vpbroadcastd m%4, [o(pw_2896x8)]
psubw m%6, m%1, m%5
paddw m%1, m%5
......@@ -254,18 +254,18 @@ SECTION .text
pmulhrsw m%6, m%4 ; t1
pmulhrsw m%8, m%4 ; t6
pmulhrsw m%5, m%4 ; t5
psubw m%4, m%1, m%7 ; dct4 out3
paddw m%1, m%7 ; dct4 out0
paddw m%7, m%6, m%3 ; dct4 out1
psubw m%6, m%3 ; dct4 out2
paddw m%2, m%7, m%8 ; out1
psubw m%7, m%8 ; out6
psubw m%8, m%1, m%10 ; out7
paddw m%1, m%10 ; out0
paddw m%3, m%6, m%5 ; out2
psubw m%6, m%5 ; out5
psubw m%5, m%4, m%9 ; out4
paddw m%4, m%9 ; out3
psubsw m%4, m%1, m%7 ; dct4 out3
paddsw m%1, m%7 ; dct4 out0
paddsw m%7, m%6, m%3 ; dct4 out1
psubsw m%6, m%3 ; dct4 out2
paddsw m%2, m%7, m%8 ; out1
psubsw m%7, m%8 ; out6
psubsw m%8, m%1, m%10 ; out7
paddsw m%1, m%10 ; out0
paddsw m%3, m%6, m%5 ; out2
psubsw m%6, m%5 ; out5
psubsw m%5, m%4, m%9 ; out4
paddsw m%4, m%9 ; out3
%endmacro
; in1 = %1, in3 = %2, in5 = %3, in7 = %4
......@@ -275,25 +275,25 @@ SECTION .text
ITX_MULSUB_2W %5, %4, %9, %10, %11, 3166, 2598 ; t9a, t14a
ITX_MULSUB_2W %3, %6, %9, %10, %11, 1931, 3612 ; t10a, t13a
ITX_MULSUB_2W %7, %2, %9, %10, %11, 3920, 1189 ; t11a, t12a
psubw m%9, m%2, m%6 ; t13
paddw m%6, m%2 ; t12
psubw m%2, m%8, m%4 ; t14
paddw m%8, m%4 ; t15
psubw m%4, m%7, m%3 ; t10
paddw m%3, m%7 ; t11
psubw m%7, m%1, m%5 ; t9
paddw m%1, m%5 ; t8
psubsw m%9, m%2, m%6 ; t13
paddsw m%6, m%2 ; t12
psubsw m%2, m%8, m%4 ; t14
paddsw m%8, m%4 ; t15
psubsw m%4, m%7, m%3 ; t10
paddsw m%3, m%7 ; t11
psubsw m%7, m%1, m%5 ; t9
paddsw m%1, m%5 ; t8
ITX_MULSUB_2W %2, %7, %5, %10, %11, 1567, 3784 ; t9a, t14a
ITX_MULSUB_2W %9, %4, %5, %10, %11, m3784, 1567 ; t10a, t13a
vpbroadcastd m%10, [o(pw_2896x8)]
psubw m%5, m%2, m%9 ; t10
paddw m%2, m%9 ; t9
psubw m%9, m%1, m%3 ; t11a
paddw m%1, m%3 ; t8a
psubw m%3, m%7, m%4 ; t13
paddw m%7, m%4 ; t14
psubw m%4, m%8, m%6 ; t12a
paddw m%8, m%6 ; t15a
psubsw m%5, m%2, m%9 ; t10
paddsw m%2, m%9 ; t9
psubsw m%9, m%1, m%3 ; t11a
paddsw m%1, m%3 ; t8a
psubsw m%3, m%7, m%4 ; t13
paddsw m%7, m%4 ; t14
psubsw m%4, m%8, m%6 ; t12a
paddsw m%8, m%6 ; t15a
paddw m%6, m%3, m%5 ; t13a
psubw m%3, m%5 ; t10a
paddw m%5, m%4, m%9 ; t12
......@@ -458,8 +458,8 @@ ALIGN function_align
vpbroadcastd m4, [o(pw_2896x8)]
pmulhrsw m0, m4 ; t0 t1
%endif
psubw m1, m0, m2 ; out3 out2
paddw m0, m2 ; out0 out1
psubsw m1, m0, m2 ; out3 out2
paddsw m0, m2 ; out0 out1
%endmacro
%macro IADST4_1D_PACKED 0
......@@ -693,22 +693,22 @@ cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
ITX_MUL2X_PACK 4, 1, 2, 6, 3406, 2276, 1 ; t5a t6a
ITX_MUL2X_PACK 3, 1, 2, 6, 1567, 3784 ; t3 t2
vpbroadcastd m6, [o(pw_2896x8)]
psubw m2, m5, m4 ; t4 t7
paddw m5, m4 ; t5a t6a
psubsw m2, m5, m4 ; t4 t7
paddsw m5, m4 ; t5a t6a
pshufd m4, m2, q1032
psubw m1, m2, m4
paddw m4, m2
vpblendd m4, m4, m1, 0xcc
pmulhrsw m0, m6 ; t0 t1
pmulhrsw m4, m6 ; t6 t5
psubw m1, m0, m3 ; tmp3 tmp2
paddw m0, m3 ; tmp0 tmp1
psubsw m1, m0, m3 ; tmp3 tmp2
paddsw m0, m3 ; tmp0 tmp1
shufps m2, m5, m4, q1032 ; t7 t6
vpblendd m5, m5, m4, 0xcc ; t4 t5
psubw m3, m0, m2 ; out7 out6
paddw m0, m2 ; out0 out1
psubw m2, m1, m5 ; out4 out5
paddw m1, m5 ; out3 out2
psubsw m3, m0, m2 ; out7 out6
paddsw m0, m2 ; out0 out1
psubsw m2, m1, m5 ; out4 out5
paddsw m1, m5 ; out3 out2
%endmacro
%macro IADST8_1D_PACKED 0
......@@ -721,19 +721,19 @@ cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612 ; t2a t3a
ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598 ; t4a t5a
ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189 ; t6a t7a
psubw m4, m0, m2 ; t4 t5
paddw m0, m2 ; t0 t1
psubw m5, m1, m3 ; t6 t7
paddw m1, m3 ; t2 t3
psubsw m4, m0, m2 ; t4 t5
paddsw m0, m2 ; t0 t1
psubsw m5, m1, m3 ; t6 t7
paddsw m1, m3 ; t2 t3
shufps m2, m5, m4, q1032
punpckhwd m4, m2
punpcklwd m5, m2
ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784, 1 ; t5a t4a
ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567 ; t7a t6a
psubw m2, m0, m1 ; t2 t3
paddw m0, m1 ; out0 -out7
psubw m1, m4, m5 ; t7 t6
paddw m4, m5 ; out6 -out1
psubsw m2, m0, m1 ; t2 t3
paddsw m0, m1 ; out0 -out7
psubsw m1, m4, m5 ; t7 t6
paddsw m4, m5 ; out6 -out1
vpbroadcastd m5, [o(pw_2896x8)]
vpblendd m3, m0, m4, 0x33 ; out6 -out7
vpblendd m0, m0, m4, 0xcc ; out0 -out1
......@@ -981,10 +981,10 @@ cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
ITX_MUL2X_PACK 7, 2, 4, 10, 799, 4017, 1 ; t4a t7a
ITX_MUL2X_PACK 3, 2, 4, 10, 3406, 2276, 1 ; t5a t6a
ITX_MUL2X_PACK 6, 2, 4, 10, 1567, 3784 ; t3 t2
psubw m2, m8, m0 ; t9 t14
paddw m8, m0 ; t8 t15
psubw m0, m1, m5 ; t10 t13
paddw m1, m5 ; t11 t12
psubsw m2, m8, m0 ; t9 t14
paddsw m8, m0 ; t8 t15
psubsw m0, m1, m5 ; t10 t13
paddsw m1, m5 ; t11 t12
%if mmsize > 16
vbroadcasti128 m5, [o(deint_shuf)]
%else
......@@ -996,12 +996,12 @@ cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
ITX_MUL2X_PACK 2, 4, _, 10, 4, 5, 4 ; t9a t14a
vpbroadcastd m4, [o(pw_m1567_m3784)] ; reuse pw_m3784_1567
ITX_MUL2X_PACK 0, 5, _, 10, 5, 4, 4 ; t10a t13a
psubw m5, m7, m3 ; t5a t6a
paddw m7, m3 ; t4 t7
psubw m4, m8, m1 ; t11a t12a
paddw m8, m1 ; t8a t15a
paddw m1, m2, m0 ; t9 t14
psubw m2, m0 ; t10 t13
psubsw m5, m7, m3 ; t5a t6a
paddsw m7, m3 ; t4 t7
psubsw m4, m8, m1 ; t11a t12a
paddsw m8, m1 ; t8a t15a
paddsw m1, m2, m0 ; t9 t14
psubsw m2, m0 ; t10 t13
punpckhqdq m0, m8, m1 ; t15a t14
punpcklqdq m8, m1 ; t8a t9
pshufd m3, m5, q1032
......@@ -1019,20 +1019,20 @@ cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
pmulhrsw m5, m1 ; t12 t13a
shufps m2, m7, m3, q1032 ; t7 t6
vpblendd m7, m7, m3, 0xcc ; t4 t5
psubw m1, m9, m6 ; dct4 out3 out2
paddw m9, m6 ; dct4 out0 out1
psubw m3, m9, m2 ; dct8 out7 out6
paddw m9, m2 ; dct8 out0 out1
psubw m2, m1, m7 ; dct8 out4 out5
paddw m1, m7 ; dct8 out3 out2
psubw m7, m9, m0 ; out15 out14
paddw m0, m9 ; out0 out1
psubw m6, m1, m5 ; out12 out13
paddw m1, m5 ; out3 out2
psubw m5, m2, m4 ; out11 out10
paddw m2, m4 ; out4 out5
psubw m4, m3, m8 ; out8 out9
paddw m3, m8 ; out7 out6
psubsw m1, m9, m6 ; dct4 out3 out2
paddsw m9, m6 ; dct4 out0 out1
psubsw m3, m9, m2 ; dct8 out7 out6
paddsw m9, m2 ; dct8 out0 out1
psubsw m2, m1, m7 ; dct8 out4 out5
paddsw m1, m7 ; dct8 out3 out2
psubsw m7, m9, m0 ; out15 out14
paddsw m0, m9 ; out0 out1
psubsw m6, m1, m5 ; out12 out13
paddsw m1, m5 ; out3 out2
psubsw m5, m2, m4 ; out11 out10
paddsw m2, m4 ; out4 out5
psubsw m4, m3, m8 ; out8 out9
paddsw m3, m8 ; out7 out6
%endmacro
INV_TXFM_4X16_FN dct, dct, 0
......@@ -1153,20 +1153,20 @@ ALIGN function_align
ITX_MUL4X_PACK 1, 2, 5, 6, 8, 1751, 3703, 2440, 3290, 3
ITX_MUL4X_PACK 3, 2, 5, 6, 8, 3035, 2751, 3513, 2106, 3
ITX_MUL4X_PACK 4, 2, 5, 6, 8, 3857, 1380, 4052, 601, 3
psubw m2, m0, m3 ; t9a t8a t11a t10a
paddw m0, m3 ; t1a t0a t3a t2a
psubw m3, m1, m4 ; t13a t12a t15a t14a
paddw m1, m4 ; t5a t4a t7a t6a
psubsw m2, m0, m3 ; t9a t8a t11a t10a
paddsw m0, m3 ; t1a t0a t3a t2a
psubsw m3, m1, m4 ; t13a t12a t15a t14a
paddsw m1, m4 ; t5a t4a t7a t6a
ITX_MUL4X_PACK 2, 4, 5, 6, 8, 799, 4017, 3406, 2276, 3
psubw m6, m7, m5
ITX_MUL2X_PACK 3, 5, _, 8, 6, 4, 6
vpbroadcastd m6, [o(pw_m3784_1567)]
vpbroadcastd m5, [o(pw_1567_3784)]
psubw m4, m0, m1 ; t5 t4 t7 t6
paddw m0, m1 ; t1 t0 t3 t2
psubw m1, m2, m3 ; t13a t12a t15a t14a
paddw m2, m3 ; t9a t8a t11a t10a
psubw m3, m7, m6
psubsw m4, m0, m1 ; t5 t4 t7 t6
paddsw m0, m1 ; t1 t0 t3 t2
psubsw m1, m2, m3 ; t13a t12a t15a t14a
paddsw m2, m3 ; t9a t8a t11a t10a
psubw m3, m7, m6 ; pw_3784_m1567
vpblendd m6, m6, m3, 0xf0
ITX_MUL2X_PACK 4, 3, _, 8, 6, 5, 4 ; t4a t5a t7a t6a
ITX_MUL2X_PACK 1, 3, _, 8, 6, 5, 4 ; t12 t13 t15 t14
......@@ -1179,10 +1179,10 @@ ALIGN function_align
vinserti128 m4, m4, xm1, 1 ; t4a t5a t12 t13
vpbroadcastd m5, [o(pw_2896x8)]
pshufd m2, m2, q1032 ; t6a t7a t14 t15
psubw m1, m0, m3 ; t3a t2a t11 t10
paddw m0, m3 ; -out15 out0 out14 -out1
paddw m3, m4, m2 ; -out3 out12 out2 -out13
psubw m4, m2 ; t6 t7 t14a t15a
psubsw m1, m0, m3 ; t3a t2a t11 t10
paddsw m0, m3 ; -out15 out0 out14 -out1
paddsw m3, m4, m2 ; -out3 out12 out2 -out13
psubsw m4, m2 ; t6 t7 t14a t15a
shufps m2, m1, m4, q1032 ; t2a t6 t10 t14a
vpblendd m4, m4, m1, 0x33 ; t3a t7 t11 t15a
paddw m1, m2, m4
......@@ -1902,53 +1902,53 @@ ALIGN function_align
ITX_MUL2X_PACK 6, 4, 9, 10, 3513, 2106, 3 ; t10 t11
ITX_MUL2X_PACK 7, 4, 9, 10, 3857, 1380, 3 ; t12 t13
ITX_MUL2X_PACK 8, 4, 9, 10, 4052, 601, 3 ; t14 t15
psubw m4, m0, m5 ; t9a t8a
paddw m0, m5 ; t1a t0a
psubw m5, m1, m6 ; t11a t10a
paddw m1, m6 ; t3a t2a
psubw m6, m2, m7 ; t13a t12a
paddw m2, m7 ; t5a t4a
psubw m7, m3, m8 ; t15a t14a
paddw m3, m8 ; t7a t6a
psubsw m4, m0, m5 ; t9a t8a
paddsw m0, m5 ; t1a t0a
psubsw m5, m1, m6 ; t11a t10a
paddsw m1, m6 ; t3a t2a
psubsw m6, m2, m7 ; t13a t12a
paddsw m2, m7 ; t5a t4a
psubsw m7, m3, m8 ; t15a t14a
paddsw m3, m8 ; t7a t6a
vpbroadcastd m11, [o(pw_m4017_799)]
vpbroadcastd m12, [o(pw_799_4017)]
pxor m9, m9
ITX_MUL2X_PACK 4, 8, _, 10, 11, 12, 6 ; t8 t9
psubw m8, m9, m11
psubw m8, m9, m11 ; pw_4017_m799
ITX_MUL2X_PACK 6, 12, _, 10, 12, 8, 6 ; t12 t13
vpbroadcastd m11, [o(pw_m2276_3406)]
vpbroadcastd m12, [o(pw_3406_2276)]
ITX_MUL2X_PACK 5, 8, _, 10, 11, 12, 6 ; t10 t11
psubw m8, m9, m11
psubw m8, m9, m11 ; pw_2276_m3406
ITX_MUL2X_PACK 7, 12, _, 10, 12, 8, 6 ; t14 t15
psubw m8, m1, m3 ; t7 t6
paddw m1, m3 ; t3 t2
psubw m3, m0, m2 ; t5 t4
paddw m0, m2 ; t1 t0
psubw m2, m5, m7 ; t14a t15a
paddw m7, m5 ; t10a t11a
psubw m5, m4, m6 ; t12a t13a
paddw m4, m6 ; t8a t9a
psubsw m8, m1, m3 ; t7 t6
paddsw m1, m3 ; t3 t2
psubsw m3, m0, m2 ; t5 t4
paddsw m0, m2 ; t1 t0
psubsw m2, m5, m7 ; t14a t15a
paddsw m7, m5 ; t10a t11a
psubsw m5, m4, m6 ; t12a t13a
paddsw m4, m6 ; t8a t9a
vpbroadcastd m11, [o(pw_m3784_1567)]
vpbroadcastd m12, [o(pw_1567_3784)]
ITX_MUL2X_PACK 3, 6, _, 10, 11, 12, 4 ; t4a t5a
psubw m6, m9, m11
psubw m6, m9, m11 ; pw_3784_m1567
ITX_MUL2X_PACK 8, 12, _, 10, 12, 6, 4 ; t6a t7a
vpbroadcastd m11, [o(pw_m1567_3784)]
vpbroadcastd m12, [o(pw_3784_1567)]
ITX_MUL2X_PACK 2, 6, _, 10, 11, 12, 4 ; t15 t14
psubw m6, m9, m11
psubw m6, m9, m11 ; pw_1567_m3784
ITX_MUL2X_PACK 5, 12, _, 10, 12, 6, 4 ; t13 t12
vbroadcasti128 m11, [o(deint_shuf)]
vpbroadcastd m12, [o(pw_2896x8)]
psubw m6, m0, m1 ; t3a t2a
paddw m0, m1 ; -out15 out0
paddw m1, m2, m5 ; -out13 out2
psubw m5, m2 ; t15a t14a
paddw m2, m4, m7 ; -out1 out14
psubw m4, m7 ; t10 t11
psubw m7, m3, m8 ; t6 t7
paddw m8, m3 ; -out3 out12
psubsw m6, m0, m1 ; t3a t2a
paddsw m0, m1 ; -out15 out0
paddsw m1, m2, m5 ; -out13 out2
psubsw m5, m2 ; t15a t14a
paddsw m2, m4, m7 ; -out1 out14
psubsw m4, m7 ; t10 t11
psubsw m7, m3, m8 ; t6 t7
paddsw m8, m3 ; -out3 out12
REPX {pshufb x, m11}, m6, m4, m0, m2
vpblendd m3, m6, m4, 0xcc ; t3a t11
shufps m6, m6, m4, q1032 ; t2a t10
......@@ -2580,25 +2580,25 @@ ALIGN function_align
ITX_MULSUB_2W 3, 4, 8, 9, 10, 3166, 2598 ; t5a, t4a
ITX_MULSUB_2W 1, 6, 8, 9, 10, 3920, 1189 ; t7a, t6a
ITX_MULSUB_2W 5, 2, 8, 9, 10, 1931, 3612 ; t3a, t2a
psubw m8, m2, m6 ; t6
paddw m2, m6 ; t2
psubw m6, m0, m4 ; t4
paddw m0, m4 ; t0
psubw m4, m5, m1 ; t7
paddw m5, m1 ; t3
psubw m1, m7, m3 ; t5
paddw m7, m3 ; t1
psubsw m8, m2, m6 ; t6
paddsw m2, m6 ; t2
psubsw m6, m0, m4 ; t4
paddsw m0, m4 ; t0
psubsw m4, m5, m1 ; t7
paddsw m5, m1 ; t3
psubsw m1, m7, m3 ; t5
paddsw m7, m3 ; t1
ITX_MULSUB_2W 6, 1, 3, 9, 10, 1567, 3784 ; t5a, t4a
ITX_MULSUB_2W 4, 8, 3, 9, 10, 3784, 1567 ; t6a, t7a
psubw m9, m6, m8 ; t7
paddw m6, m8 ; out6
psubsw m9, m6, m8 ; t7
paddsw m6, m8 ; out6
vpbroadcastd m8, [o(pw_2896x8)]
psubw m3, m7, m5 ; t3
paddw m7, m5 ; -out7
psubw m5, m0, m2 ; t2
paddw m0, m2 ; out0
psubw m2, m1, m4 ; t6
paddw m1, m4 ; -out1
psubsw m3, m7, m5 ; t3
paddsw m7, m5 ; -out7
psubsw m5, m0, m2 ; t2
paddsw m0, m2 ; out0
psubsw m2, m1, m4 ; t6
paddsw m1, m4 ; -out1
psubw m4, m5, m3
paddw m3, m5
psubw m5, m2, m9
......@@ -2959,25 +2959,25 @@ ALIGN function_align
mova [rsp+gprsize+32*0], m6 ; tmp3
IDCT16_1D_ODDHALF 9, 3, 5, 7, 1, 11, 13, 14, 6, 10, 15
mova m6, [rsp+gprsize+32*1] ; tmp5
psubw m15, m0, m14 ; out15
paddw m0, m14 ; out0
psubw m14, m2, m13 ; out14
paddw m2, m13 ; out1
psubsw m15, m0, m14 ; out15
paddsw m0, m14 ; out0
psubsw m14, m2, m13 ; out14
paddsw m2, m13 ; out1
mova [rsp+gprsize+32*1], m2
psubw m13, m4, m11 ; out13
paddw m2, m4, m11 ; out2
psubw m11, m8, m7 ; out11
paddw m4, m8, m7 ; out4
psubsw m13, m4, m11 ; out13
paddsw m2, m4, m11 ; out2
psubsw m11, m8, m7 ; out11
paddsw m4, m8, m7 ; out4
mova m7, [rsp+gprsize+32*2] ; tmp7
psubw m10, m6, m5 ; out10
paddw m5, m6 ; out5
psubw m8, m7, m9 ; out8
paddw m7, m9 ; out7
psubw m9, m12, m3 ; out9
paddw m6, m12, m3 ; out6
psubsw m10, m6, m5 ; out10
paddsw m5, m6 ; out5
psubsw m8, m7, m9 ; out8
paddsw m7, m9 ; out7
psubsw m9, m12, m3 ; out9
paddsw m6, m12, m3 ; out6
mova m3, [rsp+gprsize+32*0] ; tmp3
psubw m12, m3, m1 ; out12
paddw m3, m1 ; out3
psubsw m12, m3, m1 ; out12
paddsw m3, m1 ; out3
ret
INV_TXFM_16X16_FN adst, dct
......@@ -3012,24 +3012,24 @@ ALIGN function_align
ITX_MULSUB_2W 9, 6, 0, 4, 15, 2440, 3290 ; t7, t6
ITX_MULSUB_2W 5, 10, 0, 4, 15, 3513, 2106 ; t11, t10
ITX_MULSUB_2W 1, 14, 0, 4, 15, 4052, 601 ; t15, t14
psubw m0, m2, m10 ; t10a
paddw m2, m10 ; t2a
psubw m10, m13, m5 ; t11a
paddw m13, m5 ; t3a
psubw m5, m6, m14 ; t14a
paddw m6, m14 ; t6a
psubw m14, m9, m1 ; t15a
paddw m9, m1 ; t7a
psubsw m0, m2, m10 ; t10a
paddsw m2, m10 ; t2a
psubsw m10, m13, m5 ; t11a
paddsw m13, m5 ; t3a
psubsw m5, m6, m14 ; t14a
paddsw m6, m14 ; t6a
psubsw m14, m9, m1 ; t15a
paddsw m9, m1 ; t7a
ITX_MULSUB_2W 0, 10, 1, 4, 15, 3406, 2276 ; t11, t10
ITX_MULSUB_2W 14, 5, 1, 4, 15, 2276, 3406 ; t14, t15
psubw m1, m10, m14 ; t14a
paddw m10, m14 ; t10a
psubw m14, m0, m5 ; t15a
paddw m0, m5 ; t11a
psubw m5, m2, m6 ; t6
paddw m2, m6 ; t2
psubw m6, m13, m9 ; t7
paddw m13, m9 ; t3
psubsw m1, m10, m14 ; t14a
paddsw m10, m14 ; t10a
psubsw m14, m0, m5 ; t15a
paddsw m0, m5 ; t11a
psubsw m5, m2, m6 ; t6
paddsw m2, m6 ; t2
psubsw m6, m13, m9 ; t7
paddsw m13, m9 ; t3
ITX_MULSUB_2W 6, 5, 4, 9, 15, 3784, 1567 ; t6a, t7a
ITX_MULSUB_2W 14, 1, 4, 9, 15, 3784, 1567 ; t14, t15
mova m9, [rsp+gprsize+32*0] ; in15
......@@ -3042,46 +3042,46 @@ ALIGN function_align
ITX_MULSUB_2W 11, 6, 2, 10, 15, 1751, 3703 ; t5, t4
ITX_MULSUB_2W 7, 8, 2, 10, 15, 3035, 2751 ; t9, t8
ITX_MULSUB_2W 3, 12, 2, 10, 15, 3857, 1380 ; t13, t12
psubw m10, m4, m8 ; t8a
paddw m8, m4 ; t0a
psubw m4, m9, m7 ; t9a
paddw m9, m7 ; t1a
psubw m7, m6, m12 ; t12a
paddw m6, m12 ; t4a
psubw m12, m11, m3 ; t13a
paddw m11, m3 ; t5a
psubsw m10, m4, m8 ; t8a
paddsw m8, m4 ; t0a
psubsw m4, m9, m7 ; t9a
paddsw m9, m7 ; t1a
psubsw m7, m6, m12 ; t12a
paddsw m6, m12 ; t4a
psubsw m12, m11, m3 ; t13a
paddsw m11, m3 ; t5a
ITX_MULSUB_2W 10, 4, 2, 3, 15, 799, 4017 ; t9, t8
ITX_MULSUB_2W 12, 7, 2, 3, 15, 4017, 799 ; t12, t13
psubw m3, m9, m11 ; t5
paddw m9, m11 ; t1
psubw m11, m4, m12 ; t12a
paddw m4, m12 ; t8a
paddw m12, m8, m6 ; t0
psubw m8, m6 ; t4
paddw m6, m10, m7 ; t9a
psubw m10, m7 ; t13a
psubsw m3, m9, m11 ; t5
paddsw m9, m11 ; t1
psubsw m11, m4, m12 ; t12a
paddsw m4, m12 ; t8a
paddsw m12, m8, m6 ; t0
psubsw m8, m6 ; t4
paddsw m6, m10, m7 ; t9a
psubsw m10, m7 ; t13a
ITX_MULSUB_2W 8, 3, 2, 7, 15, 1567, 3784 ; t5a, t4a
ITX_MULSUB_2W 11, 10, 2, 7, 15, 1567, 3784 ; t13, t12
mova m7, [rsp+gprsize+32*0] ; t10a
mova m2, [rsp+gprsize+32*1] ; t6a
paddw m15, m9, m13 ; -out15
psubw m9, m13 ; t3a
paddw m13, m11, m1 ; -out13
psubw m11, m1 ; t15a
psubw m1, m4, m7 ; t10
paddw m7, m4 ; -out1
psubw m4, m3, m2 ; t6
paddw m3, m2 ; -out3
paddw m2, m10, m14 ; out2
psubw m10, m14 ; t14a
paddw m14, m6, m0 ; out14
psubw m6, m0 ; t11
paddsw m15, m9, m13 ; -out15
psubsw m9, m13 ; t3a
paddsw m13, m11, m1 ; -out13
psubsw m11, m1 ; t15a
psubsw m1, m4, m7 ; t10
paddsw m7, m4 ; -out1
psubsw m4, m3, m2 ; t6
paddsw m3, m2 ; -out3
paddsw m2, m10, m14 ; out2
psubsw m10, m14 ; t14a
paddsw m14, m6, m0 ; out14
psubsw m6, m0 ; t11
mova m0, [rsp+gprsize+32*2] ; t2
mova [rsp+gprsize+32*1], m7
psubw m7, m12, m0 ; t2a
paddw m0, m12 ; out0
paddw m12, m8, m5 ; out12
psubw m8, m5 ; t7
psubsw m7, m12, m0 ; t2a
paddsw m0, m12 ; out0
paddsw m12, m8, m5 ; out12
psubsw m8, m5 ; t7
paddw m5, m10, m11 ; -out5
psubw m10, m11 ; out10
psubw m11, m4, m8 ; -out11
......@@ -3475,74 +3475,74 @@ ALIGN function_align
ITX_MUL2X_PACK 13, 6, 12, 10, 2440, 3290, 3 ; t22a, t25a
ITX_MUL2X_PACK 11, 6, 12, 10, 3035, 2751, 3 ; t17a, t30a
.main2:
psubw m6, m1, m11 ; t17 t30
paddw m1, m11 ; t16 t31
psubw m11, m9, m14 ; t18 t29
paddw m9, m14 ; t19 t28
psubw m14, m15, m0 ; t21 t26
paddw m15, m0 ; t20 t27
psubw m0, m8, m13 ; t22 t25
paddw m8, m13 ; t23 t24
psubsw m6, m1, m11 ; t17 t30
paddsw m1, m11 ; t16 t31
psubsw m11, m9, m14 ; t18 t29
paddsw m9, m14 ; t19 t28
psubsw m14, m15, m0 ; t21 t26
paddsw m15, m0 ; t20 t27
psubsw m0, m8, m13 ; t22 t25
paddsw m8, m13 ; t23 t24
ITX_MUL2X_PACK 6, 12, 13, 10, 799, 4017, 3 ; t17a t30a
ITX_MUL2X_PACK 11, 12, 13, 10, m4017, 799, 3 ; t18a t29a
ITX_MUL2X_PACK 14, 12, 13, 10, 3406, 2276, 3 ; t21a t26a
ITX_MUL2X_PACK 0, 12, 13, 10, m2276, 3406, 3 ; t22a t25a
psubw m13, m1, m9 ; t19a t28a
paddw m1, m9 ; t16a t31a
psubw m9, m8, m15 ; t20a t27a
paddw m8, m15 ; t23a t24a
psubw m15, m6, m11 ; t18 t29
paddw m6, m11 ; t17 t30
psubw m11, m0, m14 ; t21 t26
paddw m0, m14 ; t22 t25
psubsw m13, m1, m9 ; t19a t28a
paddsw m1, m9 ; t16a t31a
psubsw m9, m8, m15 ; t20a t27a
paddsw m8, m15 ; t23a t24a
psubsw m15, m6, m11 ; t18 t29
paddsw m6, m11 ; t17 t30
psubsw m11, m0, m14 ; t21 t26
paddsw m0, m14 ; t22 t25
ITX_MUL2X_PACK 15, 12, 14, 10, 1567, 3784, 1 ; t18a t29a
ITX_MUL2X_PACK 13, 12, 14, 10, 1567, 3784, 1 ; t19 t28
ITX_MUL2X_PACK 9, 12, 14, 10, m3784, 1567, 1 ; t20 t27
ITX_MUL2X_PACK 11, 12, 14, 10, m3784, 1567, 1 ; t21a t26a
vbroadcasti128 m12, [o(deint_shuf)]
REPX {pshufb x, m12}, m0, m1, m6, m8
psubw m14, m1, m8 ; t23 t24
paddw m1, m8 ; t16 t31
psubw m8, m6, m0 ; t22a t25a
paddw m6, m0 ; t17a t30a
psubw m0, m15, m11 ; t21 t26
paddw m15, m11 ; t18 t29
psubw m11, m13, m9 ; t20a t27a
paddw m13, m9 ; t19a t28a
psubsw m14, m1, m8 ; t23 t24
paddsw m1, m8 ; t16 t31
psubsw m8, m6, m0 ; t22a t25a
paddsw m6, m0 ; t17a t30a
psubsw m0, m15, m11 ; t21 t26
paddsw m15, m11 ; t18 t29
psubsw m11, m13, m9 ; t20a t27a
paddsw m13, m9 ; t19a t28a
vpbroadcastd m12, [o(pw_2896x8)]
punpcklqdq m9, m11, m0 ; t20a t21
punpckhqdq m11, m0 ; t27a t26
punpcklqdq m0, m14, m8 ; t23 t22a
punpckhqdq m14, m8 ; t24 t25a
psubw m8, m11, m9 ; t20 t21a
paddw m11, m9 ; t27 t26a
psubw m9, m14, m0 ; t23a t22
paddw m14, m0 ; t24a t25
REPX {pmulhrsw x, m12}, m8, m9, m14, m11
punpcklqdq m9, m11, m0 ; t20a t21
punpckhqdq m11, m0 ; t27a t26
punpcklqdq m0, m14, m8 ; t23 t22a
punpckhqdq m14, m8 ; t24 t25a
psubw m8, m11, m9 ; t20 t21a
paddw m11, m9 ; t27 t26a
psubw m9, m14, m0 ; t23a t22
paddw m14, m0 ; t24a t25
REPX {pmulhrsw x, m12}, m8, m9, m14, m11
punpcklqdq m0, m1, m6 ; t16 t17a
punpckhqdq m1, m6 ; t31 t30a
psubw m10, m5, m8 ; out20 out21
paddw m5, m8 ; out11 out10
psubw m6, m3, m14 ; out24 out25
paddw m3, m14 ; out7 out6
psubw m8, m7, m0 ; out16 out17
paddw m7, m0 ; out15 out14
psubsw m10, m5, m8 ; out20 out21
paddsw m5, m8 ; out11 out10
psubsw m6, m3, m14 ; out24 out25
paddsw m3, m14 ; out7 out6
psubsw m8, m7, m0 ; out16 out17
paddsw m7, m0 ; out15 out14
mova m0, [rsp+gprsize+0*32]
punpcklqdq m12, m13, m15 ; t19a t18
punpckhqdq m13, m15 ; t28a t29
psubw m15, m0, m1 ; out31 out30
paddw m0, m1 ; out0 out1
psubsw m15, m0, m1 ; out31 out30
paddsw m0, m1 ; out0 out1
mova m1, [rsp+gprsize+1*32]
mova [rsp+gprsize+0*32], m6
mova m6, [rsp+gprsize+2*32]
psubw m14, m1, m13 ; out28 out29
paddw m1, m13 ; out3 out2
psubw m13, m2, m11 ; out27 out26
paddw m2, m11 ; out4 out5
psubw m11, m4, m9 ; out23 out22
paddw m4, m9 ; out8 out9
psubw m9, m6, m12 ; out19 out18
paddw m6, m12 ; out12 out13
psubsw m14, m1, m13 ; out28 out29
paddsw m1, m13 ; out3 out2
psubsw m13, m2, m11 ; out27 out26
paddsw m2, m11 ; out4 out5
psubsw m11, m4, m9 ; out23 out22
paddsw m4