Commit a755b6e3 authored by Henrik Gramner's avatar Henrik Gramner

Clip coefficients in SSSE3/AVX2 inverse transform asm

parent eb01bdb9
Pipeline #3601 passed with stages
in 4 minutes and 56 seconds
...@@ -231,20 +231,20 @@ SECTION .text ...@@ -231,20 +231,20 @@ SECTION .text
psubw m%1, m%3 psubw m%1, m%3
pmulhrsw m%1, m%6 ; t1 pmulhrsw m%1, m%6 ; t1
pmulhrsw m%5, m%6 ; t0 pmulhrsw m%5, m%6 ; t0
psubw m%3, m%1, m%2 psubsw m%3, m%1, m%2
paddw m%2, m%1 paddsw m%2, m%1
paddw m%1, m%5, m%4 paddsw m%1, m%5, m%4
psubw m%4, m%5, m%4 psubsw m%4, m%5, m%4
%endmacro %endmacro
%macro IDCT8_1D 11 ; src[1-8], tmp[1-2], pd_2048 %macro IDCT8_1D 11 ; src[1-8], tmp[1-2], pd_2048
ITX_MULSUB_2W %6, %4, %9, %10, %11, 3406, 2276 ; t5a, t6a ITX_MULSUB_2W %6, %4, %9, %10, %11, 3406, 2276 ; t5a, t6a
ITX_MULSUB_2W %2, %8, %9, %10, %11, 799, 4017 ; t4a, t7a ITX_MULSUB_2W %2, %8, %9, %10, %11, 799, 4017 ; t4a, t7a
ITX_MULSUB_2W %3, %7, %9, %10, %11, 1567, 3784 ; t2, t3 ITX_MULSUB_2W %3, %7, %9, %10, %11, 1567, 3784 ; t2, t3
paddw m%9, m%2, m%6 ; t4 paddsw m%9, m%2, m%6 ; t4
psubw m%2, m%6 ; t5a psubsw m%2, m%6 ; t5a
paddw m%10, m%8, m%4 ; t7 paddsw m%10, m%8, m%4 ; t7
psubw m%8, m%4 ; t6a psubsw m%8, m%4 ; t6a
vpbroadcastd m%4, [o(pw_2896x8)] vpbroadcastd m%4, [o(pw_2896x8)]
psubw m%6, m%1, m%5 psubw m%6, m%1, m%5
paddw m%1, m%5 paddw m%1, m%5
...@@ -254,18 +254,18 @@ SECTION .text ...@@ -254,18 +254,18 @@ SECTION .text
pmulhrsw m%6, m%4 ; t1 pmulhrsw m%6, m%4 ; t1
pmulhrsw m%8, m%4 ; t6 pmulhrsw m%8, m%4 ; t6
pmulhrsw m%5, m%4 ; t5 pmulhrsw m%5, m%4 ; t5
psubw m%4, m%1, m%7 ; dct4 out3 psubsw m%4, m%1, m%7 ; dct4 out3
paddw m%1, m%7 ; dct4 out0 paddsw m%1, m%7 ; dct4 out0
paddw m%7, m%6, m%3 ; dct4 out1 paddsw m%7, m%6, m%3 ; dct4 out1
psubw m%6, m%3 ; dct4 out2 psubsw m%6, m%3 ; dct4 out2
paddw m%2, m%7, m%8 ; out1 paddsw m%2, m%7, m%8 ; out1
psubw m%7, m%8 ; out6 psubsw m%7, m%8 ; out6
psubw m%8, m%1, m%10 ; out7 psubsw m%8, m%1, m%10 ; out7
paddw m%1, m%10 ; out0 paddsw m%1, m%10 ; out0
paddw m%3, m%6, m%5 ; out2 paddsw m%3, m%6, m%5 ; out2
psubw m%6, m%5 ; out5 psubsw m%6, m%5 ; out5
psubw m%5, m%4, m%9 ; out4 psubsw m%5, m%4, m%9 ; out4
paddw m%4, m%9 ; out3 paddsw m%4, m%9 ; out3
%endmacro %endmacro
; in1 = %1, in3 = %2, in5 = %3, in7 = %4 ; in1 = %1, in3 = %2, in5 = %3, in7 = %4
...@@ -275,25 +275,25 @@ SECTION .text ...@@ -275,25 +275,25 @@ SECTION .text
ITX_MULSUB_2W %5, %4, %9, %10, %11, 3166, 2598 ; t9a, t14a ITX_MULSUB_2W %5, %4, %9, %10, %11, 3166, 2598 ; t9a, t14a
ITX_MULSUB_2W %3, %6, %9, %10, %11, 1931, 3612 ; t10a, t13a ITX_MULSUB_2W %3, %6, %9, %10, %11, 1931, 3612 ; t10a, t13a
ITX_MULSUB_2W %7, %2, %9, %10, %11, 3920, 1189 ; t11a, t12a ITX_MULSUB_2W %7, %2, %9, %10, %11, 3920, 1189 ; t11a, t12a
psubw m%9, m%2, m%6 ; t13 psubsw m%9, m%2, m%6 ; t13
paddw m%6, m%2 ; t12 paddsw m%6, m%2 ; t12
psubw m%2, m%8, m%4 ; t14 psubsw m%2, m%8, m%4 ; t14
paddw m%8, m%4 ; t15 paddsw m%8, m%4 ; t15
psubw m%4, m%7, m%3 ; t10 psubsw m%4, m%7, m%3 ; t10
paddw m%3, m%7 ; t11 paddsw m%3, m%7 ; t11
psubw m%7, m%1, m%5 ; t9 psubsw m%7, m%1, m%5 ; t9
paddw m%1, m%5 ; t8 paddsw m%1, m%5 ; t8
ITX_MULSUB_2W %2, %7, %5, %10, %11, 1567, 3784 ; t9a, t14a ITX_MULSUB_2W %2, %7, %5, %10, %11, 1567, 3784 ; t9a, t14a
ITX_MULSUB_2W %9, %4, %5, %10, %11, m3784, 1567 ; t10a, t13a ITX_MULSUB_2W %9, %4, %5, %10, %11, m3784, 1567 ; t10a, t13a
vpbroadcastd m%10, [o(pw_2896x8)] vpbroadcastd m%10, [o(pw_2896x8)]
psubw m%5, m%2, m%9 ; t10 psubsw m%5, m%2, m%9 ; t10
paddw m%2, m%9 ; t9 paddsw m%2, m%9 ; t9
psubw m%9, m%1, m%3 ; t11a psubsw m%9, m%1, m%3 ; t11a
paddw m%1, m%3 ; t8a paddsw m%1, m%3 ; t8a
psubw m%3, m%7, m%4 ; t13 psubsw m%3, m%7, m%4 ; t13
paddw m%7, m%4 ; t14 paddsw m%7, m%4 ; t14
psubw m%4, m%8, m%6 ; t12a psubsw m%4, m%8, m%6 ; t12a
paddw m%8, m%6 ; t15a paddsw m%8, m%6 ; t15a
paddw m%6, m%3, m%5 ; t13a paddw m%6, m%3, m%5 ; t13a
psubw m%3, m%5 ; t10a psubw m%3, m%5 ; t10a
paddw m%5, m%4, m%9 ; t12 paddw m%5, m%4, m%9 ; t12
...@@ -458,8 +458,8 @@ ALIGN function_align ...@@ -458,8 +458,8 @@ ALIGN function_align
vpbroadcastd m4, [o(pw_2896x8)] vpbroadcastd m4, [o(pw_2896x8)]
pmulhrsw m0, m4 ; t0 t1 pmulhrsw m0, m4 ; t0 t1
%endif %endif
psubw m1, m0, m2 ; out3 out2 psubsw m1, m0, m2 ; out3 out2
paddw m0, m2 ; out0 out1 paddsw m0, m2 ; out0 out1
%endmacro %endmacro
%macro IADST4_1D_PACKED 0 %macro IADST4_1D_PACKED 0
...@@ -693,22 +693,22 @@ cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2 ...@@ -693,22 +693,22 @@ cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
ITX_MUL2X_PACK 4, 1, 2, 6, 3406, 2276, 1 ; t5a t6a ITX_MUL2X_PACK 4, 1, 2, 6, 3406, 2276, 1 ; t5a t6a
ITX_MUL2X_PACK 3, 1, 2, 6, 1567, 3784 ; t3 t2 ITX_MUL2X_PACK 3, 1, 2, 6, 1567, 3784 ; t3 t2
vpbroadcastd m6, [o(pw_2896x8)] vpbroadcastd m6, [o(pw_2896x8)]
psubw m2, m5, m4 ; t4 t7 psubsw m2, m5, m4 ; t4 t7
paddw m5, m4 ; t5a t6a paddsw m5, m4 ; t5a t6a
pshufd m4, m2, q1032 pshufd m4, m2, q1032
psubw m1, m2, m4 psubw m1, m2, m4
paddw m4, m2 paddw m4, m2
vpblendd m4, m4, m1, 0xcc vpblendd m4, m4, m1, 0xcc
pmulhrsw m0, m6 ; t0 t1 pmulhrsw m0, m6 ; t0 t1
pmulhrsw m4, m6 ; t6 t5 pmulhrsw m4, m6 ; t6 t5
psubw m1, m0, m3 ; tmp3 tmp2 psubsw m1, m0, m3 ; tmp3 tmp2
paddw m0, m3 ; tmp0 tmp1 paddsw m0, m3 ; tmp0 tmp1
shufps m2, m5, m4, q1032 ; t7 t6 shufps m2, m5, m4, q1032 ; t7 t6
vpblendd m5, m5, m4, 0xcc ; t4 t5 vpblendd m5, m5, m4, 0xcc ; t4 t5
psubw m3, m0, m2 ; out7 out6 psubsw m3, m0, m2 ; out7 out6
paddw m0, m2 ; out0 out1 paddsw m0, m2 ; out0 out1
psubw m2, m1, m5 ; out4 out5 psubsw m2, m1, m5 ; out4 out5
paddw m1, m5 ; out3 out2 paddsw m1, m5 ; out3 out2
%endmacro %endmacro
%macro IADST8_1D_PACKED 0 %macro IADST8_1D_PACKED 0
...@@ -721,19 +721,19 @@ cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2 ...@@ -721,19 +721,19 @@ cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612 ; t2a t3a ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612 ; t2a t3a
ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598 ; t4a t5a ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598 ; t4a t5a
ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189 ; t6a t7a ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189 ; t6a t7a
psubw m4, m0, m2 ; t4 t5 psubsw m4, m0, m2 ; t4 t5
paddw m0, m2 ; t0 t1 paddsw m0, m2 ; t0 t1
psubw m5, m1, m3 ; t6 t7 psubsw m5, m1, m3 ; t6 t7
paddw m1, m3 ; t2 t3 paddsw m1, m3 ; t2 t3
shufps m2, m5, m4, q1032 shufps m2, m5, m4, q1032
punpckhwd m4, m2 punpckhwd m4, m2
punpcklwd m5, m2 punpcklwd m5, m2
ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784, 1 ; t5a t4a ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784, 1 ; t5a t4a
ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567 ; t7a t6a ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567 ; t7a t6a
psubw m2, m0, m1 ; t2 t3 psubsw m2, m0, m1 ; t2 t3
paddw m0, m1 ; out0 -out7 paddsw m0, m1 ; out0 -out7
psubw m1, m4, m5 ; t7 t6 psubsw m1, m4, m5 ; t7 t6
paddw m4, m5 ; out6 -out1 paddsw m4, m5 ; out6 -out1
vpbroadcastd m5, [o(pw_2896x8)] vpbroadcastd m5, [o(pw_2896x8)]
vpblendd m3, m0, m4, 0x33 ; out6 -out7 vpblendd m3, m0, m4, 0x33 ; out6 -out7
vpblendd m0, m0, m4, 0xcc ; out0 -out1 vpblendd m0, m0, m4, 0xcc ; out0 -out1
...@@ -981,10 +981,10 @@ cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 ...@@ -981,10 +981,10 @@ cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
ITX_MUL2X_PACK 7, 2, 4, 10, 799, 4017, 1 ; t4a t7a ITX_MUL2X_PACK 7, 2, 4, 10, 799, 4017, 1 ; t4a t7a
ITX_MUL2X_PACK 3, 2, 4, 10, 3406, 2276, 1 ; t5a t6a ITX_MUL2X_PACK 3, 2, 4, 10, 3406, 2276, 1 ; t5a t6a
ITX_MUL2X_PACK 6, 2, 4, 10, 1567, 3784 ; t3 t2 ITX_MUL2X_PACK 6, 2, 4, 10, 1567, 3784 ; t3 t2
psubw m2, m8, m0 ; t9 t14 psubsw m2, m8, m0 ; t9 t14
paddw m8, m0 ; t8 t15 paddsw m8, m0 ; t8 t15
psubw m0, m1, m5 ; t10 t13 psubsw m0, m1, m5 ; t10 t13
paddw m1, m5 ; t11 t12 paddsw m1, m5 ; t11 t12
%if mmsize > 16 %if mmsize > 16
vbroadcasti128 m5, [o(deint_shuf)] vbroadcasti128 m5, [o(deint_shuf)]
%else %else
...@@ -996,12 +996,12 @@ cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 ...@@ -996,12 +996,12 @@ cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
ITX_MUL2X_PACK 2, 4, _, 10, 4, 5, 4 ; t9a t14a ITX_MUL2X_PACK 2, 4, _, 10, 4, 5, 4 ; t9a t14a
vpbroadcastd m4, [o(pw_m1567_m3784)] ; reuse pw_m3784_1567 vpbroadcastd m4, [o(pw_m1567_m3784)] ; reuse pw_m3784_1567
ITX_MUL2X_PACK 0, 5, _, 10, 5, 4, 4 ; t10a t13a ITX_MUL2X_PACK 0, 5, _, 10, 5, 4, 4 ; t10a t13a
psubw m5, m7, m3 ; t5a t6a psubsw m5, m7, m3 ; t5a t6a
paddw m7, m3 ; t4 t7 paddsw m7, m3 ; t4 t7
psubw m4, m8, m1 ; t11a t12a psubsw m4, m8, m1 ; t11a t12a
paddw m8, m1 ; t8a t15a paddsw m8, m1 ; t8a t15a
paddw m1, m2, m0 ; t9 t14 paddsw m1, m2, m0 ; t9 t14
psubw m2, m0 ; t10 t13 psubsw m2, m0 ; t10 t13
punpckhqdq m0, m8, m1 ; t15a t14 punpckhqdq m0, m8, m1 ; t15a t14
punpcklqdq m8, m1 ; t8a t9 punpcklqdq m8, m1 ; t8a t9
pshufd m3, m5, q1032 pshufd m3, m5, q1032
...@@ -1019,20 +1019,20 @@ cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 ...@@ -1019,20 +1019,20 @@ cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
pmulhrsw m5, m1 ; t12 t13a pmulhrsw m5, m1 ; t12 t13a
shufps m2, m7, m3, q1032 ; t7 t6 shufps m2, m7, m3, q1032 ; t7 t6
vpblendd m7, m7, m3, 0xcc ; t4 t5 vpblendd m7, m7, m3, 0xcc ; t4 t5
psubw m1, m9, m6 ; dct4 out3 out2 psubsw m1, m9, m6 ; dct4 out3 out2
paddw m9, m6 ; dct4 out0 out1 paddsw m9, m6 ; dct4 out0 out1
psubw m3, m9, m2 ; dct8 out7 out6 psubsw m3, m9, m2 ; dct8 out7 out6
paddw m9, m2 ; dct8 out0 out1 paddsw m9, m2 ; dct8 out0 out1
psubw m2, m1, m7 ; dct8 out4 out5 psubsw m2, m1, m7 ; dct8 out4 out5
paddw m1, m7 ; dct8 out3 out2 paddsw m1, m7 ; dct8 out3 out2
psubw m7, m9, m0 ; out15 out14 psubsw m7, m9, m0 ; out15 out14
paddw m0, m9 ; out0 out1 paddsw m0, m9 ; out0 out1
psubw m6, m1, m5 ; out12 out13 psubsw m6, m1, m5 ; out12 out13
paddw m1, m5 ; out3 out2 paddsw m1, m5 ; out3 out2
psubw m5, m2, m4 ; out11 out10 psubsw m5, m2, m4 ; out11 out10
paddw m2, m4 ; out4 out5 paddsw m2, m4 ; out4 out5
psubw m4, m3, m8 ; out8 out9 psubsw m4, m3, m8 ; out8 out9
paddw m3, m8 ; out7 out6 paddsw m3, m8 ; out7 out6
%endmacro %endmacro
INV_TXFM_4X16_FN dct, dct, 0 INV_TXFM_4X16_FN dct, dct, 0
...@@ -1153,20 +1153,20 @@ ALIGN function_align ...@@ -1153,20 +1153,20 @@ ALIGN function_align
ITX_MUL4X_PACK 1, 2, 5, 6, 8, 1751, 3703, 2440, 3290, 3 ITX_MUL4X_PACK 1, 2, 5, 6, 8, 1751, 3703, 2440, 3290, 3
ITX_MUL4X_PACK 3, 2, 5, 6, 8, 3035, 2751, 3513, 2106, 3 ITX_MUL4X_PACK 3, 2, 5, 6, 8, 3035, 2751, 3513, 2106, 3
ITX_MUL4X_PACK 4, 2, 5, 6, 8, 3857, 1380, 4052, 601, 3 ITX_MUL4X_PACK 4, 2, 5, 6, 8, 3857, 1380, 4052, 601, 3
psubw m2, m0, m3 ; t9a t8a t11a t10a psubsw m2, m0, m3 ; t9a t8a t11a t10a
paddw m0, m3 ; t1a t0a t3a t2a paddsw m0, m3 ; t1a t0a t3a t2a
psubw m3, m1, m4 ; t13a t12a t15a t14a psubsw m3, m1, m4 ; t13a t12a t15a t14a
paddw m1, m4 ; t5a t4a t7a t6a paddsw m1, m4 ; t5a t4a t7a t6a
ITX_MUL4X_PACK 2, 4, 5, 6, 8, 799, 4017, 3406, 2276, 3 ITX_MUL4X_PACK 2, 4, 5, 6, 8, 799, 4017, 3406, 2276, 3
psubw m6, m7, m5 psubw m6, m7, m5
ITX_MUL2X_PACK 3, 5, _, 8, 6, 4, 6 ITX_MUL2X_PACK 3, 5, _, 8, 6, 4, 6
vpbroadcastd m6, [o(pw_m3784_1567)] vpbroadcastd m6, [o(pw_m3784_1567)]
vpbroadcastd m5, [o(pw_1567_3784)] vpbroadcastd m5, [o(pw_1567_3784)]
psubw m4, m0, m1 ; t5 t4 t7 t6 psubsw m4, m0, m1 ; t5 t4 t7 t6
paddw m0, m1 ; t1 t0 t3 t2 paddsw m0, m1 ; t1 t0 t3 t2
psubw m1, m2, m3 ; t13a t12a t15a t14a psubsw m1, m2, m3 ; t13a t12a t15a t14a
paddw m2, m3 ; t9a t8a t11a t10a paddsw m2, m3 ; t9a t8a t11a t10a
psubw m3, m7, m6 psubw m3, m7, m6 ; pw_3784_m1567
vpblendd m6, m6, m3, 0xf0 vpblendd m6, m6, m3, 0xf0
ITX_MUL2X_PACK 4, 3, _, 8, 6, 5, 4 ; t4a t5a t7a t6a ITX_MUL2X_PACK 4, 3, _, 8, 6, 5, 4 ; t4a t5a t7a t6a
ITX_MUL2X_PACK 1, 3, _, 8, 6, 5, 4 ; t12 t13 t15 t14 ITX_MUL2X_PACK 1, 3, _, 8, 6, 5, 4 ; t12 t13 t15 t14
...@@ -1179,10 +1179,10 @@ ALIGN function_align ...@@ -1179,10 +1179,10 @@ ALIGN function_align
vinserti128 m4, m4, xm1, 1 ; t4a t5a t12 t13 vinserti128 m4, m4, xm1, 1 ; t4a t5a t12 t13
vpbroadcastd m5, [o(pw_2896x8)] vpbroadcastd m5, [o(pw_2896x8)]
pshufd m2, m2, q1032 ; t6a t7a t14 t15 pshufd m2, m2, q1032 ; t6a t7a t14 t15
psubw m1, m0, m3 ; t3a t2a t11 t10 psubsw m1, m0, m3 ; t3a t2a t11 t10
paddw m0, m3 ; -out15 out0 out14 -out1 paddsw m0, m3 ; -out15 out0 out14 -out1
paddw m3, m4, m2 ; -out3 out12 out2 -out13 paddsw m3, m4, m2 ; -out3 out12 out2 -out13
psubw m4, m2 ; t6 t7 t14a t15a psubsw m4, m2 ; t6 t7 t14a t15a
shufps m2, m1, m4, q1032 ; t2a t6 t10 t14a shufps m2, m1, m4, q1032 ; t2a t6 t10 t14a
vpblendd m4, m4, m1, 0x33 ; t3a t7 t11 t15a vpblendd m4, m4, m1, 0x33 ; t3a t7 t11 t15a
paddw m1, m2, m4 paddw m1, m2, m4
...@@ -1902,53 +1902,53 @@ ALIGN function_align ...@@ -1902,53 +1902,53 @@ ALIGN function_align
ITX_MUL2X_PACK 6, 4, 9, 10, 3513, 2106, 3 ; t10 t11 ITX_MUL2X_PACK 6, 4, 9, 10, 3513, 2106, 3 ; t10 t11
ITX_MUL2X_PACK 7, 4, 9, 10, 3857, 1380, 3 ; t12 t13 ITX_MUL2X_PACK 7, 4, 9, 10, 3857, 1380, 3 ; t12 t13
ITX_MUL2X_PACK 8, 4, 9, 10, 4052, 601, 3 ; t14 t15 ITX_MUL2X_PACK 8, 4, 9, 10, 4052, 601, 3 ; t14 t15
psubw m4, m0, m5 ; t9a t8a psubsw m4, m0, m5 ; t9a t8a
paddw m0, m5 ; t1a t0a paddsw m0, m5 ; t1a t0a
psubw m5, m1, m6 ; t11a t10a psubsw m5, m1, m6 ; t11a t10a
paddw m1, m6 ; t3a t2a paddsw m1, m6 ; t3a t2a
psubw m6, m2, m7 ; t13a t12a psubsw m6, m2, m7 ; t13a t12a
paddw m2, m7 ; t5a t4a paddsw m2, m7 ; t5a t4a
psubw m7, m3, m8 ; t15a t14a psubsw m7, m3, m8 ; t15a t14a
paddw m3, m8 ; t7a t6a paddsw m3, m8 ; t7a t6a
vpbroadcastd m11, [o(pw_m4017_799)] vpbroadcastd m11, [o(pw_m4017_799)]
vpbroadcastd m12, [o(pw_799_4017)] vpbroadcastd m12, [o(pw_799_4017)]
pxor m9, m9 pxor m9, m9
ITX_MUL2X_PACK 4, 8, _, 10, 11, 12, 6 ; t8 t9 ITX_MUL2X_PACK 4, 8, _, 10, 11, 12, 6 ; t8 t9
psubw m8, m9, m11 psubw m8, m9, m11 ; pw_4017_m799
ITX_MUL2X_PACK 6, 12, _, 10, 12, 8, 6 ; t12 t13 ITX_MUL2X_PACK 6, 12, _, 10, 12, 8, 6 ; t12 t13
vpbroadcastd m11, [o(pw_m2276_3406)] vpbroadcastd m11, [o(pw_m2276_3406)]
vpbroadcastd m12, [o(pw_3406_2276)] vpbroadcastd m12, [o(pw_3406_2276)]
ITX_MUL2X_PACK 5, 8, _, 10, 11, 12, 6 ; t10 t11 ITX_MUL2X_PACK 5, 8, _, 10, 11, 12, 6 ; t10 t11
psubw m8, m9, m11 psubw m8, m9, m11 ; pw_2276_m3406
ITX_MUL2X_PACK 7, 12, _, 10, 12, 8, 6 ; t14 t15 ITX_MUL2X_PACK 7, 12, _, 10, 12, 8, 6 ; t14 t15
psubw m8, m1, m3 ; t7 t6 psubsw m8, m1, m3 ; t7 t6
paddw m1, m3 ; t3 t2 paddsw m1, m3 ; t3 t2
psubw m3, m0, m2 ; t5 t4 psubsw m3, m0, m2 ; t5 t4
paddw m0, m2 ; t1 t0 paddsw m0, m2 ; t1 t0
psubw m2, m5, m7 ; t14a t15a psubsw m2, m5, m7 ; t14a t15a
paddw m7, m5 ; t10a t11a paddsw m7, m5 ; t10a t11a
psubw m5, m4, m6 ; t12a t13a psubsw m5, m4, m6 ; t12a t13a
paddw m4, m6 ; t8a t9a paddsw m4, m6 ; t8a t9a
vpbroadcastd m11, [o(pw_m3784_1567)] vpbroadcastd m11, [o(pw_m3784_1567)]
vpbroadcastd m12, [o(pw_1567_3784)] vpbroadcastd m12, [o(pw_1567_3784)]
ITX_MUL2X_PACK 3, 6, _, 10, 11, 12, 4 ; t4a t5a ITX_MUL2X_PACK 3, 6, _, 10, 11, 12, 4 ; t4a t5a
psubw m6, m9, m11 psubw m6, m9, m11 ; pw_3784_m1567
ITX_MUL2X_PACK 8, 12, _, 10, 12, 6, 4 ; t6a t7a ITX_MUL2X_PACK 8, 12, _, 10, 12, 6, 4 ; t6a t7a
vpbroadcastd m11, [o(pw_m1567_3784)] vpbroadcastd m11, [o(pw_m1567_3784)]
vpbroadcastd m12, [o(pw_3784_1567)] vpbroadcastd m12, [o(pw_3784_1567)]
ITX_MUL2X_PACK 2, 6, _, 10, 11, 12, 4 ; t15 t14 ITX_MUL2X_PACK 2, 6, _, 10, 11, 12, 4 ; t15 t14
psubw m6, m9, m11 psubw m6, m9, m11 ; pw_1567_m3784
ITX_MUL2X_PACK 5, 12, _, 10, 12, 6, 4 ; t13 t12 ITX_MUL2X_PACK 5, 12, _, 10, 12, 6, 4 ; t13 t12
vbroadcasti128 m11, [o(deint_shuf)] vbroadcasti128 m11, [o(deint_shuf)]
vpbroadcastd m12, [o(pw_2896x8)] vpbroadcastd m12, [o(pw_2896x8)]
psubw m6, m0, m1 ; t3a t2a psubsw m6, m0, m1 ; t3a t2a
paddw m0, m1 ; -out15 out0 paddsw m0, m1 ; -out15 out0
paddw m1, m2, m5 ; -out13 out2 paddsw m1, m2, m5 ; -out13 out2
psubw m5, m2 ; t15a t14a psubsw m5, m2 ; t15a t14a
paddw m2, m4, m7 ; -out1 out14 paddsw m2, m4, m7 ; -out1 out14
psubw m4, m7 ; t10 t11 psubsw m4, m7 ; t10 t11
psubw m7, m3, m8 ; t6 t7 psubsw m7, m3, m8 ; t6 t7
paddw m8, m3 ; -out3 out12 paddsw m8, m3 ; -out3 out12
REPX {pshufb x, m11}, m6, m4, m0, m2 REPX {pshufb x, m11}, m6, m4, m0, m2
vpblendd m3, m6, m4, 0xcc ; t3a t11 vpblendd m3, m6, m4, 0xcc ; t3a t11
shufps m6, m6, m4, q1032 ; t2a t10 shufps m6, m6, m4, q1032 ; t2a t10
...@@ -2580,25 +2580,25 @@ ALIGN function_align ...@@ -2580,25 +2580,25 @@ ALIGN function_align
ITX_MULSUB_2W 3, 4, 8, 9, 10, 3166, 2598 ; t5a, t4a ITX_MULSUB_2W 3, 4, 8, 9, 10, 3166, 2598 ; t5a, t4a
ITX_MULSUB_2W 1, 6, 8, 9, 10, 3920, 1189 ; t7a, t6a ITX_MULSUB_2W 1, 6, 8, 9, 10, 3920, 1189 ; t7a, t6a
ITX_MULSUB_2W 5, 2, 8, 9, 10, 1931, 3612 ; t3a, t2a ITX_MULSUB_2W 5, 2, 8, 9, 10, 1931, 3612 ; t3a, t2a
psubw m8, m2, m6 ; t6 psubsw m8, m2, m6 ; t6
paddw m2, m6 ; t2 paddsw m2, m6 ; t2
psubw m6, m0, m4 ; t4 psubsw m6, m0, m4 ; t4
paddw m0, m4 ; t0 paddsw m0, m4 ; t0
psubw m4, m5, m1 ; t7 psubsw m4, m5, m1 ; t7
paddw m5, m1 ; t3 paddsw m5, m1 ; t3
psubw m1, m7, m3 ; t5 psubsw m1, m7, m3 ; t5
paddw m7, m3 ; t1 paddsw m7, m3 ; t1
ITX_MULSUB_2W 6, 1, 3, 9, 10, 1567, 3784 ; t5a, t4a ITX_MULSUB_2W 6, 1, 3, 9, 10, 1567, 3784 ; t5a, t4a
ITX_MULSUB_2W 4, 8, 3, 9, 10, 3784, 1567 ; t6a, t7a ITX_MULSUB_2W 4, 8, 3, 9, 10, 3784, 1567 ; t6a, t7a
psubw m9, m6, m8 ; t7 psubsw m9, m6, m8 ; t7
paddw m6, m8 ; out6 paddsw m6, m8 ; out6
vpbroadcastd m8, [o(pw_2896x8)] vpbroadcastd m8, [o(pw_2896x8)]
psubw m3, m7, m5 ; t3 psubsw m3, m7, m5 ; t3
paddw m7, m5 ; -out7 paddsw m7, m5 ; -out7
psubw m5, m0, m2 ; t2 psubsw m5, m0, m2 ; t2
paddw m0, m2 ; out0 paddsw m0, m2 ; out0
psubw m2, m1, m4 ; t6 psubsw m2, m1, m4 ; t6
paddw m1, m4 ; -out1 paddsw m1, m4 ; -out1
psubw m4, m5, m3 psubw m4, m5, m3
paddw m3, m5 paddw m3, m5
psubw m5, m2, m9 psubw m5, m2, m9
...@@ -2959,25 +2959,25 @@ ALIGN function_align ...@@ -2959,25 +2959,25 @@ ALIGN function_align
mova [rsp+gprsize+32*0], m6 ; tmp3 mova [rsp+gprsize+32*0], m6 ; tmp3
IDCT16_1D_ODDHALF 9, 3, 5, 7, 1, 11, 13, 14, 6, 10, 15 IDCT16_1D_ODDHALF 9, 3, 5, 7, 1, 11, 13, 14, 6, 10, 15
mova m6, [rsp+gprsize+32*1] ; tmp5 mova m6, [rsp+gprsize+32*1] ; tmp5
psubw m15, m0, m14 ; out15 psubsw m15, m0, m14 ; out15
paddw m0, m14 ; out0 paddsw m0, m14 ; out0
psubw m14, m2, m13 ; out14 psubsw m14, m2, m13 ; out14
paddw m2, m13 ; out1 paddsw m2, m13 ; out1
mova [rsp+gprsize+32*1], m2 mova [rsp+gprsize+32*1], m2
psubw m13, m4, m11 ; out13 psubsw m13, m4, m11 ; out13
paddw m2, m4, m11 ; out2 paddsw m2, m4, m11 ; out2
psubw m11, m8, m7 ; out11 psubsw m11, m8, m7 ; out11
paddw m4, m8, m7 ; out4 paddsw m4, m8, m7 ; out4
mova m7, [rsp+gprsize+32*2] ; tmp7 mova m7, [rsp+gprsize+32*2] ; tmp7
psubw m10, m6, m5 ; out10 psubsw m10, m6, m5 ; out10
paddw m5, m6 ; out5 paddsw m5, m6 ; out5
psubw m8, m7, m9 ; out8 psubsw m8, m7, m9 ; out8
paddw m7, m9 ; out7 paddsw m7, m9 ; out7
psubw m9, m12, m3 ; out9 psubsw m9, m12, m3 ; out9
paddw m6, m12, m3 ; out6 paddsw m6, m12, m3 ; out6
mova m3, [rsp+gprsize+32*0] ; tmp3 mova m3, [rsp+gprsize+32*0] ; tmp3
psubw m12, m3, m1 ; out12 psubsw m12, m3, m1 ; out12
paddw m3, m1 ; out3 paddsw m3, m1 ; out3
ret ret
INV_TXFM_16X16_FN adst, dct INV_TXFM_16X16_FN adst, dct
...@@ -3012,24 +3012,24 @@ ALIGN function_align ...@@ -3012,24 +3012,24 @@ ALIGN function_align
ITX_MULSUB_2W 9, 6, 0, 4, 15, 2440, 3290 ; t7, t6 ITX_MULSUB_2W 9, 6, 0, 4, 15, 2440, 3290 ; t7, t6
ITX_MULSUB_2W 5, 10, 0, 4, 15, 3513, 2106 ; t11, t10 ITX_MULSUB_2W 5, 10, 0, 4, 15, 3513, 2106 ; t11, t10
ITX_MULSUB_2W 1, 14, 0, 4, 15, 4052, 601 ; t15, t14 ITX_MULSUB_2W 1, 14, 0, 4, 15, 4052, 601 ; t15, t14
psubw m0, m2, m10 ; t10a psubsw m0, m2, m10 ; t10a
paddw m2, m10 ; t2a paddsw m2, m10 ; t2a
psubw m10, m13, m5 ; t11a psubsw m10, m13, m5 ; t11a
paddw m13, m5 ; t3a paddsw m13, m5 ; t3a
psubw m5, m6, m14 ; t14a psubsw m5, m6, m14 ; t14a
paddw m6, m14 ; t6a paddsw m6, m14 ; t6a
psubw m14, m9, m1 ; t15a psubsw m14, m9, m1 ; t15a
paddw m9, m1 ; t7a paddsw m9, m1 ; t7a
ITX_MULSUB_2W 0, 10, 1, 4, 15, 3406, 2276 ; t11, t10 ITX_MULSUB_2W 0, 10, 1, 4, 15, 3406, 2276 ; t11, t10
ITX_MULSUB_2W 14, 5, 1, 4, 15, 2276, 3406 ; t14, t15 ITX_MULSUB_2W 14, 5, 1, 4, 15, 2276, 3406 ; t14, t15
psubw m1, m10, m14 ; t14a psubsw m1, m10, m14 ; t14a
paddw m10, m14 ; t10a paddsw m10, m14 ; t10a
psubw m14, m0, m5 ; t15a psubsw m14, m0, m5 ; t15a
paddw m0, m5 ; t11a paddsw m0, m5 ; t11a
psubw m5, m2, m6 ; t6 psubsw m5, m2, m6 ; t6
paddw m2, m6 ; t2 paddsw m2, m6 ; t2
psubw m6, m13, m9 ; t7 psubsw m6, m13, m9 ; t7
paddw m13, m9 ; t3 paddsw m13, m9 ; t3
ITX_MULSUB_2W 6, 5, 4, 9, 15, 3784, 1567 ; t6a, t7a ITX_MULSUB_2W 6, 5, 4, 9, 15, 3784, 1567 ; t6a, t7a
ITX_MULSUB_2W 14, 1, 4, 9, 15, 3784, 1567 ; t14, t15 ITX_MULSUB_2W 14, 1, 4, 9, 15, 3784, 1567 ; t14, t15
mova m9, [rsp+gprsize+32*0] ; in15 mova m9, [rsp+gprsize+32*0] ; in15
...@@ -3042,46 +3042,46 @@ ALIGN function_align ...@@ -3042,46 +3042,46 @@ ALIGN function_align
ITX_MULSUB_2W 11, 6, 2, 10