Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
Menu
Open sidebar
VideoLAN
dav1d
Commits
a755b6e3
Commit
a755b6e3
authored
Dec 15, 2018
by
Henrik Gramner
Browse files
Clip coefficients in SSSE3/AVX2 inverse transform asm
parent
eb01bdb9
Pipeline
#3601
passed with stages
in 4 minutes and 56 seconds
Changes
2
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
src/x86/itx.asm
View file @
a755b6e3
...
...
@@ -231,20 +231,20 @@ SECTION .text
psubw
m
%
1
,
m
%
3
pmulhrsw
m
%
1
,
m
%
6
; t1
pmulhrsw
m
%
5
,
m
%
6
; t0
psubw
m
%
3
,
m
%
1
,
m
%
2
paddw
m
%
2
,
m
%
1
paddw
m
%
1
,
m
%
5
,
m
%
4
psubw
m
%
4
,
m
%
5
,
m
%
4
psub
s
w
m
%
3
,
m
%
1
,
m
%
2
padd
s
w
m
%
2
,
m
%
1
padd
s
w
m
%
1
,
m
%
5
,
m
%
4
psub
s
w
m
%
4
,
m
%
5
,
m
%
4
%endmacro
%macro IDCT8_1D 11
; src[1-8], tmp[1-2], pd_2048
ITX_MULSUB_2W
%
6
,
%
4
,
%
9
,
%
10
,
%
11
,
3406
,
2276
; t5a, t6a
ITX_MULSUB_2W
%
2
,
%
8
,
%
9
,
%
10
,
%
11
,
799
,
4017
; t4a, t7a
ITX_MULSUB_2W
%
3
,
%
7
,
%
9
,
%
10
,
%
11
,
1567
,
3784
; t2, t3
paddw
m
%
9
,
m
%
2
,
m
%
6
; t4
psubw
m
%
2
,
m
%
6
; t5a
paddw
m
%
10
,
m
%
8
,
m
%
4
; t7
psubw
m
%
8
,
m
%
4
; t6a
padd
s
w
m
%
9
,
m
%
2
,
m
%
6
; t4
psub
s
w
m
%
2
,
m
%
6
; t5a
padd
s
w
m
%
10
,
m
%
8
,
m
%
4
; t7
psub
s
w
m
%
8
,
m
%
4
; t6a
vpbroadcastd
m
%
4
,
[
o
(
pw_2896x8
)]
psubw
m
%
6
,
m
%
1
,
m
%
5
paddw
m
%
1
,
m
%
5
...
...
@@ -254,18 +254,18 @@ SECTION .text
pmulhrsw
m
%
6
,
m
%
4
; t1
pmulhrsw
m
%
8
,
m
%
4
; t6
pmulhrsw
m
%
5
,
m
%
4
; t5
psubw
m
%
4
,
m
%
1
,
m
%
7
; dct4 out3
paddw
m
%
1
,
m
%
7
; dct4 out0
paddw
m
%
7
,
m
%
6
,
m
%
3
; dct4 out1
psubw
m
%
6
,
m
%
3
; dct4 out2
paddw
m
%
2
,
m
%
7
,
m
%
8
; out1
psubw
m
%
7
,
m
%
8
; out6
psubw
m
%
8
,
m
%
1
,
m
%
10
; out7
paddw
m
%
1
,
m
%
10
; out0
paddw
m
%
3
,
m
%
6
,
m
%
5
; out2
psubw
m
%
6
,
m
%
5
; out5
psubw
m
%
5
,
m
%
4
,
m
%
9
; out4
paddw
m
%
4
,
m
%
9
; out3
psub
s
w
m
%
4
,
m
%
1
,
m
%
7
; dct4 out3
padd
s
w
m
%
1
,
m
%
7
; dct4 out0
padd
s
w
m
%
7
,
m
%
6
,
m
%
3
; dct4 out1
psub
s
w
m
%
6
,
m
%
3
; dct4 out2
padd
s
w
m
%
2
,
m
%
7
,
m
%
8
; out1
psub
s
w
m
%
7
,
m
%
8
; out6
psub
s
w
m
%
8
,
m
%
1
,
m
%
10
; out7
padd
s
w
m
%
1
,
m
%
10
; out0
padd
s
w
m
%
3
,
m
%
6
,
m
%
5
; out2
psub
s
w
m
%
6
,
m
%
5
; out5
psub
s
w
m
%
5
,
m
%
4
,
m
%
9
; out4
padd
s
w
m
%
4
,
m
%
9
; out3
%endmacro
; in1 = %1, in3 = %2, in5 = %3, in7 = %4
...
...
@@ -275,25 +275,25 @@ SECTION .text
ITX_MULSUB_2W
%
5
,
%
4
,
%
9
,
%
10
,
%
11
,
3166
,
2598
; t9a, t14a
ITX_MULSUB_2W
%
3
,
%
6
,
%
9
,
%
10
,
%
11
,
1931
,
3612
; t10a, t13a
ITX_MULSUB_2W
%
7
,
%
2
,
%
9
,
%
10
,
%
11
,
3920
,
1189
; t11a, t12a
psubw
m
%
9
,
m
%
2
,
m
%
6
; t13
paddw
m
%
6
,
m
%
2
; t12
psubw
m
%
2
,
m
%
8
,
m
%
4
; t14
paddw
m
%
8
,
m
%
4
; t15
psubw
m
%
4
,
m
%
7
,
m
%
3
; t10
paddw
m
%
3
,
m
%
7
; t11
psubw
m
%
7
,
m
%
1
,
m
%
5
; t9
paddw
m
%
1
,
m
%
5
; t8
psub
s
w
m
%
9
,
m
%
2
,
m
%
6
; t13
padd
s
w
m
%
6
,
m
%
2
; t12
psub
s
w
m
%
2
,
m
%
8
,
m
%
4
; t14
padd
s
w
m
%
8
,
m
%
4
; t15
psub
s
w
m
%
4
,
m
%
7
,
m
%
3
; t10
padd
s
w
m
%
3
,
m
%
7
; t11
psub
s
w
m
%
7
,
m
%
1
,
m
%
5
; t9
padd
s
w
m
%
1
,
m
%
5
; t8
ITX_MULSUB_2W
%
2
,
%
7
,
%
5
,
%
10
,
%
11
,
1567
,
3784
; t9a, t14a
ITX_MULSUB_2W
%
9
,
%
4
,
%
5
,
%
10
,
%
11
,
m3784
,
1567
; t10a, t13a
vpbroadcastd
m
%
10
,
[
o
(
pw_2896x8
)]
psubw
m
%
5
,
m
%
2
,
m
%
9
; t10
paddw
m
%
2
,
m
%
9
; t9
psubw
m
%
9
,
m
%
1
,
m
%
3
; t11a
paddw
m
%
1
,
m
%
3
; t8a
psubw
m
%
3
,
m
%
7
,
m
%
4
; t13
paddw
m
%
7
,
m
%
4
; t14
psubw
m
%
4
,
m
%
8
,
m
%
6
; t12a
paddw
m
%
8
,
m
%
6
; t15a
psub
s
w
m
%
5
,
m
%
2
,
m
%
9
; t10
padd
s
w
m
%
2
,
m
%
9
; t9
psub
s
w
m
%
9
,
m
%
1
,
m
%
3
; t11a
padd
s
w
m
%
1
,
m
%
3
; t8a
psub
s
w
m
%
3
,
m
%
7
,
m
%
4
; t13
padd
s
w
m
%
7
,
m
%
4
; t14
psub
s
w
m
%
4
,
m
%
8
,
m
%
6
; t12a
padd
s
w
m
%
8
,
m
%
6
; t15a
paddw
m
%
6
,
m
%
3
,
m
%
5
; t13a
psubw
m
%
3
,
m
%
5
; t10a
paddw
m
%
5
,
m
%
4
,
m
%
9
; t12
...
...
@@ -458,8 +458,8 @@ ALIGN function_align
vpbroadcastd
m4
,
[
o
(
pw_2896x8
)]
pmulhrsw
m0
,
m4
; t0 t1
%endif
psubw
m1
,
m0
,
m2
; out3 out2
paddw
m0
,
m2
; out0 out1
psub
s
w
m1
,
m0
,
m2
; out3 out2
padd
s
w
m0
,
m2
; out0 out1
%endmacro
%macro IADST4_1D_PACKED 0
...
...
@@ -693,22 +693,22 @@ cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
ITX_MUL2X_PACK
4
,
1
,
2
,
6
,
3406
,
2276
,
1
; t5a t6a
ITX_MUL2X_PACK
3
,
1
,
2
,
6
,
1567
,
3784
; t3 t2
vpbroadcastd
m6
,
[
o
(
pw_2896x8
)]
psubw
m2
,
m5
,
m4
; t4 t7
paddw
m5
,
m4
; t5a t6a
psub
s
w
m2
,
m5
,
m4
; t4 t7
padd
s
w
m5
,
m4
; t5a t6a
pshufd
m4
,
m2
,
q1032
psubw
m1
,
m2
,
m4
paddw
m4
,
m2
vpblendd
m4
,
m4
,
m1
,
0xcc
pmulhrsw
m0
,
m6
; t0 t1
pmulhrsw
m4
,
m6
; t6 t5
psubw
m1
,
m0
,
m3
; tmp3 tmp2
paddw
m0
,
m3
; tmp0 tmp1
psub
s
w
m1
,
m0
,
m3
; tmp3 tmp2
padd
s
w
m0
,
m3
; tmp0 tmp1
shufps
m2
,
m5
,
m4
,
q1032
; t7 t6
vpblendd
m5
,
m5
,
m4
,
0xcc
; t4 t5
psubw
m3
,
m0
,
m2
; out7 out6
paddw
m0
,
m2
; out0 out1
psubw
m2
,
m1
,
m5
; out4 out5
paddw
m1
,
m5
; out3 out2
psub
s
w
m3
,
m0
,
m2
; out7 out6
padd
s
w
m0
,
m2
; out0 out1
psub
s
w
m2
,
m1
,
m5
; out4 out5
padd
s
w
m1
,
m5
; out3 out2
%endmacro
%macro IADST8_1D_PACKED 0
...
...
@@ -721,19 +721,19 @@ cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
ITX_MUL2X_PACK
1
,
4
,
5
,
6
,
1931
,
3612
; t2a t3a
ITX_MUL2X_PACK
2
,
4
,
5
,
6
,
3166
,
2598
; t4a t5a
ITX_MUL2X_PACK
3
,
4
,
5
,
6
,
3920
,
1189
; t6a t7a
psubw
m4
,
m0
,
m2
; t4 t5
paddw
m0
,
m2
; t0 t1
psubw
m5
,
m1
,
m3
; t6 t7
paddw
m1
,
m3
; t2 t3
psub
s
w
m4
,
m0
,
m2
; t4 t5
padd
s
w
m0
,
m2
; t0 t1
psub
s
w
m5
,
m1
,
m3
; t6 t7
padd
s
w
m1
,
m3
; t2 t3
shufps
m2
,
m5
,
m4
,
q1032
punpckhwd
m4
,
m2
punpcklwd
m5
,
m2
ITX_MUL2X_PACK
4
,
2
,
3
,
6
,
1567
,
3784
,
1
; t5a t4a
ITX_MUL2X_PACK
5
,
2
,
3
,
6
,
3784
,
1567
; t7a t6a
psubw
m2
,
m0
,
m1
; t2 t3
paddw
m0
,
m1
; out0 -out7
psubw
m1
,
m4
,
m5
; t7 t6
paddw
m4
,
m5
; out6 -out1
psub
s
w
m2
,
m0
,
m1
; t2 t3
padd
s
w
m0
,
m1
; out0 -out7
psub
s
w
m1
,
m4
,
m5
; t7 t6
padd
s
w
m4
,
m5
; out6 -out1
vpbroadcastd
m5
,
[
o
(
pw_2896x8
)]
vpblendd
m3
,
m0
,
m4
,
0x33
; out6 -out7
vpblendd
m0
,
m0
,
m4
,
0xcc
; out0 -out1
...
...
@@ -981,10 +981,10 @@ cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
ITX_MUL2X_PACK
7
,
2
,
4
,
10
,
799
,
4017
,
1
; t4a t7a
ITX_MUL2X_PACK
3
,
2
,
4
,
10
,
3406
,
2276
,
1
; t5a t6a
ITX_MUL2X_PACK
6
,
2
,
4
,
10
,
1567
,
3784
; t3 t2
psubw
m2
,
m8
,
m0
; t9 t14
paddw
m8
,
m0
; t8 t15
psubw
m0
,
m1
,
m5
; t10 t13
paddw
m1
,
m5
; t11 t12
psub
s
w
m2
,
m8
,
m0
; t9 t14
padd
s
w
m8
,
m0
; t8 t15
psub
s
w
m0
,
m1
,
m5
; t10 t13
padd
s
w
m1
,
m5
; t11 t12
%if mmsize > 16
vbroadcasti128
m5
,
[
o
(
deint_shuf
)]
%else
...
...
@@ -996,12 +996,12 @@ cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
ITX_MUL2X_PACK
2
,
4
,
_
,
10
,
4
,
5
,
4
; t9a t14a
vpbroadcastd
m4
,
[
o
(
pw_m1567_m3784
)]
; reuse pw_m3784_1567
ITX_MUL2X_PACK
0
,
5
,
_
,
10
,
5
,
4
,
4
; t10a t13a
psubw
m5
,
m7
,
m3
; t5a t6a
paddw
m7
,
m3
; t4 t7
psubw
m4
,
m8
,
m1
; t11a t12a
paddw
m8
,
m1
; t8a t15a
paddw
m1
,
m2
,
m0
; t9 t14
psubw
m2
,
m0
; t10 t13
psub
s
w
m5
,
m7
,
m3
; t5a t6a
padd
s
w
m7
,
m3
; t4 t7
psub
s
w
m4
,
m8
,
m1
; t11a t12a
padd
s
w
m8
,
m1
; t8a t15a
padd
s
w
m1
,
m2
,
m0
; t9 t14
psub
s
w
m2
,
m0
; t10 t13
punpckhqdq
m0
,
m8
,
m1
; t15a t14
punpcklqdq
m8
,
m1
; t8a t9
pshufd
m3
,
m5
,
q1032
...
...
@@ -1019,20 +1019,20 @@ cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
pmulhrsw
m5
,
m1
; t12 t13a
shufps
m2
,
m7
,
m3
,
q1032
; t7 t6
vpblendd
m7
,
m7
,
m3
,
0xcc
; t4 t5
psubw
m1
,
m9
,
m6
; dct4 out3 out2
paddw
m9
,
m6
; dct4 out0 out1
psubw
m3
,
m9
,
m2
; dct8 out7 out6
paddw
m9
,
m2
; dct8 out0 out1
psubw
m2
,
m1
,
m7
; dct8 out4 out5
paddw
m1
,
m7
; dct8 out3 out2
psubw
m7
,
m9
,
m0
; out15 out14
paddw
m0
,
m9
; out0 out1
psubw
m6
,
m1
,
m5
; out12 out13
paddw
m1
,
m5
; out3 out2
psubw
m5
,
m2
,
m4
; out11 out10
paddw
m2
,
m4
; out4 out5
psubw
m4
,
m3
,
m8
; out8 out9
paddw
m3
,
m8
; out7 out6
psub
s
w
m1
,
m9
,
m6
; dct4 out3 out2
padd
s
w
m9
,
m6
; dct4 out0 out1
psub
s
w
m3
,
m9
,
m2
; dct8 out7 out6
padd
s
w
m9
,
m2
; dct8 out0 out1
psub
s
w
m2
,
m1
,
m7
; dct8 out4 out5
padd
s
w
m1
,
m7
; dct8 out3 out2
psub
s
w
m7
,
m9
,
m0
; out15 out14
padd
s
w
m0
,
m9
; out0 out1
psub
s
w
m6
,
m1
,
m5
; out12 out13
padd
s
w
m1
,
m5
; out3 out2
psub
s
w
m5
,
m2
,
m4
; out11 out10
padd
s
w
m2
,
m4
; out4 out5
psub
s
w
m4
,
m3
,
m8
; out8 out9
padd
s
w
m3
,
m8
; out7 out6
%endmacro
INV_TXFM_4X16_FN
dct
,
dct
,
0
...
...
@@ -1153,20 +1153,20 @@ ALIGN function_align
ITX_MUL4X_PACK
1
,
2
,
5
,
6
,
8
,
1751
,
3703
,
2440
,
3290
,
3
ITX_MUL4X_PACK
3
,
2
,
5
,
6
,
8
,
3035
,
2751
,
3513
,
2106
,
3
ITX_MUL4X_PACK
4
,
2
,
5
,
6
,
8
,
3857
,
1380
,
4052
,
601
,
3
psubw
m2
,
m0
,
m3
; t9a t8a t11a t10a
paddw
m0
,
m3
; t1a t0a t3a t2a
psubw
m3
,
m1
,
m4
; t13a t12a t15a t14a
paddw
m1
,
m4
; t5a t4a t7a t6a
psub
s
w
m2
,
m0
,
m3
; t9a t8a t11a t10a
padd
s
w
m0
,
m3
; t1a t0a t3a t2a
psub
s
w
m3
,
m1
,
m4
; t13a t12a t15a t14a
padd
s
w
m1
,
m4
; t5a t4a t7a t6a
ITX_MUL4X_PACK
2
,
4
,
5
,
6
,
8
,
799
,
4017
,
3406
,
2276
,
3
psubw
m6
,
m7
,
m5
ITX_MUL2X_PACK
3
,
5
,
_
,
8
,
6
,
4
,
6
vpbroadcastd
m6
,
[
o
(
pw_m3784_1567
)]
vpbroadcastd
m5
,
[
o
(
pw_1567_3784
)]
psubw
m4
,
m0
,
m1
; t5 t4 t7 t6
paddw
m0
,
m1
; t1 t0 t3 t2
psubw
m1
,
m2
,
m3
; t13a t12a t15a t14a
paddw
m2
,
m3
; t9a t8a t11a t10a
psubw
m3
,
m7
,
m6
psub
s
w
m4
,
m0
,
m1
; t5 t4 t7 t6
padd
s
w
m0
,
m1
; t1 t0 t3 t2
psub
s
w
m1
,
m2
,
m3
; t13a t12a t15a t14a
padd
s
w
m2
,
m3
; t9a t8a t11a t10a
psubw
m3
,
m7
,
m6
; pw_3784_m1567
vpblendd
m6
,
m6
,
m3
,
0xf0
ITX_MUL2X_PACK
4
,
3
,
_
,
8
,
6
,
5
,
4
; t4a t5a t7a t6a
ITX_MUL2X_PACK
1
,
3
,
_
,
8
,
6
,
5
,
4
; t12 t13 t15 t14
...
...
@@ -1179,10 +1179,10 @@ ALIGN function_align
vinserti128
m4
,
m4
,
xm1
,
1
; t4a t5a t12 t13
vpbroadcastd
m5
,
[
o
(
pw_2896x8
)]
pshufd
m2
,
m2
,
q1032
; t6a t7a t14 t15
psubw
m1
,
m0
,
m3
; t3a t2a t11 t10
paddw
m0
,
m3
; -out15 out0 out14 -out1
paddw
m3
,
m4
,
m2
; -out3 out12 out2 -out13
psubw
m4
,
m2
; t6 t7 t14a t15a
psub
s
w
m1
,
m0
,
m3
; t3a t2a t11 t10
padd
s
w
m0
,
m3
; -out15 out0 out14 -out1
padd
s
w
m3
,
m4
,
m2
; -out3 out12 out2 -out13
psub
s
w
m4
,
m2
; t6 t7 t14a t15a
shufps
m2
,
m1
,
m4
,
q1032
; t2a t6 t10 t14a
vpblendd
m4
,
m4
,
m1
,
0x33
; t3a t7 t11 t15a
paddw
m1
,
m2
,
m4
...
...
@@ -1902,53 +1902,53 @@ ALIGN function_align
ITX_MUL2X_PACK
6
,
4
,
9
,
10
,
3513
,
2106
,
3
; t10 t11
ITX_MUL2X_PACK
7
,
4
,
9
,
10
,
3857
,
1380
,
3
; t12 t13
ITX_MUL2X_PACK
8
,
4
,
9
,
10
,
4052
,
601
,
3
; t14 t15
psubw
m4
,
m0
,
m5
; t9a t8a
paddw
m0
,
m5
; t1a t0a
psubw
m5
,
m1
,
m6
; t11a t10a
paddw
m1
,
m6
; t3a t2a
psubw
m6
,
m2
,
m7
; t13a t12a
paddw
m2
,
m7
; t5a t4a
psubw
m7
,
m3
,
m8
; t15a t14a
paddw
m3
,
m8
; t7a t6a
psub
s
w
m4
,
m0
,
m5
; t9a t8a
padd
s
w
m0
,
m5
; t1a t0a
psub
s
w
m5
,
m1
,
m6
; t11a t10a
padd
s
w
m1
,
m6
; t3a t2a
psub
s
w
m6
,
m2
,
m7
; t13a t12a
padd
s
w
m2
,
m7
; t5a t4a
psub
s
w
m7
,
m3
,
m8
; t15a t14a
padd
s
w
m3
,
m8
; t7a t6a
vpbroadcastd
m11
,
[
o
(
pw_m4017_799
)]
vpbroadcastd
m12
,
[
o
(
pw_799_4017
)]
pxor
m9
,
m9
ITX_MUL2X_PACK
4
,
8
,
_
,
10
,
11
,
12
,
6
; t8 t9
psubw
m8
,
m9
,
m11
psubw
m8
,
m9
,
m11
; pw_4017_m799
ITX_MUL2X_PACK
6
,
12
,
_
,
10
,
12
,
8
,
6
; t12 t13
vpbroadcastd
m11
,
[
o
(
pw_m2276_3406
)]
vpbroadcastd
m12
,
[
o
(
pw_3406_2276
)]
ITX_MUL2X_PACK
5
,
8
,
_
,
10
,
11
,
12
,
6
; t10 t11
psubw
m8
,
m9
,
m11
psubw
m8
,
m9
,
m11
; pw_2276_m3406
ITX_MUL2X_PACK
7
,
12
,
_
,
10
,
12
,
8
,
6
; t14 t15
psubw
m8
,
m1
,
m3
; t7 t6
paddw
m1
,
m3
; t3 t2
psubw
m3
,
m0
,
m2
; t5 t4
paddw
m0
,
m2
; t1 t0
psubw
m2
,
m5
,
m7
; t14a t15a
paddw
m7
,
m5
; t10a t11a
psubw
m5
,
m4
,
m6
; t12a t13a
paddw
m4
,
m6
; t8a t9a
psub
s
w
m8
,
m1
,
m3
; t7 t6
padd
s
w
m1
,
m3
; t3 t2
psub
s
w
m3
,
m0
,
m2
; t5 t4
padd
s
w
m0
,
m2
; t1 t0
psub
s
w
m2
,
m5
,
m7
; t14a t15a
padd
s
w
m7
,
m5
; t10a t11a
psub
s
w
m5
,
m4
,
m6
; t12a t13a
padd
s
w
m4
,
m6
; t8a t9a
vpbroadcastd
m11
,
[
o
(
pw_m3784_1567
)]
vpbroadcastd
m12
,
[
o
(
pw_1567_3784
)]
ITX_MUL2X_PACK
3
,
6
,
_
,
10
,
11
,
12
,
4
; t4a t5a
psubw
m6
,
m9
,
m11
psubw
m6
,
m9
,
m11
; pw_3784_m1567
ITX_MUL2X_PACK
8
,
12
,
_
,
10
,
12
,
6
,
4
; t6a t7a
vpbroadcastd
m11
,
[
o
(
pw_m1567_3784
)]
vpbroadcastd
m12
,
[
o
(
pw_3784_1567
)]
ITX_MUL2X_PACK
2
,
6
,
_
,
10
,
11
,
12
,
4
; t15 t14
psubw
m6
,
m9
,
m11
psubw
m6
,
m9
,
m11
; pw_1567_m3784
ITX_MUL2X_PACK
5
,
12
,
_
,
10
,
12
,
6
,
4
; t13 t12
vbroadcasti128
m11
,
[
o
(
deint_shuf
)]
vpbroadcastd
m12
,
[
o
(
pw_2896x8
)]
psubw
m6
,
m0
,
m1
; t3a t2a
paddw
m0
,
m1
; -out15 out0
paddw
m1
,
m2
,
m5
; -out13 out2
psubw
m5
,
m2
; t15a t14a
paddw
m2
,
m4
,
m7
; -out1 out14
psubw
m4
,
m7
; t10 t11
psubw
m7
,
m3
,
m8
; t6 t7
paddw
m8
,
m3
; -out3 out12
psub
s
w
m6
,
m0
,
m1
; t3a t2a
padd
s
w
m0
,
m1
; -out15 out0
padd
s
w
m1
,
m2
,
m5
; -out13 out2
psub
s
w
m5
,
m2
; t15a t14a
padd
s
w
m2
,
m4
,
m7
; -out1 out14
psub
s
w
m4
,
m7
; t10 t11
psub
s
w
m7
,
m3
,
m8
; t6 t7
padd
s
w
m8
,
m3
; -out3 out12
REPX
{
pshufb
x
,
m11
}
,
m6
,
m4
,
m0
,
m2
vpblendd
m3
,
m6
,
m4
,
0xcc
; t3a t11
shufps
m6
,
m6
,
m4
,
q1032
; t2a t10
...
...
@@ -2580,25 +2580,25 @@ ALIGN function_align
ITX_MULSUB_2W
3
,
4
,
8
,
9
,
10
,
3166
,
2598
; t5a, t4a
ITX_MULSUB_2W
1
,
6
,
8
,
9
,
10
,
3920
,
1189
; t7a, t6a
ITX_MULSUB_2W
5
,
2
,
8
,
9
,
10
,
1931
,
3612
; t3a, t2a
psubw
m8
,
m2
,
m6
; t6
paddw
m2
,
m6
; t2
psubw
m6
,
m0
,
m4
; t4
paddw
m0
,
m4
; t0
psubw
m4
,
m5
,
m1
; t7
paddw
m5
,
m1
; t3
psubw
m1
,
m7
,
m3
; t5
paddw
m7
,
m3
; t1
psub
s
w
m8
,
m2
,
m6
; t6
padd
s
w
m2
,
m6
; t2
psub
s
w
m6
,
m0
,
m4
; t4
padd
s
w
m0
,
m4
; t0
psub
s
w
m4
,
m5
,
m1
; t7
padd
s
w
m5
,
m1
; t3
psub
s
w
m1
,
m7
,
m3
; t5
padd
s
w
m7
,
m3
; t1
ITX_MULSUB_2W
6
,
1
,
3
,
9
,
10
,
1567
,
3784
; t5a, t4a
ITX_MULSUB_2W
4
,
8
,
3
,
9
,
10
,
3784
,
1567
; t6a, t7a
psubw
m9
,
m6
,
m8
; t7
paddw
m6
,
m8
; out6
psub
s
w
m9
,
m6
,
m8
; t7
padd
s
w
m6
,
m8
; out6
vpbroadcastd
m8
,
[
o
(
pw_2896x8
)]
psubw
m3
,
m7
,
m5
; t3
paddw
m7
,
m5
; -out7
psubw
m5
,
m0
,
m2
; t2
paddw
m0
,
m2
; out0
psubw
m2
,
m1
,
m4
; t6
paddw
m1
,
m4
; -out1
psub
s
w
m3
,
m7
,
m5
; t3
padd
s
w
m7
,
m5
; -out7
psub
s
w
m5
,
m0
,
m2
; t2
padd
s
w
m0
,
m2
; out0
psub
s
w
m2
,
m1
,
m4
; t6
padd
s
w
m1
,
m4
; -out1
psubw
m4
,
m5
,
m3
paddw
m3
,
m5
psubw
m5
,
m2
,
m9
...
...
@@ -2959,25 +2959,25 @@ ALIGN function_align
mova
[
rsp
+
gprsize
+
32
*
0
],
m6
; tmp3
IDCT16_1D_ODDHALF
9
,
3
,
5
,
7
,
1
,
11
,
13
,
14
,
6
,
10
,
15
mova
m6
,
[
rsp
+
gprsize
+
32
*
1
]
; tmp5
psubw
m15
,
m0
,
m14
; out15
paddw
m0
,
m14
; out0
psubw
m14
,
m2
,
m13
; out14
paddw
m2
,
m13
; out1
psub
s
w
m15
,
m0
,
m14
; out15
padd
s
w
m0
,
m14
; out0
psub
s
w
m14
,
m2
,
m13
; out14
padd
s
w
m2
,
m13
; out1
mova
[
rsp
+
gprsize
+
32
*
1
],
m2
psubw
m13
,
m4
,
m11
; out13
paddw
m2
,
m4
,
m11
; out2
psubw
m11
,
m8
,
m7
; out11
paddw
m4
,
m8
,
m7
; out4
psub
s
w
m13
,
m4
,
m11
; out13
padd
s
w
m2
,
m4
,
m11
; out2
psub
s
w
m11
,
m8
,
m7
; out11
padd
s
w
m4
,
m8
,
m7
; out4
mova
m7
,
[
rsp
+
gprsize
+
32
*
2
]
; tmp7
psubw
m10
,
m6
,
m5
; out10
paddw
m5
,
m6
; out5
psubw
m8
,
m7
,
m9
; out8
paddw
m7
,
m9
; out7
psubw
m9
,
m12
,
m3
; out9
paddw
m6
,
m12
,
m3
; out6
psub
s
w
m10
,
m6
,
m5
; out10
padd
s
w
m5
,
m6
; out5
psub
s
w
m8
,
m7
,
m9
; out8
padd
s
w
m7
,
m9
; out7
psub
s
w
m9
,
m12
,
m3
; out9
padd
s
w
m6
,
m12
,
m3
; out6
mova
m3
,
[
rsp
+
gprsize
+
32
*
0
]
; tmp3
psubw
m12
,
m3
,
m1
; out12
paddw
m3
,
m1
; out3
psub
s
w
m12
,
m3
,
m1
; out12
padd
s
w
m3
,
m1
; out3
ret
INV_TXFM_16X16_FN
adst
,
dct
...
...
@@ -3012,24 +3012,24 @@ ALIGN function_align
ITX_MULSUB_2W
9
,
6
,
0
,
4
,
15
,
2440
,
3290
; t7, t6
ITX_MULSUB_2W
5
,
10
,
0
,
4
,
15
,
3513
,
2106
; t11, t10
ITX_MULSUB_2W
1
,
14
,
0
,
4
,
15
,
4052
,
601
; t15, t14
psubw
m0
,
m2
,
m10
; t10a
paddw
m2
,
m10
; t2a
psubw
m10
,
m13
,
m5
; t11a
paddw
m13
,
m5
; t3a
psubw
m5
,
m6
,
m14
; t14a
paddw
m6
,
m14
; t6a
psubw
m14
,
m9
,
m1
; t15a
paddw
m9
,
m1
; t7a
psub
s
w
m0
,
m2
,
m10
; t10a
padd
s
w
m2
,
m10
; t2a
psub
s
w
m10
,
m13
,
m5
; t11a
padd
s
w
m13
,
m5
; t3a
psub
s
w
m5
,
m6
,
m14
; t14a
padd
s
w
m6
,
m14
; t6a
psub
s
w
m14
,
m9
,
m1
; t15a
padd
s
w
m9
,
m1
; t7a
ITX_MULSUB_2W
0
,
10
,
1
,
4
,
15
,
3406
,
2276
; t11, t10
ITX_MULSUB_2W
14
,
5
,
1
,
4
,
15
,
2276
,
3406
; t14, t15
psubw
m1
,
m10
,
m14
; t14a
paddw
m10
,
m14
; t10a
psubw
m14
,
m0
,
m5
; t15a
paddw
m0
,
m5
; t11a
psubw
m5
,
m2
,
m6
; t6
paddw
m2
,
m6
; t2
psubw
m6
,
m13
,
m9
; t7
paddw
m13
,
m9
; t3
psub
s
w
m1
,
m10
,
m14
; t14a
padd
s
w
m10
,
m14
; t10a
psub
s
w
m14
,
m0
,
m5
; t15a
padd
s
w
m0
,
m5
; t11a
psub
s
w
m5
,
m2
,
m6
; t6
padd
s
w
m2
,
m6
; t2
psub
s
w
m6
,
m13
,
m9
; t7
padd
s
w
m13
,
m9
; t3
ITX_MULSUB_2W
6
,
5
,
4
,
9
,
15
,
3784
,
1567
; t6a, t7a
ITX_MULSUB_2W
14
,
1
,
4
,
9
,
15
,
3784
,
1567
; t14, t15
mova
m9
,
[
rsp
+
gprsize
+
32
*
0
]
; in15
...
...
@@ -3042,46 +3042,46 @@ ALIGN function_align
ITX_MULSUB_2W
11
,
6
,
2
,
10
,
15
,
1751
,
3703
; t5, t4
ITX_MULSUB_2W
7
,
8
,
2
,
10
,
15
,
3035
,
2751
; t9, t8
ITX_MULSUB_2W
3
,
12
,
2
,
10
,
15
,
3857
,
1380
; t13, t12
psubw
m10
,
m4
,
m8
; t8a
paddw
m8
,
m4
; t0a
psubw
m4
,
m9
,
m7
; t9a
paddw
m9
,
m7
; t1a
psubw
m7
,
m6
,
m12
; t12a
paddw
m6
,
m12
; t4a
psubw
m12
,
m11
,
m3
; t13a
paddw
m11
,
m3
; t5a
psub
s
w
m10
,
m4
,
m8
; t8a
padd
s
w
m8
,
m4
; t0a
psub
s
w
m4
,
m9
,
m7
; t9a
padd
s
w
m9
,
m7
; t1a
psub
s
w
m7
,
m6
,
m12
; t12a
padd
s
w
m6
,
m12
; t4a
psub
s
w
m12
,
m11
,
m3
; t13a
padd
s
w
m11
,
m3
; t5a
ITX_MULSUB_2W
10
,
4
,
2
,
3
,
15
,
799
,
4017
; t9, t8
ITX_MULSUB_2W
12
,
7
,
2
,
3
,
15
,
4017
,
799
; t12, t13
psubw
m3
,
m9
,
m11
; t5
paddw
m9
,
m11
; t1
psubw
m11
,
m4
,
m12
; t12a
paddw
m4
,
m12
; t8a
paddw
m12
,
m8
,
m6
; t0
psubw
m8
,
m6
; t4
paddw
m6
,
m10
,
m7
; t9a
psubw
m10
,
m7
; t13a
psub
s
w
m3
,
m9
,
m11
; t5
padd
s
w
m9
,
m11
; t1
psub
s
w
m11
,
m4
,
m12
; t12a
padd
s
w
m4
,
m12
; t8a
padd
s
w
m12
,
m8
,
m6
; t0
psub
s
w
m8
,
m6
; t4
padd
s
w
m6
,
m10
,
m7
; t9a
psub
s
w
m10
,
m7
; t13a
ITX_MULSUB_2W
8
,
3
,
2
,
7
,
15
,
1567
,
3784
; t5a, t4a
ITX_MULSUB_2W
11
,
10
,
2
,
7
,
15
,
1567
,
3784
; t13, t12
mova
m7
,
[
rsp
+
gprsize
+
32
*
0
]
; t10a
mova
m2
,
[
rsp
+
gprsize
+
32
*
1
]
; t6a
paddw
m15
,
m9
,
m13
; -out15
psubw
m9
,
m13
; t3a
paddw
m13
,
m11
,
m1
; -out13
psubw
m11
,
m1
; t15a
psubw
m1
,
m4
,
m7
; t10
paddw
m7
,
m4
; -out1
psubw
m4
,
m3
,
m2
; t6
paddw
m3
,
m2
; -out3
paddw
m2
,
m10
,
m14
; out2
psubw
m10
,
m14
; t14a
paddw
m14
,
m6
,
m0
; out14
psubw
m6
,
m0
; t11
padd
s
w
m15
,
m9
,
m13
; -out15
psub
s
w
m9
,
m13
; t3a
padd
s
w
m13
,
m11
,
m1
; -out13
psub
s
w
m11
,
m1
; t15a
psub
s
w
m1
,
m4
,
m7
; t10
padd
s
w
m7
,
m4
; -out1
psub
s
w
m4
,
m3
,
m2
; t6
padd
s
w
m3
,
m2
; -out3
padd
s
w
m2
,
m10
,
m14
; out2
psub
s
w
m10
,
m14
; t14a
padd
s
w
m14
,
m6
,
m0
; out14
psub
s
w
m6
,
m0
; t11
mova
m0
,
[
rsp
+
gprsize
+
32
*
2
]
; t2
mova
[
rsp
+
gprsize
+
32
*
1
],
m7
psubw
m7
,
m12
,
m0
; t2a
paddw
m0
,
m12
; out0
paddw
m12
,
m8
,
m5
; out12
psubw
m8
,
m5
; t7
psub
s
w
m7
,
m12
,
m0
; t2a
padd
s
w
m0
,
m12
; out0
padd
s
w
m12
,
m8
,
m5
; out12
psub
s
w
m8
,
m5
; t7
paddw
m5
,
m10
,
m11
; -out5
psubw
m10
,
m11
; out10
psubw
m11
,
m4
,
m8
; -out11
...
...
@@ -3475,74 +3475,74 @@ ALIGN function_align
ITX_MUL2X_PACK
13
,
6
,
12
,
10
,
2440
,
3290
,
3
; t22a, t25a
ITX_MUL2X_PACK
11
,
6
,
12
,
10
,
3035
,
2751
,
3
; t17a, t30a
.main2:
psubw
m6
,
m1
,
m11
; t17 t30
paddw
m1
,
m11
; t16 t31
psubw
m11
,
m9
,
m14
; t18 t29
paddw
m9
,
m14
; t19 t28
psubw
m14
,
m15
,
m0
; t21 t26
paddw
m15
,
m0
; t20 t27
psubw
m0
,
m8
,
m13
; t22 t25
paddw
m8
,
m13
; t23 t24
psub
s
w
m6
,
m1
,
m11
; t17 t30
padd
s
w
m1
,
m11
; t16 t31