Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
VideoLAN
dav1d
Commits
d4dfa85c
Commit
d4dfa85c
authored
Oct 01, 2019
by
Henrik Gramner
Browse files
x86: Increase precision of SSSE3 IDCT intermediates
parent
de561b3b
Pipeline
#9842
passed with stages
in 5 minutes and 35 seconds
Changes
1
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
src/x86/itx_ssse3.asm
View file @
d4dfa85c
...
...
@@ -202,7 +202,7 @@ SECTION .text
ret
%endmacro
; flags: 1 = swap, 2: coef_regs
; flags: 1 = swap, 2: coef_regs
, 4: no_pack
%macro ITX_MUL2X_PACK 5-6 0
; dst/src, tmp[1], rnd, coef[1-2], flags
%if %6 & 2
pmaddwd
m
%
2
,
m
%
4
,
m
%
1
...
...
@@ -218,24 +218,17 @@ SECTION .text
paddd
m
%
1
,
m
%
3
psrad
m
%
2
,
12
psrad
m
%
1
,
12
%if %6 & 4 == 0
packssdw
m
%
1
,
m
%
2
%endif
%endmacro
%macro IDCT4_1D_PACKED 0-1
;pw_2896x8
punpckhwd
m2
,
m0
,
m1
;unpacked in1 in3
psubw
m3
,
m0
,
m1
paddw
m0
,
m1
punpcklqdq
m0
,
m3
;high: in0-in2 ;low: in0+in2
mova
m3
,
[
o
(
pd_2048
)]
punpckhwd
m2
,
m0
,
m1
;unpacked in1 in3
punpcklwd
m0
,
m1
;unpacked in0 in2
ITX_MUL2X_PACK
2
,
1
,
3
,
1567
,
3784
%if %0 == 1
pmulhrsw
m0
,
m
%
1
%else
pmulhrsw
m0
,
[
o
(
pw_2896x8
)]
;high: t1 ;low: t0
%endif
ITX_MUL2X_PACK
0
,
1
,
3
,
2896
,
2896
psubsw
m1
,
m0
,
m2
;high: out2 ;low: out3
paddsw
m0
,
m2
;high: out1 ;low: out0
%endmacro
...
...
@@ -499,79 +492,81 @@ cglobal inv_txfm_add_wht_wht_4x4, 3, 3, 4, dst, stride, coeff
%macro IDCT8_1D_PACKED 0
mova
m6
,
[
o
(
pd_2048
)]
punpckhwd
m5
,
m0
,
m3
;unpacked in1 in7
punpckhwd
m4
,
m2
,
m1
;unpacked in5 in3
punpckhwd
m4
,
m0
,
m3
;unpacked in1 in7
punpcklwd
m0
,
m2
;unpacked in0 in4
punpckhwd
m2
,
m1
;unpacked in5 in3
punpcklwd
m1
,
m3
;unpacked in2 in6
psubw
m3
,
m0
,
m2
paddw
m0
,
m2
punpcklqdq
m0
,
m3
;low: in0+in4 high: in0-in4
ITX_MUL2X_PACK
5
,
2
,
6
,
799
,
4017
,
1
;low: t4a high: t7a
ITX_MUL2X_PACK
4
,
2
,
6
,
3406
,
2276
,
1
;low: t5a high: t6a
ITX_MUL2X_PACK
1
,
2
,
6
,
1567
,
3784
;low: t3 high: t2
mova
m6
,
[
o
(
pw_2896x8
)]
psubsw
m2
,
m5
,
m4
;low: t5a high: t6a
paddsw
m5
,
m4
;low: t4 high: t7
punpckhqdq
m4
,
m2
,
m2
;low: t6a high: t6a
psubw
m3
,
m4
,
m2
;low: t6a - t5a
paddw
m4
,
m2
;low: t6a + t5a
punpcklqdq
m4
,
m3
;low: t6a + t5a high: t6a - t5a
pmulhrsw
m0
,
m6
;low: t0 high: t1
pmulhrsw
m4
,
m6
;low: t6 high: t5
shufps
m2
,
m5
,
m4
,
q1032
;low: t7 high: t6
shufps
m5
,
m4
,
q3210
;low: t4 high: t5
psubsw
m4
,
m0
,
m1
;low: tmp3 high: tmp2
ITX_MUL2X_PACK
4
,
3
,
6
,
799
,
4017
;low: t7a high: t4a
ITX_MUL2X_PACK
2
,
3
,
6
,
3406
,
2276
;low: t6a high: t5a
ITX_MUL2X_PACK
1
,
3
,
6
,
1567
,
3784
;low: t3 high: t2
psubsw
m3
,
m4
,
m2
;low: t6a high: t5a
paddsw
m4
,
m2
;low: t7 high: t4
pshufb
m3
,
[
o
(
deint_shuf1
)]
ITX_MUL2X_PACK
0
,
2
,
6
,
2896
,
2896
;low: t0 high: t1
ITX_MUL2X_PACK
3
,
2
,
6
,
2896
,
2896
;low: t6 high: t5
psubsw
m2
,
m0
,
m1
;low: tmp3 high: tmp2
paddsw
m0
,
m1
;low: tmp0 high: tmp1
psubsw
m3
,
m0
,
m2
;low: out7 high: out6
paddsw
m0
,
m2
;low: out0 high: out1
psubsw
m2
,
m4
,
m5
;low: out4 high: out5
paddsw
m1
,
m4
,
m5
;low: out3 high: out2
punpcklqdq
m1
,
m4
,
m3
;low: t7 high: t6
punpckhqdq
m4
,
m3
;low: t4 high: t5
psubsw
m3
,
m0
,
m1
;low: out7 high: out6
paddsw
m0
,
m1
;low: out0 high: out1
paddsw
m1
,
m2
,
m4
;low: out3 high: out2
psubsw
m2
,
m4
;low: out4 high: out5
%endmacro
;dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
;dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
%macro ITX_MULSUB_2W 7
; dst/src[1-2], tmp[1-2], rnd, coef[1-2]
punpckhwd
m
%
3
,
m
%
1
,
m
%
2
%macro ITX_MULSUB_2W 7
-8 0
; dst/src[1-2], tmp[1-2], rnd, coef[1-2]
, dst2_in_tmp1
punpckhwd
m
%
4
,
m
%
1
,
m
%
2
punpcklwd
m
%
1
,
m
%
2
%if %7 < 8
pmaddwd
m
%
2
,
m
%
7
,
m
%
1
pmaddwd
m
%
4
,
m
%
7
,
m
%
3
pmaddwd
m
%
3
,
m
%
7
,
m
%
4
%else
mova
m
%
2
,
[
o
(
pw_
%
7
_
%
6
)]
pmaddwd
m
%
4
,
m
%
3
,
m
%
2
%if %8
pmaddwd
m
%
3
,
m
%
1
,
m
%
2
pmaddwd
m
%
2
,
m
%
4
%else
pmaddwd
m
%
3
,
m
%
4
,
m
%
2
pmaddwd
m
%
2
,
m
%
1
%endif
paddd
m
%
4
,
m
%
5
%endif
paddd
m
%
3
,
m
%
5
paddd
m
%
2
,
m
%
5
psrad
m
%
4
,
12
psrad
m
%
3
,
12
psrad
m
%
2
,
12
packssdw
m
%
2
,
m
%
4
;dst2
%if %8
packssdw
m
%
3
,
m
%
2
%else
packssdw
m
%
2
,
m
%
3
;dst2
%endif
%if %7 < 8
pmaddwd
m
%
3
,
m
%
6
pmaddwd
m
%
4
,
m
%
6
pmaddwd
m
%
1
,
m
%
6
%elif %8
mova
m
%
2
,
[
o
(
pw_
%
6
_m
%
7
)]
pmaddwd
m
%
4
,
m
%
2
pmaddwd
m
%
1
,
m
%
2
%else
mova
m
%
4
,
[
o
(
pw_
%
6
_m
%
7
)]
pmaddwd
m
%
3
,
m
%
4
pmaddwd
m
%
1
,
m
%
4
mova
m
%
3
,
[
o
(
pw_
%
6
_m
%
7
)]
pmaddwd
m
%
4
,
m
%
3
pmaddwd
m
%
1
,
m
%
3
%endif
paddd
m
%
3
,
m
%
5
paddd
m
%
4
,
m
%
5
paddd
m
%
1
,
m
%
5
psrad
m
%
3
,
12
psrad
m
%
4
,
12
psrad
m
%
1
,
12
packssdw
m
%
1
,
m
%
3
;dst1
packssdw
m
%
1
,
m
%
4
;dst1
%endmacro
%macro IDCT4_1D 7
; src[1-4], tmp[1-2], pd_2048
ITX_MULSUB_2W
%
2
,
%
4
,
%
5
,
%
6
,
%
7
,
1567
,
3784
;t2, t3
mova
m
%
6
,
[
o
(
pw_2896x8
)]
paddw
m
%
5
,
m
%
1
,
m
%
3
psubw
m
%
1
,
m
%
3
pmulhrsw
m
%
1
,
m
%
6
;t1
pmulhrsw
m
%
5
,
m
%
6
;t0
psubsw
m
%
3
,
m
%
1
,
m
%
2
;out2
paddsw
m
%
2
,
m
%
1
;out1
paddsw
m
%
1
,
m
%
5
,
m
%
4
;out0
psubsw
m
%
5
,
m
%
4
;out3
mova
m
%
4
,
m
%
5
ITX_MULSUB_2W
%
2
,
%
4
,
%
5
,
%
6
,
%
7
,
1567
,
3784
,
1
;t2, t3
ITX_MULSUB_2W
%
1
,
%
3
,
%
4
,
%
6
,
%
7
,
2896
,
2896
,
1
;t1, t0
psubsw
m
%
3
,
m
%
1
,
m
%
2
;out2
paddsw
m
%
2
,
m
%
1
;out1
paddsw
m
%
1
,
m
%
5
,
m
%
4
;out0
psubsw
m
%
4
,
m
%
5
;out3
%endmacro
%macro WRITE_4X8 4
;row[1-4]
...
...
@@ -1286,17 +1281,13 @@ cglobal iidentity_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
%endmacro
%macro IDCT8_1D_ODDHALF 7
; src[1-4], tmp[1-2], pd_2048
ITX_MULSUB_2W
%
1
,
%
4
,
%
5
,
%
6
,
%
7
,
799
,
4017
;t4a, t7a
ITX_MULSUB_2W
%
3
,
%
2
,
%
5
,
%
6
,
%
7
,
3406
,
2276
;t5a, t6a
psubsw
m
%
5
,
m
%
1
,
m
%
3
;t5a
paddsw
m
%
1
,
m
%
3
;t4
psubsw
m
%
6
,
m
%
4
,
m
%
2
;t6a
paddsw
m
%
4
,
m
%
2
;t7
mova
m
%
3
,
[
o
(
pw_2896x8
)]
psubw
m
%
2
,
m
%
6
,
m
%
5
;t6a - t5a
paddw
m
%
6
,
m
%
5
;t6a + t5a
pmulhrsw
m
%
2
,
m
%
3
;t5
pmulhrsw
m
%
3
,
m
%
6
;t6
ITX_MULSUB_2W
%
1
,
%
4
,
%
5
,
%
6
,
%
7
,
799
,
4017
;t4a, t7a
ITX_MULSUB_2W
%
3
,
%
2
,
%
5
,
%
6
,
%
7
,
3406
,
2276
,
1
;t5a, t6a
psubsw
m
%
2
,
m
%
4
,
m
%
5
;t6a
paddsw
m
%
4
,
m
%
5
;t7
psubsw
m
%
5
,
m
%
1
,
m
%
3
;t5a
paddsw
m
%
1
,
m
%
3
;t4
ITX_MULSUB_2W
%
2
,
%
5
,
%
3
,
%
6
,
%
7
,
2896
,
2896
,
1
;t5, t6
%endmacro
INV_TXFM_8X8_FN
dct
,
dct
,
0
...
...
@@ -2063,37 +2054,34 @@ cglobal iidentity_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
%macro IDCT16_1D_PACKED_ODDHALF 7
;src[1-4], tmp[1-3]
punpckhwd
m
%
5
,
m
%
4
,
m
%
1
;packed in13 in3
punpcklwd
m
%
1
,
m
%
4
;packed in1 in15
punpcklwd
m
%
6
,
m
%
3
,
m
%
2
;packed in9 in7
punpcklwd
m
%
4
,
m
%
3
,
m
%
2
;packed in9 in7
punpckhwd
m
%
2
,
m
%
3
;packed in5 in11
mova
m
%
7
,
[
o
(
pd_2048
)]
ITX_MUL2X_PACK
%
1
,
%
4
,
%
7
,
401
,
4076
,
1
;low: t8a high: t15a
ITX_MUL2X_PACK
%
6
,
%
4
,
%
7
,
3166
,
2598
,
1
;low: t9a high: t14a
ITX_MUL2X_PACK
%
2
,
%
4
,
%
7
,
1931
,
3612
,
1
;low: t10a high: t13a
ITX_MUL2X_PACK
%
5
,
%
4
,
%
7
,
3920
,
1189
,
1
;low: t11a high: t12a
psubsw
m
%
4
,
m
%
1
,
m
%
6
;low: t9 high: t14
paddsw
m
%
1
,
m
%
6
;low: t8 high: t15
ITX_MUL2X_PACK
%
1
,
%
6
,
%
7
,
401
,
4076
,
1
;low: t8a high: t15a
ITX_MUL2X_PACK
%
4
,
%
6
,
%
7
,
3166
,
2598
,
1
;low: t9a high: t14a
ITX_MUL2X_PACK
%
2
,
%
6
,
%
7
,
1931
,
3612
,
1
;low: t10a high: t13a
ITX_MUL2X_PACK
%
5
,
%
6
,
%
7
,
3920
,
1189
,
1
;low: t11a high: t12a
psubsw
m
%
6
,
m
%
1
,
m
%
4
;low: t9 high: t14
paddsw
m
%
1
,
m
%
4
;low: t8 high: t15
psubsw
m
%
3
,
m
%
5
,
m
%
2
;low: t10 high: t13
paddsw
m
%
2
,
m
%
5
;low: t11 high: t12
punpcklqdq
m
%
5
,
m
%
4
,
m
%
3
;low: t9 high: t10
punpckhqdq
m
%
4
,
m
%
3
;low: t14 high: t13
punpcklwd
m
%
6
,
m
%
4
,
m
%
5
;packed t14 t9
punpckhwd
m
%
5
,
m
%
4
;packed t10 t13
paddsw
m
%
5
,
m
%
2
;low: t11 high: t12
mova
m
%
2
,
[
o
(
deint_shuf2
)]
pshufb
m
%
6
,
m
%
2
pshufb
m
%
3
,
[
o
(
deint_shuf1
)]
pxor
m
%
4
,
m
%
4
psubw
m
%
4
,
m
%
5
;packed -t10 -t13
psubw
m
%
4
,
m
%
3
;packed -t10 -t13
ITX_MUL2X_PACK
%
6
,
%
3
,
%
7
,
1567
,
3784
,
1
;low: t9a high: t14a
ITX_MUL2X_PACK
%
4
,
%
3
,
%
7
,
3784
,
1567
;low: t10a high: t13a
psubsw
m
%
3
,
m
%
1
,
m
%
2
;low: t11a high: t12a
paddsw
m
%
1
,
m
%
2
;low: t8a high: t15a
psubsw
m
%
3
,
m
%
1
,
m
%
5
;low: t11a high: t12a
paddsw
m
%
1
,
m
%
5
;low: t8a high: t15a
psubsw
m
%
5
,
m
%
6
,
m
%
4
;low: t10 high: t13
paddsw
m
%
6
,
m
%
4
;low: t9 high: t14
mova
m
%
7
,
[
o
(
pw_2896x8
)]
punpckhqdq
m
%
4
,
m
%
3
,
m
%
5
;low: t12a high: t13
punpcklqdq
m
%
3
,
m
%
5
;low: t11a high: t10
psubw
m
%
2
,
m
%
4
,
m
%
3
paddw
m
%
3
,
m
%
4
pmulhrsw
m
%
2
,
m
%
7
;low: t11 high: t10a
pmulhrsw
m
%
3
,
m
%
7
;low: t12 high: t13a
pshufb
m
%
3
,
m
%
2
pshufb
m
%
5
,
m
%
2
ITX_MUL2X_PACK
%
3
,
%
2
,
%
7
,
2896
,
2896
,
4
;t12, t11
ITX_MUL2X_PACK
%
5
,
%
4
,
%
7
,
2896
,
2896
,
4
;t13a, t10a
packssdw
m
%
2
,
m
%
4
;low: t11 high: t10a
packssdw
m
%
3
,
m
%
5
;low: t12 high: t13a
punpckhqdq
m
%
4
,
m
%
1
,
m
%
6
;low: t15a high: t14
punpcklqdq
m
%
1
,
m
%
6
;low: t8a high: t9
%endmacro
...
...
@@ -2918,19 +2906,14 @@ ALIGN function_align
mova
m0
,
[
rsp
+
gprsize
*
2
+
16
*
1
]
mova
m2
,
[
rsp
+
gprsize
*
2
+
16
*
2
]
mova
[
rsp
+
gprsize
*
2
+
16
*
1
],
m4
psubsw
m
4
,
m0
,
m3
;t13
psubsw
m
5
,
m0
,
m3
;t13
paddsw
m0
,
m3
;t14
psubsw
m3
,
m2
,
m1
;t12a
mova
m3
,
[
o
(
pd_2048
)]
psubsw
m4
,
m2
,
m1
;t12a
paddsw
m1
,
m2
;t15a
mova
m5
,
[
o
(
pw_2896x8
)]
psubw
m2
,
m4
,
m7
;t13-t10
paddw
m7
,
m4
;t13+t10
psubw
m4
,
m3
,
m6
;t12a-t11a
paddw
m6
,
m3
;t12a+t11a
pmulhrsw
m7
,
m5
;t13a
pmulhrsw
m4
,
m5
;t11
pmulhrsw
m6
,
m5
;t12
pmulhrsw
m5
,
m2
;t10a
mova
[
rsp
+
gprsize
*
2
+
16
*
2
],
m1
ITX_MULSUB_2W
5
,
7
,
1
,
2
,
3
,
2896
,
2896
;t10a, t13a
ITX_MULSUB_2W
4
,
6
,
1
,
2
,
3
,
2896
,
2896
;t11, t12
mova
m3
,
[
rsp
+
gprsize
*
2
+
16
*
8
]
psubsw
m2
,
m3
,
m5
;out10
paddsw
m3
,
m5
;out5
...
...
@@ -2950,6 +2933,7 @@ ALIGN function_align
mova
[
rsp
+
gprsize
*
2
+
16
*
5
],
m6
psubsw
m6
,
m7
,
m0
;out14
paddsw
m7
,
m0
;out1
mova
m1
,
[
rsp
+
gprsize
*
2
+
16
*
2
]
mova
m0
,
[
rsp
+
gprsize
*
2
+
16
*
3
]
mova
[
rsp
+
gprsize
*
2
+
16
*
4
],
m7
psubsw
m7
,
m0
,
m1
;out15
...
...
@@ -4211,35 +4195,30 @@ ALIGN function_align
psubsw
m5
,
m3
,
m2
;t28a
paddsw
m3
,
m2
;t31a
ITX_MULSUB_2W
5
,
4
,
1
,
2
,
7
,
1567
,
3784
;t19, t28
mova
m2
,
[
rsp
+
gprsize
*
2
+
16
*
15
]
;tmp12
psubsw
m1
,
m5
,
m6
;t20a
paddsw
m5
,
m6
;t19a
psubsw
m6
,
m2
,
m5
;out19
paddsw
m2
,
m5
;out12
mova
m5
,
[
rsp
+
gprsize
*
2
+
16
*
30
]
;t27
mova
[
rsp
+
gprsize
*
2
+
16
*
22
],
m6
;out19
mova
[
rsp
+
gprsize
*
2
+
16
*
15
],
m2
;out12
mova
m5
,
[
rsp
+
gprsize
*
2
+
16
*
30
]
;t27
psubsw
m6
,
m4
,
m5
;t27a
paddsw
m4
,
m5
;t28a
ITX_MULSUB_2W
6
,
1
,
2
,
5
,
7
,
2896
,
2896
;t20, t27
mova
m2
,
[
rsp
+
gprsize
*
2
+
16
*
6
]
;tmp3
mova
m7
,
[
o
(
pw_2896x8
)]
psubw
m5
,
m6
,
m1
;t27a - t20a
paddw
m6
,
m1
;t27a + t20a
psubsw
m1
,
m2
,
m4
;out28
psubsw
m5
,
m2
,
m4
;out28
paddsw
m2
,
m4
;out3
pmulhrsw
m5
,
m7
;t20
pmulhrsw
m6
,
m7
;t27
mova
m4
,
[
rsp
+
gprsize
*
2
+
16
*
14
]
;tmp11
mova
[
rsp
+
gprsize
*
2
+
16
*
31
],
m
1
;out28
mova
[
rsp
+
gprsize
*
2
+
16
*
31
],
m
5
;out28
mova
[
rsp
+
gprsize
*
2
+
16
*
6
],
m2
;out3
psubsw
m
1
,
m4
,
m
5
;out20
paddsw
m4
,
m
5
;out11
psubsw
m
5
,
m4
,
m
6
;out20
paddsw
m4
,
m
6
;out11
mova
m2
,
[
rsp
+
gprsize
*
2
+
16
*
7
]
;tmp4
mova
[
rsp
+
gprsize
*
2
+
16
*
23
],
m
1
;out20
mova
[
rsp
+
gprsize
*
2
+
16
*
23
],
m
5
;out20
mova
[
rsp
+
gprsize
*
2
+
16
*
14
],
m4
;out11
psubsw
m5
,
m2
,
m
6
;out27
paddsw
m2
,
m
6
;out4
psubsw
m5
,
m2
,
m
1
;out27
paddsw
m2
,
m
1
;out4
mova
m1
,
[
rsp
+
gprsize
*
2
+
16
*
26
]
;t23a
mova
m4
,
[
rsp
+
gprsize
*
2
+
16
*
27
]
;t24a
mova
[
rsp
+
gprsize
*
2
+
16
*
30
],
m5
;out27
...
...
@@ -4248,27 +4227,24 @@ ALIGN function_align
paddsw
m0
,
m1
;t16
psubsw
m2
,
m3
,
m4
;t24
paddsw
m3
,
m4
;t31
ITX_MULSUB_2W
2
,
5
,
4
,
6
,
7
,
2896
,
2896
;t23a, t24a
mova
m6
,
[
rsp
+
gprsize
*
2
+
16
*
18
]
;tmp15
psubw
m1
,
m2
,
m5
;t24 - t23
paddw
m2
,
m5
;t24 + t23
psubsw
m4
,
m6
,
m0
;out16
paddsw
m6
,
m0
;out15
pmulhrsw
m1
,
m7
;t23a
pmulhrsw
m2
,
m7
;t24a
mova
m0
,
[
rsp
+
gprsize
*
2
+
16
*
3
]
;tmp0
mova
m
5
,
[
rsp
+
gprsize
*
2
+
16
*
11
]
;tmp8
mova
m
1
,
[
rsp
+
gprsize
*
2
+
16
*
11
]
;tmp8
mova
[
rsp
+
gprsize
*
2
+
16
*
18
],
m6
;out15
mova
[
rsp
+
gprsize
*
2
+
16
*
19
],
m4
;out16
psubsw
m6
,
m0
,
m3
;out31
paddsw
m0
,
m3
;out0
psubsw
m4
,
m
5
,
m
1
;out23
paddsw
m
5
,
m
1
;out8
psubsw
m4
,
m
1
,
m
2
;out23
paddsw
m
1
,
m
2
;out8
mova
m3
,
[
rsp
+
gprsize
*
2
+
16
*
10
]
;tmp7
mova
[
rsp
+
gprsize
*
2
+
16
*
34
],
m6
;out31
mova
[
rsp
+
gprsize
*
2
+
16
*
11
],
m
5
;out8
mova
[
rsp
+
gprsize
*
2
+
16
*
11
],
m
1
;out8
mova
[
rsp
+
gprsize
*
2
+
16
*
26
],
m4
;out23
paddsw
m6
,
m3
,
m
2
;out7
psubsw
m3
,
m
2
;out24
paddsw
m6
,
m3
,
m
5
;out7
psubsw
m3
,
m
5
;out24
mova
m1
,
[
rsp
+
gprsize
*
2
+
16
*
20
]
;t17
mova
m5
,
[
rsp
+
gprsize
*
2
+
16
*
25
]
;t22
mova
m2
,
[
rsp
+
gprsize
*
2
+
16
*
17
]
;tmp14
...
...
@@ -4283,23 +4259,20 @@ ALIGN function_align
mova
[
rsp
+
gprsize
*
2
+
16
*
20
],
m3
;out17
psubsw
m2
,
m1
,
m5
;t25a
paddsw
m1
,
m5
;t30a
psubw
m3
,
m2
,
m4
;t25a - t22a
paddw
m2
,
m4
;t25a + t22a
ITX_MULSUB_2W
2
,
4
,
3
,
5
,
7
,
2896
,
2896
;t22, t25
mova
m5
,
[
rsp
+
gprsize
*
2
+
16
*
4
]
;tmp1
pmulhrsw
m3
,
m7
;t22
pmulhrsw
m2
,
m7
;t25
psubsw
m4
,
m5
,
m1
;out30
psubsw
m3
,
m5
,
m1
;out30
paddsw
m5
,
m1
;out1
mova
m1
,
[
rsp
+
gprsize
*
2
+
16
*
12
]
;tmp9
mova
[
rsp
+
gprsize
*
2
+
16
*
33
],
m
4
;out30
mova
[
rsp
+
gprsize
*
2
+
16
*
33
],
m
3
;out30
mova
[
rsp
+
gprsize
*
2
+
16
*
4
],
m5
;out1
psubsw
m
4
,
m1
,
m
3
;out22
paddsw
m1
,
m
3
;out9
psubsw
m
3
,
m1
,
m
2
;out22
paddsw
m1
,
m
2
;out9
mova
m5
,
[
rsp
+
gprsize
*
2
+
16
*
9
]
;tmp6
mova
[
rsp
+
gprsize
*
2
+
16
*
25
],
m
4
;out22
mova
[
rsp
+
gprsize
*
2
+
16
*
25
],
m
3
;out22
mova
[
rsp
+
gprsize
*
2
+
16
*
12
],
m1
;out9
psubsw
m3
,
m5
,
m
2
;out25
paddsw
m5
,
m
2
;out6
psubsw
m3
,
m5
,
m
4
;out25
paddsw
m5
,
m
4
;out6
mova
m4
,
[
rsp
+
gprsize
*
2
+
16
*
21
]
;t18a
mova
m1
,
[
rsp
+
gprsize
*
2
+
16
*
24
]
;t21a
mova
m2
,
[
rsp
+
gprsize
*
2
+
16
*
16
]
;tmp13
...
...
@@ -4315,17 +4288,14 @@ ALIGN function_align
mova
[
rsp
+
gprsize
*
2
+
16
*
16
],
m2
;out13
psubsw
m5
,
m3
,
m1
;t26
paddsw
m3
,
m1
;t29
ITX_MULSUB_2W
5
,
4
,
1
,
2
,
7
,
2896
,
2896
;t21a, t26a
mova
m2
,
[
rsp
+
gprsize
*
2
+
16
*
5
]
;tmp2
psubw
m1
,
m5
,
m4
;t26 - t21
paddw
m4
,
m5
;t26 + t21
psubsw
m5
,
m2
,
m3
;out29
psubsw
m1
,
m2
,
m3
;out29
paddsw
m2
,
m3
;out2
pmulhrsw
m1
,
m7
;t21a
pmulhrsw
m4
,
m7
;t26a
mova
m3
,
[
rsp
+
gprsize
*
2
+
16
*
13
]
;tmp10
mova
[
rsp
+
gprsize
*
2
+
16
*
32
],
m
5
;out29
psubsw
m7
,
m3
,
m
1
;out21
paddsw
m3
,
m
1
;out10
mova
[
rsp
+
gprsize
*
2
+
16
*
32
],
m
1
;out29
psubsw
m7
,
m3
,
m
5
;out21
paddsw
m3
,
m
5
;out10
mova
m5
,
[
rsp
+
gprsize
*
2
+
16
*
8
]
;tmp5
mova
[
rsp
+
gprsize
*
2
+
16
*
24
],
m7
;out21
mova
[
rsp
+
gprsize
*
2
+
16
*
13
],
m3
;out10
...
...
@@ -6010,262 +5980,237 @@ ALIGN function_align
psubw
m5
,
m6
,
m3
ITX_MULSUB_2W
5
,
4
,
2
,
3
,
7
,
1567
,
3784
;t43, t52
mova
m7
,
[
o
(
pw_2896x8
)]
mova
m2
,
[
rsp
+
gprsize
*
2
+
16
*
38
]
;t35a
mova
m3
,
[
rsp
+
gprsize
*
2
+
16
*
31
]
;tmp[28]
psubsw
m6
,
m2
,
m0
;t44
paddsw
m2
,
m0
;t35
psubsw
m0
,
m3
,
m2
;out35
paddsw
m2
,
m3
;out28
mova
m3
,
[
rsp
+
gprsize
*
2
+
16
*
63
]
;t60a
mova
[
rsp
+
gprsize
*
2
+
16
*
38
],
m0
;out35
mova
[
rsp
+
gprsize
*
2
+
16
*
31
],
m2
;out28
mova
m3
,
[
rsp
+
gprsize
*
2
+
16
*
63
]
;t60a
mova
m2
,
[
rsp
+
gprsize
*
2
+
16
*
6
]
;tmp[3]
psubsw
m0
,
m3
,
m1
;t51
paddsw
m3
,
m1
;t60
psubw
m1
,
m0
,
m6
;t44a
paddw
m
0
,
m6
;t
51a
psubsw
m
6
,
m2
,
m3
;out60
ITX_MULSUB_2W
0
,
6
,
1
,
2
,
7
,
2896
,
2896
;t44a
, t51a
mova
m
2
,
[
rsp
+
gprsize
*
2
+
16
*
6
]
;t
mp[3]
psubsw
m
1
,
m2
,
m3
;out60
paddsw
m2
,
m3
;out3
pmulhrsw
m1
,
m7
;t44a
pmulhrsw
m0
,
m7
;t51a
mova
m3
,
[
rsp
+
gprsize
*
2
+
16
*
22
]
;tmp[19]
mova
[
rsp
+
gprsize
*
2
+
16
*
63
],
m
6
;out60
mova
[
rsp
+
gprsize
*
2
+
16
*
63
],
m
1
;out60
mova
[
rsp
+
gprsize
*
2
+
16
*
6
],
m2
;out3
psubsw
m
6
,
m3
,
m
1
;out44
paddsw
m3
,
m
1
;out19
psubsw
m
1
,
m3
,
m
0
;out44
paddsw
m3
,
m
0
;out19
mova
m2
,
[
rsp
+
gprsize
*
2
+
16
*
15
]
;tmp[12]
mova
[
rsp
+
gprsize
*
2
+
16
*
47
],
m6
;out44
mova
[
rsp
+
gprsize
*
2
+
16
*
22
],
m3
;out19
psubsw
m1
,
m2
,
m0
;out51
paddsw
m2
,
m0
;out12
mova
[
rsp
+
gprsize
*
2
+
16
*
54
],
m1
;out51
mova
[
rsp
+
gprsize
*
2
+
16
*
15
],
m2
;out12
mova
m0
,
[
rsp
+
gprsize
*
2
+
16
*
39
]
;t36
mova
[
rsp
+
gprsize
*
2
+
16
*
47
],
m1
;out44
mova
[
rsp
+
gprsize
*
2
+
16
*
22
],
m3
;out19
mova
m1
,
[
rsp
+
gprsize
*
2
+
16
*
62
]
;t59
psubsw
m3
,
m2
,
m6
;out51
paddsw
m2
,
m6
;out12
mova
[
rsp
+
gprsize
*
2
+
16
*
54
],
m3
;out51
mova
[
rsp
+
gprsize
*
2
+
16
*
15
],
m2
;out12
psubsw
m2
,
m0
,
m5
;t43a
paddsw
m0
,
m5
;t36a
mova
m5
,
[
rsp
+
gprsize
*
2
+
16
*
30
]
;tmp[27]
psubsw
m3
,
m1
,
m4
;t52a
paddsw
m1
,
m4
;t59a
psubw
m5
,
m3
,
m2
;t43
paddw
m3
,
m2
;t52
mova
m2
,
[
rsp
+
gprsize
*
2
+
16
*
30
]
;tmp[27]
ITX_MULSUB_2W
3
,
2
,
4
,
6
,
7
,
2896
,
2896
;t43, t52
mova
m4
,
[
rsp
+
gprsize
*
2
+
16
*
7
]
;tmp[4 ]
pmulhrsw
m5
,
m7
;t43
pmulhrsw
m3
,
m7
;t52
psubsw
m6
,
m2
,
m0
;out36
paddsw
m2
,
m0
;out27
psubsw
m6
,
m5
,
m0
;out36
paddsw
m5
,
m0
;out27
psubsw
m0
,
m4
,
m1
;out59
paddsw
m4
,
m1
;out4
mova
[
rsp
+
gprsize
*
2
+
16
*
39
],
m6
;out36
mova
[
rsp
+
gprsize
*
2
+
16
*
30
],
m
2
;out27
mova
[
rsp
+
gprsize
*
2
+
16
*
30
],
m
5
;out27
mova
[
rsp
+
gprsize
*
2
+
16
*
62
],
m0
;out59
mova
[
rsp
+
gprsize
*
2
+
16
*
7
],
m4
;out4
mova
m0
,
[
rsp
+
gprsize
*
2
+
16
*
23
]
;tmp[20]
mova
m
2
,
[
rsp
+
gprsize
*
2
+
16
*
14
]
;tmp[11]
psubsw
m4
,
m0
,
m
5
;out43
paddsw
m0
,
m
5
;out20
psubsw
m6
,
m
2
,
m
3
;out52
paddsw
m
2
,
m
3
;out11
mova
m
5
,
[
rsp
+
gprsize
*
2
+
16
*
14
]
;tmp[11]
psubsw
m4
,
m0
,
m
3
;out43
paddsw
m0
,
m
3
;out20
psubsw
m6
,
m
5
,
m
2
;out52
paddsw
m
5
,
m
2
;out11
mova
[
rsp
+
gprsize
*
2
+
16
*
46
],
m4
;out43
mova
[
rsp
+
gprsize
*
2
+
16
*
23
],
m0
;out20
mova
[
rsp
+
gprsize
*
2
+
16
*
55
],
m6
;out52
mova
[
rsp
+
gprsize
*
2
+
16
*
14
],
m
2
;out11
mova
[
rsp
+
gprsize
*
2
+
16
*
14
],
m
5
;out11
mova
m0
,
[
rsp
+
gprsize
*
2
+
16
*
40
]
;t37a
mova
m
2
,
[
rsp
+
gprsize
*
2
+
16
*
45
]
;t42a
mova
m
5
,
[
rsp
+
gprsize
*
2
+
16
*
45
]
;t42a
mova
m3
,
[
rsp
+
gprsize
*
2
+
16
*
56
]
;t53a
mova
m1
,
[
rsp
+
gprsize
*
2
+
16
*
61
]
;t58a
psubsw
m4
,
m0
,
m2
;t42
paddsw
m0
,
m2
;t37
mova
m2
,
[
rsp
+
gprsize
*
2
+
16
*
29
]
;tmp[26]
psubsw
m4
,
m0
,
m5
;t42
paddsw
m0
,
m5
;t37
psubsw
m5
,
m1
,
m3
;t53
paddsw
m1
,
m3
;t58
psubw
m6
,
m5
,
m4
;t42a
paddw
m5
,
m4
;t53a
mova
m2
,
[
rsp
+
gprsize
*
2
+
16
*
29
]
;tmp[26]
ITX_MULSUB_2W
5
,
4
,
3
,
6
,
7
,
2896
,
2896
;t43, t52
mova
m3
,
[
rsp
+
gprsize
*
2
+
16
*
8
]
;tmp[5 ]
pmulhrsw
m6
,
m7
;t42a
pmulhrsw
m5
,
m7
;t53a
psubsw
m4
,
m2
,
m0
;out37
psubsw
m6
,
m2
,
m0
;out37
paddsw
m2
,
m0
;out26
psubsw
m0
,
m3
,
m1
;out58
paddsw
m3
,
m1
;out5
mova
[
rsp
+
gprsize
*
2
+
16
*
40
],
m
4
;out37
mova
[
rsp
+
gprsize
*
2
+
16
*
40
],
m
6
;out37
mova
[
rsp
+
gprsize
*
2
+
16
*
29
],
m2
;out26
mova
[
rsp
+
gprsize
*
2
+
16
*
61
],
m0
;out58
mova
[
rsp
+
gprsize
*
2
+
16
*
8
],
m3
;out5
mova
m0
,
[
rsp
+
gprsize
*
2
+
16
*
24
]
;tmp[21]
mova
m1
,
[
rsp
+
gprsize
*
2
+
16
*
13
]
;tmp[10]
psubsw
m2
,
m0
,
m
6
;out42
paddsw
m0
,
m
6
;out21
psubsw
m3
,
m1
,
m
5
;out53
paddsw
m1
,
m
5
;out10
psubsw
m2
,
m0
,
m
5
;out42
paddsw
m0
,
m
5
;out21
psubsw
m3
,
m1
,
m
4
;out53
paddsw
m1
,
m
4
;out10
mova
[
rsp
+
gprsize
*
2
+
16
*
45
],
m2
;out42
mova
[
rsp
+
gprsize
*
2
+
16
*
24
],
m0
;out21
mova
[
rsp
+
gprsize
*
2
+
16
*
56
],
m3
;out53
mova
[
rsp
+
gprsize
*
2
+
16
*
13
],
m1
;out10
mova
m0
,
[
rsp
+
gprsize
*
2
+
16
*
41
]
;t38
mova
m
2
,
[
rsp
+
gprsize
*
2
+
16
*
44
]
;t41
mova
m
5
,
[
rsp
+
gprsize
*
2
+
16
*
44
]
;t41
mova
m3
,
[
rsp
+
gprsize
*
2
+
16
*
57
]
;t54
mova
m1
,
[
rsp
+
gprsize
*
2
+
16
*
60
]
;t57
psubsw
m4
,
m0
,
m2
;t41a
paddsw
m0
,
m2
;t38a
mova
m2
,
[
rsp
+
gprsize
*
2
+
16
*
28
]
;tmp[25]
psubsw
m4
,
m0
,
m5
;t41a
paddsw
m0
,
m5
;t38a
psubsw
m5
,
m1
,
m3
;t54a
paddsw
m1
,
m3
;t57a
psubw
m6
,
m5
,
m4
;t41
paddw
m5
,
m4
;t54
mova
m2
,
[
rsp
+
gprsize
*
2
+
16
*
28
]
;tmp[25]
ITX_MULSUB_2W
5
,
4
,
3
,
6
,
7
,
2896
,
2896
;t41a, t54a