Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
VideoLAN
x264
Commits
543601b8
Commit
543601b8
authored
Jul 29, 2008
by
Holger Lubitz
Committed by
Fiona Glaser
Jul 29, 2008
Browse files
Refactor asm macros part 2: DCT
parent
60f7c47d
Changes
2
Hide whitespace changes
Inline
Side-by-side
common/x86/pixel-32.asm
View file @
543601b8
...
...
@@ -22,18 +22,10 @@
;*****************************************************************************
%include "x86inc.asm"
%include "x86util.asm"
SECTION
.text
%macro SUMSUB_BADC 4
paddw
%
1
,
%
2
paddw
%
3
,
%
4
paddw
%
2
,
%
2
paddw
%
4
,
%
4
psubw
%
2
,
%
1
psubw
%
4
,
%
3
%endmacro
%macro SBUTTERFLY 5
mov
%
1
%
5
,
%
3
punpckl
%
2
%
3
,
%
4
...
...
@@ -47,21 +39,6 @@ SECTION .text
SBUTTERFLY
q
,
dq
,
%
5
,
%
2
,
%
3
%endmacro
%macro ABS1 2
; mma, tmp
pxor
%
2
,
%
2
psubw
%
2
,
%
1
pmaxsw
%
1
,
%
2
%endmacro
%macro ABS2 4
; mma, mmb, tmp0, tmp1
pxor
%
3
,
%
3
pxor
%
4
,
%
4
psubw
%
3
,
%
1
psubw
%
4
,
%
2
pmaxsw
%
1
,
%
3
pmaxsw
%
2
,
%
4
%endmacro
%macro LOAD_DIFF_4P 4
; mmp, mmt, dx, dy
movd
%
1
,
[
eax
+
ebx
*%
4
+%
3
]
movd
%
2
,
[
ecx
+
edx
*%
4
+%
3
]
...
...
@@ -89,15 +66,6 @@ SECTION .text
movq
mm6
,
[
sp
ill
]
%endmacro
%macro HADAMARD8_1D 8
SUMSUB_BADC
%
1
,
%
5
,
%
2
,
%
6
SUMSUB_BADC
%
3
,
%
7
,
%
4
,
%
8
SUMSUB_BADC
%
1
,
%
3
,
%
2
,
%
4
SUMSUB_BADC
%
5
,
%
7
,
%
6
,
%
8
SUMSUB_BADC
%
1
,
%
2
,
%
3
,
%
4
SUMSUB_BADC
%
5
,
%
6
,
%
7
,
%
8
%endmacro
%macro SUM4x8_MM 0
movq
[
sp
ill
],
mm6
movq
[
sp
ill
+
8
],
mm7
...
...
common/x86/pixel-a.asm
View file @
543601b8
...
...
@@ -23,6 +23,7 @@
;*****************************************************************************
%include "x86inc.asm"
%include "x86util.asm"
SECTION
_RODATA
pw_1:
times
8
dw
1
...
...
@@ -166,29 +167,6 @@ SSD 8, 4, sse2
; SATD
;=============================================================================
%macro LOAD_DIFF_4P 4
; dst, tmp, [pix1], [pix2]
movd
%
1
,
%
3
movd
%
2
,
%
4
punpcklbw
%
1
,
%
2
punpcklbw
%
2
,
%
2
psubw
%
1
,
%
2
%endmacro
%macro LOAD_DIFF_8P 4
; dst, tmp, [pix1], [pix2]
movq
%
1
,
%
3
movq
%
2
,
%
4
punpcklbw
%
1
,
%
2
punpcklbw
%
2
,
%
2
psubw
%
1
,
%
2
%endmacro
%macro LOAD_DIFF_8x4P 6
; 4x dest, 2x temp
LOAD_DIFF_8P
%
1
,
%
5
,
[
r0
],
[
r2
]
LOAD_DIFF_8P
%
2
,
%
6
,
[
r0
+
r1
],
[
r2
+
r3
]
LOAD_DIFF_8P
%
3
,
%
5
,
[
r0
+
2
*
r1
],
[
r2
+
2
*
r3
]
LOAD_DIFF_8P
%
4
,
%
6
,
[
r0
+
r4
],
[
r2
+
r5
]
%endmacro
; phaddw is used only in 4x4 hadamard, because in 8x8 it's slower:
; even on Penryn, phaddw has latency 3 while paddw and punpck* have 1.
; 4x4 is special in that 4x4 transpose in xmmregs takes extra munging,
...
...
@@ -207,29 +185,11 @@ SSD 8, 4, sse2
PHSUMSUB
%
5
,
%
2
,
%
3
%endmacro
%macro SUMSUB_BADC 4
paddw
%
1
,
%
2
paddw
%
3
,
%
4
paddw
%
2
,
%
2
paddw
%
4
,
%
4
psubw
%
2
,
%
1
psubw
%
4
,
%
3
%endmacro
%macro HADAMARD4_1D 4
SUMSUB_BADC
%
1
,
%
2
,
%
3
,
%
4
SUMSUB_BADC
%
1
,
%
3
,
%
2
,
%
4
%endmacro
%macro HADAMARD8_1D 8
SUMSUB_BADC
%
1
,
%
5
,
%
2
,
%
6
SUMSUB_BADC
%
3
,
%
7
,
%
4
,
%
8
SUMSUB_BADC
%
1
,
%
3
,
%
2
,
%
4
SUMSUB_BADC
%
5
,
%
7
,
%
6
,
%
8
SUMSUB_BADC
%
1
,
%
2
,
%
3
,
%
4
SUMSUB_BADC
%
5
,
%
6
,
%
7
,
%
8
%endmacro
%macro SBUTTERFLY 5
mov
%
1
%
5
,
%
3
punpckl
%
2
%
3
,
%
4
...
...
@@ -305,38 +265,6 @@ SSD 8, 4, sse2
%endmacro
%endif
%macro ABS1_MMX 2
; a, tmp
pxor
%
2
,
%
2
psubw
%
2
,
%
1
pmaxsw
%
1
,
%
2
%endmacro
%macro ABS2_MMX 4
; a, b, tmp0, tmp1
pxor
%
3
,
%
3
pxor
%
4
,
%
4
psubw
%
3
,
%
1
psubw
%
4
,
%
2
pmaxsw
%
1
,
%
3
pmaxsw
%
2
,
%
4
%endmacro
%macro ABS1_SSSE3 2
pabsw
%
1
,
%
1
%endmacro
%macro ABS2_SSSE3 4
pabsw
%
1
,
%
1
pabsw
%
2
,
%
2
%endmacro
%define ABS1 ABS1_MMX
%define ABS2 ABS2_MMX
%macro ABS4 6
ABS2
%
1
,
%
2
,
%
5
,
%
6
ABS2
%
3
,
%
4
,
%
5
,
%
6
%endmacro
%macro HADAMARD4x4_SUM 1
; %1 = dest (row sum of one block)
HADAMARD4_1D
mm4
,
mm5
,
mm6
,
mm7
TRANSPOSE4x4W
mm4
,
mm5
,
mm6
,
mm7
,
%
1
...
...
@@ -354,10 +282,10 @@ SSD 8, 4, sse2
; clobber: mm3..mm7
; out: %1 = satd
%macro SATD_4x4_MMX 3
LOAD_DIFF
_4P
mm4
,
mm3
,
[
r0
+%
2
],
[
r2
+%
2
]
LOAD_DIFF
_4P
mm5
,
mm3
,
[
r0
+
r1
+%
2
],
[
r2
+
r3
+%
2
]
LOAD_DIFF
_4P
mm6
,
mm3
,
[
r0
+
2
*
r1
+%
2
],
[
r2
+
2
*
r3
+%
2
]
LOAD_DIFF
_4P
mm7
,
mm3
,
[
r0
+
r4
+%
2
],
[
r2
+
r5
+%
2
]
LOAD_DIFF
mm4
,
mm3
,
none
,
[
r0
+%
2
],
[
r2
+%
2
]
LOAD_DIFF
mm5
,
mm3
,
none
,
[
r0
+
r1
+%
2
],
[
r2
+
r3
+%
2
]
LOAD_DIFF
mm6
,
mm3
,
none
,
[
r0
+
2
*
r1
+%
2
],
[
r2
+
2
*
r3
+%
2
]
LOAD_DIFF
mm7
,
mm3
,
none
,
[
r0
+
r4
+%
2
],
[
r2
+
r5
+%
2
]
%if %3
lea
r0
,
[
r0
+
4
*
r1
]
lea
r2
,
[
r2
+
4
*
r3
]
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment