Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
Menu
Open sidebar
VideoLAN
x264
Commits
d13a1868
Commit
d13a1868
authored
Nov 03, 2005
by
Loren Merritt
Browse files
amd64 sse2 8x8dct. 1.45x faster than mmx.
git-svn-id:
svn://svn.videolan.org/x264/trunk@353
df754926-b1dd-0310-bc7b-ec298dee348c
parent
08e19ed8
Changes
4
Hide whitespace changes
Inline
Side-by-side
common/amd64/dct-a.asm
View file @
d13a1868
...
...
@@ -50,6 +50,14 @@ BITS 64
psubw
%
1
,
%
2
%endmacro
%macro MMX_LOAD_DIFF_8P 5
movq
%
1
,
%
4
punpcklbw
%
1
,
%
3
movq
%
2
,
%
5
punpcklbw
%
2
,
%
3
psubw
%
1
,
%
2
%endmacro
%macro MMX_SUMSUB_BA 2
paddw
%
1
,
%
2
paddw
%
2
,
%
2
...
...
@@ -82,26 +90,38 @@ BITS 64
psubw
%
4
,
%
3
%endmacro
%macro SBUTTERFLYwd 3
movq
%
3
,
%
1
punpcklwd
%
1
,
%
2
punpckhwd
%
3
,
%
2
%endmacro
%macro SBUTTERFLYdq 3
movq
%
3
,
%
1
punpckldq
%
1
,
%
2
punpckhdq
%
3
,
%
2
%macro SBUTTERFLY 5
mov
%
1
%
5
,
%
3
punpckl
%
2
%
3
,
%
4
punpckh
%
2
%
5
,
%
4
%endmacro
;-----------------------------------------------------------------------------
; input ABCD output ADTC
;-----------------------------------------------------------------------------
%macro MMX_TRANSPOSE 5
SBUTTERFLYwd
%
1
,
%
2
,
%
5
SBUTTERFLYwd
%
3
,
%
4
,
%
2
SBUTTERFLYdq
%
1
,
%
3
,
%
4
SBUTTERFLYdq
%
5
,
%
2
,
%
3
SBUTTERFLY
q
,
wd
,
%
1
,
%
2
,
%
5
SBUTTERFLY
q
,
wd
,
%
3
,
%
4
,
%
2
SBUTTERFLY
q
,
dq
,
%
1
,
%
3
,
%
4
SBUTTERFLY
q
,
dq
,
%
5
,
%
2
,
%
3
%endmacro
;-----------------------------------------------------------------------------
; input ABCDEFGH output AFHDTECB
;-----------------------------------------------------------------------------
%macro SSE2_TRANSPOSE8x8 9
SBUTTERFLY
dqa
,
wd
,
%
1
,
%
2
,
%
9
SBUTTERFLY
dqa
,
wd
,
%
3
,
%
4
,
%
2
SBUTTERFLY
dqa
,
wd
,
%
5
,
%
6
,
%
4
SBUTTERFLY
dqa
,
wd
,
%
7
,
%
8
,
%
6
SBUTTERFLY
dqa
,
dq
,
%
1
,
%
3
,
%
8
SBUTTERFLY
dqa
,
dq
,
%
9
,
%
2
,
%
3
SBUTTERFLY
dqa
,
dq
,
%
5
,
%
7
,
%
2
SBUTTERFLY
dqa
,
dq
,
%
4
,
%
6
,
%
7
SBUTTERFLY
dqa
,
qdq
,
%
1
,
%
5
,
%
6
SBUTTERFLY
dqa
,
qdq
,
%
9
,
%
4
,
%
5
SBUTTERFLY
dqa
,
qdq
,
%
8
,
%
2
,
%
4
SBUTTERFLY
dqa
,
qdq
,
%
3
,
%
7
,
%
2
%endmacro
%macro MMX_STORE_DIFF_4P 5
...
...
@@ -114,33 +134,22 @@ BITS 64
movd
%
5
,
%
1
%endmacro
;%macro
;%endmacro
%macro MMX_STORE_DIFF_8P 4
psraw
%
1
,
6
movq
%
2
,
%
4
punpcklbw
%
2
,
%
3
paddsw
%
1
,
%
2
packuswb
%
1
,
%
1
movq
%
4
,
%
1
%endmacro
;=============================================================================
;
Local Data (Read Only)
;
Constants
;=============================================================================
%ifdef FORMAT_COFF
SECTION
.rodata
%else
SECTION
.rodata
%endif
;-----------------------------------------------------------------------------
; Various memory constants (trigonometric values or rounding values)
;-----------------------------------------------------------------------------
ALIGN
16
x264_mmx_1:
dw
1
,
1
,
1
,
1
x264_mmx_32:
dw
32
,
32
,
32
,
32
x264_mmx_PPNN:
dw
1
,
1
,
-
1
,
-
1
x264_mmx_PNPN:
dw
1
,
-
1
,
1
,
-
1
x264_mmx_PNNP:
dw
1
,
-
1
,
-
1
,
1
x264_mmx_PPPN:
dw
1
,
1
,
1
,
-
1
x264_mmx_PPNP:
dw
1
,
1
,
-
1
,
1
x264_mmx_2121:
dw
2
,
1
,
2
,
1
x264_mmx_p2n2p1p1:
dw
2
,
-
2
,
1
,
1
SECTION
.rodata
al
ign
=
16
pw_1:
times
8
dw
1
pw_32:
times
8
dw
32
;=============================================================================
; Code
...
...
@@ -170,7 +179,7 @@ x264_dct4x4dc_mmxext:
MMX_TRANSPOSE
mm0
,
mm2
,
mm3
,
mm4
,
mm1
; in: mm0, mm2, mm3, mm4 out: mm0, mm4, mm1, mm3
movq
mm6
,
[
x264_mmx
_1
GLOBAL
]
movq
mm6
,
[
pw
_1
GLOBAL
]
paddw
mm0
,
mm6
paddw
mm4
,
mm6
psraw
mm0
,
1
...
...
@@ -304,7 +313,7 @@ x264_add4x4_idct_mmxext:
MMX_SUMSUB_BADC
mm2
,
mm3
,
mm4
,
mm1
; mm2=s02+s13 mm3=s02-s13 mm4=d02+d13 mm1=d02-d13
MMX_ZERO
mm7
movq
mm6
,
[
x264_mmx
_32
GLOBAL
]
movq
mm6
,
[
pw
_32
GLOBAL
]
MMX_STORE_DIFF_4P
mm2
,
mm0
,
mm6
,
mm7
,
[
rax
]
MMX_STORE_DIFF_4P
mm4
,
mm0
,
mm6
,
mm7
,
[
rax
+
rcx
]
...
...
@@ -319,402 +328,188 @@ x264_add4x4_idct_mmxext:
; 8x8 Transform
; =============================================================================
; -----------------------------------------------------------------------------
; input 2x8 unsigned bytes (%5,%6), zero (%7) output: difference (%1,%2)
; -----------------------------------------------------------------------------
%macro MMX_LOAD_DIFF_8P 7
movq
%
1
,
%
5
movq
%
2
,
%
1
punpcklbw
%
1
,
%
7
punpckhbw
%
2
,
%
7
movq
%
3
,
%
6
movq
%
4
,
%
3
punpcklbw
%
3
,
%
7
punpckhbw
%
4
,
%
7
psubw
%
1
,
%
3
psubw
%
2
,
%
4
%endmacro
%macro MMX_LOADSUMSUB 4
; returns %1=%3+%4, %2=%3-%4
movq
%
2
,
%
3
movq
%
1
,
%
4
MMX_SUMSUB_BA
%
1
,
%
2
%endmacro
%macro MMX_STORE_DIFF_8P 6
movq
%
1
,
%
3
movq
%
2
,
%
1
punpcklbw
%
1
,
%
6
punpckhbw
%
2
,
%
6
paddw
%
1
,
%
4
paddw
%
2
,
%
5
packuswb
%
1
,
%
2
movq
%
3
,
%
1
; in: ABCDEFGH
; out: FBCGEDHI
%macro DCT8_1D 10
MMX_SUMSUB_BA
%
8
,
%
1
; %8=s07, %1=d07
MMX_SUMSUB_BA
%
7
,
%
2
; %7=s16, %2=d16
MMX_SUMSUB_BA
%
6
,
%
3
; %6=s25, %3=d25
MMX_SUMSUB_BA
%
5
,
%
4
; %5=s34, %4=d34
MMX_SUMSUB_BA
%
5
,
%
8
; %5=a0, %8=a2
MMX_SUMSUB_BA
%
6
,
%
7
; %6=a1, %7=a3
movdqa
%
9
,
%
1
psraw
%
9
,
1
paddw
%
9
,
%
1
paddw
%
9
,
%
2
paddw
%
9
,
%
3
; %9=a4
movdqa
%
10
,
%
4
psraw
%
10
,
1
paddw
%
10
,
%
4
paddw
%
10
,
%
2
psubw
%
10
,
%
3
; %10=a7
MMX_SUMSUB_BA
%
4
,
%
1
psubw
%
1
,
%
3
psubw
%
4
,
%
2
psraw
%
3
,
1
psraw
%
2
,
1
psubw
%
1
,
%
3
; %1=a5
psubw
%
4
,
%
2
; %4=a6
MMX_SUMSUB_BA
%
6
,
%
5
; %6=b0, %5=b4
movdqa
%
2
,
%
10
psraw
%
2
,
2
paddw
%
2
,
%
9
; %2=b1
psraw
%
9
,
2
psubw
%
9
,
%
10
; %9=b7
movdqa
%
3
,
%
7
psraw
%
3
,
1
paddw
%
3
,
%
8
; %3=b2
psraw
%
8
,
1
psubw
%
8
,
%
7
; %8=b6
movdqa
%
7
,
%
4
psraw
%
7
,
2
paddw
%
7
,
%
1
; %7=b3
psraw
%
1
,
2
psubw
%
4
,
%
1
; %4=b5
%endmacro
cglobal
x264_pixel_sub_8x8_mmx
cglobal
x264_xdct8_mmxext
cglobal
x264_ydct8_mmx
cglobal
x264_xidct8_mmxext
cglobal
x264_yidct8_mmx
cglobal
x264_pixel_add_8x8_mmx
cglobal
x264_sub8x8_dct8_sse2
ALIGN
16
;-----------------------------------------------------------------------------
; void
x264_pixel
_sub
_
8x8_
mmx
( int16_t
*diff
, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
;
; void
__cdecl x264
_sub8x8_
dct8_sse2
( int16_t
dct[8][8]
, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
;-----------------------------------------------------------------------------
x264_
pixel_
sub
_
8x8_
mmx
:
; mov rdi, rdi ; d
iff
x264_sub8x8_
dct8_sse2
:
; mov rdi, rdi ; d
ct
; mov rsi, rsi ; pix1
;
movsxd rdx, edx ; i_pix1
movsxd
rdx
,
edx
; i_pix1
; mov rcx, rcx ; pix2
movsxd
r10
,
parm5d
; i_pix2
MMX_ZERO
mm7
%assign disp 0
%rep 8
MMX_LOAD_DIFF_8P
mm0
,
mm1
,
mm2
,
mm3
,
[
parm2q
],
[
parm4q
],
mm7
movq
[
parm1q
+
disp
],
mm0
movq
[
parm1q
+
disp
+
8
],
mm1
add
parm2q
,
parm3q
add
parm4q
,
r10
%assign disp disp+16
%endrep
ret
ALIGN
16
;-----------------------------------------------------------------------------
; void x264_xdct8_mmxext( int16_t dest[8][8] );
;-----------------------------------------------------------------------------
x264_xdct8_mmxext:
movq
mm5
,
[
x264_mmx_PPNN
GLOBAL
]
movq
mm6
,
[
x264_mmx_PNNP
GLOBAL
]
movq
mm4
,
[
x264_mmx_PPPN
GLOBAL
]
movq
mm7
,
[
x264_mmx_PPNP
GLOBAL
]
;-------------------------------------------------------------------------
; horizontal dct ( compute 1 row at a time -> 8 loops )
;-------------------------------------------------------------------------
%assign disp 0
%rep 8
movq
mm0
,
[
parm1q
+
disp
]
movq
mm1
,
[
parm1q
+
disp
+
8
]
pshufw
mm2
,
mm1
,
00011011b
movq
mm1
,
mm0
paddw
mm0
,
mm2
; (low)s07/s16/d25/s34(high)
psubw
mm1
,
mm2
; (low)d07/d16/d25/d34(high)
pshufw
mm2
,
mm0
,
00011011b
; (low)s34/s25/s16/s07(high)
pmullw
mm0
,
mm5
; (low)s07/s16/-s25/-s34(high)
paddw
mm0
,
mm2
; (low)a0/a1/a3/a2(high)
movq
mm3
,
mm1
psraw
mm1
,
1
; (low)d07/d16/d25/d34(high) (x>>1)
pshufw
mm2
,
mm3
,
10110001b
; (low)d16/d07/d34/d25(high)
paddw
mm1
,
mm3
; (low)d07/d16/d25/d34(high) (x+(x>>1))
pshufw
mm3
,
mm2
,
00011011b
; (low)d25/d34/d07/d16(high)
pmullw
mm2
,
mm5
; (low)d16/d07/-d34/-d25(high)
pmullw
mm1
,
mm6
; (low)d07/-d16/-d25/d34(high) (x+(x>>1))
paddw
mm3
,
mm2
paddw
mm1
,
mm3
; (low)a4/a6/a5/a7(high)
pshufw
mm2
,
mm0
,
11001001b
; (low)a1/a3/a0/a2(high)
pshufw
mm0
,
mm0
,
10011100b
; (low)a0/a2/a1/a3(high)
pmullw
mm2
,
[
x264_mmx_2121
GLOBAL
]
pmullw
mm0
,
mm5
; (low)a0/a2/-a1/-a3(high)
psraw
mm2
,
1
; (low)a1/a3>>1/a0/a2>>1(high)
paddw
mm0
,
mm2
; (low)dst0/dst2/dst4/dst6(high)
pshufw
mm1
,
mm1
,
00100111b
; (low)a7/a6/a5/a4(high)
pshufw
mm2
,
mm1
,
00011011b
; (low)a4/a5/a6/a7(high)
psraw
mm1
,
2
; (low)a7>>2/a6>>2/a5>>2/a4>>2(high)
pmullw
mm2
,
mm4
; (low)a4/a5/a6/-a7(high)
pmullw
mm1
,
mm7
; (low)a7>>2/a6>>2/-a5>>2/a4>>2(high)
paddw
mm1
,
mm2
; (low)dst1/dst3/dst5/dst7(high)
movq
mm2
,
mm0
punpcklwd
mm0
,
mm1
; (low)dst0/dst1/dst2/dst3(high)
punpckhwd
mm2
,
mm1
; (low)dst4/dst5/dst6/dst7(high)
movq
[
parm1q
+
disp
],
mm0
movq
[
parm1q
+
disp
+
8
],
mm2
%assign disp disp+16
%endrep
movsxd
r8
,
r8d
; i_pix2
MMX_ZERO
xmm9
MMX_LOAD_DIFF_8P
xmm0
,
xmm8
,
xmm9
,
[
rsi
],
[
rcx
]
MMX_LOAD_DIFF_8P
xmm1
,
xmm8
,
xmm9
,
[
rsi
+
rdx
],
[
rcx
+
r8
]
MMX_LOAD_DIFF_8P
xmm2
,
xmm8
,
xmm9
,
[
rsi
+
rdx
*
2
],
[
rcx
+
r8
*
2
]
lea
r9
,
[
rdx
+
rdx
*
2
]
lea
r10
,
[
r8
+
r8
*
2
]
add
rsi
,
r9
add
rcx
,
r10
MMX_LOAD_DIFF_8P
xmm3
,
xmm8
,
xmm9
,
[
rsi
],
[
rcx
]
MMX_LOAD_DIFF_8P
xmm4
,
xmm8
,
xmm9
,
[
rsi
+
rdx
],
[
rcx
+
r8
]
MMX_LOAD_DIFF_8P
xmm5
,
xmm8
,
xmm9
,
[
rsi
+
rdx
*
2
],
[
rcx
+
r8
*
2
]
MMX_LOAD_DIFF_8P
xmm6
,
xmm8
,
xmm9
,
[
rsi
+
r9
],
[
rcx
+
r10
]
MMX_LOAD_DIFF_8P
xmm7
,
xmm8
,
xmm9
,
[
rsi
+
rdx
*
4
],
[
rcx
+
r8
*
4
]
SSE2_TRANSPOSE8x8
xmm0
,
xmm1
,
xmm2
,
xmm3
,
xmm4
,
xmm5
,
xmm6
,
xmm7
,
xmm8
DCT8_1D
xmm0
,
xmm5
,
xmm7
,
xmm3
,
xmm8
,
xmm4
,
xmm2
,
xmm1
,
xmm6
,
xmm9
SSE2_TRANSPOSE8x8
xmm4
,
xmm5
,
xmm7
,
xmm2
,
xmm8
,
xmm3
,
xmm1
,
xmm6
,
xmm0
DCT8_1D
xmm4
,
xmm3
,
xmm6
,
xmm2
,
xmm0
,
xmm8
,
xmm7
,
xmm5
,
xmm1
,
xmm9
movdqa
[
rdi
+
0x00
],
xmm8
movdqa
[
rdi
+
0x10
],
xmm3
movdqa
[
rdi
+
0x20
],
xmm6
movdqa
[
rdi
+
0x30
],
xmm7
movdqa
[
rdi
+
0x40
],
xmm0
movdqa
[
rdi
+
0x50
],
xmm2
movdqa
[
rdi
+
0x60
],
xmm5
movdqa
[
rdi
+
0x70
],
xmm1
ret
ALIGN
16
;-----------------------------------------------------------------------------
; void x264_ydct8_mmx( int16_t dest[8][8] );
;-----------------------------------------------------------------------------
x264_ydct8_mmx:
;-------------------------------------------------------------------------
; vertical dct ( compute 4 columns at a time -> 2 loops )
;-------------------------------------------------------------------------
%assign disp 0
%rep 2
; in: ABCDEFGH
; out: IBHDEACG
%macro IDCT8_1D 10
MMX_SUMSUB_BA
%
5
,
%
1
; %5=a0, %1=a2
movdqa
%
10
,
%
3
psraw
%
3
,
1
psubw
%
3
,
%
7
; %3=a4
psraw
%
7
,
1
paddw
%
7
,
%
10
; %7=a6
movdqa
%
9
,
%
2
psraw
%
9
,
1
paddw
%
9
,
%
2
paddw
%
9
,
%
4
paddw
%
9
,
%
6
; %9=a7
MMX_LOADSUMSUB
mm2
,
mm3
,
[
parm1q
+
disp
+
0
*
16
],
[
parm1q
+
disp
+
7
*
16
]
; mm2 = s07, mm3 = d07
MMX_LOADSUMSUB
mm1
,
mm5
,
[
parm1q
+
disp
+
1
*
16
],
[
parm1q
+
disp
+
6
*
16
]
; mm1 = s16, mm5 = d16
MMX_LOADSUMSUB
mm0
,
mm6
,
[
parm1q
+
disp
+
2
*
16
],
[
parm1q
+
disp
+
5
*
16
]
; mm0 = s25, mm6 = d25
MMX_LOADSUMSUB
mm4
,
mm7
,
[
parm1q
+
disp
+
3
*
16
],
[
parm1q
+
disp
+
4
*
16
]
; mm4 = s34, mm7 = d34
MMX_SUMSUB_BA
mm4
,
mm2
; mm4 = a0, mm2 = a2
MMX_SUMSUB_BA
mm0
,
mm1
; mm0 = a1, mm1 = a3
MMX_SUMSUB_BA
mm0
,
mm4
; mm0 = dst0, mm1 = dst4
movq
[
parm1q
+
disp
+
0
*
16
],
mm0
movq
[
parm1q
+
disp
+
4
*
16
],
mm4
movq
mm0
,
mm1
; a3
psraw
mm0
,
1
; a3>>1
paddw
mm0
,
mm2
; a2 + (a3>>1)
psraw
mm2
,
1
; a2>>1
psubw
mm2
,
mm1
; (a2>>1) - a3
movq
[
parm1q
+
disp
+
2
*
16
],
mm0
movq
[
parm1q
+
disp
+
6
*
16
],
mm2
movq
mm0
,
mm6
psraw
mm0
,
1
paddw
mm0
,
mm6
; d25+(d25>>1)
movq
mm1
,
mm3
psubw
mm1
,
mm7
; a5 = d07-d34-(d25+(d25>>1))
psubw
mm1
,
mm0
movq
mm0
,
mm5
psraw
mm0
,
1
paddw
mm0
,
mm5
; d16+(d16>>1)
movq
mm2
,
mm3
paddw
mm2
,
mm7
; a6 = d07+d34-(d16+(d16>>1))
psubw
mm2
,
mm0
movq
mm0
,
mm3
psraw
mm0
,
1
paddw
mm0
,
mm3
; d07+(d07>>1)
paddw
mm0
,
mm5
paddw
mm0
,
mm6
; a4 = d16+d25+(d07+(d07>>1))
movq
mm3
,
mm7
psraw
mm3
,
1
paddw
mm3
,
mm7
; d34+(d34>>1)
paddw
mm3
,
mm5
psubw
mm3
,
mm6
; a7 = d16-d25+(d34+(d34>>1))
movq
mm7
,
mm3
psraw
mm7
,
2
paddw
mm7
,
mm0
; a4 + (a7>>2)
movq
mm6
,
mm2
psraw
mm6
,
2
paddw
mm6
,
mm1
; a5 + (a6>>2)
psraw
mm0
,
2
psraw
mm1
,
2
psubw
mm0
,
mm3
; (a4>>2) - a7
psubw
mm2
,
mm1
; a6 - (a5>>2)
movq
[
parm1q
+
disp
+
1
*
16
],
mm7
movq
[
parm1q
+
disp
+
3
*
16
],
mm6
movq
[
parm1q
+
disp
+
5
*
16
],
mm2
movq
[
parm1q
+
disp
+
7
*
16
],
mm0
%assign disp disp+8
%endrep
ret
ALIGN
16
;-----------------------------------------------------------------------------
; void x264_xidct8_mmxext( int16_t dest[8][8] );
;-----------------------------------------------------------------------------
x264_xidct8_mmxext:
movq
mm4
,
[
x264_mmx_PPNN
GLOBAL
]
movq
mm5
,
[
x264_mmx_PNPN
GLOBAL
]
movq
mm6
,
[
x264_mmx_PPNP
GLOBAL
]
movq
mm7
,
[
x264_mmx_PPPN
GLOBAL
]
;-------------------------------------------------------------------------
; horizontal idct ( compute 1 row at a time -> 8 loops )
;-------------------------------------------------------------------------
%assign disp 0
%rep 8
pshufw
mm0
,
[
parm1q
+
disp
],
11011000b
; (low)d0,d2,d1,d3(high)
pshufw
mm2
,
[
parm1q
+
disp
+
8
],
11011000b
; (low)d4,d6,d5,d7(high)
movq
mm1
,
mm0
punpcklwd
mm0
,
mm2
; (low)d0,d4,d2,d6(high)
punpckhwd
mm1
,
mm2
; (low)d1,d5,d3,d7(high)
pshufw
mm2
,
mm0
,
10110001b
; (low)d4,d0,d6,d2(high)
pmullw
mm0
,
[
x264_mmx_p2n2p1p1
GLOBAL
]
; (low)2*d0,-2*d4,d2,d6(high)
pmullw
mm2
,
mm6
; (low)d4,d0,-d6,d2(high)
psraw
mm0
,
1
; (low)d0,-d4,d2>>1,d6>>1(high)
paddw
mm0
,
mm2
; (low)e0,e2,e4,e6(high)
movq
mm3
,
mm1
; (low)d1,d5,d3,d7(high)
psraw
mm1
,
1
; (low)d1>>1,d5>>1,d3>>1,d7>>1(high)
pshufw
mm2
,
mm3
,
10110001b
; (low)d5,d1,d7,d3(high)
paddw
mm1
,
mm3
; (low)d1+(d1>>1),d5+(d5>>1),d3+(d3>>1),d7+(d7>>1)(high)
pshufw
mm3
,
mm2
,
00011011b
; (low)d3,d7,d1,d5(high)
pmullw
mm1
,
mm4
; (low)d1+(d1>>1),d5+(d5>>1),-d3-(d3>>1),-d7-(d7>>1)(high)
pmullw
mm2
,
mm5
; (low)d5,-d1,d7,-d3(high)
paddw
mm1
,
mm3
paddw
mm1
,
mm2
; (low)e7,e5,e3,e1(high)
pshufw
mm2
,
mm0
,
00011011b
; (low)e6,e4,e2,e0(high)
pmullw
mm0
,
mm4
; (low)e0,e2,-e4,-e6(high)
pshufw
mm3
,
mm1
,
00011011b
; (low)e1,e3,e5,e7(high)
psraw
mm1
,
2
; (low)e7>>2,e5>>2,e3>>2,e1>>2(high)
pmullw
mm3
,
mm6
; (low)e1,e3,-e5,e7(high)
pmullw
mm1
,
mm7
; (low)e7>>2,e5>>2,e3>>2,-e1>>2(high)
paddw
mm0
,
mm2
; (low)f0,f2,f4,f6(high)
paddw
mm1
,
mm3
; (low)f1,f3,f5,f7(high)
pshufw
mm3
,
mm0
,
00011011b
; (low)f6,f4,f2,f0(high)
pshufw
mm2
,
mm1
,
00011011b
; (low)f7,f5,f3,f1(high)
psubw
mm3
,
mm1
paddw
mm0
,
mm2
movq
[
parm1q
+
disp
],
mm0
movq
[
parm1q
+
disp
+
8
],
mm3
%assign disp disp+16
%endrep
ret
ALIGN
16
;-----------------------------------------------------------------------------
; void x264_yidct8_mmx( int16_t dest[8][8] );
;-----------------------------------------------------------------------------
x264_yidct8_mmx:
;-------------------------------------------------------------------------
; vertical idct ( compute 4 columns at a time -> 2 loops )
;-------------------------------------------------------------------------
%assign disp 0
%rep 2
movq
mm1
,
[
parm1q
+
disp
+
1
*
16
]
; mm1 = d1
movq
mm3
,
[
parm1q
+
disp
+
3
*
16
]
; mm3 = d3
movq
mm5
,
[
parm1q
+
disp
+
5
*
16
]
; mm5 = d5
movq
mm7
,
[
parm1q
+
disp
+
7
*
16
]
; mm7 = d7
movq
mm4
,
mm7
psraw
mm4
,
1
movq
mm0
,
mm5
psubw
mm0
,
mm7
psubw
mm0
,
mm4
psubw
mm0
,
mm3
; mm0 = e1
movq
mm6
,
mm3
psraw
mm6
,
1
movq
mm2
,
mm7
psubw
mm2
,
mm6
psubw
mm2
,
mm3
paddw
mm2
,
mm1
; mm2 = e3
movq
mm4
,
mm5
psraw
mm4
,
1
paddw
mm4
,
mm5
paddw
mm4
,
mm7
psubw
mm4
,
mm1
; mm4 = e5
movq
mm6
,
mm1
psraw
mm6
,
1
paddw
mm6
,
mm1
paddw
mm6
,
mm5
paddw
mm6
,
mm3
; mm6 = e7
movq
mm1
,
mm0
movq
mm3
,
mm4
movq
mm5
,
mm2
movq
mm7
,
mm6
psraw
mm6
,
2
psraw
mm3
,
2
psraw
mm5
,
2
psraw
mm0
,
2
paddw
mm1
,
mm6
; mm1 = f1
paddw
mm3
,
mm2
; mm3 = f3
psubw
mm5
,
mm4
; mm5 = f5
psubw
mm7
,
mm0
; mm7 = f7
movq
mm2
,
[
parm1q
+
disp
+
2
*
16
]
; mm2 = d2
movq
mm6
,
[
parm1q
+
disp
+
6
*
16
]
; mm6 = d6
movq
mm4
,
mm2
movq
mm0
,
mm6
psraw
mm4
,
1
psraw
mm6
,
1
psubw
mm4
,
mm0
; mm4 = a4
paddw
mm6
,
mm2
; mm6 = a6
movq
mm2
,
[
parm1q
+
disp
+
0
*
16
]
; mm2 = d0
movq
mm0
,
[
parm1q
+
disp
+
4
*
16
]
; mm0 = d4
MMX_SUMSUB_BA
mm0
,
mm2
; mm0 = a0, mm2 = a2
MMX_SUMSUB_BA
mm6
,
mm0
; mm6 = f0, mm0 = f6
MMX_SUMSUB_BA
mm4
,
mm2
; mm4 = f2, mm2 = f4
MMX_SUMSUB_BA
mm7
,
mm6
; mm7 = g0, mm6 = g7
MMX_SUMSUB_BA
mm5
,
mm4
; mm5 = g1, mm4 = g6
MMX_SUMSUB_BA
mm3
,
mm2
; mm3 = g2, mm2 = g5
MMX_SUMSUB_BA
mm1
,
mm0
; mm1 = g3, mm0 = g4
psraw
mm7
,
6
psraw
mm6
,
6
psraw
mm5
,
6
psraw
mm4
,
6
psraw
mm3
,
6
psraw
mm2
,
6
psraw
mm1
,
6
psraw
mm0
,
6
movq
[
parm1q
+
disp
+
0
*
16
],
mm7
movq
[
parm1q
+
disp
+
1
*
16
],
mm5
movq
[
parm1q
+
disp
+
2
*
16
],
mm3
movq
[
parm1q
+
disp
+
3
*
16
],
mm1
movq
[
parm1q
+
disp
+
4
*
16
],
mm0
movq
[
parm1q
+
disp
+
5
*
16
],
mm2
movq
[
parm1q
+
disp
+
6
*
16
],
mm4
movq
[
parm1q
+
disp
+
7
*
16
],
mm6
%assign disp disp+8
%endrep
movdqa
%
10
,
%
6
psraw
%
10
,
1
paddw
%
10
,
%
6
paddw
%
10
,
%
8
psubw
%
10
,
<