Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
Menu
Open sidebar
VideoLAN
x264
Commits
9168abfa
Commit
9168abfa
authored
Apr 08, 2008
by
Loren Merritt
Browse files
faster x86_32 dct8
parent
56bf7565
Changes
3
Hide whitespace changes
Inline
Side-by-side
common/dct.c
View file @
9168abfa
...
...
@@ -415,14 +415,11 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
if
(
cpu
&
X264_CPU_SSE2
)
{
#ifdef ARCH_X86_64
dctf
->
sub8x8_dct8
=
x264_sub8x8_dct8_sse2
;
dctf
->
sub16x16_dct8
=
x264_sub16x16_dct8_sse2
;
#endif
dctf
->
add8x8_idct8
=
x264_add8x8_idct8_sse2
;
dctf
->
add16x16_idct8
=
x264_add16x16_idct8_sse2
;
}
#endif //HAVE_MMX
#ifdef ARCH_PPC
...
...
common/x86/dct-32.asm
View file @
9168abfa
...
...
@@ -37,15 +37,6 @@ SECTION .text
psubw
%
2
,
%
1
%endmacro
%macro SUMSUB_BADC 4
paddw
%
1
,
%
2
paddw
%
3
,
%
4
paddw
%
2
,
%
2
paddw
%
4
,
%
4
psubw
%
2
,
%
1
psubw
%
4
,
%
3
%endmacro
%macro SBUTTERFLY 4
mova
m
%
4
,
m
%
2
punpckl
%
1
m
%
2
,
m
%
3
...
...
@@ -61,330 +52,91 @@ SECTION .text
SWAP
%
2
,
%
3
%endmacro
; input 2x8 unsigned bytes (%5,%6), zero (%7) output: difference (%1,%2)
%macro LOAD_DIFF_8P 7
movq
%
1
,
%
5
movq
%
2
,
%
1
punpcklbw
%
1
,
%
7
punpckhbw
%
2
,
%
7
movq
%
3
,
%
6
movq
%
4
,
%
3
punpcklbw
%
3
,
%
7
punpckhbw
%
4
,
%
7
psubw
%
1
,
%
3
psubw
%
2
,
%
4
%endmacro
%macro LOADSUMSUB 4
; returns %1=%3+%4, %2=%3-%4
movq
%
2
,
%
3
movq
%
1
,
%
4
SUMSUB_BA
%
1
,
%
2
%macro LOAD_DIFF_8P 4
movh
%
1
,
%
3
movh
%
2
,
%
4
punpcklbw
%
1
,
%
2
punpcklbw
%
2
,
%
2
psubw
%
1
,
%
2
%endmacro
%macro STORE_DIFF_8P 4
psraw
%
1
,
6
mov
q
%
3
,
%
2
mov
h
%
3
,
%
2
punpcklbw
%
3
,
%
4
paddsw
%
1
,
%
3
packuswb
%
1
,
%
1
mov
q
%
2
,
%
1
mov
h
%
2
,
%
1
%endmacro
; in: m0..m7
; out: 0,4,6 in mem, rest in regs
%macro DCT8_1D 9
SUMSUB_BA
m
%
8
,
m
%
1
; %8 = s07, %1 = d07
SUMSUB_BA
m
%
7
,
m
%
2
; %7 = s16, %2 = d16
SUMSUB_BA
m
%
6
,
m
%
3
; %6 = s25, %3 = d25
SUMSUB_BA
m
%
5
,
m
%
4
; %5 = s34, %4 = d34
SUMSUB_BA
m
%
5
,
m
%
8
; %5 = a0, %8 = a2
SUMSUB_BA
m
%
6
,
m
%
7
; %6 = a1, %7 = a3
SUMSUB_BA
m
%
6
,
m
%
5
; %6 = dst0, %5 = dst4
mova
[
%
9
+
0x00
],
m
%
6
mova
[
%
9
+
0x40
],
m
%
5
mova
m
%
6
,
m
%
7
; a3
psraw
m
%
6
,
1
; a3>>1
paddw
m
%
6
,
m
%
8
; a2 + (a3>>1)
psraw
m
%
8
,
1
; a2>>1
psubw
m
%
8
,
m
%
7
; (a2>>1) - a3
mova
[
%
9
+
0x60
],
m
%
8
mova
m
%
5
,
m
%
3
psraw
m
%
5
,
1
paddw
m
%
5
,
m
%
3
; d25+(d25>>1)
mova
m
%
7
,
m
%
1
psubw
m
%
7
,
m
%
4
; a5 = d07-d34-(d25+(d25>>1))
psubw
m
%
7
,
m
%
5
mova
m
%
5
,
m
%
2
psraw
m
%
5
,
1
paddw
m
%
5
,
m
%
2
; d16+(d16>>1)
mova
m
%
8
,
m
%
1
paddw
m
%
8
,
m
%
4
psubw
m
%
8
,
m
%
5
; a6 = d07+d34-(d16+(d16>>1))
mova
m
%
5
,
m
%
1
psraw
m
%
5
,
1
paddw
m
%
5
,
m
%
1
; d07+(d07>>1)
paddw
m
%
5
,
m
%
2
paddw
m
%
5
,
m
%
3
; a4 = d16+d25+(d07+(d07>>1))
mova
m
%
1
,
m
%
4
psraw
m
%
1
,
1
paddw
m
%
1
,
m
%
4
; d34+(d34>>1)
paddw
m
%
1
,
m
%
2
psubw
m
%
1
,
m
%
3
; a7 = d16-d25+(d34+(d34>>1))
mova
m
%
4
,
m
%
1
psraw
m
%
4
,
2
paddw
m
%
4
,
m
%
5
; a4 + (a7>>2)
mova
m
%
3
,
m
%
8
psraw
m
%
3
,
2
paddw
m
%
3
,
m
%
7
; a5 + (a6>>2)
psraw
m
%
5
,
2
psraw
m
%
7
,
2
psubw
m
%
5
,
m
%
1
; (a4>>2) - a7
psubw
m
%
8
,
m
%
7
; a6 - (a5>>2)
SWAP
%
2
,
%
4
,
%
3
,
%
6
,
%
8
,
%
5
%endmacro
;-----------------------------------------------------------------------------
; void x264_pixel_sub_8x8_mmx( int16_t *diff, uint8_t *pix1, uint8_t *pix2 );
;-----------------------------------------------------------------------------
ALIGN
16
x264_pixel_sub_8x8_mmx:
pxor
mm7
,
mm7
%assign i 0
%rep 8
LOAD_DIFF_8P
mm0
,
mm1
,
mm2
,
mm3
,
[
r1
],
[
r2
],
mm7
movq
[
r0
+
i
],
mm0
movq
[
r0
+
i
+
8
],
mm1
add
r1
,
FENC_STRIDE
add
r2
,
FDEC_STRIDE
%assign i i+16
%endrep
ret
;-----------------------------------------------------------------------------
; void x264_ydct8_mmx( int16_t dest[8][8] );
;-----------------------------------------------------------------------------
ALIGN
16
x264_ydct8_mmx:
;-------------------------------------------------------------------------
; vertical dct ( compute 4 columns at a time -> 2 loops )
;-------------------------------------------------------------------------
%assign i 0
%rep 2
LOADSUMSUB
mm2
,
mm3
,
[
r0
+
i
+
0
*
16
],
[
r0
+
i
+
7
*
16
]
; mm2 = s07, mm3 = d07
LOADSUMSUB
mm1
,
mm5
,
[
r0
+
i
+
1
*
16
],
[
r0
+
i
+
6
*
16
]
; mm1 = s16, mm5 = d16
LOADSUMSUB
mm0
,
mm6
,
[
r0
+
i
+
2
*
16
],
[
r0
+
i
+
5
*
16
]
; mm0 = s25, mm6 = d25
LOADSUMSUB
mm4
,
mm7
,
[
r0
+
i
+
3
*
16
],
[
r0
+
i
+
4
*
16
]
; mm4 = s34, mm7 = d34
SUMSUB_BA
mm4
,
mm2
; mm4 = a0, mm2 = a2
SUMSUB_BA
mm0
,
mm1
; mm0 = a1, mm1 = a3
SUMSUB_BA
mm0
,
mm4
; mm0 = dst0, mm1 = dst4
movq
[
r0
+
i
+
0
*
16
],
mm0
movq
[
r0
+
i
+
4
*
16
],
mm4
movq
mm0
,
mm1
; a3
psraw
mm0
,
1
; a3>>1
paddw
mm0
,
mm2
; a2 + (a3>>1)
psraw
mm2
,
1
; a2>>1
psubw
mm2
,
mm1
; (a2>>1) - a3
movq
[
r0
+
i
+
2
*
16
],
mm0
movq
[
r0
+
i
+
6
*
16
],
mm2
movq
mm0
,
mm6
psraw
mm0
,
1
paddw
mm0
,
mm6
; d25+(d25>>1)
movq
mm1
,
mm3
psubw
mm1
,
mm7
; a5 = d07-d34-(d25+(d25>>1))
psubw
mm1
,
mm0
movq
mm0
,
mm5
psraw
mm0
,
1
paddw
mm0
,
mm5
; d16+(d16>>1)
movq
mm2
,
mm3
paddw
mm2
,
mm7
; a6 = d07+d34-(d16+(d16>>1))
psubw
mm2
,
mm0
movq
mm0
,
mm3
psraw
mm0
,
1
paddw
mm0
,
mm3
; d07+(d07>>1)
paddw
mm0
,
mm5
paddw
mm0
,
mm6
; a4 = d16+d25+(d07+(d07>>1))
movq
mm3
,
mm7
psraw
mm3
,
1
paddw
mm3
,
mm7
; d34+(d34>>1)
paddw
mm3
,
mm5
psubw
mm3
,
mm6
; a7 = d16-d25+(d34+(d34>>1))
movq
mm7
,
mm3
psraw
mm7
,
2
paddw
mm7
,
mm0
; a4 + (a7>>2)
movq
mm6
,
mm2
psraw
mm6
,
2
paddw
mm6
,
mm1
; a5 + (a6>>2)
psraw
mm0
,
2
psraw
mm1
,
2
psubw
mm0
,
mm3
; (a4>>2) - a7
psubw
mm2
,
mm1
; a6 - (a5>>2)
movq
[
r0
+
i
+
1
*
16
],
mm7
movq
[
r0
+
i
+
3
*
16
],
mm6
movq
[
r0
+
i
+
5
*
16
],
mm2
movq
[
r0
+
i
+
7
*
16
],
mm0
%assign i i+8
%endrep
ret
;-----------------------------------------------------------------------------
; void x264_yidct8_mmx( int16_t dest[8][8] );
;-----------------------------------------------------------------------------
ALIGN
16
x264_yidct8_mmx:
;-------------------------------------------------------------------------
; vertical idct ( compute 4 columns at a time -> 2 loops )
;-------------------------------------------------------------------------
%assign i 0
%rep 2
movq
mm1
,
[
r0
+
i
+
1
*
16
]
; mm1 = d1
movq
mm3
,
[
r0
+
i
+
3
*
16
]
; mm3 = d3
movq
mm5
,
[
r0
+
i
+
5
*
16
]
; mm5 = d5
movq
mm7
,
[
r0
+
i
+
7
*
16
]
; mm7 = d7
movq
mm4
,
mm7
psraw
mm4
,
1
movq
mm0
,
mm5
psubw
mm0
,
mm7
psubw
mm0
,
mm4
psubw
mm0
,
mm3
; mm0 = e1
movq
mm6
,
mm3
psraw
mm6
,
1
movq
mm2
,
mm7
psubw
mm2
,
mm6
psubw
mm2
,
mm3
paddw
mm2
,
mm1
; mm2 = e3
movq
mm4
,
mm5
psraw
mm4
,
1
paddw
mm4
,
mm5
paddw
mm4
,
mm7
psubw
mm4
,
mm1
; mm4 = e5
movq
mm6
,
mm1
psraw
mm6
,
1
paddw
mm6
,
mm1
paddw
mm6
,
mm5
paddw
mm6
,
mm3
; mm6 = e7
movq
mm1
,
mm0
movq
mm3
,
mm4
movq
mm5
,
mm2
movq
mm7
,
mm6
psraw
mm6
,
2
psraw
mm3
,
2
psraw
mm5
,
2
psraw
mm0
,
2
paddw
mm1
,
mm6
; mm1 = f1
paddw
mm3
,
mm2
; mm3 = f3
psubw
mm5
,
mm4
; mm5 = f5
psubw
mm7
,
mm0
; mm7 = f7
movq
mm2
,
[
r0
+
i
+
2
*
16
]
; mm2 = d2
movq
mm6
,
[
r0
+
i
+
6
*
16
]
; mm6 = d6
movq
mm4
,
mm2
movq
mm0
,
mm6
psraw
mm4
,
1
psraw
mm6
,
1
psubw
mm4
,
mm0
; mm4 = a4
paddw
mm6
,
mm2
; mm6 = a6
movq
mm2
,
[
r0
+
i
+
0
*
16
]
; mm2 = d0
movq
mm0
,
[
r0
+
i
+
4
*
16
]
; mm0 = d4
SUMSUB_BA
mm0
,
mm2
; mm0 = a0, mm2 = a2
SUMSUB_BADC
mm6
,
mm0
,
mm4
,
mm2
; mm6 = f0, mm0 = f6
; mm4 = f2, mm2 = f4
SUMSUB_BADC
mm7
,
mm6
,
mm5
,
mm4
; mm7 = g0, mm6 = g7
; mm5 = g1, mm4 = g6
SUMSUB_BADC
mm3
,
mm2
,
mm1
,
mm0
; mm3 = g2, mm2 = g5
; mm1 = g3, mm0 = g4
movq
[
r0
+
i
+
0
*
16
],
mm7
movq
[
r0
+
i
+
1
*
16
],
mm5
movq
[
r0
+
i
+
2
*
16
],
mm3
movq
[
r0
+
i
+
3
*
16
],
mm1
movq
[
r0
+
i
+
4
*
16
],
mm0
movq
[
r0
+
i
+
5
*
16
],
mm2
movq
[
r0
+
i
+
6
*
16
],
mm4
movq
[
r0
+
i
+
7
*
16
],
mm6
%assign i i+8
%endrep
ret
;-----------------------------------------------------------------------------
; void x264_pixel_add_8x8_mmx( uint8_t *dst, int16_t src[8][8] );
;-----------------------------------------------------------------------------
ALIGN
16
x264_pixel_add_8x8_mmx:
pxor
mm7
,
mm7
%assign i 0
%rep 8
movq
mm0
,
[
r0
]
movq
mm2
,
[
r1
+
i
]
movq
mm3
,
[
r1
+
i
+
8
]
movq
mm1
,
mm0
psraw
mm2
,
6
psraw
mm3
,
6
punpcklbw
mm0
,
mm7
punpckhbw
mm1
,
mm7
paddw
mm0
,
mm2
paddw
mm1
,
mm3
packuswb
mm0
,
mm1
movq
[
r0
],
mm0
add
r0
,
FDEC_STRIDE
%assign i i+16
%endrep
ret
;-----------------------------------------------------------------------------
; void x264_transpose_8x8_mmx( int16_t src[8][8] );
;-----------------------------------------------------------------------------
ALIGN
16
x264_transpose_8x8_mmx:
movq
m0
,
[
r0
]
movq
m1
,
[
r0
+
16
]
movq
m2
,
[
r0
+
32
]
movq
m3
,
[
r0
+
48
]
TRANSPOSE4x4W
0
,
1
,
2
,
3
,
4
movq
[
r0
],
m0
movq
[
r0
+
16
],
m1
movq
[
r0
+
32
],
m2
movq
[
r0
+
48
],
m3
movq
m0
,
[
r0
+
72
]
movq
m1
,
[
r0
+
88
]
movq
m2
,
[
r0
+
104
]
movq
m3
,
[
r0
+
120
]
TRANSPOSE4x4W
0
,
1
,
2
,
3
,
4
movq
[
r0
+
72
],
m0
movq
[
r0
+
88
],
m1
movq
[
r0
+
104
],
m2
movq
[
r0
+
120
],
m3
movq
m0
,
[
r0
+
8
]
movq
m1
,
[
r0
+
24
]
movq
m2
,
[
r0
+
40
]
movq
m3
,
[
r0
+
56
]
TRANSPOSE4x4W
0
,
1
,
2
,
3
,
4
movq
m4
,
[
r0
+
64
]
movq
m5
,
[
r0
+
80
]
movq
m6
,
[
r0
+
96
]
movq
m7
,
[
r0
+
112
]
movq
[
r0
+
64
],
m0
movq
[
r0
+
80
],
m1
movq
[
r0
+
96
],
m2
movq
[
r0
+
112
],
m3
TRANSPOSE4x4W
4
,
5
,
6
,
7
,
0
movq
[
r0
+
8
],
m4
movq
[
r0
+
24
],
m5
movq
[
r0
+
40
],
m6
movq
[
r0
+
56
],
m7
ret
;-----------------------------------------------------------------------------
; void x264_sub8x8_dct8_mmx( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
cglobal
x264_sub8x8_dct8_mmx
,
3
,
3
call
x264_pixel_sub_8x8_mmx
call
x264_ydct8_mmx
call
x264_transpose_8x8_mmx
jmp
x264_ydct8_mmx
;-----------------------------------------------------------------------------
; void x264_add8x8_idct8_mmx( uint8_t *dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
cglobal
x264_add8x8_idct8_mmx
,
0
,
1
mov
r0
,
r1m
add
word
[
r0
],
32
call
x264_yidct8_mmx
call
x264_transpose_8x8_mmx
call
x264_yidct8_mmx
mov
r1
,
r0
mov
r0
,
r0m
jmp
x264_pixel_add_8x8_mmx
INIT_XMM
%macro IDCT8_1D 8
movdqa
m
%
1
,
m
%
3
movdqa
m
%
5
,
m
%
7
; in: 0,4 in mem, rest in regs
; out: m0..m7
%macro IDCT8_1D 9
mova
m
%
1
,
m
%
3
mova
m
%
5
,
m
%
7
psraw
m
%
3
,
1
psraw
m
%
7
,
1
psubw
m
%
3
,
m
%
5
paddw
m
%
7
,
m
%
1
mov
dqa
m
%
5
,
m
%
2
mov
a
m
%
5
,
m
%
2
psraw
m
%
5
,
1
paddw
m
%
5
,
m
%
2
paddw
m
%
5
,
m
%
4
paddw
m
%
5
,
m
%
6
mov
dqa
m
%
1
,
m
%
6
mov
a
m
%
1
,
m
%
6
psraw
m
%
1
,
1
paddw
m
%
1
,
m
%
6
paddw
m
%
1
,
m
%
8
...
...
@@ -397,8 +149,8 @@ INIT_XMM
psraw
m
%
8
,
1
psubw
m
%
2
,
m
%
4
psubw
m
%
6
,
m
%
8
mov
dqa
m
%
4
,
m
%
5
mov
dqa
m
%
8
,
m
%
1
mov
a
m
%
4
,
m
%
5
mov
a
m
%
8
,
m
%
1
psraw
m
%
4
,
2
psraw
m
%
8
,
2
paddw
m
%
4
,
m
%
6
...
...
@@ -407,8 +159,8 @@ INIT_XMM
psraw
m
%
2
,
2
psubw
m
%
5
,
m
%
6
psubw
m
%
2
,
m
%
1
mov
dqa
m
%
1
,
[
r1
+
0x00
]
mov
dqa
m
%
6
,
[
r1
+
0x40
]
mov
a
m
%
1
,
[
%
9
+
0x00
]
mov
a
m
%
6
,
[
%
9
+
0x40
]
SUMSUB_BA
m
%
6
,
m
%
1
SUMSUB_BA
m
%
7
,
m
%
6
SUMSUB_BA
m
%
3
,
m
%
1
...
...
@@ -420,19 +172,227 @@ INIT_XMM
SWAP
%
3
,
%
8
,
%
7
%endmacro
; in: m0..m7
; out: all except m4, which is in [%9+0x40]
INIT_MMX
ALIGN
16
load_diff_4x8_mmx:
LOAD_DIFF_8P
m0
,
m7
,
[
r1
+
0
*
FENC_STRIDE
],
[
r2
+
0
*
FDEC_STRIDE
]
LOAD_DIFF_8P
m1
,
m7
,
[
r1
+
1
*
FENC_STRIDE
],
[
r2
+
1
*
FDEC_STRIDE
]
LOAD_DIFF_8P
m2
,
m7
,
[
r1
+
2
*
FENC_STRIDE
],
[
r2
+
2
*
FDEC_STRIDE
]
LOAD_DIFF_8P
m3
,
m7
,
[
r1
+
3
*
FENC_STRIDE
],
[
r2
+
3
*
FDEC_STRIDE
]
LOAD_DIFF_8P
m4
,
m7
,
[
r1
+
4
*
FENC_STRIDE
],
[
r2
+
4
*
FDEC_STRIDE
]
LOAD_DIFF_8P
m5
,
m7
,
[
r1
+
5
*
FENC_STRIDE
],
[
r2
+
5
*
FDEC_STRIDE
]
movq
[
r0
],
m0
LOAD_DIFF_8P
m6
,
m7
,
[
r1
+
6
*
FENC_STRIDE
],
[
r2
+
6
*
FDEC_STRIDE
]
LOAD_DIFF_8P
m7
,
m0
,
[
r1
+
7
*
FENC_STRIDE
],
[
r2
+
7
*
FDEC_STRIDE
]
movq
m0
,
[
r0
]
ret
INIT_MMX
ALIGN
16
dct8_mmx:
DCT8_1D
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
r0
SAVE_MM_PERMUTATION
dct8_mmx
ret
%macro SPILL_SHUFFLE 3-*
; ptr, list of regs, list of memory offsets
%xdefine %%base %1
%rep %0/2
%xdefine %%tmp m%2
%rotate %0/2
mova
[
%%
base
+
%
2
*
16
],
%%
tmp
%rotate 1-%0/2
%endrep
%endmacro
%macro UNSPILL_SHUFFLE 3-*
%xdefine %%base %1
%rep %0/2
%xdefine %%tmp m%2
%rotate %0/2
mova
%%
tmp
,
[
%%
base
+
%
2
*
16
]
%rotate 1-%0/2
%endrep
%endmacro
%macro SPILL 2+
; assume offsets are the same as reg numbers
SPILL_SHUFFLE
%
1
,
%
2
,
%
2
%endmacro
%macro UNSPILL 2+
UNSPILL_SHUFFLE
%
1
,
%
2
,
%
2
%endmacro
;-----------------------------------------------------------------------------
; void x264_sub8x8_dct8_mmx( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
cglobal
x264_sub8x8_dct8_mmx
,
3
,
3
global
x264_sub8x8_dct8_mmx
%+
.skip_prologue
.skip_prologue:
INIT_MMX
call
load_diff_4x8_mmx
call
dct8_mmx
UNSPILL
r0
,
0
TRANSPOSE4x4W
0
,
1
,
2
,
3
,
4
SPILL
r0
,
0
,
1
,
2
,
3
UNSPILL
r0
,
4
,
6
TRANSPOSE4x4W
4
,
5
,
6
,
7
,
0
SPILL
r0
,
4
,
5
,
6
,
7
INIT_MMX
add
r1
,
4
add
r2
,
4
add
r0
,
8
call
load_diff_4x8_mmx
sub
r1
,
4
sub
r2
,
4
call
dct8_mmx
sub
r0
,
8
UNSPILL
r0
+
8
,
4
,
6
TRANSPOSE4x4W
4
,
5
,
6
,
7
,
0
SPILL
r0
+
8
,
4
,
5
,
6
,
7
UNSPILL
r0
+
8
,
0
TRANSPOSE4x4W
0
,
1
,
2
,
3
,
5
UNSPILL
r0
,
4
,
5
,
6
,
7
SPILL_SHUFFLE
r0
,
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
movq
mm4
,
m6
; depends on the permutation to not produce conflicts
movq
mm0
,
m4
movq
mm1
,
m5
movq
mm2
,
mm4
movq
mm3
,
m7
INIT_MMX
UNSPILL
r0
+
8
,
4
,
5
,
6
,
7
add
r0
,
8
call
dct8_mmx
sub
r0
,
8
SPILL
r0
+
8
,
1
,
2
,
3
,
5
,
7
INIT_MMX
UNSPILL
r0
,
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
call
dct8_mmx
SPILL
r0
,
1
,
2
,
3
,
5
,
7
ret
INIT_MMX
ALIGN
16
idct8_mmx:
IDCT8_1D
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
r1
SAVE_MM_PERMUTATION
idct8_mmx
ret
%macro ADD_STORE_ROW 3
movq
m1
,
[
r0
+%
1
*
FDEC_STRIDE
]
movq
m2
,
m1
punpcklbw
m1
,
m0
punpckhbw
m2
,
m0
paddw
m1
,
%
2
paddw
m2
,
%
3
packuswb
m1
,
m2
movq
[
r0
+%
1
*
FDEC_STRIDE
],
m1
%endmacro
;-----------------------------------------------------------------------------
; void x264_add8x8_idct8_mmx( uint8_t *dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
cglobal
x264_add8x8_idct8_mmx
,
2
,
2
global
x264_add8x8_idct8_mmx
%+
.skip_prologue
.skip_prologue:
INIT_MMX
add
word
[
r1
],
32
UNSPILL
r1
,
1
,
2
,
3
,
5
,
6
,
7
call
idct8_mmx
SPILL
r1
,
7
TRANSPOSE4x4W
0
,
1
,
2
,
3
,
7
SPILL
r1
,
0
,
1
,
2
,
3
UNSPILL
r1
,
7
TRANSPOSE4x4W
4
,
5
,
6
,
7
,
0
SPILL
r1
,
4
,
5
,
6
,
7
INIT_MMX
UNSPILL
r1
+
8
,
1
,
2
,
3
,
5
,
6
,
7
add
r1
,
8
call
idct8_mmx
sub
r1
,
8
SPILL
r1
+
8
,
7
TRANSPOSE4x4W
0
,
1
,
2
,
3
,
7
SPILL
r1
+
8
,
0
,
1
,
2
,
3
UNSPILL
r1
+
8
,
7
TRANSPOSE4x4W
4
,
5
,
6
,
7
,
0
SPILL
r1
+
8
,
4
,
5
,
6
,
7
INIT_MMX
movq
m3
,
[
r1
+
0x08
]
movq
m0
,
[
r1
+
0x40
]
movq
[
r1
+
0x40
],
m3
movq
[
r1
+
0x08
],
m0
; memory layout at this time:
; A0------ A1------
; B0------ F0------
; C0------ G0------
; D0------ H0------
; E0------ E1------
; B1------ F1------
; C1------ G1------
; D1------ H1------
UNSPILL_SHUFFLE
r1
,
1
,
2
,
3
,
5
,
6
,
7