Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
Menu
Open sidebar
VideoLAN
x264
Commits
539103a5
Commit
539103a5
authored
Apr 08, 2008
by
Loren Merritt
Browse files
dct4 sse2
parent
9168abfa
Changes
3
Hide whitespace changes
Inline
Side-by-side
common/dct.c
View file @
539103a5
...
...
@@ -420,6 +420,13 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf
->
add8x8_idct8
=
x264_add8x8_idct8_sse2
;
dctf
->
add16x16_idct8
=
x264_add16x16_idct8_sse2
;
}
if
(
(
cpu
&
X264_CPU_SSE2
)
&&
!
(
cpu
&
X264_CPU_3DNOW
)
)
{
dctf
->
sub8x8_dct
=
x264_sub8x8_dct_sse2
;
dctf
->
sub16x16_dct
=
x264_sub16x16_dct_sse2
;
dctf
->
add8x8_idct
=
x264_add8x8_idct_sse2
;
dctf
->
add16x16_idct
=
x264_add16x16_idct_sse2
;
}
#endif //HAVE_MMX
#ifdef ARCH_PPC
...
...
common/x86/dct-a.asm
View file @
539103a5
...
...
@@ -32,9 +32,9 @@ pb_zigzag4: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
SECTION
.text
%macro LOAD_DIFF_4P 5
mov
d
%
1
,
%
4
mov
h
%
1
,
%
4
punpcklbw
%
1
,
%
3
mov
d
%
2
,
%
5
mov
h
%
2
,
%
5
punpcklbw
%
2
,
%
3
psubw
%
1
,
%
2
%endmacro
...
...
@@ -55,7 +55,7 @@ SECTION .text
%endmacro
%macro SUMSUB2_AB 3
mov
q
%
3
,
%
1
mov
a
%
3
,
%
1
paddw
%
1
,
%
1
paddw
%
1
,
%
2
psubw
%
3
,
%
2
...
...
@@ -63,8 +63,8 @@ SECTION .text
%endmacro
%macro SUMSUBD2_AB 4
mov
q
%
4
,
%
1
mov
q
%
3
,
%
2
mov
a
%
4
,
%
1
mov
a
%
3
,
%
2
psraw
%
2
,
1
psraw
%
4
,
1
paddw
%
1
,
%
2
...
...
@@ -86,14 +86,22 @@ SECTION .text
SWAP
%
2
,
%
3
%endmacro
%macro STORE_DIFF_4P 5
paddw
%
1
,
%
3
%macro TRANSPOSE2x4x4W 5
SBUTTERFLY
wd
,
%
1
,
%
2
,
%
5
SBUTTERFLY
wd
,
%
3
,
%
4
,
%
5
SBUTTERFLY
dq
,
%
1
,
%
3
,
%
5
SBUTTERFLY
dq
,
%
2
,
%
4
,
%
5
SBUTTERFLY
qdq
,
%
1
,
%
2
,
%
5
SBUTTERFLY
qdq
,
%
3
,
%
4
,
%
5
%endmacro
%macro STORE_DIFF_4P 4
psraw
%
1
,
6
mov
d
%
2
,
%
5
punpcklbw
%
2
,
%
4
mov
h
%
2
,
%
4
punpcklbw
%
2
,
%
3
paddsw
%
1
,
%
2
packuswb
%
1
,
%
1
mov
d
%
5
,
%
1
mov
h
%
4
,
%
1
%endmacro
%macro HADAMARD4_1D 4
...
...
@@ -164,17 +172,20 @@ cglobal x264_idct4x4dc_mmx, 1,1
;-----------------------------------------------------------------------------
cglobal
x264_sub4x4_dct_mmx
,
3
,
3
.skip_prologue:
%macro SUB_DCT4 1
LOAD_DIFF_4P
m0
,
m6
,
m7
,
[
r1
+
0
*
FENC_STRIDE
],
[
r2
+
0
*
FDEC_STRIDE
]
LOAD_DIFF_4P
m1
,
m6
,
m7
,
[
r1
+
1
*
FENC_STRIDE
],
[
r2
+
1
*
FDEC_STRIDE
]
LOAD_DIFF_4P
m2
,
m6
,
m7
,
[
r1
+
2
*
FENC_STRIDE
],
[
r2
+
2
*
FDEC_STRIDE
]
LOAD_DIFF_4P
m3
,
m6
,
m7
,
[
r1
+
3
*
FENC_STRIDE
],
[
r2
+
3
*
FDEC_STRIDE
]
DCT4_1D
0
,
1
,
2
,
3
,
4
TRANSPOSE
4x4W
0
,
1
,
2
,
3
,
4
TRANSPOSE
%
1
0
,
1
,
2
,
3
,
4
DCT4_1D
0
,
1
,
2
,
3
,
4
movq
[
r0
+
0
],
m0
movq
[
r0
+
8
],
m1
movq
[
r0
+
16
],
m2
movq
[
r0
+
24
],
m3
%endmacro
SUB_DCT4
4
x4W
RET
;-----------------------------------------------------------------------------
...
...
@@ -186,18 +197,52 @@ cglobal x264_add4x4_idct_mmx, 2,2,1
movq
m1
,
[
r1
+
8
]
movq
m2
,
[
r1
+
16
]
movq
m3
,
[
r1
+
24
]
%macro ADD_IDCT4 1
IDCT4_1D
0
,
1
,
2
,
3
,
4
,
5
TRANSPOSE4x4W
0
,
1
,
2
,
3
,
4
TRANSPOSE
%
1
0
,
1
,
2
,
3
,
4
paddw
m0
,
[
pw_32
GLOBAL
]
IDCT4_1D
0
,
1
,
2
,
3
,
4
,
5
pxor
m7
,
m7
movq
m6
,
[
pw_32
GLOBAL
]
STORE_DIFF_4P
m0
,
m4
,
m6
,
m7
,
[
r0
+
0
*
FDEC_STRIDE
]
STORE_DIFF_4P
m1
,
m4
,
m6
,
m7
,
[
r0
+
1
*
FDEC_STRIDE
]
STORE_DIFF_4P
m2
,
m4
,
m6
,
m7
,
[
r0
+
2
*
FDEC_STRIDE
]
STORE_DIFF_4P
m3
,
m4
,
m6
,
m7
,
[
r0
+
3
*
FDEC_STRIDE
]
STORE_DIFF_4P
m0
,
m4
,
m7
,
[
r0
+
0
*
FDEC_STRIDE
]
STORE_DIFF_4P
m1
,
m4
,
m7
,
[
r0
+
1
*
FDEC_STRIDE
]
STORE_DIFF_4P
m2
,
m4
,
m7
,
[
r0
+
2
*
FDEC_STRIDE
]
STORE_DIFF_4P
m3
,
m4
,
m7
,
[
r0
+
3
*
FDEC_STRIDE
]
%endmacro
ADD_IDCT4
4
x4W
RET
INIT_XMM
cglobal
x264_sub8x8_dct_sse2
,
3
,
3
.skip_prologue:
call
.8x4
add
r0
,
64
add
r1
,
4
*
FENC_STRIDE
add
r2
,
4
*
FDEC_STRIDE
.8x4:
SUB_DCT4
2
x4x4W
movhps
[
r0
+
32
],
m0
movhps
[
r0
+
40
],
m1
movhps
[
r0
+
48
],
m2
movhps
[
r0
+
56
],
m3
ret
cglobal
x264_add8x8_idct_sse2
,
2
,
2
,
1
.skip_prologue:
call
.8x4
add
r1
,
64
add
r0
,
4
*
FDEC_STRIDE
.8x4:
movq
m0
,
[
r1
+
0
]
movq
m1
,
[
r1
+
8
]
movq
m2
,
[
r1
+
16
]
movq
m3
,
[
r1
+
24
]
movhps
m0
,
[
r1
+
32
]
movhps
m1
,
[
r1
+
40
]
movhps
m2
,
[
r1
+
48
]
movhps
m3
,
[
r1
+
56
]
ADD_IDCT4
2
x4x4W
ret
;-----------------------------------------------------------------------------
; void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
...
...
@@ -207,16 +252,16 @@ cglobal %1, 3,3
.skip_prologue:
call
%
2
add
r0
,
%
3
add
r1
,
%
4
-%
5
*
FENC_STRIDE
add
r2
,
%
4
-%
5
*
FDEC_STRIDE
add
r1
,
%
4
-%
5
-%
6
*
FENC_STRIDE
add
r2
,
%
4
-%
5
-%
6
*
FDEC_STRIDE
call
%
2
add
r0
,
%
3
add
r1
,
%
4
*
FENC_STRIDE
-%
6
add
r2
,
%
4
*
FDEC_STRIDE
-%
6
add
r1
,
(
%
4
-%
6
)
*
FENC_STRIDE
-%
5
-%
4
add
r2
,
(
%
4
-%
6
)
*
FDEC_STRIDE
-%
5
-%
4
call
%
2
add
r0
,
%
3
add
r1
,
%
4
-%
5
*
FENC_STRIDE
add
r2
,
%
4
-%
5
*
FDEC_STRIDE
add
r1
,
%
4
-%
5
-%
6
*
FENC_STRIDE
add
r2
,
%
4
-%
5
-%
6
*
FDEC_STRIDE
jmp
%
2
%endmacro
...
...
@@ -227,36 +272,39 @@ cglobal %1, 3,3
cglobal
%
1
,
2
,
2
,
1
.skip_prologue:
call
%
2
add
r0
,
%
4
-%
5
*
FDEC_STRIDE
add
r0
,
%
4
-%
5
-%
6
*
FDEC_STRIDE
add
r1
,
%
3
call
%
2
add
r0
,
%
4
*
FDEC_STRIDE
-%
6
add
r0
,
(
%
4
-%
6
)
*
FDEC_STRIDE
-%
5
-%
4
add
r1
,
%
3
call
%
2
add
r0
,
%
4
-%
5
*
FDEC_STRIDE
add
r0
,
%
4
-%
5
-%
6
*
FDEC_STRIDE
add
r1
,
%
3
jmp
%
2
%endmacro
SUB_NxN_DCT
x264_sub8x8_dct_mmx
,
x264_sub4x4_dct_mmx
%+
.skip_prologue
,
32
,
4
,
0
,
4
ADD_NxN_IDCT
x264_add8x8_idct_mmx
,
x264_add4x4_idct_mmx
%+
.skip_prologue
,
32
,
4
,
0
,
4
SUB_NxN_DCT
x264_sub8x8_dct_mmx
,
x264_sub4x4_dct_mmx
%+
.skip_prologue
,
32
,
4
,
0
,
0
ADD_NxN_IDCT
x264_add8x8_idct_mmx
,
x264_add4x4_idct_mmx
%+
.skip_prologue
,
32
,
4
,
0
,
0
SUB_NxN_DCT
x264_sub16x16_dct_mmx
,
x264_sub8x8_dct_mmx
%+
.skip_prologue
,
32
,
8
,
4
,
4
ADD_NxN_IDCT
x264_add16x16_idct_mmx
,
x264_add8x8_idct_mmx
%+
.skip_prologue
,
32
,
8
,
4
,
4
SUB_NxN_DCT
x264_sub16x16_dct_
mmx
,
x264_sub8x8_dct_
mmx
%+
.skip_prologue
,
32
,
4
,
4
,
12
ADD_NxN_IDCT
x264_add16x16_idct_
mmx
,
x264_add8x8_idct_
mmx
%+
.skip_prologue
,
32
,
4
,
4
,
12
SUB_NxN_DCT
x264_sub16x16_dct_
sse2
,
x264_sub8x8_dct_
sse2
%+
.skip_prologue
,
64
,
8
,
0
,
4
ADD_NxN_IDCT
x264_add16x16_idct_
sse2
,
x264_add8x8_idct_
sse2
%+
.skip_prologue
,
64
,
8
,
0
,
4
%ifndef ARCH_X86_64
cextern
x264_sub8x8_dct8_mmx.skip_prologue
cextern
x264_add8x8_idct8_mmx.skip_prologue
SUB_NxN_DCT
x264_sub16x16_dct8_mmx
,
x264_sub8x8_dct8_mmx
%+
.skip_prologue
,
128
,
8
,
0
,
8
ADD_NxN_IDCT
x264_add16x16_idct8_mmx
,
x264_add8x8_idct8_mmx
%+
.skip_prologue
,
128
,
8
,
0
,
8
SUB_NxN_DCT
x264_sub16x16_dct8_mmx
,
x264_sub8x8_dct8_mmx
%+
.skip_prologue
,
128
,
8
,
0
,
0
ADD_NxN_IDCT
x264_add16x16_idct8_mmx
,
x264_add8x8_idct8_mmx
%+
.skip_prologue
,
128
,
8
,
0
,
0
%define x264_sub8x8_dct8_sse2 x264_sub8x8_dct8_sse2.skip_prologue
%define x264_add8x8_idct8_sse2 x264_add8x8_idct8_sse2.skip_prologue
%endif
cextern
x264_sub8x8_dct8_sse2
cextern
x264_add8x8_idct8_sse2
SUB_NxN_DCT
x264_sub16x16_dct8_sse2
,
x264_sub8x8_dct8_sse2
,
128
,
8
,
0
,
8
ADD_NxN_IDCT
x264_add16x16_idct8_sse2
,
x264_add8x8_idct8_sse2
,
128
,
8
,
0
,
8
SUB_NxN_DCT
x264_sub16x16_dct8_sse2
,
x264_sub8x8_dct8_sse2
,
128
,
8
,
0
,
0
ADD_NxN_IDCT
x264_add16x16_idct8_sse2
,
x264_add8x8_idct8_sse2
,
128
,
8
,
0
,
0
...
...
common/x86/dct.h
View file @
539103a5
...
...
@@ -26,23 +26,25 @@
void
x264_sub4x4_dct_mmx
(
int16_t
dct
[
4
][
4
],
uint8_t
*
pix1
,
uint8_t
*
pix2
);
void
x264_sub8x8_dct_mmx
(
int16_t
dct
[
4
][
4
][
4
],
uint8_t
*
pix1
,
uint8_t
*
pix2
);
void
x264_sub16x16_dct_mmx
(
int16_t
dct
[
16
][
4
][
4
],
uint8_t
*
pix1
,
uint8_t
*
pix2
);
void
x264_sub8x8_dct_sse2
(
int16_t
dct
[
4
][
4
][
4
],
uint8_t
*
pix1
,
uint8_t
*
pix2
);
void
x264_sub16x16_dct_sse2
(
int16_t
dct
[
16
][
4
][
4
],
uint8_t
*
pix1
,
uint8_t
*
pix2
);
void
x264_add4x4_idct_mmx
(
uint8_t
*
p_dst
,
int16_t
dct
[
4
][
4
]
);
void
x264_add8x8_idct_mmx
(
uint8_t
*
p_dst
,
int16_t
dct
[
4
][
4
][
4
]
);
void
x264_add16x16_idct_mmx
(
uint8_t
*
p_dst
,
int16_t
dct
[
16
][
4
][
4
]
);
void
x264_add8x8_idct_sse2
(
uint8_t
*
p_dst
,
int16_t
dct
[
4
][
4
][
4
]
);
void
x264_add16x16_idct_sse2
(
uint8_t
*
p_dst
,
int16_t
dct
[
16
][
4
][
4
]
);
void
x264_dct4x4dc_mmx
(
int16_t
d
[
4
][
4
]
);
void
x264_idct4x4dc_mmx
(
int16_t
d
[
4
][
4
]
);
void
x264_sub8x8_dct8_mmx
(
int16_t
dct
[
8
][
8
],
uint8_t
*
pix1
,
uint8_t
*
pix2
);
void
x264_sub16x16_dct8_mmx
(
int16_t
dct
[
4
][
8
][
8
],
uint8_t
*
pix1
,
uint8_t
*
pix2
);
void
x264_add8x8_idct8_mmx
(
uint8_t
*
dst
,
int16_t
dct
[
8
][
8
]
);
void
x264_add16x16_idct8_mmx
(
uint8_t
*
dst
,
int16_t
dct
[
4
][
8
][
8
]
);
void
x264_sub8x8_dct8_sse2
(
int16_t
dct
[
8
][
8
],
uint8_t
*
pix1
,
uint8_t
*
pix2
);
void
x264_sub16x16_dct8_sse2
(
int16_t
dct
[
4
][
8
][
8
],
uint8_t
*
pix1
,
uint8_t
*
pix2
);
void
x264_add8x8_idct8_mmx
(
uint8_t
*
dst
,
int16_t
dct
[
8
][
8
]
);
void
x264_add16x16_idct8_mmx
(
uint8_t
*
dst
,
int16_t
dct
[
4
][
8
][
8
]
);
void
x264_add8x8_idct8_sse2
(
uint8_t
*
dst
,
int16_t
dct
[
8
][
8
]
);
void
x264_add16x16_idct8_sse2
(
uint8_t
*
dst
,
int16_t
dct
[
4
][
8
][
8
]
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment