Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
10
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
Open sidebar
VideoLAN
x264
Commits
f9ad5ee2
Commit
f9ad5ee2
authored
Jun 06, 2008
by
Loren Merritt
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
enable ssse3 phadd satd on Penryn.
parent
b8670681
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
54 additions
and
13 deletions
+54
-13
common/cpu.c
common/cpu.c
+3
-0
common/pixel.c
common/pixel.c
+9
-0
common/x86/pixel-a.asm
common/x86/pixel-a.asm
+34
-13
common/x86/pixel.h
common/x86/pixel.h
+1
-0
tools/checkasm.c
tools/checkasm.c
+6
-0
x264.h
x264.h
+1
-0
No files found.
common/cpu.c
View file @
f9ad5ee2
...
...
@@ -47,6 +47,7 @@ const struct {
{
"SSE2"
,
X264_CPU_MMX
|
X264_CPU_MMXEXT
|
X264_CPU_SSE
|
X264_CPU_SSE2
},
{
"SSE3"
,
X264_CPU_MMX
|
X264_CPU_MMXEXT
|
X264_CPU_SSE
|
X264_CPU_SSE2
|
X264_CPU_SSE3
},
{
"SSSE3"
,
X264_CPU_MMX
|
X264_CPU_MMXEXT
|
X264_CPU_SSE
|
X264_CPU_SSE2
|
X264_CPU_SSE3
|
X264_CPU_SSSE3
},
{
"SSE4"
,
X264_CPU_MMX
|
X264_CPU_MMXEXT
|
X264_CPU_SSE
|
X264_CPU_SSE2
|
X264_CPU_SSE3
|
X264_CPU_SSSE3
|
X264_CPU_SSE4
},
{
"3DNow"
,
X264_CPU_3DNOW
},
{
"Altivec"
,
X264_CPU_ALTIVEC
},
{
"Cache32"
,
X264_CPU_CACHELINE_SPLIT
|
X264_CPU_CACHELINE_32
},
...
...
@@ -88,6 +89,8 @@ uint32_t x264_cpu_detect( void )
cpu
|=
X264_CPU_SSE3
;
if
(
ecx
&
0x00000200
)
cpu
|=
X264_CPU_SSSE3
;
if
(
ecx
&
0x00080000
)
cpu
|=
X264_CPU_SSE4
;
x264_cpu_cpuid
(
0x80000000
,
&
eax
,
&
ebx
,
&
ecx
,
&
edx
);
max_extended_cap
=
eax
;
...
...
common/pixel.c
View file @
f9ad5ee2
...
...
@@ -360,6 +360,7 @@ SATD_X_DECL7()
SATD_X_DECL7
(
_mmxext
)
SATD_X_DECL5
(
_sse2
)
SATD_X_DECL7
(
_ssse3
)
SATD_X_DECL5
(
_ssse3_phadd
)
#endif
/****************************************************************************
...
...
@@ -649,6 +650,14 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT2
(
sad_x4
,
_cache64_ssse3
);
}
}
if
(
cpu
&
X264_CPU_SSE4
)
{
// enabled on Penryn, but slower on Conroe
INIT5
(
satd
,
_ssse3_phadd
);
INIT5
(
satd_x3
,
_ssse3_phadd
);
INIT5
(
satd_x4
,
_ssse3_phadd
);
}
#endif //HAVE_MMX
#ifdef ARCH_PPC
...
...
common/x86/pixel-a.asm
View file @
f9ad5ee2
...
...
@@ -274,19 +274,23 @@ SSD_SSE2 8, 4
LOAD_DIFF_8P
%
4
,
%
6
,
[
r0
+
r4
],
[
r2
+
r5
]
%endmacro
;;; row transform not used, because phaddw is much slower than paddw on a Conroe
;%macro PHSUMSUB 3
; movdqa %3, %1
; phaddw %1, %2
; phsubw %3, %2
;%endmacro
;%macro HADAMARD4_ROW_SSSE3 5 ; abcd-t -> adtc
; PHSUMSUB %1, %2, %5
; PHSUMSUB %3, %4, %2
; PHSUMSUB %1, %3, %4
; PHSUMSUB %5, %2, %3
;%endmacro
; phaddw is used only in 4x4 hadamard, because in 8x8 it's slower:
; even on Penryn, phaddw has latency 3 while paddw and punpck* have 1.
; 4x4 is special in that 4x4 transpose in xmmregs takes extra munging,
; whereas phaddw-based transform doesn't care what order the coefs end up in.
%macro PHSUMSUB 3
movdqa
%
3
,
%
1
phaddw
%
1
,
%
2
phsubw
%
3
,
%
2
%endmacro
%macro HADAMARD4_ROW_PHADD 5
; abcd-t -> adtc
PHSUMSUB
%
1
,
%
2
,
%
5
PHSUMSUB
%
3
,
%
4
,
%
2
PHSUMSUB
%
1
,
%
3
,
%
4
PHSUMSUB
%
5
,
%
2
,
%
3
%endmacro
%macro SUMSUB_BADC 4
paddw
%
1
,
%
2
...
...
@@ -494,6 +498,21 @@ SSD_SSE2 8, 4
paddusw
xmm6
,
xmm2
%endmacro
%macro SATD_8x4_PHADD 1
LOAD_DIFF_8x4P
xmm0
,
xmm1
,
xmm2
,
xmm3
,
xmm4
,
xmm5
%if %1
lea
r0
,
[
r0
+
4
*
r1
]
lea
r2
,
[
r2
+
4
*
r3
]
%endif
HADAMARD4_1D
xmm0
,
xmm1
,
xmm2
,
xmm3
HADAMARD4_ROW_PHADD
xmm0
,
xmm1
,
xmm2
,
xmm3
,
xmm4
ABS4
xmm0
,
xmm3
,
xmm4
,
xmm2
,
xmm1
,
xmm5
paddusw
xmm0
,
xmm3
paddusw
xmm2
,
xmm4
paddusw
xmm6
,
xmm0
paddusw
xmm6
,
xmm2
%endmacro
%macro SATD_START_MMX 0
lea
r4
,
[
3
*
r1
]
; 3*stride1
lea
r5
,
[
3
*
r3
]
; 3*stride2
...
...
@@ -1279,6 +1298,8 @@ SA8D_16x16_32 ssse3
INTRA_SA8D_SSE2
ss
se3
INTRA_SATDS_MMX
ss
se3
SATD_W4
ss
se3
; mmx, but uses pabsw from ssse3.
%define SATD_8x4_SSE2 SATD_8x4_PHADD
SATDS_SSE2
ss
se3_phadd
...
...
common/x86/pixel.h
View file @
f9ad5ee2
...
...
@@ -51,6 +51,7 @@ DECL_X1( ssd, sse2 )
DECL_X1
(
satd
,
mmxext
)
DECL_X1
(
satd
,
sse2
)
DECL_X1
(
satd
,
ssse3
)
DECL_X1
(
satd
,
ssse3_phadd
)
DECL_X1
(
sa8d
,
mmxext
)
DECL_X1
(
sa8d
,
sse2
)
DECL_X1
(
sa8d
,
ssse3
)
...
...
tools/checkasm.c
View file @
f9ad5ee2
...
...
@@ -120,6 +120,7 @@ static void print_bench(void)
for
(
k
=
0
;
k
<
j
&&
benchs
[
i
].
vers
[
k
].
pointer
!=
b
->
pointer
;
k
++
);
if
(
k
<
j
)
continue
;
printf
(
"%s_%s%s: %"
PRId64
"
\n
"
,
benchs
[
i
].
name
,
b
->
cpu
&
X264_CPU_SSE4
?
"sse4"
:
b
->
cpu
&
X264_CPU_SSSE3
?
"ssse3"
:
b
->
cpu
&
X264_CPU_SSE3
?
"sse3"
:
b
->
cpu
&
X264_CPU_SSE2
?
"sse2"
:
...
...
@@ -1142,6 +1143,11 @@ int check_all_flags( void )
ret
|=
add_flags
(
&
cpu0
,
&
cpu1
,
X264_CPU_SSSE3
,
"SSSE3"
);
ret
|=
add_flags
(
&
cpu0
,
&
cpu1
,
X264_CPU_CACHELINE_SPLIT
|
X264_CPU_CACHELINE_64
,
"SSSE3 Cache64"
);
}
if
(
x264_cpu_detect
()
&
X264_CPU_SSSE3
)
{
cpu1
&=
~
(
X264_CPU_CACHELINE_SPLIT
|
X264_CPU_CACHELINE_64
);
ret
|=
add_flags
(
&
cpu0
,
&
cpu1
,
X264_CPU_SSE4
,
"SSE4"
);
}
#elif ARCH_PPC
if
(
x264_cpu_detect
()
&
X264_CPU_ALTIVEC
)
{
...
...
x264.h
View file @
f9ad5ee2
...
...
@@ -58,6 +58,7 @@ typedef struct x264_t x264_t;
#define X264_CPU_CACHELINE_SPLIT 0x200
/* avoid memory loads that span the border between two cachelines */
#define X264_CPU_CACHELINE_32 0x0400
/* size of a cacheline in bytes */
#define X264_CPU_CACHELINE_64 0x0800
#define X264_CPU_SSE4 0x001000
/* sse 4.1 */
/* Analyse flags
*/
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment