Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
X
x264
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
8
Issues
8
List
Boards
Labels
Service Desk
Milestones
Merge Requests
9
Merge Requests
9
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Environments
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
VideoLAN
x264
Commits
b8670681
Commit
b8670681
authored
Jun 06, 2008
by
Loren Merritt
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
benchmark most of the asm functions (checkasm --bench).
parent
c24df7da
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
340 additions
and
82 deletions
+340
-82
Makefile
Makefile
+2
-1
tools/checkasm-a.asm
tools/checkasm-a.asm
+15
-0
tools/checkasm.c
tools/checkasm.c
+323
-81
No files found.
Makefile
View file @
b8670681
...
@@ -32,7 +32,7 @@ ASMSRC = $(X86SRC) common/x86/pixel-32.asm
...
@@ -32,7 +32,7 @@ ASMSRC = $(X86SRC) common/x86/pixel-32.asm
OBJASM
=
$(ASMSRC:%.asm=%.o)
OBJASM
=
$(ASMSRC:%.asm=%.o)
ASFLAGS
+=
-Icommon
/x86/
ASFLAGS
+=
-Icommon
/x86/
$(OBJASM)
:
common/x86/x86inc.asm common/x86/x86inc-32.asm
$(OBJASM)
:
common/x86/x86inc.asm common/x86/x86inc-32.asm
checkasm
:
tools/checkasm-
32
.o
checkasm
:
tools/checkasm-
a
.o
endif
endif
ifeq
($(ARCH),X86_64)
ifeq
($(ARCH),X86_64)
...
@@ -41,6 +41,7 @@ ASMSRC = $(X86SRC:-32.asm=-64.asm)
...
@@ -41,6 +41,7 @@ ASMSRC = $(X86SRC:-32.asm=-64.asm)
OBJASM
=
$(ASMSRC:%.asm=%.o)
OBJASM
=
$(ASMSRC:%.asm=%.o)
ASFLAGS
+=
-Icommon
/x86/
-DARCH_X86_64
ASFLAGS
+=
-Icommon
/x86/
-DARCH_X86_64
$(OBJASM)
:
common/x86/x86inc.asm common/x86/x86inc-64.asm
$(OBJASM)
:
common/x86/x86inc.asm common/x86/x86inc-64.asm
checkasm
:
tools/checkasm-a.o
endif
endif
endif
endif
...
...
tools/checkasm-
32
.asm
→
tools/checkasm-
a
.asm
View file @
b8670681
...
@@ -37,6 +37,7 @@ cextern printf
...
@@ -37,6 +37,7 @@ cextern printf
%define n5 dword 0xb78d0d1d
%define n5 dword 0xb78d0d1d
%define n6 dword 0x33627ba7
%define n6 dword 0x33627ba7
%ifndef ARCH_X86_64
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
; long x264_checkasm_call( long (*func)(), int *ok, ... )
; long x264_checkasm_call( long (*func)(), int *ok, ... )
;-----------------------------------------------------------------------------
;-----------------------------------------------------------------------------
...
@@ -71,3 +72,17 @@ cglobal x264_checkasm_call, 1,7
...
@@ -71,3 +72,17 @@ cglobal x264_checkasm_call, 1,7
mov
eax
,
r3
mov
eax
,
r3
.ok:
.ok:
RET
RET
%endif
; ARCH_X86_64
;-----------------------------------------------------------------------------
; int x264_stack_pagealign( int (*func)(), int align )
;-----------------------------------------------------------------------------
cglobal
x264_stack_pagealign
,
2
,
2
push
rbp
mov
rbp
,
rsp
and
rsp
,
~
0xfff
sub
rsp
,
r1
call
r0
leave
RET
tools/checkasm.c
View file @
b8670681
#include <ctype.h>
#include <stdlib.h>
#include <stdlib.h>
#include <limits.h>
#include <math.h>
#include <math.h>
#include "common/common.h"
#include "common/common.h"
...
@@ -9,22 +11,177 @@ uint8_t * buf1, * buf2;
...
@@ -9,22 +11,177 @@ uint8_t * buf1, * buf2;
/* buf3, buf4: used to store output */
/* buf3, buf4: used to store output */
uint8_t
*
buf3
,
*
buf4
;
uint8_t
*
buf3
,
*
buf4
;
int
quiet
=
0
;
#define report( name ) { \
#define report( name ) { \
if( used_asm ) \
if( used_asm
&& !quiet
) \
fprintf( stderr, " - %-21s [%s]\n", name, ok ? "OK" : "FAILED" ); \
fprintf( stderr, " - %-21s [%s]\n", name, ok ? "OK" : "FAILED" ); \
if( !ok ) ret = -1; \
if( !ok ) ret = -1; \
}
}
#define BENCH_RUNS 100 // tradeoff between accuracy and speed
#define BENCH_ALIGNS 16 // number of stack+heap data alignments (another accuracy vs speed tradeoff)
#define MAX_FUNCS 1000 // just has to be big enough to hold all the existing functions
#define MAX_CPUS 10 // number of different combinations of cpu flags
typedef
struct
{
void
*
pointer
;
// just for detecting duplicates
uint32_t
cpu
;
uint32_t
cycles
;
uint32_t
den
;
}
bench_t
;
typedef
struct
{
char
*
name
;
bench_t
vers
[
MAX_CPUS
];
}
bench_func_t
;
int
do_bench
=
0
;
char
func_name
[
100
];
static
bench_func_t
benchs
[
MAX_FUNCS
];
static
const
char
*
pixel_names
[
10
]
=
{
"16x16"
,
"16x8"
,
"8x16"
,
"8x8"
,
"8x4"
,
"4x8"
,
"4x4"
,
"4x2"
,
"2x4"
,
"2x2"
};
static
const
char
*
intra_predict_16x16_names
[
7
]
=
{
"v"
,
"h"
,
"dc"
,
"p"
,
"dcl"
,
"dct"
,
"dc8"
};
static
const
char
*
intra_predict_8x8c_names
[
7
]
=
{
"dc"
,
"h"
,
"v"
,
"p"
,
"dcl"
,
"dct"
,
"dc8"
};
static
const
char
*
intra_predict_4x4_names
[
12
]
=
{
"v"
,
"h"
,
"dc"
,
"ddl"
,
"ddr"
,
"vr"
,
"hd"
,
"vl"
,
"hu"
,
"dcl"
,
"dct"
,
"dc8"
};
static
const
char
**
intra_predict_8x8_names
=
intra_predict_4x4_names
;
#define set_func_name(...) snprintf( func_name, sizeof(func_name), __VA_ARGS__ )
static
inline
uint32_t
read_time
(
void
)
{
#if defined(__GNUC__) && (defined(ARCH_X86) || defined(ARCH_X86_64))
uint32_t
a
;
asm
volatile
(
"rdtsc"
:
"=a"
(
a
)
::
"edx"
);
return
a
;
#else
return
0
;
#endif
}
static
bench_t
*
get_bench
(
const
char
*
name
,
int
cpu
)
{
int
i
,
j
;
for
(
i
=
0
;
benchs
[
i
].
name
&&
strcmp
(
name
,
benchs
[
i
].
name
);
i
++
)
assert
(
i
<
MAX_FUNCS
);
if
(
!
benchs
[
i
].
name
)
benchs
[
i
].
name
=
strdup
(
name
);
if
(
!
cpu
)
return
&
benchs
[
i
].
vers
[
0
];
for
(
j
=
1
;
benchs
[
i
].
vers
[
j
].
cpu
&&
benchs
[
i
].
vers
[
j
].
cpu
!=
cpu
;
j
++
)
assert
(
j
<
MAX_CPUS
);
benchs
[
i
].
vers
[
j
].
cpu
=
cpu
;
return
&
benchs
[
i
].
vers
[
j
];
}
int
cmp_nop
(
const
void
*
a
,
const
void
*
b
)
{
return
*
(
uint16_t
*
)
a
-
*
(
uint16_t
*
)
b
;
}
int
cmp_bench
(
const
void
*
a
,
const
void
*
b
)
{
// asciibetical sort except preserving numbers
const
char
*
sa
=
((
bench_func_t
*
)
a
)
->
name
;
const
char
*
sb
=
((
bench_func_t
*
)
b
)
->
name
;
for
(;;
sa
++
,
sb
++
)
{
if
(
!*
sa
&&
!*
sb
)
return
0
;
if
(
isdigit
(
*
sa
)
&&
isdigit
(
*
sb
)
&&
isdigit
(
sa
[
1
])
!=
isdigit
(
sb
[
1
])
)
return
isdigit
(
sa
[
1
])
-
isdigit
(
sb
[
1
]);
if
(
*
sa
!=
*
sb
)
return
*
sa
-
*
sb
;
}
}
static
void
print_bench
(
void
)
{
uint16_t
nops
[
10000
]
=
{
0
};
int
i
,
j
,
k
,
nfuncs
,
nop_time
=
0
;
for
(
i
=
0
;
i
<
10000
;
i
++
)
{
int
t
=
read_time
();
nops
[
i
]
=
read_time
()
-
t
;
}
qsort
(
nops
,
10000
,
sizeof
(
uint16_t
),
cmp_nop
);
for
(
i
=
500
;
i
<
9500
;
i
++
)
nop_time
+=
nops
[
i
];
nop_time
/=
900
;
printf
(
"nop: %d
\n
"
,
nop_time
);
for
(
i
=
0
;
i
<
MAX_FUNCS
&&
benchs
[
i
].
name
;
i
++
);
nfuncs
=
i
;
qsort
(
benchs
,
nfuncs
,
sizeof
(
bench_func_t
),
cmp_bench
);
for
(
i
=
0
;
i
<
nfuncs
;
i
++
)
for
(
j
=
0
;
j
<
MAX_CPUS
&&
(
!
j
||
benchs
[
i
].
vers
[
j
].
cpu
);
j
++
)
{
bench_t
*
b
=
&
benchs
[
i
].
vers
[
j
];
if
(
!
b
->
den
)
continue
;
for
(
k
=
0
;
k
<
j
&&
benchs
[
i
].
vers
[
k
].
pointer
!=
b
->
pointer
;
k
++
);
if
(
k
<
j
)
continue
;
printf
(
"%s_%s%s: %"
PRId64
"
\n
"
,
benchs
[
i
].
name
,
b
->
cpu
&
X264_CPU_SSSE3
?
"ssse3"
:
b
->
cpu
&
X264_CPU_SSE3
?
"sse3"
:
b
->
cpu
&
X264_CPU_SSE2
?
"sse2"
:
b
->
cpu
&
X264_CPU_MMX
?
"mmx"
:
"c"
,
b
->
cpu
&
X264_CPU_CACHELINE_32
?
"_c32"
:
b
->
cpu
&
X264_CPU_CACHELINE_64
?
"_c64"
:
""
,
((
int64_t
)
10
*
b
->
cycles
/
b
->
den
-
nop_time
)
/
4
);
}
}
#if defined(ARCH_X86) || defined(ARCH_X86_64)
int
x264_stack_pagealign
(
int
(
*
func
)(),
int
align
);
#else
#define x264_stack_pagealign( func, align ) func()
#endif
#define call_c1(func,...) func(__VA_ARGS__)
#ifdef ARCH_X86
/* detect when callee-saved regs aren't saved.
/* detect when callee-saved regs aren't saved.
* needs an explicit asm check because it only sometimes crashes in normal use. */
* needs an explicit asm check because it only sometimes crashes in normal use. */
#define call_c(func,...) func(__VA_ARGS__)
#ifdef ARCH_X86
long
x264_checkasm_call
(
long
(
*
func
)(),
int
*
ok
,
...
);
long
x264_checkasm_call
(
long
(
*
func
)(),
int
*
ok
,
...
);
#define call_a(func,...) x264_checkasm_call((long(*)())func, &ok, __VA_ARGS__)
#define call_a
1
(func,...) x264_checkasm_call((long(*)())func, &ok, __VA_ARGS__)
#else
#else
#define call_a
call_c
#define call_a
1 call_c1
#endif
#endif
#define call_bench(func,cpu,...)\
if(do_bench)\
{\
uint32_t tsum = 0;\
int tcount = 0;\
int ti;\
call_a1(func, __VA_ARGS__);\
for( ti=0; ti<(cpu?BENCH_RUNS:BENCH_RUNS/4); ti++ )\
{\
uint32_t t = read_time();\
func(__VA_ARGS__);\
func(__VA_ARGS__);\
func(__VA_ARGS__);\
func(__VA_ARGS__);\
t = read_time() - t;\
if( t*tcount <= tsum*4 && ti > 0 )\
{\
tsum += t;\
tcount++;\
}\
}\
bench_t *b = get_bench( func_name, cpu );\
b->cycles += tsum;\
b->den += tcount;\
b->pointer = func;\
}
/* for most functions, run benchmark and correctness test at the same time.
* for those that modify their inputs, run the above macros separately */
#define call_a(func,...) ({ call_a2(func,__VA_ARGS__); call_a1(func,__VA_ARGS__); })
#define call_c(func,...) ({ call_c2(func,__VA_ARGS__); call_c1(func,__VA_ARGS__); })
#define call_a2(func,...) ({ call_bench(func,cpu_new,__VA_ARGS__); })
#define call_c2(func,...) ({ call_bench(func,0,__VA_ARGS__); })
static
int
check_pixel
(
int
cpu_ref
,
int
cpu_new
)
static
int
check_pixel
(
int
cpu_ref
,
int
cpu_new
)
{
{
x264_pixel_function_t
pixel_c
;
x264_pixel_function_t
pixel_c
;
...
@@ -54,11 +211,12 @@ static int check_pixel( int cpu_ref, int cpu_new )
...
@@ -54,11 +211,12 @@ static int check_pixel( int cpu_ref, int cpu_new )
int res_c, res_asm; \
int res_c, res_asm; \
if( pixel_asm.name[i] != pixel_ref.name[i] ) \
if( pixel_asm.name[i] != pixel_ref.name[i] ) \
{ \
{ \
set_func_name( "%s_%s", #name, pixel_names[i] ); \
for( j=0; j<64; j++ ) \
for( j=0; j<64; j++ ) \
{ \
{ \
used_asm = 1; \
used_asm = 1; \
res_c = call_c( pixel_c.name[i], buf1,
32, buf2+j*!align, 16
); \
res_c = call_c( pixel_c.name[i], buf1,
16, buf2+j*!align, 64
); \
res_asm = call_a( pixel_asm.name[i], buf1,
32, buf2+j*!align, 16
); \
res_asm = call_a( pixel_asm.name[i], buf1,
16, buf2+j*!align, 64
); \
if( res_c != res_asm ) \
if( res_c != res_asm ) \
{ \
{ \
ok = 0; \
ok = 0; \
...
@@ -81,20 +239,21 @@ static int check_pixel( int cpu_ref, int cpu_new )
...
@@ -81,20 +239,21 @@ static int check_pixel( int cpu_ref, int cpu_new )
int res_c[4]={0}, res_asm[4]={0}; \
int res_c[4]={0}, res_asm[4]={0}; \
if( pixel_asm.sad_x##N[i] && pixel_asm.sad_x##N[i] != pixel_ref.sad_x##N[i] ) \
if( pixel_asm.sad_x##N[i] && pixel_asm.sad_x##N[i] != pixel_ref.sad_x##N[i] ) \
{ \
{ \
set_func_name( "sad_x%d_%s", N, pixel_names[i] ); \
for( j=0; j<64; j++) \
for( j=0; j<64; j++) \
{ \
{ \
uint8_t *pix2 = buf2+j; \
uint8_t *pix2 = buf2+j; \
used_asm = 1; \
used_asm = 1; \
res_c[0] = pixel_c.sad[i]( buf1, 16, pix2,
32
); \
res_c[0] = pixel_c.sad[i]( buf1, 16, pix2,
64
); \
res_c[1] = pixel_c.sad[i]( buf1, 16, pix2+
30, 32
); \
res_c[1] = pixel_c.sad[i]( buf1, 16, pix2+
6, 64
); \
res_c[2] = pixel_c.sad[i]( buf1, 16, pix2+1,
32
); \
res_c[2] = pixel_c.sad[i]( buf1, 16, pix2+1,
64
); \
if(N==4) \
if(N==4) \
{ \
{ \
res_c[3] = pixel_c.sad[i]( buf1, 16, pix2+
99, 32
); \
res_c[3] = pixel_c.sad[i]( buf1, 16, pix2+
10, 64
); \
call_a( pixel_asm.sad_x4[i], buf1, pix2, pix2+
30, pix2+1, pix2+99, 32
, res_asm ); \
call_a( pixel_asm.sad_x4[i], buf1, pix2, pix2+
6, pix2+1, pix2+10, 64
, res_asm ); \
} \
} \
else \
else \
call_a( pixel_asm.sad_x3[i], buf1, pix2, pix2+
30, pix2+1, 32
, res_asm ); \
call_a( pixel_asm.sad_x3[i], buf1, pix2, pix2+
6, pix2+1, 64
, res_asm ); \
if( memcmp(res_c, res_asm, sizeof(res_c)) ) \
if( memcmp(res_c, res_asm, sizeof(res_c)) ) \
{ \
{ \
ok = 0; \
ok = 0; \
...
@@ -102,6 +261,10 @@ static int check_pixel( int cpu_ref, int cpu_new )
...
@@ -102,6 +261,10 @@ static int check_pixel( int cpu_ref, int cpu_new )
i, res_c[0], res_c[1], res_c[2], res_c[3], \
i, res_c[0], res_c[1], res_c[2], res_c[3], \
res_asm[0], res_asm[1], res_asm[2], res_asm[3] ); \
res_asm[0], res_asm[1], res_asm[2], res_asm[3] ); \
} \
} \
if(N==4) \
call_c2( pixel_c.sad_x4[i], buf1, pix2, pix2+6, pix2+1, pix2+10, 64, res_asm ); \
else \
call_c2( pixel_c.sad_x3[i], buf1, pix2, pix2+6, pix2+1, 64, res_asm ); \
} \
} \
} \
} \
} \
} \
...
@@ -114,6 +277,7 @@ static int check_pixel( int cpu_ref, int cpu_new )
...
@@ -114,6 +277,7 @@ static int check_pixel( int cpu_ref, int cpu_new )
if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \
if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \
{ \
{ \
int res_c[3], res_asm[3]; \
int res_c[3], res_asm[3]; \
set_func_name( #name );\
used_asm = 1; \
used_asm = 1; \
memcpy( buf3, buf2, 1024 ); \
memcpy( buf3, buf2, 1024 ); \
for( i=0; i<3; i++ ) \
for( i=0; i<3; i++ ) \
...
@@ -142,7 +306,8 @@ static int check_pixel( int cpu_ref, int cpu_new )
...
@@ -142,7 +306,8 @@ static int check_pixel( int cpu_ref, int cpu_new )
pixel_asm
.
ssim_end4
!=
pixel_ref
.
ssim_end4
)
pixel_asm
.
ssim_end4
!=
pixel_ref
.
ssim_end4
)
{
{
float
res_c
,
res_a
;
float
res_c
,
res_a
;
ok
=
1
;
int
sums
[
5
][
4
]
=
{{
0
}};
used_asm
=
ok
=
1
;
x264_emms
();
x264_emms
();
res_c
=
x264_pixel_ssim_wxh
(
&
pixel_c
,
buf1
+
2
,
32
,
buf2
+
2
,
32
,
32
,
28
);
res_c
=
x264_pixel_ssim_wxh
(
&
pixel_c
,
buf1
+
2
,
32
,
buf2
+
2
,
32
,
32
,
28
);
res_a
=
x264_pixel_ssim_wxh
(
&
pixel_asm
,
buf1
+
2
,
32
,
buf2
+
2
,
32
,
32
,
28
);
res_a
=
x264_pixel_ssim_wxh
(
&
pixel_asm
,
buf1
+
2
,
32
,
buf2
+
2
,
32
,
32
,
28
);
...
@@ -151,6 +316,12 @@ static int check_pixel( int cpu_ref, int cpu_new )
...
@@ -151,6 +316,12 @@ static int check_pixel( int cpu_ref, int cpu_new )
ok
=
0
;
ok
=
0
;
fprintf
(
stderr
,
"ssim: %.7f != %.7f [FAILED]
\n
"
,
res_c
,
res_a
);
fprintf
(
stderr
,
"ssim: %.7f != %.7f [FAILED]
\n
"
,
res_c
,
res_a
);
}
}
set_func_name
(
"ssim_core"
);
call_c2
(
pixel_c
.
ssim_4x4x2_core
,
buf1
+
2
,
32
,
buf2
+
2
,
32
,
sums
);
call_a2
(
pixel_asm
.
ssim_4x4x2_core
,
buf1
+
2
,
32
,
buf2
+
2
,
32
,
sums
);
set_func_name
(
"ssim_end"
);
call_c2
(
pixel_c
.
ssim_end4
,
sums
,
sums
,
4
);
call_a2
(
pixel_asm
.
ssim_end4
,
sums
,
sums
,
4
);
report
(
"ssim :"
);
report
(
"ssim :"
);
}
}
...
@@ -165,6 +336,7 @@ static int check_pixel( int cpu_ref, int cpu_new )
...
@@ -165,6 +336,7 @@ static int check_pixel( int cpu_ref, int cpu_new )
int16_t
mvs_a
[
32
],
mvs_c
[
32
];
int16_t
mvs_a
[
32
],
mvs_c
[
32
];
int
mvn_a
,
mvn_c
;
int
mvn_a
,
mvn_c
;
int
thresh
=
rand
()
&
0x3fff
;
int
thresh
=
rand
()
&
0x3fff
;
set_func_name
(
"esa_ads"
);
for
(
j
=
0
;
j
<
72
;
j
++
)
for
(
j
=
0
;
j
<
72
;
j
++
)
sums
[
j
]
=
rand
()
&
0x3fff
;
sums
[
j
]
=
rand
()
&
0x3fff
;
for
(
j
=
0
;
j
<
4
;
j
++
)
for
(
j
=
0
;
j
<
4
;
j
++
)
...
@@ -195,7 +367,7 @@ static int check_dct( int cpu_ref, int cpu_new )
...
@@ -195,7 +367,7 @@ static int check_dct( int cpu_ref, int cpu_new )
x264_dct_function_t
dct_ref
;
x264_dct_function_t
dct_ref
;
x264_dct_function_t
dct_asm
;
x264_dct_function_t
dct_asm
;
x264_quant_function_t
qf
;
x264_quant_function_t
qf
;
int
ret
=
0
,
ok
,
used_asm
,
i
;
int
ret
=
0
,
ok
,
used_asm
,
i
,
interlace
;
DECLARE_ALIGNED_16
(
int16_t
dct1
[
16
][
4
][
4
]
);
DECLARE_ALIGNED_16
(
int16_t
dct1
[
16
][
4
][
4
]
);
DECLARE_ALIGNED_16
(
int16_t
dct2
[
16
][
4
][
4
]
);
DECLARE_ALIGNED_16
(
int16_t
dct2
[
16
][
4
][
4
]
);
DECLARE_ALIGNED_16
(
int16_t
dct4
[
16
][
4
][
4
]
);
DECLARE_ALIGNED_16
(
int16_t
dct4
[
16
][
4
][
4
]
);
...
@@ -221,6 +393,7 @@ static int check_dct( int cpu_ref, int cpu_new )
...
@@ -221,6 +393,7 @@ static int check_dct( int cpu_ref, int cpu_new )
#define TEST_DCT( name, t1, t2, size ) \
#define TEST_DCT( name, t1, t2, size ) \
if( dct_asm.name != dct_ref.name ) \
if( dct_asm.name != dct_ref.name ) \
{ \
{ \
set_func_name( #name );\
used_asm = 1; \
used_asm = 1; \
call_c( dct_c.name, t1, buf1, buf2 ); \
call_c( dct_c.name, t1, buf1, buf2 ); \
call_a( dct_asm.name, t2, buf1, buf2 ); \
call_a( dct_asm.name, t2, buf1, buf2 ); \
...
@@ -260,18 +433,21 @@ static int check_dct( int cpu_ref, int cpu_new )
...
@@ -260,18 +433,21 @@ static int check_dct( int cpu_ref, int cpu_new )
#define TEST_IDCT( name, src ) \
#define TEST_IDCT( name, src ) \
if( dct_asm.name != dct_ref.name ) \
if( dct_asm.name != dct_ref.name ) \
{ \
{ \
set_func_name( #name );\
used_asm = 1; \
used_asm = 1; \
memcpy( buf3, buf1, 32*32 ); \
memcpy( buf3, buf1, 32*32 ); \
memcpy( buf4, buf1, 32*32 ); \
memcpy( buf4, buf1, 32*32 ); \
memcpy( dct1, src, 512 ); \
memcpy( dct1, src, 512 ); \
memcpy( dct2, src, 512 ); \
memcpy( dct2, src, 512 ); \
call_c( dct_c.name, buf3, (void*)dct1 ); \
call_c
1
( dct_c.name, buf3, (void*)dct1 ); \
call_a( dct_asm.name, buf4, (void*)dct2 ); \
call_a
1
( dct_asm.name, buf4, (void*)dct2 ); \
if( memcmp( buf3, buf4, 32*32 ) ) \
if( memcmp( buf3, buf4, 32*32 ) ) \
{ \
{ \
ok = 0; \
ok = 0; \
fprintf( stderr, #name " [FAILED]\n" ); \
fprintf( stderr, #name " [FAILED]\n" ); \
} \
} \
call_c2( dct_c.name, buf3, (void*)dct1 ); \
call_a2( dct_asm.name, buf4, (void*)dct2 ); \
}
}
ok
=
1
;
used_asm
=
0
;
ok
=
1
;
used_asm
=
0
;
TEST_IDCT
(
add4x4_idct
,
dct4
);
TEST_IDCT
(
add4x4_idct
,
dct4
);
...
@@ -290,27 +466,33 @@ static int check_dct( int cpu_ref, int cpu_new )
...
@@ -290,27 +466,33 @@ static int check_dct( int cpu_ref, int cpu_new )
{
{
DECLARE_ALIGNED_16
(
int16_t
dct1
[
4
][
4
]
)
=
{{
-
12
,
42
,
23
,
67
},{
2
,
90
,
89
,
56
},{
67
,
43
,
-
76
,
91
},{
56
,
-
78
,
-
54
,
1
}};
DECLARE_ALIGNED_16
(
int16_t
dct1
[
4
][
4
]
)
=
{{
-
12
,
42
,
23
,
67
},{
2
,
90
,
89
,
56
},{
67
,
43
,
-
76
,
91
},{
56
,
-
78
,
-
54
,
1
}};
DECLARE_ALIGNED_16
(
int16_t
dct2
[
4
][
4
]
)
=
{{
-
12
,
42
,
23
,
67
},{
2
,
90
,
89
,
56
},{
67
,
43
,
-
76
,
91
},{
56
,
-
78
,
-
54
,
1
}};
DECLARE_ALIGNED_16
(
int16_t
dct2
[
4
][
4
]
)
=
{{
-
12
,
42
,
23
,
67
},{
2
,
90
,
89
,
56
},{
67
,
43
,
-
76
,
91
},{
56
,
-
78
,
-
54
,
1
}};
set_func_name
(
"dct4x4dc"
);
used_asm
=
1
;
used_asm
=
1
;
call_c
(
dct_c
.
dct4x4dc
,
dct1
);
call_c
1
(
dct_c
.
dct4x4dc
,
dct1
);
call_a
(
dct_asm
.
dct4x4dc
,
dct2
);
call_a
1
(
dct_asm
.
dct4x4dc
,
dct2
);
if
(
memcmp
(
dct1
,
dct2
,
32
)
)
if
(
memcmp
(
dct1
,
dct2
,
32
)
)
{
{
ok
=
0
;
ok
=
0
;
fprintf
(
stderr
,
" - dct4x4dc : [FAILED]
\n
"
);
fprintf
(
stderr
,
" - dct4x4dc : [FAILED]
\n
"
);
}
}
call_c2
(
dct_c
.
dct4x4dc
,
dct1
);
call_a2
(
dct_asm
.
dct4x4dc
,
dct2
);
}
}
if
(
dct_asm
.
idct4x4dc
!=
dct_ref
.
idct4x4dc
)
if
(
dct_asm
.
idct4x4dc
!=
dct_ref
.
idct4x4dc
)
{
{
DECLARE_ALIGNED_16
(
int16_t
dct1
[
4
][
4
]
)
=
{{
-
12
,
42
,
23
,
67
},{
2
,
90
,
89
,
56
},{
67
,
43
,
-
76
,
91
},{
56
,
-
78
,
-
54
,
1
}};
DECLARE_ALIGNED_16
(
int16_t
dct1
[
4
][
4
]
)
=
{{
-
12
,
42
,
23
,
67
},{
2
,
90
,
89
,
56
},{
67
,
43
,
-
76
,
91
},{
56
,
-
78
,
-
54
,
1
}};
DECLARE_ALIGNED_16
(
int16_t
dct2
[
4
][
4
]
)
=
{{
-
12
,
42
,
23
,
67
},{
2
,
90
,
89
,
56
},{
67
,
43
,
-
76
,
91
},{
56
,
-
78
,
-
54
,
1
}};
DECLARE_ALIGNED_16
(
int16_t
dct2
[
4
][
4
]
)
=
{{
-
12
,
42
,
23
,
67
},{
2
,
90
,
89
,
56
},{
67
,
43
,
-
76
,
91
},{
56
,
-
78
,
-
54
,
1
}};
set_func_name
(
"idct4x4dc"
);
used_asm
=
1
;
used_asm
=
1
;
call_c
(
dct_c
.
idct4x4dc
,
dct1
);
call_c
1
(
dct_c
.
idct4x4dc
,
dct1
);
call_a
(
dct_asm
.
idct4x4dc
,
dct2
);
call_a
1
(
dct_asm
.
idct4x4dc
,
dct2
);
if
(
memcmp
(
dct1
,
dct2
,
32
)
)
if
(
memcmp
(
dct1
,
dct2
,
32
)
)
{
{
ok
=
0
;
ok
=
0
;
fprintf
(
stderr
,
" - idct4x4dc : [FAILED]
\n
"
);
fprintf
(
stderr
,
" - idct4x4dc : [FAILED]
\n
"
);
}
}
call_c2
(
dct_c
.
idct4x4dc
,
dct1
);
call_a2
(
dct_asm
.
idct4x4dc
,
dct2
);
}
}
report
(
"(i)dct4x4dc :"
);
report
(
"(i)dct4x4dc :"
);
...
@@ -319,6 +501,7 @@ static int check_dct( int cpu_ref, int cpu_new )
...
@@ -319,6 +501,7 @@ static int check_dct( int cpu_ref, int cpu_new )
{
{
DECLARE_ALIGNED_16
(
int16_t
dct1
[
2
][
2
]
)
=
{{
-
12
,
42
},{
2
,
90
}};
DECLARE_ALIGNED_16
(
int16_t
dct1
[
2
][
2
]
)
=
{{
-
12
,
42
},{
2
,
90
}};
DECLARE_ALIGNED_16
(
int16_t
dct2
[
2
][
2
]
)
=
{{
-
12
,
42
},{
2
,
90
}};
DECLARE_ALIGNED_16
(
int16_t
dct2
[
2
][
2
]
)
=
{{
-
12
,
42
},{
2
,
90
}};
set_func_name
(
"dct2x2dc"
);
used_asm
=
1
;
used_asm
=
1
;
call_c
(
dct_c
.
dct2x2dc
,
dct1
);
call_c
(
dct_c
.
dct2x2dc
,
dct1
);
call_a
(
dct_asm
.
dct2x2dc
,
dct2
);
call_a
(
dct_asm
.
dct2x2dc
,
dct2
);
...
@@ -332,6 +515,7 @@ static int check_dct( int cpu_ref, int cpu_new )
...
@@ -332,6 +515,7 @@ static int check_dct( int cpu_ref, int cpu_new )
{
{
DECLARE_ALIGNED_16
(
int16_t
dct1
[
2
][
2
]
)
=
{{
-
12
,
42
},{
2
,
90
}};
DECLARE_ALIGNED_16
(
int16_t
dct1
[
2
][
2
]
)
=
{{
-
12
,
42
},{
2
,
90
}};
DECLARE_ALIGNED_16
(
int16_t
dct2
[
2
][
2
]
)
=
{{
-
12
,
42
},{
2
,
90
}};
DECLARE_ALIGNED_16
(
int16_t
dct2
[
2
][
2
]
)
=
{{
-
12
,
42
},{
2
,
90
}};
set_func_name
(
"idct2x2dc"
);
used_asm
=
1
;
used_asm
=
1
;
call_c
(
dct_c
.
idct2x2dc
,
dct1
);
call_c
(
dct_c
.
idct2x2dc
,
dct1
);
call_a
(
dct_asm
.
idct2x2dc
,
dct2
);
call_a
(
dct_asm
.
idct2x2dc
,
dct2
);
...
@@ -353,6 +537,7 @@ static int check_dct( int cpu_ref, int cpu_new )
...
@@ -353,6 +537,7 @@ static int check_dct( int cpu_ref, int cpu_new )
#define TEST_ZIGZAG_SCAN( name, t1, t2, dct, size ) \
#define TEST_ZIGZAG_SCAN( name, t1, t2, dct, size ) \
if( zigzag_asm.name != zigzag_ref.name ) \
if( zigzag_asm.name != zigzag_ref.name ) \
{ \
{ \
set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" );\
used_asm = 1; \
used_asm = 1; \
call_c( zigzag_c.name, t1, dct ); \
call_c( zigzag_c.name, t1, dct ); \
call_a( zigzag_asm.name, t2, dct ); \
call_a( zigzag_asm.name, t2, dct ); \
...
@@ -366,18 +551,22 @@ static int check_dct( int cpu_ref, int cpu_new )
...
@@ -366,18 +551,22 @@ static int check_dct( int cpu_ref, int cpu_new )
#define TEST_ZIGZAG_SUB( name, t1, t2, size ) \
#define TEST_ZIGZAG_SUB( name, t1, t2, size ) \
if( zigzag_asm.name != zigzag_ref.name ) \
if( zigzag_asm.name != zigzag_ref.name ) \
{ \
{ \
set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" );\
used_asm = 1; \
used_asm = 1; \
memcpy( buf3, buf1, 16*FDEC_STRIDE ); \
memcpy( buf3, buf1, 16*FDEC_STRIDE ); \
memcpy( buf4, buf1, 16*FDEC_STRIDE ); \
memcpy( buf4, buf1, 16*FDEC_STRIDE ); \
call_c( zigzag_c.name, t1, buf2, buf3 ); \
call_c
1
( zigzag_c.name, t1, buf2, buf3 ); \
call_a
( zigzag_asm.name, t2, buf2, buf4 );
\
call_a
1( zigzag_asm.name, t2, buf2, buf4 );
\
if( memcmp( t1, t2, size*sizeof(int16_t) )|| memcmp( buf3, buf4, 16*FDEC_STRIDE ) ) \
if( memcmp( t1, t2, size*sizeof(int16_t) )|| memcmp( buf3, buf4, 16*FDEC_STRIDE ) ) \
{ \
{ \
ok = 0; \
ok = 0; \
fprintf( stderr, #name " [FAILED]\n" ); \
fprintf( stderr, #name " [FAILED]\n" ); \
} \
} \
call_c2( zigzag_c.name, t1, buf2, buf3 ); \
call_a2( zigzag_asm.name, t2, buf2, buf4 ); \
}
}
interlace
=
0
;
x264_zigzag_init
(
0
,
&
zigzag_c
,
0
);
x264_zigzag_init
(
0
,
&
zigzag_c
,
0
);
x264_zigzag_init
(
cpu_ref
,
&
zigzag_ref
,
0
);
x264_zigzag_init
(
cpu_ref
,
&
zigzag_ref
,
0
);
x264_zigzag_init
(
cpu_new
,
&
zigzag_asm
,
0
);
x264_zigzag_init
(
cpu_new
,
&
zigzag_asm
,
0
);
...
@@ -388,6 +577,7 @@ static int check_dct( int cpu_ref, int cpu_new )
...
@@ -388,6 +577,7 @@ static int check_dct( int cpu_ref, int cpu_new )
TEST_ZIGZAG_SUB
(
sub_4x4
,
level1
,
level2
,
16
);
TEST_ZIGZAG_SUB
(
sub_4x4
,
level1
,
level2
,
16
);
report
(
"zigzag_frame :"
);
report
(
"zigzag_frame :"
);
interlace
=
1
;
x264_zigzag_init
(
0
,
&
zigzag_c
,
1
);
x264_zigzag_init
(
0
,
&
zigzag_c
,
1
);
x264_zigzag_init
(
cpu_ref
,
&
zigzag_ref
,
1
);
x264_zigzag_init
(
cpu_ref
,
&
zigzag_ref
,
1
);
x264_zigzag_init
(
cpu_new
,
&
zigzag_asm
,
1
);
x264_zigzag_init
(
cpu_new
,
&
zigzag_asm
,
1
);
...
@@ -411,10 +601,10 @@ static int check_mc( int cpu_ref, int cpu_new )
...
@@ -411,10 +601,10 @@ static int check_mc( int cpu_ref, int cpu_new )
x264_pixel_function_t
pixel
;
x264_pixel_function_t
pixel
;
uint8_t
*
src
=
&
buf1
[
2
*
32
+
2
];
uint8_t
*
src
=
&
buf1
[
2
*
32
+
2
];
uint8_t
*
src2
[
4
]
=
{
&
buf1
[
2
*
32
+
2
],
&
buf1
[
6
*
32
+
2
],
uint8_t
*
src2
[
4
]
=
{
&
buf1
[
3
*
64
+
2
],
&
buf1
[
5
*
64
+
2
],
&
buf1
[
10
*
32
+
2
],
&
buf1
[
14
*
32
+
2
]
};
&
buf1
[
7
*
64
+
2
],
&
buf1
[
9
*
64
+
2
]
};
uint8_t
*
dst1
=
&
buf3
[
2
*
32
]
;
uint8_t
*
dst1
=
buf3
;
uint8_t
*
dst2
=
&
buf4
[
2
*
32
]
;
uint8_t
*
dst2
=
buf4
;
int
dx
,
dy
,
i
,
j
,
k
,
w
;
int
dx
,
dy
,
i
,
j
,
k
,
w
;
int
ret
=
0
,
ok
,
used_asm
;
int
ret
=
0
,
ok
,
used_asm
;
...
@@ -427,11 +617,12 @@ static int check_mc( int cpu_ref, int cpu_new )
...
@@ -427,11 +617,12 @@ static int check_mc( int cpu_ref, int cpu_new )
#define MC_TEST_LUMA( w, h ) \
#define MC_TEST_LUMA( w, h ) \
if( mc_a.mc_luma != mc_ref.mc_luma && !(w&(w-1)) && h<=16 ) \
if( mc_a.mc_luma != mc_ref.mc_luma && !(w&(w-1)) && h<=16 ) \
{ \
{ \
set_func_name( "mc_luma_%dx%d", w, h );\
used_asm = 1; \
used_asm = 1; \
memset(buf3, 0xCD, 1024); \
memset(buf3, 0xCD, 1024); \
memset(buf4, 0xCD, 1024); \
memset(buf4, 0xCD, 1024); \
call_c( mc_c.mc_luma, dst1, 32, src2,
16
, dx, dy, w, h ); \
call_c( mc_c.mc_luma, dst1, 32, src2,
64
, dx, dy, w, h ); \
call_a( mc_a.mc_luma, dst2, 32, src2,
16
, dx, dy, w, h ); \
call_a( mc_a.mc_luma, dst2, 32, src2,
64
, dx, dy, w, h ); \
if( memcmp( buf3, buf4, 1024 ) ) \
if( memcmp( buf3, buf4, 1024 ) ) \
{ \
{ \
fprintf( stderr, "mc_luma[mv(%d,%d) %2dx%-2d] [FAILED]\n", dx, dy, w, h ); \
fprintf( stderr, "mc_luma[mv(%d,%d) %2dx%-2d] [FAILED]\n", dx, dy, w, h ); \
...
@@ -442,11 +633,12 @@ static int check_mc( int cpu_ref, int cpu_new )
...
@@ -442,11 +633,12 @@ static int check_mc( int cpu_ref, int cpu_new )
{ \
{ \
uint8_t *ref = dst2; \
uint8_t *ref = dst2; \
int ref_stride = 32; \
int ref_stride = 32; \
set_func_name( "get_ref_%dx%d", w, h );\
used_asm = 1; \
used_asm = 1; \
memset(buf3, 0xCD, 1024); \
memset(buf3, 0xCD, 1024); \
memset(buf4, 0xCD, 1024); \
memset(buf4, 0xCD, 1024); \
call_c( mc_c.mc_luma, dst1, 32, src2,
16
, dx, dy, w, h ); \