Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
7
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
Open sidebar
VideoLAN
x264
Commits
45e36790
Commit
45e36790
authored
Aug 16, 2008
by
David Pethes
Committed by
Fiona Glaser
Aug 16, 2008
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Add dedicated variance function instead of using SAD+SSD
Faster variance calculation
parent
25976441
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
170 additions
and
5 deletions
+170
-5
common/pixel.c
common/pixel.c
+34
-1
common/pixel.h
common/pixel.h
+2
-0
common/x86/pixel-a.asm
common/x86/pixel-a.asm
+106
-0
common/x86/pixel.h
common/x86/pixel.h
+5
-0
encoder/ratecontrol.c
encoder/ratecontrol.c
+2
-4
tools/checkasm.c
tools/checkasm.c
+21
-0
No files found.
common/pixel.c
View file @
45e36790
...
...
@@ -152,6 +152,33 @@ static inline void pixel_sub_wxh( int16_t *diff, int i_size,
}
/****************************************************************************
* pixel_var_wxh
****************************************************************************/
#define PIXEL_VAR_C( name, w, shift ) \
static int name( uint8_t *pix, int i_stride, uint32_t *sad ) \
{ \
uint32_t var = 0, sum = 0, sqr = 0; \
int x, y; \
for( y = 0; y < w; y++ ) \
{ \
for( x = 0; x < w; x++ ) \
{ \
sum += pix[x]; \
sqr += pix[x] * pix[x]; \
} \
pix += i_stride; \
} \
var = sqr - (sum * sum >> shift); \
*sad = sum; \
return var; \
}
PIXEL_VAR_C
(
x264_pixel_var_16x16
,
16
,
8
)
PIXEL_VAR_C
(
x264_pixel_var_8x8
,
8
,
6
)
/****************************************************************************
* pixel_satd_WxH: sum of 4x4 Hadamard transformed differences
****************************************************************************/
...
...
@@ -532,6 +559,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT4
(
sa8d
,
);
INIT_ADS
(
);
pixf
->
var
[
PIXEL_16x16
]
=
x264_pixel_var_16x16
;
pixf
->
var
[
PIXEL_8x8
]
=
x264_pixel_var_8x8
;
pixf
->
ssim_4x4x2_core
=
ssim_4x4x2_core
;
pixf
->
ssim_end4
=
ssim_end4
;
...
...
@@ -550,7 +580,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT7
(
satd_x3
,
_mmxext
);
INIT7
(
satd_x4
,
_mmxext
);
INIT_ADS
(
_mmxext
);
pixf
->
var
[
PIXEL_16x16
]
=
x264_pixel_var_16x16_mmxext
;
pixf
->
var
[
PIXEL_8x8
]
=
x264_pixel_var_8x8_mmxext
;
#ifdef ARCH_X86
pixf
->
sa8d
[
PIXEL_16x16
]
=
x264_pixel_sa8d_16x16_mmxext
;
pixf
->
sa8d
[
PIXEL_8x8
]
=
x264_pixel_sa8d_8x8_mmxext
;
...
...
@@ -592,6 +623,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT2
(
sad_x3
,
_sse2
);
INIT2
(
sad_x4
,
_sse2
);
INIT_ADS
(
_sse2
);
pixf
->
var
[
PIXEL_8x8
]
=
x264_pixel_var_8x8_sse2
;
#ifdef ARCH_X86
if
(
cpu
&
X264_CPU_CACHELINE_64
)
...
...
@@ -608,6 +640,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT5
(
satd
,
_sse2
);
INIT5
(
satd_x3
,
_sse2
);
INIT5
(
satd_x4
,
_sse2
);
pixf
->
var
[
PIXEL_16x16
]
=
x264_pixel_var_16x16_sse2
;
pixf
->
ssim_4x4x2_core
=
x264_pixel_ssim_4x4x2_core_sse2
;
pixf
->
ssim_end4
=
x264_pixel_ssim_end4_sse2
;
pixf
->
sa8d
[
PIXEL_16x16
]
=
x264_pixel_sa8d_16x16_sse2
;
...
...
common/pixel.h
View file @
45e36790
...
...
@@ -26,6 +26,7 @@
typedef
int
(
*
x264_pixel_cmp_t
)
(
uint8_t
*
,
int
,
uint8_t
*
,
int
);
typedef
void
(
*
x264_pixel_cmp_x3_t
)
(
uint8_t
*
,
uint8_t
*
,
uint8_t
*
,
uint8_t
*
,
int
,
int
[
3
]
);
typedef
void
(
*
x264_pixel_cmp_x4_t
)
(
uint8_t
*
,
uint8_t
*
,
uint8_t
*
,
uint8_t
*
,
uint8_t
*
,
int
,
int
[
4
]
);
typedef
int
(
*
x264_pixel_var_t
)
(
uint8_t
*
,
int
,
uint32_t
*
);
enum
{
...
...
@@ -71,6 +72,7 @@ typedef struct
x264_pixel_cmp_t
fpelcmp
[
7
];
/* either satd or sad for fullpel motion search */
x264_pixel_cmp_x3_t
fpelcmp_x3
[
7
];
x264_pixel_cmp_x4_t
fpelcmp_x4
[
7
];
x264_pixel_var_t
var
[
4
];
void
(
*
ssim_4x4x2_core
)(
const
uint8_t
*
pix1
,
int
stride1
,
const
uint8_t
*
pix2
,
int
stride2
,
int
sums
[
2
][
4
]
);
...
...
common/x86/pixel-a.asm
View file @
45e36790
...
...
@@ -162,6 +162,112 @@ SSD 8, 8, sse2
SSD
8
,
4
,
ss
e2
;=============================================================================
; variance
;=============================================================================
%macro VAR_START 0
pxor
m5
,
m5
; sum
pxor
m6
,
m6
; sum squared
pxor
m7
,
m7
; zero
%ifdef ARCH_X86_64
%define t3d r3d
%else
%define t3d r2d
%endif
%endmacro
%macro VAR_END 1
%if mmsize == 16
movhlps
m0
,
m5
paddw
m5
,
m0
%endif
movifnidn
r2d
,
r2m
movd
r1d
,
m5
movd
[
r2
],
m5
; return sum
imul
r1d
,
r1d
HADDD
m6
,
m1
shr
r1d
,
%
1
movd
eax
,
m6
sub
eax
,
r1d
; sqr - (sum * sum >> shift)
RET
%endmacro
%macro VAR_2ROW 2
mov
t3d
,
%
2
.loop:
mova
m0
,
[
r0
]
mova
m1
,
m0
mova
m3
,
[
r0
+%
1
]
mova
m2
,
m0
punpcklbw
m0
,
m7
mova
m4
,
m3
punpckhbw
m1
,
m7
%ifidn %1, r1
lea
r0
,
[
r0
+%
1
*
2
]
%else
add
r0
,
r1
%endif
punpckhbw
m4
,
m7
psadbw
m2
,
m7
paddw
m5
,
m2
mova
m2
,
m3
punpcklbw
m3
,
m7
dec
t3d
psadbw
m2
,
m7
pmaddwd
m0
,
m0
paddw
m5
,
m2
pmaddwd
m1
,
m1
paddd
m6
,
m0
pmaddwd
m3
,
m3
paddd
m6
,
m1
pmaddwd
m4
,
m4
paddd
m6
,
m3
paddd
m6
,
m4
jg
.loop
%endmacro
;-----------------------------------------------------------------------------
; int x264_pixel_var_wxh_mmxext( uint8_t *, int, int * )
;-----------------------------------------------------------------------------
INIT_MMX
cglobal
x264_pixel_var_16x16_mmxext
,
2
,
3
VAR_START
VAR_2ROW
8
,
16
VAR_END
8
cglobal
x264_pixel_var_8x8_mmxext
,
2
,
3
VAR_START
VAR_2ROW
r1
,
4
VAR_END
6
INIT_XMM
cglobal
x264_pixel_var_16x16_sse2
,
2
,
3
VAR_START
VAR_2ROW
r1
,
8
VAR_END
8
cglobal
x264_pixel_var_8x8_sse2
,
2
,
3
VAR_START
mov
t3d
,
4
.loop:
movh
m0
,
[
r0
]
movhps
m0
,
[
r0
+
r1
]
lea
r0
,
[
r0
+
r1
*
2
]
mova
m1
,
m0
punpcklbw
m0
,
m7
mova
m2
,
m1
punpckhbw
m1
,
m7
dec
t3d
pmaddwd
m0
,
m0
pmaddwd
m1
,
m1
psadbw
m2
,
m7
paddw
m5
,
m2
paddd
m6
,
m0
paddd
m6
,
m1
jnz
.loop
VAR_END
6
;=============================================================================
; SATD
...
...
common/x86/pixel.h
View file @
45e36790
...
...
@@ -68,6 +68,11 @@ DECL_X4( sad, cache64_ssse3 );
#undef DECL_X1
#undef DECL_X4
int
x264_pixel_var_16x16_mmxext
(
uint8_t
*
pix
,
int
i_stride
,
uint32_t
*
sad
);
int
x264_pixel_var_16x16_sse2
(
uint8_t
*
pix
,
int
i_stride
,
uint32_t
*
sad
);
int
x264_pixel_var_8x8_mmxext
(
uint8_t
*
pix
,
int
i_stride
,
uint32_t
*
sad
);
int
x264_pixel_var_8x8_sse2
(
uint8_t
*
pix
,
int
i_stride
,
uint32_t
*
sad
);
void
x264_intra_satd_x3_4x4_mmxext
(
uint8_t
*
,
uint8_t
*
,
int
*
);
void
x264_intra_satd_x3_4x4_ssse3
(
uint8_t
*
,
uint8_t
*
,
int
*
);
void
x264_intra_satd_x3_8x8c_mmxext
(
uint8_t
*
,
uint8_t
*
,
int
*
);
...
...
encoder/ratecontrol.c
View file @
45e36790
...
...
@@ -187,7 +187,7 @@ static NOINLINE int ac_energy_mb( x264_t *h, int mb_x, int mb_y, int *satd )
/* FIXME: This array is larger than necessary because a bug in GCC causes an all-zero
* array to be placed in .bss despite .bss not being correctly aligned on some platforms (win32?) */
DECLARE_ALIGNED_16
(
static
uint8_t
zero
[
17
]
)
=
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
1
};
unsigned
int
var
=
0
,
sad
,
ssd
,
i
;
unsigned
int
var
=
0
,
sad
,
i
;
if
(
satd
||
h
->
param
.
rc
.
i_aq_mode
==
X264_AQ_GLOBAL
)
{
for
(
i
=
0
;
i
<
3
;
i
++
)
...
...
@@ -199,9 +199,7 @@ static NOINLINE int ac_energy_mb( x264_t *h, int mb_x, int mb_y, int *satd )
:
w
*
(
mb_x
+
mb_y
*
stride
);
int
pix
=
i
?
PIXEL_8x8
:
PIXEL_16x16
;
stride
<<=
h
->
mb
.
b_interlaced
;
sad
=
h
->
pixf
.
sad
[
pix
](
zero
,
0
,
h
->
fenc
->
plane
[
i
]
+
offset
,
stride
);
ssd
=
h
->
pixf
.
ssd
[
pix
](
zero
,
0
,
h
->
fenc
->
plane
[
i
]
+
offset
,
stride
);
var
+=
ssd
-
(
sad
*
sad
>>
(
i
?
6
:
8
));
var
+=
h
->
pixf
.
var
[
pix
](
h
->
fenc
->
plane
[
i
]
+
offset
,
stride
,
&
sad
);
// SATD to represent the block's overall complexity (bit cost) for intra encoding.
// exclude the DC coef, because nothing short of an actual intra prediction will estimate DC cost.
if
(
var
&&
satd
)
...
...
tools/checkasm.c
View file @
45e36790
...
...
@@ -302,6 +302,27 @@ static int check_pixel( int cpu_ref, int cpu_new )
TEST_PIXEL_X
(
3
);
TEST_PIXEL_X
(
4
);
#define TEST_PIXEL_VAR( i ) \
if( pixel_asm.var[i] != pixel_ref.var[i] ) \
{ \
uint32_t res_c, res_asm; \
uint32_t sad_c, sad_asm; \
set_func_name( "%s_%s", "var", pixel_names[i] ); \
used_asm = 1; \
res_c = call_c( pixel_c.var[i], buf1, 16, &sad_c ); \
res_asm = call_a( pixel_asm.var[i], buf1, 16, &sad_asm ); \
if( (res_c != res_asm) || (sad_c != sad_asm) ) \
{ \
ok = 0; \
fprintf( stderr, "var[%d]: %d,%d != %d,%d [FAILED]\n", i, res_c, sad_c, res_asm, sad_asm ); \
} \
}
ok
=
1
;
used_asm
=
0
;
TEST_PIXEL_VAR
(
PIXEL_16x16
);
TEST_PIXEL_VAR
(
PIXEL_8x8
);
report
(
"pixel var :"
);
#define TEST_INTRA_SATD( name, pred, satd, i8x8, ... ) \
if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \
{ \
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment