Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
7
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
Open sidebar
VideoLAN
x264
Commits
bc29c635
Commit
bc29c635
authored
Dec 11, 2008
by
Loren Merritt
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
faster ESA init
reduce memory if using ESA and not p4x4
parent
8e5d63a5
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
227 additions
and
36 deletions
+227
-36
common/common.h
common/common.h
+1
-0
common/frame.c
common/frame.c
+1
-1
common/mc.c
common/mc.c
+57
-14
common/mc.h
common/mc.h
+6
-0
common/x86/mc-a2.asm
common/x86/mc-a2.asm
+98
-0
common/x86/mc-c.c
common/x86/mc-c.c
+16
-0
encoder/encoder.c
encoder/encoder.c
+3
-0
tools/checkasm.c
tools/checkasm.c
+45
-21
No files found.
common/common.h
View file @
bc29c635
...
...
@@ -338,6 +338,7 @@ struct x264_t
int
i_max_ref1
;
int
i_delay
;
/* Number of frames buffered for B reordering */
int
b_have_lowres
;
/* Whether 1/2 resolution luma planes are being used */
int
b_have_sub8x8_esa
;
}
frames
;
/* current frame being encoded */
...
...
common/frame.c
View file @
bc29c635
...
...
@@ -99,7 +99,7 @@ x264_frame_t *x264_frame_new( x264_t *h )
if
(
h
->
param
.
analyse
.
i_me_method
>=
X264_ME_ESA
)
{
CHECKED_MALLOC
(
frame
->
buffer
[
3
],
2
*
frame
->
i_stride
[
0
]
*
(
frame
->
i_lines
[
0
]
+
2
*
i_padv
)
*
sizeof
(
uint16_t
)
);
frame
->
i_stride
[
0
]
*
(
frame
->
i_lines
[
0
]
+
2
*
i_padv
)
*
sizeof
(
uint16_t
)
<<
h
->
frames
.
b_have_sub8x8_esa
);
frame
->
integral
=
(
uint16_t
*
)
frame
->
buffer
[
3
]
+
frame
->
i_stride
[
0
]
*
i_padv
+
PADH
;
}
...
...
common/mc.c
View file @
bc29c635
...
...
@@ -269,6 +269,42 @@ static void memzero_aligned( void * dst, int n )
memset
(
dst
,
0
,
n
);
}
static
void
integral_init4h
(
uint16_t
*
sum
,
uint8_t
*
pix
,
int
stride
)
{
int
x
,
v
=
pix
[
0
]
+
pix
[
1
]
+
pix
[
2
]
+
pix
[
3
];
for
(
x
=
0
;
x
<
stride
-
4
;
x
++
)
{
sum
[
x
]
=
v
+
sum
[
x
-
stride
];
v
+=
pix
[
x
+
4
]
-
pix
[
x
];
}
}
static
void
integral_init8h
(
uint16_t
*
sum
,
uint8_t
*
pix
,
int
stride
)
{
int
x
,
v
=
pix
[
0
]
+
pix
[
1
]
+
pix
[
2
]
+
pix
[
3
]
+
pix
[
4
]
+
pix
[
5
]
+
pix
[
6
]
+
pix
[
7
];
for
(
x
=
0
;
x
<
stride
-
8
;
x
++
)
{
sum
[
x
]
=
v
+
sum
[
x
-
stride
];
v
+=
pix
[
x
+
8
]
-
pix
[
x
];
}
}
static
void
integral_init4v
(
uint16_t
*
sum8
,
uint16_t
*
sum4
,
int
stride
)
{
int
x
;
for
(
x
=
0
;
x
<
stride
-
8
;
x
++
)
sum4
[
x
]
=
sum8
[
x
+
4
*
stride
]
-
sum8
[
x
];
for
(
x
=
0
;
x
<
stride
-
8
;
x
++
)
sum8
[
x
]
=
sum8
[
x
+
8
*
stride
]
+
sum8
[
x
+
8
*
stride
+
4
]
-
sum8
[
x
]
-
sum8
[
x
+
4
];
}
static
void
integral_init8v
(
uint16_t
*
sum8
,
int
stride
)
{
int
x
;
for
(
x
=
0
;
x
<
stride
-
8
;
x
++
)
sum8
[
x
]
=
sum8
[
x
+
8
*
stride
]
-
sum8
[
x
];
}
void
x264_frame_init_lowres
(
x264_t
*
h
,
x264_frame_t
*
frame
)
{
uint8_t
*
src
=
frame
->
plane
[
0
];
...
...
@@ -353,6 +389,11 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf )
pf
->
memzero_aligned
=
memzero_aligned
;
pf
->
frame_init_lowres_core
=
frame_init_lowres_core
;
pf
->
integral_init4h
=
integral_init4h
;
pf
->
integral_init8h
=
integral_init8h
;
pf
->
integral_init4v
=
integral_init4v
;
pf
->
integral_init8v
=
integral_init8v
;
#ifdef HAVE_MMX
x264_mc_init_mmx
(
cpu
,
pf
);
#endif
...
...
@@ -370,7 +411,7 @@ void x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
int
start
=
(
mb_y
*
16
>>
b_interlaced
)
-
8
;
// buffer = 4 for deblock + 3 for 6tap, rounded to 8
int
height
=
((
b_end
?
frame
->
i_lines
[
0
]
:
mb_y
*
16
)
>>
b_interlaced
)
+
8
;
int
offs
=
start
*
stride
-
8
;
// buffer = 3 for 6tap, aligned to 8 for simd
int
x
,
y
;
int
y
;
if
(
mb_y
&
b_interlaced
)
return
;
...
...
@@ -401,20 +442,22 @@ void x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
height
+=
PADV
-
8
;
for
(
y
=
start
;
y
<
height
;
y
++
)
{
uint8_t
*
ref
=
frame
->
plane
[
0
]
+
y
*
stride
-
PADH
;
uint16_t
*
line
=
frame
->
integral
+
(
y
+
1
)
*
stride
-
PADH
+
1
;
uint16_t
v
=
line
[
0
]
=
0
;
for
(
x
=
1
;
x
<
stride
-
1
;
x
++
)
line
[
x
]
=
v
+=
ref
[
x
]
+
line
[
x
-
stride
]
-
line
[
x
-
stride
-
1
];
line
-=
8
*
stride
;
if
(
y
>=
9
-
PADV
)
uint8_t
*
pix
=
frame
->
plane
[
0
]
+
y
*
stride
-
PADH
;
uint16_t
*
sum8
=
frame
->
integral
+
(
y
+
1
)
*
stride
-
PADH
;
uint16_t
*
sum4
;
if
(
h
->
frames
.
b_have_sub8x8_esa
)
{
h
->
mc
.
integral_init4h
(
sum8
,
pix
,
stride
);
sum8
-=
8
*
stride
;
sum4
=
sum8
+
stride
*
(
frame
->
i_lines
[
0
]
+
PADV
*
2
);
if
(
y
>=
8
-
PADV
)
h
->
mc
.
integral_init4v
(
sum8
,
sum4
,
stride
);
}
else
{
uint16_t
*
sum4
=
line
+
stride
*
(
frame
->
i_lines
[
0
]
+
PADV
*
2
);
for
(
x
=
1
;
x
<
stride
-
8
;
x
++
,
line
++
,
sum4
++
)
{
sum4
[
0
]
=
line
[
4
+
4
*
stride
]
-
line
[
4
]
-
line
[
4
*
stride
]
+
line
[
0
];
line
[
0
]
+=
line
[
8
+
8
*
stride
]
-
line
[
8
]
-
line
[
8
*
stride
];
}
h
->
mc
.
integral_init8h
(
sum8
,
pix
,
stride
);
if
(
y
>=
8
-
PADV
)
h
->
mc
.
integral_init8v
(
sum8
-
8
*
stride
,
stride
);
}
}
}
...
...
common/mc.h
View file @
bc29c635
...
...
@@ -66,6 +66,12 @@ typedef struct
void
*
(
*
memcpy_aligned
)(
void
*
dst
,
const
void
*
src
,
size_t
n
);
void
(
*
memzero_aligned
)(
void
*
dst
,
int
n
);
/* successive elimination prefilter */
void
(
*
integral_init4h
)(
uint16_t
*
sum
,
uint8_t
*
pix
,
int
stride
);
void
(
*
integral_init8h
)(
uint16_t
*
sum
,
uint8_t
*
pix
,
int
stride
);
void
(
*
integral_init4v
)(
uint16_t
*
sum8
,
uint16_t
*
sum4
,
int
stride
);
void
(
*
integral_init8v
)(
uint16_t
*
sum8
,
int
stride
);
void
(
*
frame_init_lowres_core
)(
uint8_t
*
src0
,
uint8_t
*
dst0
,
uint8_t
*
dsth
,
uint8_t
*
dstv
,
uint8_t
*
dstc
,
int
src_stride
,
int
dst_stride
,
int
width
,
int
height
);
}
x264_mc_functions_t
;
...
...
common/x86/mc-a2.asm
View file @
bc29c635
...
...
@@ -694,6 +694,104 @@ MEMZERO sse2
;-----------------------------------------------------------------------------
; void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, int stride )
;-----------------------------------------------------------------------------
cglobal
x264_integral_init4h_sse4
,
3
,
4
lea
r3
,
[
r0
+
r2
*
2
]
add
r1
,
r2
neg
r2
pxor
m4
,
m4
.loop:
movdqa
m0
,
[
r1
+
r2
]
movdqu
m1
,
[
r1
+
r2
+
8
]
mpsadbw
m0
,
m4
,
0
mpsadbw
m1
,
m4
,
0
paddw
m0
,
[
r0
+
r2
*
2
]
paddw
m1
,
[
r0
+
r2
*
2
+
16
]
movdqa
[
r3
+
r2
*
2
],
m0
movdqa
[
r3
+
r2
*
2
+
16
],
m1
add
r2
,
16
jl
.loop
REP_RET
cglobal
x264_integral_init8h_sse4
,
3
,
4
lea
r3
,
[
r0
+
r2
*
2
]
add
r1
,
r2
neg
r2
pxor
m4
,
m4
.loop:
movdqa
m0
,
[
r1
+
r2
]
movdqu
m1
,
[
r1
+
r2
+
8
]
movdqa
m2
,
m0
movdqa
m3
,
m1
mpsadbw
m0
,
m4
,
0
mpsadbw
m1
,
m4
,
0
mpsadbw
m2
,
m4
,
4
mpsadbw
m3
,
m4
,
4
paddw
m0
,
[
r0
+
r2
*
2
]
paddw
m1
,
[
r0
+
r2
*
2
+
16
]
paddw
m0
,
m2
paddw
m1
,
m3
movdqa
[
r3
+
r2
*
2
],
m0
movdqa
[
r3
+
r2
*
2
+
16
],
m1
add
r2
,
16
jl
.loop
REP_RET
%macro INTEGRAL_INIT 1
;-----------------------------------------------------------------------------
; void x264_integral_init4v_mmx( uint16_t *sum8, uint16_t *sum4, int stride )
;-----------------------------------------------------------------------------
cglobal
x264_integral_init4v_
%
1
,
3
,
5
shl
r2
,
1
add
r0
,
r2
add
r1
,
r2
lea
r3
,
[
r0
+
r2
*
4
]
lea
r4
,
[
r0
+
r2
*
8
]
neg
r2
.loop:
movu
m0
,
[
r0
+
r2
+
8
]
mova
m2
,
[
r0
+
r2
]
movu
m1
,
[
r4
+
r2
+
8
]
paddw
m0
,
m2
paddw
m1
,
[
r4
+
r2
]
mova
m3
,
[
r3
+
r2
]
psubw
m1
,
m0
psubw
m3
,
m2
mova
[
r0
+
r2
],
m1
mova
[
r1
+
r2
],
m3
add
r2
,
mmsize
jl
.loop
REP_RET
;-----------------------------------------------------------------------------
; void x264_integral_init8v_mmx( uint16_t *sum8, int stride )
;-----------------------------------------------------------------------------
cglobal
x264_integral_init8v_
%
1
,
3
,
3
shl
r1
,
1
add
r0
,
r1
lea
r2
,
[
r0
+
r1
*
8
]
neg
r1
.loop:
mova
m0
,
[
r2
+
r1
]
mova
m1
,
[
r2
+
r1
+
mmsize
]
psubw
m0
,
[
r0
+
r1
]
psubw
m1
,
[
r0
+
r1
+
mmsize
]
mova
[
r0
+
r1
],
m0
mova
[
r0
+
r1
+
mmsize
],
m1
add
r1
,
2
*
mmsize
jl
.loop
REP_RET
%endmacro
INIT_MMX
INTEGRAL_INIT
mmx
INIT_XMM
INTEGRAL_INIT
ss
e2
%macro FILT8x4 7
mova
%
3
,
[
r0
+%
7
]
mova
%
4
,
[
r0
+
r5
+%
7
]
...
...
common/x86/mc-c.c
View file @
bc29c635
...
...
@@ -64,6 +64,12 @@ extern void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n );
extern
void
*
x264_memcpy_aligned_sse2
(
void
*
dst
,
const
void
*
src
,
size_t
n
);
extern
void
x264_memzero_aligned_mmx
(
void
*
dst
,
int
n
);
extern
void
x264_memzero_aligned_sse2
(
void
*
dst
,
int
n
);
extern
void
x264_integral_init4h_sse4
(
uint16_t
*
sum
,
uint8_t
*
pix
,
int
stride
);
extern
void
x264_integral_init8h_sse4
(
uint16_t
*
sum
,
uint8_t
*
pix
,
int
stride
);
extern
void
x264_integral_init4v_mmx
(
uint16_t
*
sum8
,
uint16_t
*
sum4
,
int
stride
);
extern
void
x264_integral_init4v_sse2
(
uint16_t
*
sum8
,
uint16_t
*
sum4
,
int
stride
);
extern
void
x264_integral_init8v_mmx
(
uint16_t
*
sum8
,
int
stride
);
extern
void
x264_integral_init8v_sse2
(
uint16_t
*
sum8
,
int
stride
);
#define LOWRES(cpu) \
extern void x264_frame_init_lowres_core_##cpu( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,\
int src_stride, int dst_stride, int width, int height );
...
...
@@ -242,6 +248,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf
->
copy
[
PIXEL_4x4
]
=
x264_mc_copy_w4_mmx
;
pf
->
memcpy_aligned
=
x264_memcpy_aligned_mmx
;
pf
->
memzero_aligned
=
x264_memzero_aligned_mmx
;
pf
->
integral_init4v
=
x264_integral_init4v_mmx
;
pf
->
integral_init8v
=
x264_integral_init8v_mmx
;
if
(
!
(
cpu
&
X264_CPU_MMXEXT
)
)
return
;
...
...
@@ -286,6 +294,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf
->
memcpy_aligned
=
x264_memcpy_aligned_sse2
;
pf
->
memzero_aligned
=
x264_memzero_aligned_sse2
;
pf
->
integral_init4v
=
x264_integral_init4v_sse2
;
pf
->
integral_init8v
=
x264_integral_init8v_sse2
;
pf
->
hpel_filter
=
x264_hpel_filter_sse2_amd
;
if
(
cpu
&
X264_CPU_SSE2_IS_SLOW
)
...
...
@@ -331,4 +341,10 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf
->
hpel_filter
=
x264_hpel_filter_ssse3
;
pf
->
frame_init_lowres_core
=
x264_frame_init_lowres_core_ssse3
;
pf
->
mc_chroma
=
x264_mc_chroma_ssse3
;
if
(
!
(
cpu
&
X264_CPU_SSE4
)
)
return
;
pf
->
integral_init4h
=
x264_integral_init4h_sse4
;
pf
->
integral_init8h
=
x264_integral_init8h_sse4
;
}
encoder/encoder.c
View file @
bc29c635
...
...
@@ -713,6 +713,7 @@ x264_t *x264_encoder_open ( x264_param_t *param )
||
h
->
param
.
i_bframe_adaptive
||
h
->
param
.
b_pre_scenecut
);
h
->
frames
.
b_have_lowres
|=
(
h
->
param
.
rc
.
b_stat_read
&&
h
->
param
.
rc
.
i_vbv_buffer_size
>
0
);
h
->
frames
.
b_have_sub8x8_esa
=
!!
(
h
->
param
.
analyse
.
inter
&
X264_ANALYSE_PSUB8x8
);
h
->
frames
.
i_last_idr
=
-
h
->
param
.
i_keyint_max
;
h
->
frames
.
i_input
=
0
;
...
...
@@ -839,6 +840,8 @@ int x264_encoder_reconfig( x264_t *h, x264_param_t *param )
// can only twiddle these if they were enabled to begin with:
if
(
h
->
param
.
analyse
.
i_me_method
>=
X264_ME_ESA
||
param
->
analyse
.
i_me_method
<
X264_ME_ESA
)
COPY
(
analyse
.
i_me_method
);
if
(
h
->
param
.
analyse
.
i_me_method
>=
X264_ME_ESA
&&
!
h
->
frames
.
b_have_sub8x8_esa
)
h
->
param
.
analyse
.
inter
&=
~
X264_ANALYSE_PSUB8x8
;
if
(
h
->
pps
->
b_transform_8x8_mode
)
COPY
(
analyse
.
b_transform_8x8
);
if
(
h
->
frames
.
i_max_ref1
>
1
)
...
...
tools/checkasm.c
View file @
bc29c635
...
...
@@ -822,33 +822,57 @@ static int check_mc( int cpu_ref, int cpu_new )
uint8_t
*
dstc
[
4
]
=
{
buf3
,
buf3
+
1024
,
buf3
+
2048
,
buf3
+
3072
};
uint8_t
*
dsta
[
4
]
=
{
buf4
,
buf4
+
1024
,
buf4
+
2048
,
buf3
+
3072
};
set_func_name
(
"lowres_init"
);
ok
=
1
;
used_asm
=
1
;
for
(
w
=
40
;
w
<=
48
;
w
+=
8
)
if
(
mc_a
.
frame_init_lowres_core
!=
mc_ref
.
frame_init_lowres_core
)
{
int
stride
=
(
w
+
8
)
&~
15
;
call_c
(
mc_c
.
frame_init_lowres_core
,
buf1
,
dstc
[
0
],
dstc
[
1
],
dstc
[
2
],
dstc
[
3
],
w
*
2
,
stride
,
w
,
16
);
call_a
(
mc_a
.
frame_init_lowres_core
,
buf1
,
dsta
[
0
],
dsta
[
1
],
dsta
[
2
],
dsta
[
3
],
w
*
2
,
stride
,
w
,
16
);
for
(
i
=
0
;
i
<
16
;
i
++
)
{
int
stride
=
(
w
+
8
)
&~
15
;
used_asm
=
1
;
call_c
(
mc_c
.
frame_init_lowres_core
,
buf1
,
dstc
[
0
],
dstc
[
1
],
dstc
[
2
],
dstc
[
3
],
w
*
2
,
stride
,
w
,
16
);
call_a
(
mc_a
.
frame_init_lowres_core
,
buf1
,
dsta
[
0
],
dsta
[
1
],
dsta
[
2
],
dsta
[
3
],
w
*
2
,
stride
,
w
,
16
);
for
(
i
=
0
;
i
<
16
;
i
++
)
{
for
(
j
=
0
;
j
<
4
;
j
++
)
if
(
memcmp
(
dstc
[
j
]
+
i
*
stride
,
dsta
[
j
]
+
i
*
stride
,
w
)
)
{
ok
=
0
;
fprintf
(
stderr
,
"frame_init_lowres differs at plane %d line %d
\n
"
,
j
,
i
);
for
(
k
=
0
;
k
<
w
;
k
++
)
printf
(
"%d "
,
dstc
[
j
][
k
+
i
*
stride
]
);
printf
(
"
\n
"
);
for
(
k
=
0
;
k
<
w
;
k
++
)
printf
(
"%d "
,
dsta
[
j
][
k
+
i
*
stride
]
);
printf
(
"
\n
"
);
break
;
}
}
for
(
j
=
0
;
j
<
4
;
j
++
)
if
(
memcmp
(
dstc
[
j
]
+
i
*
stride
,
dsta
[
j
]
+
i
*
stride
,
w
)
)
{
ok
=
0
;
fprintf
(
stderr
,
"frame_init_lowres differs at plane %d line %d
\n
"
,
j
,
i
);
for
(
k
=
0
;
k
<
w
;
k
++
)
printf
(
"%d "
,
dstc
[
j
][
k
+
i
*
stride
]
);
printf
(
"
\n
"
);
for
(
k
=
0
;
k
<
w
;
k
++
)
printf
(
"%d "
,
dsta
[
j
][
k
+
i
*
stride
]
);
printf
(
"
\n
"
);
break
;
}
}
}
report
(
"lowres init :"
);
}
#define INTEGRAL_INIT( name, size, ... )\
if( mc_a.name != mc_ref.name )\
{\
int stride = 80;\
set_func_name( #name );\
used_asm = 1;\
memcpy( buf3, buf1, size*2*stride );\
memcpy( buf4, buf1, size*2*stride );\
uint16_t *sum = (uint16_t*)buf3;\
call_c1( mc_c.name, __VA_ARGS__ );\
sum = (uint16_t*)buf4;\
call_a1( mc_a.name, __VA_ARGS__ );\
if( memcmp( buf3, buf4, (stride-8)*2 )\
|| (size>9 && memcmp( buf3+18*stride, buf4+18*stride, (stride-8)*2 )))\
ok = 0;\
call_c2( mc_c.name, __VA_ARGS__ );\
call_a2( mc_a.name, __VA_ARGS__ );\
}
ok
=
1
;
used_asm
=
0
;
INTEGRAL_INIT
(
integral_init4h
,
2
,
sum
+
stride
,
buf2
,
stride
);
INTEGRAL_INIT
(
integral_init8h
,
2
,
sum
+
stride
,
buf2
,
stride
);
INTEGRAL_INIT
(
integral_init4v
,
14
,
sum
,
sum
+
9
*
stride
,
stride
);
INTEGRAL_INIT
(
integral_init8v
,
9
,
sum
,
stride
);
report
(
"integral init :"
);
return
ret
;
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment