Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
Menu
Open sidebar
VideoLAN
x264
Commits
e9ff8c4b
Commit
e9ff8c4b
authored
Aug 09, 2009
by
Loren Merritt
Browse files
simd part of x264_macroblock_tree_propagate.
1.6x faster on conroe.
parent
5599c478
Changes
8
Hide whitespace changes
Inline
Side-by-side
common/macroblock.c
View file @
e9ff8c4b
...
...
@@ -743,7 +743,8 @@ int x264_macroblock_cache_init( x264_t *h )
int
me_range
=
X264_MIN
(
h
->
param
.
analyse
.
i_me_range
,
h
->
param
.
analyse
.
i_mv_range
);
int
buf_tesa
=
(
h
->
param
.
analyse
.
i_me_method
>=
X264_ME_ESA
)
*
((
me_range
*
2
+
18
)
*
sizeof
(
int16_t
)
+
(
me_range
+
4
)
*
(
me_range
+
1
)
*
4
*
sizeof
(
mvsad_t
));
CHECKED_MALLOC
(
h
->
scratch_buffer
,
X264_MAX3
(
buf_hpel
,
buf_ssim
,
buf_tesa
)
);
int
buf_mbtree
=
h
->
param
.
rc
.
b_mb_tree
*
((
h
->
sps
->
i_mb_width
+
3
)
&~
3
)
*
sizeof
(
int
);
CHECKED_MALLOC
(
h
->
scratch_buffer
,
X264_MAX4
(
buf_hpel
,
buf_ssim
,
buf_tesa
,
buf_mbtree
)
);
return
0
;
fail:
return
-
1
;
...
...
common/mc.c
View file @
e9ff8c4b
...
...
@@ -356,6 +356,33 @@ static void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth,
}
}
#if defined(__GNUC__) && (defined(ARCH_X86) || defined(ARCH_X86_64))
// gcc isn't smart enough to use the "idiv" instruction
static
ALWAYS_INLINE
int32_t
div_64_32
(
int64_t
x
,
int32_t
y
)
{
int32_t
quotient
,
remainder
;
asm
(
"idiv %4"
:
"=a"
(
quotient
),
"=d"
(
remainder
)
:
"a"
((
uint32_t
)
x
),
"d"
((
int32_t
)(
x
>>
32
)),
"r"
(
y
)
);
return
quotient
;
}
#else
#define div_64_32(x,y) ((x)/(y))
#endif
/* Estimate the total amount of influence on future quality that could be had if we
* were to improve the reference samples used to inter predict any given macroblock. */
static
void
mbtree_propagate_cost
(
int
*
dst
,
uint16_t
*
propagate_in
,
uint16_t
*
intra_costs
,
uint16_t
*
inter_costs
,
uint16_t
*
inv_qscales
,
int
len
)
{
int
i
;
for
(
i
=
0
;
i
<
len
;
i
++
)
{
int
propagate_amount
=
propagate_in
[
i
]
+
((
intra_costs
[
i
]
*
inv_qscales
[
i
]
+
128
)
>>
8
);
dst
[
i
]
=
div_64_32
((
int64_t
)
propagate_amount
*
(
intra_costs
[
i
]
-
inter_costs
[
i
]),
intra_costs
[
i
]);
}
}
void
x264_mc_init
(
int
cpu
,
x264_mc_functions_t
*
pf
)
{
pf
->
mc_luma
=
mc_luma
;
...
...
@@ -392,6 +419,8 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf )
pf
->
integral_init4v
=
integral_init4v
;
pf
->
integral_init8v
=
integral_init8v
;
pf
->
mbtree_propagate_cost
=
mbtree_propagate_cost
;
#ifdef HAVE_MMX
x264_mc_init_mmx
(
cpu
,
pf
);
#endif
...
...
common/mc.h
View file @
e9ff8c4b
...
...
@@ -74,6 +74,9 @@ typedef struct
void
(
*
frame_init_lowres_core
)(
uint8_t
*
src0
,
uint8_t
*
dst0
,
uint8_t
*
dsth
,
uint8_t
*
dstv
,
uint8_t
*
dstc
,
int
src_stride
,
int
dst_stride
,
int
width
,
int
height
);
void
(
*
mbtree_propagate_cost
)(
int
*
dst
,
uint16_t
*
propagate_in
,
uint16_t
*
intra_costs
,
uint16_t
*
inter_costs
,
uint16_t
*
inv_qscales
,
int
len
);
}
x264_mc_functions_t
;
void
x264_mc_init
(
int
cpu
,
x264_mc_functions_t
*
pf
);
...
...
common/x86/mc-a2.asm
View file @
e9ff8c4b
...
...
@@ -34,6 +34,7 @@ filt_mul51: times 8 db 1, -5
pw_1:
times
8
dw
1
pw_16:
times
8
dw
16
pw_32:
times
8
dw
32
pd_128:
times
4
dd
128
SECTION
.text
...
...
@@ -1081,3 +1082,43 @@ INIT_XMM
FRAME_INIT_LOWRES
ss
e2
,
12
%define PALIGNR PALIGNR_SSSE3
FRAME_INIT_LOWRES
ss
se3
,
12
;-----------------------------------------------------------------------------
; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
; uint16_t *inter_costs, uint16_t *inv_qscales, int len )
;-----------------------------------------------------------------------------
cglobal
x264_mbtree_propagate_cost_sse2
,
6
,
6
shl
r5d
,
1
lea
r0
,
[
r0
+
r5
*
2
]
lea
r1
,
[
r1
+
r5
]
lea
r2
,
[
r2
+
r5
]
lea
r3
,
[
r3
+
r5
]
lea
r4
,
[
r4
+
r5
]
neg
r5
pxor
xmm5
,
xmm5
movdqa
xmm4
,
[
pd_128
GLOBAL
]
.loop:
movq
xmm2
,
[
r2
+
r5
]
; intra
movq
xmm0
,
[
r4
+
r5
]
; invq
punpcklwd
xmm2
,
xmm5
punpcklwd
xmm0
,
xmm5
pmaddwd
xmm0
,
xmm2
paddd
xmm0
,
xmm4
psrld
xmm0
,
8
; intra*invq>>8
movq
xmm1
,
[
r1
+
r5
]
; prop
movq
xmm3
,
[
r3
+
r5
]
; inter
punpcklwd
xmm1
,
xmm5
punpcklwd
xmm3
,
xmm5
paddd
xmm0
,
xmm1
; prop + (intra*invq>>8)
cvtdq2ps
xmm1
,
xmm2
; intra
psubd
xmm2
,
xmm3
; intra - inter
cvtdq2ps
xmm0
,
xmm0
cvtdq2ps
xmm2
,
xmm2
mulps
xmm0
,
xmm2
; (prop + (intra*invq>>8)) * (intra - inter)
divps
xmm0
,
xmm1
; / intra
cvttps2dq
xmm0
,
xmm0
; truncation isn't really desired, but matches the integer implementation
movdqa
[
r0
+
r5
*
2
],
xmm0
add
r5
,
8
jl
.loop
REP_RET
common/x86/mc-c.c
View file @
e9ff8c4b
...
...
@@ -74,6 +74,8 @@ extern void x264_integral_init4v_sse2( uint16_t *sum8, uint16_t *sum4, int strid
extern
void
x264_integral_init8v_mmx
(
uint16_t
*
sum8
,
int
stride
);
extern
void
x264_integral_init8v_sse2
(
uint16_t
*
sum8
,
int
stride
);
extern
void
x264_integral_init4v_ssse3
(
uint16_t
*
sum8
,
uint16_t
*
sum4
,
int
stride
);
extern
void
x264_mbtree_propagate_cost_sse2
(
int
*
dst
,
uint16_t
*
propagate_in
,
uint16_t
*
intra_costs
,
uint16_t
*
inter_costs
,
uint16_t
*
inv_qscales
,
int
len
);
#define LOWRES(cpu) \
extern void x264_frame_init_lowres_core_##cpu( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,\
int src_stride, int dst_stride, int width, int height );
...
...
@@ -303,6 +305,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf
->
integral_init4v
=
x264_integral_init4v_sse2
;
pf
->
integral_init8v
=
x264_integral_init8v_sse2
;
pf
->
hpel_filter
=
x264_hpel_filter_sse2_amd
;
pf
->
mbtree_propagate_cost
=
x264_mbtree_propagate_cost_sse2
;
if
(
cpu
&
X264_CPU_SSE2_IS_SLOW
)
return
;
...
...
encoder/encoder.c
View file @
e9ff8c4b
...
...
@@ -648,6 +648,7 @@ static int x264_validate_parameters( x264_t *h )
BOOLIFY
(
analyse
.
b_fast_pskip
);
BOOLIFY
(
rc
.
b_stat_write
);
BOOLIFY
(
rc
.
b_stat_read
);
BOOLIFY
(
rc
.
b_mb_tree
);
#undef BOOLIFY
return
0
;
...
...
encoder/slicetype.c
View file @
e9ff8c4b
...
...
@@ -406,22 +406,21 @@ static void x264_macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, in
int
dist_scale_factor
=
(
((
b
-
p0
)
<<
8
)
+
((
p1
-
p0
)
>>
1
)
)
/
(
p1
-
p0
);
int
i_bipred_weight
=
h
->
param
.
analyse
.
b_weighted_bipred
?
64
-
(
dist_scale_factor
>>
2
)
:
32
;
int16_t
(
*
mvs
[
2
])[
2
]
=
{
frames
[
b
]
->
lowres_mvs
[
0
][
b
-
p0
-
1
],
frames
[
b
]
->
lowres_mvs
[
1
][
p1
-
b
-
1
]
};
int
*
buf
=
h
->
scratch_buffer
;
for
(
h
->
mb
.
i_mb_y
=
0
;
h
->
mb
.
i_mb_y
<
h
->
sps
->
i_mb_height
;
h
->
mb
.
i_mb_y
++
)
{
int
mb_index
=
h
->
mb
.
i_mb_y
*
h
->
mb
.
i_mb_stride
;
h
->
mc
.
mbtree_propagate_cost
(
buf
,
frames
[
b
]
->
i_propagate_cost
+
mb_index
,
frames
[
b
]
->
i_intra_cost
+
mb_index
,
frames
[
b
]
->
lowres_costs
[
b
-
p0
][
p1
-
b
]
+
mb_index
,
frames
[
b
]
->
i_inv_qscale_factor
+
mb_index
,
h
->
sps
->
i_mb_width
);
for
(
h
->
mb
.
i_mb_x
=
0
;
h
->
mb
.
i_mb_x
<
h
->
sps
->
i_mb_width
;
h
->
mb
.
i_mb_x
++
,
mb_index
++
)
{
int
inter_cost
=
frames
[
b
]
->
lowres_costs
[
b
-
p0
][
p1
-
b
][
mb_index
];
int
intra_cost
=
frames
[
b
]
->
i_intra_cost
[
mb_index
];
int
propagate_amount
=
buf
[
h
->
mb
.
i_mb_x
];
/* Don't propagate for an intra block. */
if
(
inter_cost
<
intra_cost
)
if
(
propagate_amount
>
0
)
{
int
lists_used
=
frames
[
b
]
->
lowres_inter_types
[
b
-
p0
][
p1
-
b
][
mb_index
];
/* The approximate amount of data that this block contains. */
int
propagate_amount
=
frames
[
b
]
->
i_propagate_cost
[
mb_index
]
+
((
intra_cost
*
frames
[
b
]
->
i_inv_qscale_factor
[
mb_index
]
+
128
)
>>
8
);
propagate_amount
=
((
uint64_t
)
propagate_amount
*
(
intra_cost
-
inter_cost
))
/
intra_cost
;
int
list
;
/* Follow the MVs to the previous frame(s). */
for
(
list
=
0
;
list
<
2
;
list
++
)
...
...
tools/checkasm.c
View file @
e9ff8c4b
...
...
@@ -960,6 +960,32 @@ static int check_mc( int cpu_ref, int cpu_new )
INTEGRAL_INIT
(
integral_init8v
,
9
,
sum
,
stride
);
report
(
"integral init :"
);
if
(
mc_a
.
mbtree_propagate_cost
!=
mc_ref
.
mbtree_propagate_cost
)
{
ok
=
1
;
used_asm
=
1
;
set_func_name
(
"mbtree_propagate"
);
int
*
dsta
=
(
int
*
)
buf3
;
int
*
dstc
=
dsta
+
400
;
uint16_t
*
prop
=
(
uint16_t
*
)
buf1
;
uint16_t
*
intra
=
(
uint16_t
*
)
buf4
;
uint16_t
*
inter
=
intra
+
400
;
uint16_t
*
qscale
=
inter
+
400
;
uint16_t
*
rand
=
(
uint16_t
*
)
buf2
;
for
(
i
=
0
;
i
<
400
;
i
++
)
{
intra
[
i
]
=
*
rand
++
&
0x7fff
;
intra
[
i
]
+=
!
intra
[
i
];
inter
[
i
]
=
*
rand
++
&
0x7fff
;
qscale
[
i
]
=
*
rand
++
&
0x7fff
;
}
call_c
(
mc_c
.
mbtree_propagate_cost
,
dstc
,
prop
,
intra
,
inter
,
qscale
,
400
);
call_a
(
mc_a
.
mbtree_propagate_cost
,
dsta
,
prop
,
intra
,
inter
,
qscale
,
400
);
// I don't care about exact rounding, this is just how close the floating-point implementation happens to be
for
(
i
=
0
;
i
<
400
;
i
++
)
ok
&=
abs
(
dstc
[
i
]
-
dsta
[
i
])
<=
(
abs
(
dstc
[
i
])
>
512
)
||
fabs
((
double
)
dstc
[
i
]
/
dsta
[
i
]
-
1
)
<
1e-6
;
report
(
"mbtree propagate :"
);
}
return
ret
;
}
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment