Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
7
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
Open sidebar
VideoLAN
x264
Commits
ecb04a3b
Commit
ecb04a3b
authored
Nov 26, 2008
by
Fiona Glaser
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
dequant_4x4_dc assembly
About 3.5x faster DC dequant on Conroe
parent
6ce71ce7
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
106 additions
and
13 deletions
+106
-13
common/quant.c
common/quant.c
+4
-1
common/quant.h
common/quant.h
+2
-3
common/x86/quant-a.asm
common/x86/quant-a.asm
+71
-7
common/x86/quant.h
common/x86/quant.h
+2
-0
encoder/macroblock.c
encoder/macroblock.c
+1
-1
tools/checkasm.c
tools/checkasm.c
+26
-1
No files found.
common/quant.c
View file @
ecb04a3b
...
...
@@ -139,7 +139,7 @@ static void dequant_8x8( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp )
}
}
void
x264_mb_
dequant_4x4_dc
(
int16_t
dct
[
4
][
4
],
int
dequant_mf
[
6
][
4
][
4
],
int
i_qp
)
static
void
dequant_4x4_dc
(
int16_t
dct
[
4
][
4
],
int
dequant_mf
[
6
][
4
][
4
],
int
i_qp
)
{
const
int
i_qbits
=
i_qp
/
6
-
6
;
int
y
;
...
...
@@ -253,6 +253,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf
->
quant_2x2_dc
=
quant_2x2_dc
;
pf
->
dequant_4x4
=
dequant_4x4
;
pf
->
dequant_4x4_dc
=
dequant_4x4_dc
;
pf
->
dequant_8x8
=
dequant_8x8
;
pf
->
denoise_dct
=
x264_denoise_dct
;
...
...
@@ -267,6 +268,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf
->
quant_4x4
=
x264_quant_4x4_mmx
;
pf
->
quant_8x8
=
x264_quant_8x8_mmx
;
pf
->
dequant_4x4
=
x264_dequant_4x4_mmx
;
pf
->
dequant_4x4_dc
=
x264_dequant_4x4dc_mmxext
;
pf
->
dequant_8x8
=
x264_dequant_8x8_mmx
;
if
(
h
->
param
.
i_cqm_preset
==
X264_CQM_FLAT
)
{
...
...
@@ -294,6 +296,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf
->
quant_4x4
=
x264_quant_4x4_sse2
;
pf
->
quant_8x8
=
x264_quant_8x8_sse2
;
pf
->
dequant_4x4
=
x264_dequant_4x4_sse2
;
pf
->
dequant_4x4_dc
=
x264_dequant_4x4dc_sse2
;
pf
->
dequant_8x8
=
x264_dequant_8x8_sse2
;
if
(
h
->
param
.
i_cqm_preset
==
X264_CQM_FLAT
)
{
...
...
common/quant.h
View file @
ecb04a3b
...
...
@@ -30,8 +30,9 @@ typedef struct
void
(
*
quant_4x4_dc
)(
int16_t
dct
[
4
][
4
],
int
mf
,
int
bias
);
void
(
*
quant_2x2_dc
)(
int16_t
dct
[
2
][
2
],
int
mf
,
int
bias
);
void
(
*
dequant_4x4
)(
int16_t
dct
[
4
][
4
],
int
dequant_mf
[
6
][
4
][
4
],
int
i_qp
);
void
(
*
dequant_8x8
)(
int16_t
dct
[
8
][
8
],
int
dequant_mf
[
6
][
8
][
8
],
int
i_qp
);
void
(
*
dequant_4x4
)(
int16_t
dct
[
4
][
4
],
int
dequant_mf
[
6
][
4
][
4
],
int
i_qp
);
void
(
*
dequant_4x4_dc
)(
int16_t
dct
[
4
][
4
],
int
dequant_mf
[
6
][
4
][
4
],
int
i_qp
);
void
(
*
denoise_dct
)(
int16_t
*
dct
,
uint32_t
*
sum
,
uint16_t
*
offset
,
int
size
);
...
...
@@ -42,6 +43,4 @@ typedef struct
void
x264_quant_init
(
x264_t
*
h
,
int
cpu
,
x264_quant_function_t
*
pf
);
void
x264_mb_dequant_4x4_dc
(
int16_t
dct
[
4
][
4
],
int
dequant_mf
[
6
][
4
][
4
],
int
i_qscale
);
#endif
common/x86/quant-a.asm
View file @
ecb04a3b
...
...
@@ -255,26 +255,30 @@ QUANT_DC x264_quant_2x2_dc_ssse3, 1
%define t2d r1d
%endif
;-----------------------------------------------------------------------------
; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
;-----------------------------------------------------------------------------
%macro DEQUANT 4
cglobal
x264_dequant_
%
2
x
%
2
_
%
1
,
0
,
3
%macro DEQUANT_START 2
movifnidn
t2d
,
r2m
imul
t0d
,
t2d
,
0x2b
shr
t0d
,
8
; i_qbits = i_qp / 6
lea
t1
,
[
t0
*
3
]
sub
t2d
,
t1d
sub
t2d
,
t1d
; i_mf = i_qp % 6
shl
t2d
,
%
3
+
2
shl
t2d
,
%
1
%ifdef ARCH_X86_64
add
r1
,
t2
; dequant_mf[i_mf]
%else
add
r1
,
r1m
; dequant_mf[i_mf]
mov
r0
,
r0m
; dct
%endif
sub
t0d
,
%
3
sub
t0d
,
%
2
jl
.rshift32
; negative qbits => rightshift
%endmacro
;-----------------------------------------------------------------------------
; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
;-----------------------------------------------------------------------------
%macro DEQUANT 4
cglobal
x264_dequant_
%
2
x
%
2
_
%
1
,
0
,
3
DEQUANT_START
%
3
+
2
,
%
3
.lshift:
movd
m5
,
t0d
...
...
@@ -339,7 +343,67 @@ INIT_XMM
DEQUANT
ss
e2
,
4
,
4
,
2
DEQUANT
ss
e2
,
8
,
6
,
2
%macro DEQUANT_DC 1
cglobal
x264_dequant_4x4dc_
%
1
,
0
,
3
DEQUANT_START
6
,
6
.lshift:
movd
m6
,
[
r1
]
movd
m5
,
t0d
pslld
m6
,
m5
%if mmsize==16
pshuflw
m6
,
m6
,
0
punpcklqdq
m6
,
m6
%else
pshufw
m6
,
m6
,
0
%endif
%assign x 0
%rep 16/mmsize
mova
m0
,
[
r0
+
mmsize
*
0
+
x
]
mova
m1
,
[
r0
+
mmsize
*
1
+
x
]
pmullw
m0
,
m6
pmullw
m1
,
m6
mova
[
r0
+
mmsize
*
0
+
x
],
m0
mova
[
r0
+
mmsize
*
1
+
x
],
m1
%assign x x+mmsize*2
%endrep
RET
.rshift32:
neg
t0d
movd
m5
,
t0d
mova
m6
,
[
pw_1
GLOBAL
]
mova
m7
,
m6
pslld
m6
,
m5
psrld
m6
,
1
movd
m4
,
[
r1
]
%if mmsize==8
punpcklwd
m4
,
m4
%else
pshuflw
m4
,
m4
,
0
%endif
punpcklwd
m4
,
m6
%assign x 0
%rep 32/mmsize
mova
m0
,
[
r0
+
x
]
mova
m1
,
m0
punpcklwd
m0
,
m7
punpckhwd
m1
,
m7
pmaddwd
m0
,
m4
pmaddwd
m1
,
m4
psrad
m0
,
m5
psrad
m1
,
m5
packssdw
m0
,
m1
mova
[
r0
+
x
],
m0
%assign x x+mmsize
%endrep
RET
%endmacro
INIT_MMX
DEQUANT_DC
mmxext
INIT_XMM
DEQUANT_DC
ss
e2
;-----------------------------------------------------------------------------
; void x264_denoise_dct_mmx( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
...
...
common/x86/quant.h
View file @
ecb04a3b
...
...
@@ -36,8 +36,10 @@ void x264_quant_4x4_dc_ssse3( int16_t dct[4][4], int mf, int bias );
void
x264_quant_4x4_ssse3
(
int16_t
dct
[
4
][
4
],
uint16_t
mf
[
16
],
uint16_t
bias
[
16
]
);
void
x264_quant_8x8_ssse3
(
int16_t
dct
[
8
][
8
],
uint16_t
mf
[
64
],
uint16_t
bias
[
64
]
);
void
x264_dequant_4x4_mmx
(
int16_t
dct
[
4
][
4
],
int
dequant_mf
[
6
][
4
][
4
],
int
i_qp
);
void
x264_dequant_4x4dc_mmxext
(
int16_t
dct
[
4
][
4
],
int
dequant_mf
[
6
][
4
][
4
],
int
i_qp
);
void
x264_dequant_8x8_mmx
(
int16_t
dct
[
8
][
8
],
int
dequant_mf
[
6
][
8
][
8
],
int
i_qp
);
void
x264_dequant_4x4_sse2
(
int16_t
dct
[
4
][
4
],
int
dequant_mf
[
6
][
4
][
4
],
int
i_qp
);
void
x264_dequant_4x4dc_sse2
(
int16_t
dct
[
4
][
4
],
int
dequant_mf
[
6
][
4
][
4
],
int
i_qp
);
void
x264_dequant_8x8_sse2
(
int16_t
dct
[
8
][
8
],
int
dequant_mf
[
6
][
8
][
8
],
int
i_qp
);
void
x264_dequant_4x4_flat16_mmx
(
int16_t
dct
[
4
][
4
],
int
dequant_mf
[
6
][
4
][
4
],
int
i_qp
);
void
x264_dequant_8x8_flat16_mmx
(
int16_t
dct
[
8
][
8
],
int
dequant_mf
[
6
][
8
][
8
],
int
i_qp
);
...
...
encoder/macroblock.c
View file @
ecb04a3b
...
...
@@ -188,7 +188,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
/* output samples to fdec */
h
->
dctf
.
idct4x4dc
(
dct_dc4x4
);
x264_mb_
dequant_4x4_dc
(
dct_dc4x4
,
h
->
dequant4_mf
[
CQM_4IY
],
i_qp
);
/* XXX not inversed */
h
->
quantf
.
dequant_4x4_dc
(
dct_dc4x4
,
h
->
dequant4_mf
[
CQM_4IY
],
i_qp
);
/* XXX not inversed */
/* calculate dct coeffs */
for
(
i
=
0
;
i
<
16
;
i
++
)
...
...
tools/checkasm.c
View file @
ecb04a3b
...
...
@@ -1050,7 +1050,7 @@ static int check_quant( int cpu_ref, int cpu_new )
for( qp = 51; qp > 0; qp-- ) \
{ \
INIT_QUANT##w() \
call_c( qf_c.qname, (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
call_c
1
( qf_c.qname, (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
memcpy( dct2, dct1, w*w*2 ); \
call_c1( qf_c.dqname, (void*)dct1, h->dequant##w##_mf[block], qp ); \
call_a1( qf_a.dqname, (void*)dct2, h->dequant##w##_mf[block], qp ); \
...
...
@@ -1070,6 +1070,31 @@ static int check_quant( int cpu_ref, int cpu_new )
TEST_DEQUANT
(
quant_4x4
,
dequant_4x4
,
CQM_4IY
,
4
);
TEST_DEQUANT
(
quant_4x4
,
dequant_4x4
,
CQM_4PY
,
4
);
#define TEST_DEQUANT_DC( qname, dqname, block, w ) \
if( qf_a.dqname != qf_ref.dqname ) \
{ \
set_func_name( "%s_%s", #dqname, i_cqm?"cqm":"flat" ); \
used_asms[1] = 1; \
for( qp = 51; qp > 0; qp-- ) \
{ \
for( i = 0; i < 16; i++ ) \
dct1[i] = rand(); \
call_c1( qf_c.qname, (void*)dct1, h->quant##w##_mf[block][qp][0]>>1, h->quant##w##_bias[block][qp][0]>>1 ); \
memcpy( dct2, dct1, w*w*2 ); \
call_c1( qf_c.dqname, (void*)dct1, h->dequant##w##_mf[block], qp ); \
call_a1( qf_a.dqname, (void*)dct2, h->dequant##w##_mf[block], qp ); \
if( memcmp( dct1, dct2, w*w*2 ) ) \
{ \
oks[1] = 0; \
fprintf( stderr, #dqname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \
} \
call_c2( qf_c.dqname, (void*)dct1, h->dequant##w##_mf[block], qp ); \
call_a2( qf_a.dqname, (void*)dct2, h->dequant##w##_mf[block], qp ); \
} \
}
TEST_DEQUANT_DC
(
quant_4x4_dc
,
dequant_4x4_dc
,
CQM_4IY
,
4
);
x264_cqm_delete
(
h
);
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment