Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
Menu
Open sidebar
VideoLAN
x264
Commits
2b2f0395
Commit
2b2f0395
authored
Mar 25, 2017
by
Henrik Gramner
Browse files
x86: AVX-512 zigzag_scan_4x4_frame
parent
1878c7f2
Changes
8
Hide whitespace changes
Inline
Side-by-side
common/common.h
View file @
2b2f0395
...
...
@@ -635,11 +635,11 @@ struct x264_t
/* Current MB DCT coeffs */
struct
{
ALIGNED_
32
(
dctcoef
luma16x16_dc
[
3
][
16
]
);
ALIGNED_
64
(
dctcoef
luma16x16_dc
[
3
][
16
]
);
ALIGNED_16
(
dctcoef
chroma_dc
[
2
][
8
]
);
// FIXME share memory?
ALIGNED_32
(
dctcoef
luma8x8
[
12
][
64
]
);
ALIGNED_
32
(
dctcoef
luma4x4
[
16
*
3
][
16
]
);
ALIGNED_
64
(
dctcoef
luma4x4
[
16
*
3
][
16
]
);
}
dct
;
/* MB table and cache for current frame/mb */
...
...
common/dct.c
View file @
2b2f0395
...
...
@@ -986,6 +986,10 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig
pf_progressive
->
scan_8x8
=
x264_zigzag_scan_8x8_frame_avx
;
}
#endif // ARCH_X86_64
if
(
cpu
&
X264_CPU_AVX512
)
{
pf_progressive
->
scan_4x4
=
x264_zigzag_scan_4x4_frame_avx512
;
}
#endif // HAVE_MMX
#else
#if HAVE_MMX
...
...
@@ -1026,6 +1030,10 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig
pf_progressive
->
scan_8x8
=
x264_zigzag_scan_8x8_frame_xop
;
pf_interlaced
->
scan_8x8
=
x264_zigzag_scan_8x8_field_xop
;
}
if
(
cpu
&
X264_CPU_AVX512
)
{
pf_progressive
->
scan_4x4
=
x264_zigzag_scan_4x4_frame_avx512
;
}
#endif // HAVE_MMX
#if HAVE_ALTIVEC
if
(
cpu
&
X264_CPU_ALTIVEC
)
...
...
common/x86/dct-a.asm
View file @
2b2f0395
...
...
@@ -30,7 +30,13 @@
%include "x86inc.asm"
%include "x86util.asm"
SECTION
_RODATA
32
SECTION
_RODATA
64
%if HIGH_BIT_DEPTH
scan_frame_avx512:
dd
0
,
4
,
1
,
2
,
5
,
8
,
12
,
9
,
6
,
3
,
7
,
10
,
13
,
14
,
11
,
15
%else
scan_frame_avx512:
dw
0
,
4
,
1
,
2
,
5
,
8
,
12
,
9
,
6
,
3
,
7
,
10
,
13
,
14
,
11
,
15
%endif
pw_ppmmmmpp:
dw
1
,
1
,
-
1
,
-
1
,
-
1
,
-
1
,
1
,
1
pb_sub4frame:
db
0
,
1
,
4
,
8
,
5
,
2
,
3
,
6
,
9
,
12
,
13
,
10
,
7
,
11
,
14
,
15
pb_sub4field:
db
0
,
4
,
1
,
8
,
12
,
5
,
9
,
13
,
2
,
6
,
10
,
14
,
3
,
7
,
11
,
15
...
...
@@ -1883,3 +1889,19 @@ cglobal zigzag_interleave_8x8_cavlc, 3,3,6
mov
[
r2
+
8
],
r0w
RET
%endif
; !HIGH_BIT_DEPTH
%if HIGH_BIT_DEPTH
INIT_ZMM
avx512
cglobal
zigzag_scan_4x4_frame
,
2
,
2
mova
m0
,
[
scan_frame_avx512
]
vpermd
m0
,
m0
,
[
r1
]
mova
[
r0
],
m0
RET
%else
; !HIGH_BIT_DEPTH
INIT_YMM
avx512
cglobal
zigzag_scan_4x4_frame
,
2
,
2
mova
m0
,
[
scan_frame_avx512
]
vpermw
m0
,
m0
,
[
r1
]
mova
[
r0
],
m0
RET
%endif
; !HIGH_BIT_DEPTH
common/x86/dct.h
View file @
2b2f0395
...
...
@@ -106,11 +106,12 @@ void x264_zigzag_scan_8x8_frame_avx ( dctcoef level[64], dctcoef dct[64] );
void
x264_zigzag_scan_8x8_frame_ssse3
(
int16_t
level
[
64
],
int16_t
dct
[
64
]
);
void
x264_zigzag_scan_8x8_frame_sse2
(
dctcoef
level
[
64
],
dctcoef
dct
[
64
]
);
void
x264_zigzag_scan_8x8_frame_mmx2
(
int16_t
level
[
64
],
int16_t
dct
[
64
]
);
void
x264_zigzag_scan_4x4_frame_xop
(
dctcoef
level
[
16
],
dctcoef
dct
[
16
]
);
void
x264_zigzag_scan_4x4_frame_avx
(
dctcoef
level
[
16
],
dctcoef
dct
[
16
]
);
void
x264_zigzag_scan_4x4_frame_ssse3
(
int16_t
level
[
16
],
int16_t
dct
[
16
]
);
void
x264_zigzag_scan_4x4_frame_sse2
(
int32_t
level
[
16
],
int32_t
dct
[
16
]
);
void
x264_zigzag_scan_4x4_frame_mmx
(
int16_t
level
[
16
],
int16_t
dct
[
16
]
);
void
x264_zigzag_scan_4x4_frame_mmx
(
int16_t
level
[
16
],
int16_t
dct
[
16
]
);
void
x264_zigzag_scan_4x4_frame_sse2
(
int32_t
level
[
16
],
int32_t
dct
[
16
]
);
void
x264_zigzag_scan_4x4_frame_ssse3
(
int16_t
level
[
16
],
int16_t
dct
[
16
]
);
void
x264_zigzag_scan_4x4_frame_avx
(
dctcoef
level
[
16
],
dctcoef
dct
[
16
]
);
void
x264_zigzag_scan_4x4_frame_xop
(
dctcoef
level
[
16
],
dctcoef
dct
[
16
]
);
void
x264_zigzag_scan_4x4_frame_avx512
(
dctcoef
level
[
16
],
dctcoef
dct
[
16
]
);
void
x264_zigzag_scan_4x4_field_sse2
(
int32_t
level
[
16
],
int32_t
dct
[
16
]
);
void
x264_zigzag_scan_4x4_field_sse
(
int16_t
level
[
16
],
int16_t
dct
[
16
]
);
void
x264_zigzag_scan_8x8_field_xop
(
int16_t
level
[
64
],
int16_t
dct
[
64
]
);
...
...
encoder/macroblock.c
View file @
2b2f0395
...
...
@@ -128,8 +128,8 @@ static void x264_mb_encode_i16x16( x264_t *h, int p, int i_qp )
pixel
*
p_src
=
h
->
mb
.
pic
.
p_fenc
[
p
];
pixel
*
p_dst
=
h
->
mb
.
pic
.
p_fdec
[
p
];
ALIGNED_ARRAY_
32
(
dctcoef
,
dct4x4
,[
16
],[
16
]
);
ALIGNED_ARRAY_
32
(
dctcoef
,
dct_dc4x4
,[
16
]
);
ALIGNED_ARRAY_
64
(
dctcoef
,
dct4x4
,[
16
],[
16
]
);
ALIGNED_ARRAY_
64
(
dctcoef
,
dct_dc4x4
,[
16
]
);
int
nz
,
block_cbp
=
0
;
int
decimate_score
=
h
->
mb
.
b_dct_decimate
?
0
:
9
;
...
...
@@ -350,7 +350,7 @@ static ALWAYS_INLINE void x264_mb_encode_chroma_internal( x264_t *h, int b_inter
int
i_decimate_score
=
b_decimate
?
0
:
7
;
int
nz_ac
=
0
;
ALIGNED_ARRAY_
32
(
dctcoef
,
dct4x4
,[
8
],[
16
]
);
ALIGNED_ARRAY_
64
(
dctcoef
,
dct4x4
,[
8
],[
16
]
);
if
(
h
->
mb
.
b_lossless
)
{
...
...
@@ -824,7 +824,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_
}
else
{
ALIGNED_ARRAY_
32
(
dctcoef
,
dct4x4
,[
16
],[
16
]
);
ALIGNED_ARRAY_
64
(
dctcoef
,
dct4x4
,[
16
],[
16
]
);
for
(
int
p
=
0
;
p
<
plane_count
;
p
++
,
i_qp
=
h
->
mb
.
i_chroma_qp
)
{
int
quant_cat
=
p
?
CQM_4PC
:
CQM_4PY
;
...
...
@@ -965,8 +965,8 @@ void x264_macroblock_encode( x264_t *h )
*****************************************************************************/
static
ALWAYS_INLINE
int
x264_macroblock_probe_skip_internal
(
x264_t
*
h
,
int
b_bidir
,
int
plane_count
,
int
chroma
)
{
ALIGNED_ARRAY_
32
(
dctcoef
,
dct4x4
,[
8
],[
16
]
);
ALIGNED_ARRAY_
1
6
(
dctcoef
,
dctscan
,[
16
]
);
ALIGNED_ARRAY_
64
(
dctcoef
,
dct4x4
,[
8
],[
16
]
);
ALIGNED_ARRAY_6
4
(
dctcoef
,
dctscan
,[
16
]
);
ALIGNED_4
(
int16_t
mvp
[
2
]
);
int
i_qp
=
h
->
mb
.
i_qp
;
...
...
@@ -1252,7 +1252,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i
pixel
*
p_fenc
=
h
->
mb
.
pic
.
p_fenc
[
p
]
+
8
*
x
+
8
*
y
*
FENC_STRIDE
;
pixel
*
p_fdec
=
h
->
mb
.
pic
.
p_fdec
[
p
]
+
8
*
x
+
8
*
y
*
FDEC_STRIDE
;
int
i_decimate_8x8
=
b_decimate
?
0
:
4
;
ALIGNED_ARRAY_
32
(
dctcoef
,
dct4x4
,[
4
],[
16
]
);
ALIGNED_ARRAY_
64
(
dctcoef
,
dct4x4
,[
4
],[
16
]
);
int
nnz8x8
=
0
;
h
->
dctf
.
sub8x8_dct
(
dct4x4
,
p_fenc
,
p_fdec
);
...
...
@@ -1311,7 +1311,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i
i_qp
=
h
->
mb
.
i_chroma_qp
;
for
(
int
ch
=
0
;
ch
<
2
;
ch
++
)
{
ALIGNED_ARRAY_
32
(
dctcoef
,
dct4x4
,[
2
],[
16
]
);
ALIGNED_ARRAY_
64
(
dctcoef
,
dct4x4
,[
2
],[
16
]
);
pixel
*
p_fenc
=
h
->
mb
.
pic
.
p_fenc
[
1
+
ch
]
+
4
*
x
+
(
chroma422
?
8
:
4
)
*
y
*
FENC_STRIDE
;
pixel
*
p_fdec
=
h
->
mb
.
pic
.
p_fdec
[
1
+
ch
]
+
4
*
x
+
(
chroma422
?
8
:
4
)
*
y
*
FDEC_STRIDE
;
...
...
@@ -1376,7 +1376,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_p4x4_internal( x264_t *h, int i
}
else
{
ALIGNED_ARRAY_
32
(
dctcoef
,
dct4x4
,[
16
]
);
ALIGNED_ARRAY_
64
(
dctcoef
,
dct4x4
,[
16
]
);
h
->
dctf
.
sub4x4_dct
(
dct4x4
,
p_fenc
,
p_fdec
);
nz
=
x264_quant_4x4
(
h
,
dct4x4
,
i_qp
,
ctx_cat_plane
[
DCT_LUMA_4x4
][
p
],
0
,
p
,
i4
);
h
->
mb
.
cache
.
non_zero_count
[
x264_scan8
[
p
*
16
+
i4
]]
=
nz
;
...
...
encoder/macroblock.h
View file @
2b2f0395
...
...
@@ -116,7 +116,7 @@ static ALWAYS_INLINE void x264_mb_encode_i4x4( x264_t *h, int p, int idx, int i_
int
nz
;
pixel
*
p_src
=
&
h
->
mb
.
pic
.
p_fenc
[
p
][
block_idx_xy_fenc
[
idx
]];
pixel
*
p_dst
=
&
h
->
mb
.
pic
.
p_fdec
[
p
][
block_idx_xy_fdec
[
idx
]];
ALIGNED_ARRAY_
32
(
dctcoef
,
dct4x4
,[
16
]
);
ALIGNED_ARRAY_
64
(
dctcoef
,
dct4x4
,[
16
]
);
if
(
b_predict
)
{
...
...
encoder/rdo.c
View file @
2b2f0395
...
...
@@ -635,7 +635,7 @@ int quant_trellis_cabac( x264_t *h, dctcoef *dct,
int
b_chroma
,
int
dc
,
int
num_coefs
,
int
idx
)
{
ALIGNED_ARRAY_32
(
dctcoef
,
orig_coefs
,
[
64
]
);
ALIGNED_ARRAY_
32
(
dctcoef
,
quant_coefs
,
[
64
]
);
ALIGNED_ARRAY_
64
(
dctcoef
,
quant_coefs
,
[
64
]
);
const
uint32_t
*
coef_weight1
=
num_coefs
==
64
?
x264_dct8_weight_tab
:
x264_dct4_weight_tab
;
const
uint32_t
*
coef_weight2
=
num_coefs
==
64
?
x264_dct8_weight2_tab
:
x264_dct4_weight2_tab
;
const
int
b_interlaced
=
MB_INTERLACED
;
...
...
tools/checkasm.c
View file @
2b2f0395
...
...
@@ -839,9 +839,9 @@ static int check_dct( int cpu_ref, int cpu_new )
x264_dct_function_t
dct_asm
;
x264_quant_function_t
qf
;
int
ret
=
0
,
ok
,
used_asm
,
interlace
=
0
;
ALIGNED_ARRAY_
32
(
dctcoef
,
dct1
,
[
16
],[
16
]
);
ALIGNED_ARRAY_
64
(
dctcoef
,
dct1
,
[
16
],[
16
]
);
ALIGNED_ARRAY_32
(
dctcoef
,
dct2
,
[
16
],[
16
]
);
ALIGNED_ARRAY_
32
(
dctcoef
,
dct4
,
[
16
],[
16
]
);
ALIGNED_ARRAY_
64
(
dctcoef
,
dct4
,
[
16
],[
16
]
);
ALIGNED_ARRAY_32
(
dctcoef
,
dct8
,
[
4
],[
64
]
);
ALIGNED_16
(
dctcoef
dctdc
[
2
][
8
]
);
x264_t
h_buf
;
...
...
@@ -1044,8 +1044,8 @@ static int check_dct( int cpu_ref, int cpu_new )
x264_zigzag_function_t
zigzag_ref
[
2
];
x264_zigzag_function_t
zigzag_asm
[
2
];
ALIGNED_ARRAY_
1
6
(
dctcoef
,
level1
,[
64
]
);
ALIGNED_ARRAY_
1
6
(
dctcoef
,
level2
,[
64
]
);
ALIGNED_ARRAY_6
4
(
dctcoef
,
level1
,[
64
]
);
ALIGNED_ARRAY_6
4
(
dctcoef
,
level2
,[
64
]
);
#define TEST_ZIGZAG_SCAN( name, t1, t2, dct, size ) \
if( zigzag_asm[interlace].name != zigzag_ref[interlace].name ) \
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment