Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
VideoLAN
x264
Commits
49fb50a6
Commit
49fb50a6
authored
May 01, 2017
by
Henrik Gramner
Browse files
x86: AVX-512 pixel_var2_8x8 and 8x16
parent
92c074e2
Changes
5
Hide whitespace changes
Inline
Side-by-side
common/common.h
View file @
49fb50a6
...
...
@@ -778,8 +778,8 @@ struct x264_t
/* space for p_fenc and p_fdec */
#define FENC_STRIDE 16
#define FDEC_STRIDE 32
ALIGNED_
32
(
pixel
fenc_buf
[
48
*
FENC_STRIDE
]
);
ALIGNED_
32
(
pixel
fdec_buf
[
52
*
FDEC_STRIDE
]
);
ALIGNED_
64
(
pixel
fenc_buf
[
48
*
FENC_STRIDE
]
);
ALIGNED_
64
(
pixel
fdec_buf
[
52
*
FDEC_STRIDE
]
);
/* i4x4 and i8x8 backup data, for skipping the encode stage when possible */
ALIGNED_16
(
pixel
i4x4_fdec_buf
[
16
*
16
]
);
...
...
common/macroblock.c
View file @
49fb50a6
...
...
@@ -532,16 +532,17 @@ void x264_macroblock_thread_init( x264_t *h )
h
->
mb
.
pic
.
p_fenc
[
0
]
=
h
->
mb
.
pic
.
fenc_buf
;
h
->
mb
.
pic
.
p_fdec
[
0
]
=
h
->
mb
.
pic
.
fdec_buf
+
2
*
FDEC_STRIDE
;
h
->
mb
.
pic
.
p_fenc
[
1
]
=
h
->
mb
.
pic
.
fenc_buf
+
16
*
FENC_STRIDE
;
h
->
mb
.
pic
.
p_fdec
[
1
]
=
h
->
mb
.
pic
.
fdec_buf
+
19
*
FDEC_STRIDE
;
if
(
CHROMA444
)
{
h
->
mb
.
pic
.
p_fenc
[
2
]
=
h
->
mb
.
pic
.
fenc_buf
+
32
*
FENC_STRIDE
;
h
->
mb
.
pic
.
p_fdec
[
1
]
=
h
->
mb
.
pic
.
fdec_buf
+
19
*
FDEC_STRIDE
;
h
->
mb
.
pic
.
p_fdec
[
2
]
=
h
->
mb
.
pic
.
fdec_buf
+
36
*
FDEC_STRIDE
;
}
else
{
h
->
mb
.
pic
.
p_fenc
[
2
]
=
h
->
mb
.
pic
.
fenc_buf
+
16
*
FENC_STRIDE
+
8
;
h
->
mb
.
pic
.
p_fdec
[
2
]
=
h
->
mb
.
pic
.
fdec_buf
+
19
*
FDEC_STRIDE
+
16
;
h
->
mb
.
pic
.
p_fdec
[
1
]
=
h
->
mb
.
pic
.
fdec_buf
+
20
*
FDEC_STRIDE
;
h
->
mb
.
pic
.
p_fdec
[
2
]
=
h
->
mb
.
pic
.
fdec_buf
+
20
*
FDEC_STRIDE
+
16
;
}
}
...
...
common/pixel.c
View file @
49fb50a6
...
...
@@ -1049,6 +1049,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
{
pixf
->
var
[
PIXEL_8x16
]
=
x264_pixel_var_8x16_avx512
;
pixf
->
var
[
PIXEL_16x16
]
=
x264_pixel_var_16x16_avx512
;
pixf
->
var2
[
PIXEL_8x8
]
=
x264_pixel_var2_8x8_avx512
;
pixf
->
var2
[
PIXEL_8x16
]
=
x264_pixel_var2_8x16_avx512
;
}
#endif // HAVE_MMX
#else // !HIGH_BIT_DEPTH
...
...
@@ -1351,6 +1353,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf
->
var
[
PIXEL_8x8
]
=
x264_pixel_var_8x8_avx512
;
pixf
->
var
[
PIXEL_8x16
]
=
x264_pixel_var_8x16_avx512
;
pixf
->
var
[
PIXEL_16x16
]
=
x264_pixel_var_16x16_avx512
;
pixf
->
var2
[
PIXEL_8x8
]
=
x264_pixel_var2_8x8_avx512
;
pixf
->
var2
[
PIXEL_8x16
]
=
x264_pixel_var2_8x16_avx512
;
}
#endif //HAVE_MMX
...
...
common/x86/pixel-a.asm
View file @
49fb50a6
...
...
@@ -1128,10 +1128,17 @@ VAR2_8x8_SSSE3 16, 7
%macro VAR2_AVX2_LOAD 3
; offset_reg, row1_offset, row2_offset
%if HIGH_BIT_DEPTH
%if mmsize == 64
mova
m2
,
[
r1
+
2
*%
1
+%
2
*
FDEC_STRIDEB
]
vshufi32x4
m2
,
[
r1
+
2
*%
1
+%
2
*
FDEC_STRIDEB
+
64
],
q2020
mova
m3
,
[
r1
+
2
*%
1
+%
3
*
FDEC_STRIDEB
]
vshufi32x4
m3
,
[
r1
+
2
*%
1
+%
3
*
FDEC_STRIDEB
+
64
],
q2020
%else
mova
xm2
,
[
r1
+
2
*%
1
+%
2
*
FDEC_STRIDEB
]
vinserti128
m2
,
[
r1
+
2
*%
1
+%
2
*
FDEC_STRIDEB
+
32
],
1
mova
xm3
,
[
r1
+
2
*%
1
+%
3
*
FDEC_STRIDEB
]
vinserti128
m3
,
[
r1
+
2
*%
1
+%
3
*
FDEC_STRIDEB
+
32
],
1
%endif
psubw
m2
,
[
r0
+
1
*%
1
+%
2
*
FENC_STRIDEB
]
psubw
m3
,
[
r0
+
1
*%
1
+%
3
*
FENC_STRIDEB
]
%else
...
...
@@ -1174,6 +1181,44 @@ INIT_YMM avx2
VAR2_8x8_AVX2
8
,
6
VAR2_8x8_AVX2
16
,
7
%macro VAR2_AVX512_END 1
; shift
vbroadcasti32x4
m2
,
[
pw_1
]
pmaddwd
m0
,
m2
SBUTTERFLY
qdq
,
0
,
1
,
2
paddd
m0
,
m1
vextracti32x8
ym1
,
m0
,
1
paddd
ym0
,
ym1
psrlq
ym1
,
ym0
,
32
paddd
ym0
,
ym1
vpmovqd
xmm0
,
ym0
; sum_u, sqr_u, sum_v, sqr_v
VAR2_END
xmm0
,
xmm1
,
%
1
%endmacro
INIT_ZMM
avx512
cglobal
pixel_var2_8x8
,
2
,
3
%if HIGH_BIT_DEPTH == 0
pxor
xm6
,
xm6
%endif
VAR2_AVX2_LOAD
0
,
0
,
2
VAR2_CORE
m2
,
m3
,
0
VAR2_AVX2_LOAD
0
,
4
,
6
VAR2_CORE
m2
,
m3
,
1
VAR2_AVX512_END
6
cglobal
pixel_var2_8x16
,
2
,
3
%if HIGH_BIT_DEPTH == 0
pxor
xm6
,
xm6
%endif
mov
t0d
,
10
*
FENC_STRIDEB
VAR2_AVX2_LOAD
0
,
14
,
12
VAR2_CORE
m2
,
m3
,
0
.loop:
VAR2_AVX2_LOAD
t0
,
0
,
-
2
VAR2_CORE
m2
,
m3
,
1
sub
t0d
,
4
*
FENC_STRIDEB
jg
.loop
VAR2_AVX512_END
7
;=============================================================================
; SATD
;=============================================================================
...
...
common/x86/pixel.h
View file @
49fb50a6
...
...
@@ -169,9 +169,11 @@ float x264_pixel_ssim_end4_avx ( int sum0[5][4], int sum1[5][4], int width );
int
x264_pixel_var2_8x8_sse2
(
pixel
*
fenc
,
pixel
*
fdec
,
int
ssd
[
2
]
);
int
x264_pixel_var2_8x8_ssse3
(
uint8_t
*
fenc
,
uint8_t
*
fdec
,
int
ssd
[
2
]
);
int
x264_pixel_var2_8x8_avx2
(
pixel
*
fenc
,
pixel
*
fdec
,
int
ssd
[
2
]
);
int
x264_pixel_var2_8x8_avx512
(
pixel
*
fenc
,
pixel
*
fdec
,
int
ssd
[
2
]
);
int
x264_pixel_var2_8x16_sse2
(
pixel
*
fenc
,
pixel
*
fdec
,
int
ssd
[
2
]
);
int
x264_pixel_var2_8x16_ssse3
(
uint8_t
*
fenc
,
uint8_t
*
fdec
,
int
ssd
[
2
]
);
int
x264_pixel_var2_8x16_avx2
(
pixel
*
fenc
,
pixel
*
fdec
,
int
ssd
[
2
]
);
int
x264_pixel_var2_8x16_avx512
(
pixel
*
fenc
,
pixel
*
fdec
,
int
ssd
[
2
]
);
int
x264_pixel_vsad_mmx2
(
pixel
*
src
,
intptr_t
stride
,
int
height
);
int
x264_pixel_vsad_sse2
(
pixel
*
src
,
intptr_t
stride
,
int
height
);
int
x264_pixel_vsad_ssse3
(
pixel
*
src
,
intptr_t
stride
,
int
height
);
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment