Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
VideoLAN
x264
Commits
993eb207
Commit
993eb207
authored
May 07, 2017
by
Henrik Gramner
Browse files
x86: AVX-512 pixel_sad
Covers all variants: 4x4, 4x8, 4x16, 8x4, 8x8, 8x16, 16x8, and 16x16.
parent
2463174c
Changes
5
Hide whitespace changes
Inline
Side-by-side
common/pixel.c
View file @
993eb207
...
...
@@ -1348,6 +1348,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
if
(
cpu
&
X264_CPU_AVX512
)
{
INIT8
(
sad
,
_avx512
);
INIT8_NAME
(
sad_aligned
,
sad
,
_avx512
);
INIT8
(
satd
,
_avx512
);
pixf
->
sa8d
[
PIXEL_8x8
]
=
x264_pixel_sa8d_8x8_avx512
;
pixf
->
var
[
PIXEL_8x8
]
=
x264_pixel_var_8x8_avx512
;
...
...
common/x86/pixel.h
View file @
993eb207
...
...
@@ -52,6 +52,7 @@ DECL_X1( sad, sse2_aligned )
DECL_X1
(
sad
,
ssse3
)
DECL_X1
(
sad
,
ssse3_aligned
)
DECL_X1
(
sad
,
avx2
)
DECL_X1
(
sad
,
avx512
)
DECL_X4
(
sad
,
mmx2
)
DECL_X4
(
sad
,
sse2
)
DECL_X4
(
sad
,
sse3
)
...
...
common/x86/sad-a.asm
View file @
993eb207
...
...
@@ -122,6 +122,9 @@ SAD 4, 4
;-----------------------------------------------------------------------------
%macro SAD_W16 1
; h
cglobal
pixel_sad_16x
%
1
,
4
,
4
%ifidn cpuname, sse2
.skip_prologue:
%endif
%assign %%i 0
%if ARCH_X86_64
lea
r6
,
[
3
*
r1
]
; r6 results in fewer REX prefixes than r4 and both are volatile
...
...
@@ -201,6 +204,132 @@ cglobal pixel_sad_8x16_sse2, 4,4
SAD_INC_4x8P_SSE
1
SAD_END_SSE2
%macro SAD_W48_AVX512 3
; w, h, d/q
cglobal
pixel_sad_
%
1
x
%
2
,
4
,
4
kxnorb
k1
,
k1
,
k1
kaddb
k1
,
k1
,
k1
%assign %%i 0
%if ARCH_X86_64 && %2 != 4
lea
r6
,
[
3
*
r1
]
lea
r5
,
[
3
*
r3
]
%rep %2/4
mov
%
3
m1
,
[
r0
]
vpbroadcast
%
3
m1
{
k1
}
,
[
r0
+
r1
]
mov
%
3
m3
,
[
r2
]
vpbroadcast
%
3
m3
{
k1
}
,
[
r2
+
r3
]
mov
%
3
m2
,
[
r0
+
2
*
r1
]
vpbroadcast
%
3
m2
{
k1
}
,
[
r0
+
r6
]
mov
%
3
m4
,
[
r2
+
2
*
r3
]
vpbroadcast
%
3
m4
{
k1
}
,
[
r2
+
r5
]
%if %%i != %2/4-1
lea
r0
,
[
r0
+
4
*
r1
]
lea
r2
,
[
r2
+
4
*
r3
]
%endif
psadbw
m1
,
m3
psadbw
m2
,
m4
ACCUM
paddd
,
0
,
1
,
%%
i
paddd
m0
,
m2
%assign %%i %%i+1
%endrep
%else
%rep %2/2
mov
%
3
m1
,
[
r0
]
vpbroadcast
%
3
m1
{
k1
}
,
[
r0
+
r1
]
mov
%
3
m2
,
[
r2
]
vpbroadcast
%
3
m2
{
k1
}
,
[
r2
+
r3
]
%if %%i != %2/2-1
lea
r0
,
[
r0
+
2
*
r1
]
lea
r2
,
[
r2
+
2
*
r3
]
%endif
psadbw
m1
,
m2
ACCUM
paddd
,
0
,
1
,
%%
i
%assign %%i %%i+1
%endrep
%endif
%if %1 == 8
punpckhqdq
m1
,
m0
,
m0
paddd
m0
,
m1
%endif
movd
eax
,
m0
RET
%endmacro
INIT_XMM
avx512
SAD_W48_AVX512
4
,
4
,
d
SAD_W48_AVX512
4
,
8
,
d
SAD_W48_AVX512
4
,
16
,
d
SAD_W48_AVX512
8
,
4
,
q
SAD_W48_AVX512
8
,
8
,
q
SAD_W48_AVX512
8
,
16
,
q
%macro SAD_W16_AVX512_START 1
; h
cmp
r1d
,
FENC_STRIDE
; optimized for the most common fenc case, which
jne
pixel_sad_16x
%
1
_sse2.skip_prologue
; has the rows laid out contiguously in memory
lea
r1
,
[
3
*
r3
]
%endmacro
%macro SAD_W16_AVX512_END 0
paddd
m0
,
m1
paddd
m0
,
m2
paddd
m0
,
m3
%if mmsize == 64
vextracti32x8
ym1
,
m0
,
1
paddd
ym0
,
ym1
%endif
vextracti128
xm1
,
ym0
,
1
paddd
xmm0
,
xm0
,
xm1
punpckhqdq
xmm1
,
xmm0
,
xmm0
paddd
xmm0
,
xmm1
movd
eax
,
xmm0
RET
%endmacro
INIT_YMM
avx512
cglobal
pixel_sad_16x8
,
4
,
4
SAD_W16_AVX512_START
8
movu
xm0
,
[
r2
]
vinserti128
m0
,
[
r2
+
r3
],
1
psadbw
m0
,
[
r0
+
0
*
32
]
movu
xm1
,
[
r2
+
2
*
r3
]
vinserti128
m1
,
[
r2
+
r1
],
1
lea
r2
,
[
r2
+
4
*
r3
]
psadbw
m1
,
[
r0
+
1
*
32
]
movu
xm2
,
[
r2
]
vinserti128
m2
,
[
r2
+
r3
],
1
psadbw
m2
,
[
r0
+
2
*
32
]
movu
xm3
,
[
r2
+
2
*
r3
]
vinserti128
m3
,
[
r2
+
r1
],
1
psadbw
m3
,
[
r0
+
3
*
32
]
SAD_W16_AVX512_END
INIT_ZMM
avx512
cglobal
pixel_sad_16x16
,
4
,
4
SAD_W16_AVX512_START
16
movu
xm0
,
[
r2
]
vinserti128
ym0
,
[
r2
+
r3
],
1
movu
xm1
,
[
r2
+
4
*
r3
]
vinserti32x4
m0
,
[
r2
+
2
*
r3
],
2
vinserti32x4
m1
,
[
r2
+
2
*
r1
],
2
vinserti32x4
m0
,
[
r2
+
r1
],
3
lea
r2
,
[
r2
+
4
*
r3
]
vinserti32x4
m1
,
[
r2
+
r3
],
1
psadbw
m0
,
[
r0
+
0
*
64
]
vinserti32x4
m1
,
[
r2
+
r1
],
3
lea
r2
,
[
r2
+
4
*
r3
]
psadbw
m1
,
[
r0
+
1
*
64
]
movu
xm2
,
[
r2
]
vinserti128
ym2
,
[
r2
+
r3
],
1
movu
xm3
,
[
r2
+
4
*
r3
]
vinserti32x4
m2
,
[
r2
+
2
*
r3
],
2
vinserti32x4
m3
,
[
r2
+
2
*
r1
],
2
vinserti32x4
m2
,
[
r2
+
r1
],
3
lea
r2
,
[
r2
+
4
*
r3
]
vinserti32x4
m3
,
[
r2
+
r3
],
1
psadbw
m2
,
[
r0
+
2
*
64
]
vinserti32x4
m3
,
[
r2
+
r1
],
3
psadbw
m3
,
[
r0
+
3
*
64
]
SAD_W16_AVX512_END
;-----------------------------------------------------------------------------
; void pixel_vsad( pixel *src, intptr_t stride );
;-----------------------------------------------------------------------------
...
...
encoder/slicetype.c
View file @
993eb207
...
...
@@ -267,7 +267,7 @@ static NOINLINE unsigned int x264_weight_cost_chroma444( x264_t *h, x264_frame_t
int
i_lines
=
fenc
->
i_lines
[
p
];
int
i_width
=
fenc
->
i_width
[
p
];
pixel
*
src
=
fenc
->
plane
[
p
];
ALIGNED_ARRAY_
1
6
(
pixel
,
buf
,
[
16
*
16
]
);
ALIGNED_ARRAY_6
4
(
pixel
,
buf
,
[
16
*
16
]
);
int
pixoff
=
0
;
if
(
w
)
{
...
...
tools/checkasm.c
View file @
993eb207
...
...
@@ -370,8 +370,9 @@ static int check_pixel( int cpu_ref, int cpu_new )
used_asm = 1; \
for( int j = 0; j < 64; j++ ) \
{ \
res_c = call_c( pixel_c.name[i], pbuf1, (intptr_t)16, pbuf2+j*!align, (intptr_t)64 ); \
res_asm = call_a( pixel_asm.name[i], pbuf1, (intptr_t)16, pbuf2+j*!align, (intptr_t)64 ); \
intptr_t stride1 = (j&31) == 31 ? 32 : FENC_STRIDE; \
res_c = call_c( pixel_c.name[i], pbuf1, stride1, pbuf2+j*!align, (intptr_t)64 ); \
res_asm = call_a( pixel_asm.name[i], pbuf1, stride1, pbuf2+j*!align, (intptr_t)64 ); \
if( res_c != res_asm ) \
{ \
ok = 0; \
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment