Commit 4c48f9e7 authored by Henrik Gramner's avatar Henrik Gramner
Browse files

x86: AVX-512 pixel_var_8x8, 8x16, and 16x16

Make the SSE2, AVX, and AVX2 versions a bit faster.

Drop the MMX and XOP versions.
parent 1cf7baa4
......@@ -884,9 +884,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT4( hadamard_ac, _mmx2 );
INIT8( ssd, _mmx2 );
INIT_ADS( _mmx2 );
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmx2;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmx2;
#if ARCH_X86
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_mmx2;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_mmx2;
......@@ -1028,8 +1025,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT5( sad_x3, _xop );
INIT5( sad_x4, _xop );
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_xop;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_xop;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_xop;
pixf->vsad = x264_pixel_vsad_xop;
pixf->asd8 = x264_pixel_asd8_xop;
#if ARCH_X86_64
......@@ -1048,6 +1043,11 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx2;
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_avx2;
}
if( cpu&X264_CPU_AVX512 )
{
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_avx512;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx512;
}
#endif // HAVE_MMX
#else // !HIGH_BIT_DEPTH
#if HAVE_MMX
......@@ -1067,9 +1067,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT7( satd_x4, _mmx2 );
INIT4( hadamard_ac, _mmx2 );
INIT_ADS( _mmx2 );
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmx2;
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_mmx2;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmx2;
#if ARCH_X86
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmx2;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_mmx2;
......@@ -1321,9 +1318,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_xop;
pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_xop;
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_xop;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_xop;
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_xop;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_xop;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_xop;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_xop;
#if ARCH_X86_64
......@@ -1356,6 +1350,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
{
INIT8( satd, _avx512 );
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx512;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_avx512;
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_avx512;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx512;
}
#endif //HAVE_MMX
......
......@@ -32,6 +32,8 @@
%include "x86util.asm"
SECTION_RODATA 32
var_shuf_avx512: db 0,-1, 1,-1, 2,-1, 3,-1, 4,-1, 5,-1, 6,-1, 7,-1
db 8,-1, 9,-1,10,-1,11,-1,12,-1,13,-1,14,-1,15,-1
hmul_16p: times 16 db 1
times 8 db 1, -1
hmul_8p: times 8 db 1
......@@ -701,25 +703,32 @@ SSD_NV12
%if HIGH_BIT_DEPTH == 0
%if %1
mova m7, [pw_00ff]
%elif mmsize < 32
%elif mmsize == 16
pxor m7, m7 ; zero
%endif
%endif ; !HIGH_BIT_DEPTH
%endmacro
%macro VAR_END 2
%if HIGH_BIT_DEPTH && mmsize == 8 && %1*%2 == 256
HADDUW m5, m2
%else
HADDW m5, m2
%macro VAR_END 0
pmaddwd m5, [pw_1]
SBUTTERFLY dq, 5, 6, 0
paddd m5, m6
%if mmsize == 32
vextracti128 xm6, m5, 1
paddd xm5, xm6
%endif
HADDD m6, m1
MOVHL xm6, xm5
paddd xm5, xm6
%if ARCH_X86_64
punpckldq m5, m6
movq rax, m5
movq rax, xm5
%else
movd eax, xm5
%if cpuflag(avx)
pextrd edx, xm5, 1
%else
movd eax, m5
movd edx, m6
pshuflw xm5, xm5, q1032
movd edx, xm5
%endif
%endif
RET
%endmacro
......@@ -739,61 +748,25 @@ SSD_NV12
paddd m6, m4
%endmacro
%macro VAR_2ROW 2
mov r2d, %2
.loop:
%if HIGH_BIT_DEPTH
mova m0, [r0]
mova m1, [r0+mmsize]
mova m3, [r0+%1]
mova m4, [r0+%1+mmsize]
%else ; !HIGH_BIT_DEPTH
mova m0, [r0]
mova m3, [r0+%1]
punpckhbw m1, m0, m7
punpcklbw m0, m7
punpckhbw m4, m3, m7
punpcklbw m3, m7
%endif ; HIGH_BIT_DEPTH
%ifidn %1, r1
lea r0, [r0+%1*2]
%else
add r0, r1
%endif
VAR_CORE
dec r2d
jg .loop
%endmacro
;-----------------------------------------------------------------------------
; int pixel_var_wxh( uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
INIT_MMX mmx2
cglobal pixel_var_16x16, 2,3
FIX_STRIDES r1
VAR_START 0
VAR_2ROW 8*SIZEOF_PIXEL, 16
VAR_END 16, 16
cglobal pixel_var_8x16, 2,3
FIX_STRIDES r1
VAR_START 0
VAR_2ROW r1, 8
VAR_END 8, 16
cglobal pixel_var_8x8, 2,3
FIX_STRIDES r1
VAR_START 0
VAR_2ROW r1, 4
VAR_END 8, 8
%if HIGH_BIT_DEPTH
%macro VAR 0
cglobal pixel_var_16x16, 2,3,8
FIX_STRIDES r1
VAR_START 0
VAR_2ROW r1, 8
VAR_END 16, 16
mov r2d, 8
.loop:
mova m0, [r0]
mova m1, [r0+mmsize]
mova m3, [r0+r1]
mova m4, [r0+r1+mmsize]
lea r0, [r0+r1*2]
VAR_CORE
dec r2d
jg .loop
VAR_END
cglobal pixel_var_8x8, 2,3,8
lea r2, [r1*3]
......@@ -809,18 +782,16 @@ cglobal pixel_var_8x8, 2,3,8
mova m3, [r0+r1*4]
mova m4, [r0+r2*2]
VAR_CORE
VAR_END 8, 8
VAR_END
%endmacro ; VAR
INIT_XMM sse2
VAR
INIT_XMM avx
VAR
INIT_XMM xop
VAR
%endif ; HIGH_BIT_DEPTH
%if HIGH_BIT_DEPTH == 0
%else ; HIGH_BIT_DEPTH == 0
%macro VAR 0
cglobal pixel_var_16x16, 2,3,8
VAR_START 1
......@@ -833,7 +804,7 @@ cglobal pixel_var_16x16, 2,3,8
VAR_CORE
dec r2d
jg .loop
VAR_END 16, 16
VAR_END
cglobal pixel_var_8x8, 2,4,8
VAR_START 1
......@@ -849,7 +820,7 @@ cglobal pixel_var_8x8, 2,4,8
VAR_CORE
dec r2d
jg .loop
VAR_END 8, 8
VAR_END
cglobal pixel_var_8x16, 2,4,8
VAR_START 1
......@@ -865,15 +836,13 @@ cglobal pixel_var_8x16, 2,4,8
VAR_CORE
dec r2d
jg .loop
VAR_END 8, 16
VAR_END
%endmacro ; VAR
INIT_XMM sse2
VAR
INIT_XMM avx
VAR
INIT_XMM xop
VAR
%endif ; !HIGH_BIT_DEPTH
INIT_YMM avx2
......@@ -898,21 +867,120 @@ cglobal pixel_var_16x16, 2,4,7
VAR_CORE
dec r2d
jg .loop
vextracti128 xm0, m5, 1
vextracti128 xm1, m6, 1
paddw xm5, xm0
paddd xm6, xm1
HADDW xm5, xm2
HADDD xm6, xm1
VAR_END
%macro VAR_AVX512_CORE 1 ; accum
%if %1
paddw m0, m2
pmaddwd m2, m2
paddw m0, m3
pmaddwd m3, m3
paddd m1, m2
paddd m1, m3
%else
paddw m0, m2, m3
pmaddwd m2, m2
pmaddwd m3, m3
paddd m1, m2, m3
%endif
%endmacro
%macro VAR_AVX512_CORE_16x16 1 ; accum
%if HIGH_BIT_DEPTH
mova ym2, [r0]
vinserti64x4 m2, [r0+r1], 1
mova ym3, [r0+2*r1]
vinserti64x4 m3, [r0+r3], 1
%else
vbroadcasti64x2 ym2, [r0]
vbroadcasti64x2 m2 {k1}, [r0+r1]
vbroadcasti64x2 ym3, [r0+2*r1]
vbroadcasti64x2 m3 {k1}, [r0+r3]
pshufb m2, m4
pshufb m3, m4
%endif
VAR_AVX512_CORE %1
%endmacro
%macro VAR_AVX512_CORE_8x8 1 ; accum
%if HIGH_BIT_DEPTH
mova xm2, [r0]
mova xm3, [r0+r1]
%else
movq xm2, [r0]
movq xm3, [r0+r1]
%endif
vinserti128 ym2, [r0+2*r1], 1
vinserti128 ym3, [r0+r2], 1
lea r0, [r0+4*r1]
vinserti32x4 m2, [r0], 2
vinserti32x4 m3, [r0+r1], 2
vinserti32x4 m2, [r0+2*r1], 3
vinserti32x4 m3, [r0+r2], 3
%if HIGH_BIT_DEPTH == 0
punpcklbw m2, m4
punpcklbw m3, m4
%endif
VAR_AVX512_CORE %1
%endmacro
INIT_ZMM avx512
cglobal pixel_var_16x16, 2,4
FIX_STRIDES r1
mov r2d, 0xf0
lea r3, [3*r1]
%if HIGH_BIT_DEPTH == 0
vbroadcasti64x4 m4, [var_shuf_avx512]
kmovb k1, r2d
%endif
VAR_AVX512_CORE_16x16 0
.loop:
lea r0, [r0+4*r1]
VAR_AVX512_CORE_16x16 1
sub r2d, 0x50
jg .loop
%if ARCH_X86_64 == 0
pop r3d
%assign regs_used 3
%endif
var_avx512_end:
vbroadcasti32x4 m2, [pw_1]
pmaddwd m0, m2
SBUTTERFLY dq, 0, 1, 2
paddd m0, m1
vextracti32x8 ym1, m0, 1
paddd ym0, ym1
vextracti128 xm1, ym0, 1
paddd xmm0, xm0, xm1
punpckhqdq xmm1, xmm0, xmm0
paddd xmm0, xmm1
%if ARCH_X86_64
punpckldq xm5, xm6
movq rax, xm5
movq rax, xmm0
%else
movd eax, xm5
movd edx, xm6
movd eax, xmm0
pextrd edx, xmm0, 1
%endif
RET
%if HIGH_BIT_DEPTH == 0 ; 8x8 doesn't benefit from AVX-512 in high bit-depth
cglobal pixel_var_8x8, 2,3
lea r2, [3*r1]
pxor xm4, xm4
VAR_AVX512_CORE_8x8 0
jmp var_avx512_end
%endif
cglobal pixel_var_8x16, 2,3
FIX_STRIDES r1
lea r2, [3*r1]
%if HIGH_BIT_DEPTH == 0
pxor xm4, xm4
%endif
VAR_AVX512_CORE_8x8 0
lea r0, [r0+4*r1]
VAR_AVX512_CORE_8x8 1
jmp var_avx512_end
%macro VAR2_END 3
HADDW %2, xm1
movd r1d, %2
......
......@@ -94,11 +94,10 @@ DECL_X4( sad, cache64_mmx2 );
DECL_X4( sad, cache64_sse2 );
DECL_X4( sad, cache64_ssse3 );
DECL_PIXELS( uint64_t, var, mmx2, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, var, sse2, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, var, avx, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, var, xop, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, var, avx2, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, var, sse2, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, var, avx, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, var, avx2, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, var, avx512, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, mmx2, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, sse2, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( pixel *pix, intptr_t i_stride ))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment