Commit 95dc64c4 authored by Henrik Gramner's avatar Henrik Gramner
Browse files

x86: AVX-512 memzero_aligned

Reorder some elements in the x264_t.mb.pic struct to reduce the amount
of padding required.

Also drop the MMX implementation in favor of SSE.
parent c0cd7650
......@@ -788,16 +788,17 @@ struct x264_t
ALIGNED_64( dctcoef i4x4_dct_buf[15][16] );
uint32_t i4x4_nnz_buf[4];
uint32_t i8x8_nnz_buf[4];
int i4x4_cbp;
int i8x8_cbp;
/* Psy trellis DCT data */
ALIGNED_16( dctcoef fenc_dct8[4][64] );
ALIGNED_16( dctcoef fenc_dct4[16][16] );
/* Psy RD SATD/SA8D scores cache */
ALIGNED_32( uint64_t fenc_hadamard_cache[9] );
ALIGNED_32( uint32_t fenc_satd_cache[32] );
ALIGNED_64( uint32_t fenc_satd_cache[32] );
ALIGNED_16( uint64_t fenc_hadamard_cache[9] );
int i4x4_cbp;
int i8x8_cbp;
/* pointer over mb of the frame to be compressed */
pixel *p_fenc[3]; /* y,u,v */
......
......@@ -1507,12 +1507,32 @@ cglobal memcpy_aligned, 3,3
RET
%endmacro
;-----------------------------------------------------------------------------
; void *memzero_aligned( void *dst, size_t n );
;-----------------------------------------------------------------------------
%macro MEMZERO 0
cglobal memzero_aligned, 2,2
xorps m0, m0
.loop:
%assign %%i mmsize
%rep 128 / mmsize
movaps [r0 + r1 - %%i], m0
%assign %%i %%i+mmsize
%endrep
sub r1d, 128
jg .loop
RET
%endmacro
INIT_XMM sse
MEMCPY
MEMZERO
INIT_YMM avx
MEMCPY
MEMZERO
INIT_ZMM avx512
MEMZERO
cglobal memcpy_aligned, 3,4
dec r2d ; offset of the last byte
rorx r3d, r2d, 2
......@@ -1533,36 +1553,6 @@ cglobal memcpy_aligned, 3,4
.ret:
RET
;-----------------------------------------------------------------------------
; void *memzero_aligned( void *dst, size_t n );
;-----------------------------------------------------------------------------
%macro MEMZERO 1
cglobal memzero_aligned, 2,2
add r0, r1
neg r1
%if mmsize == 8
pxor m0, m0
%else
xorps m0, m0
%endif
.loop:
%assign i 0
%rep %1
mova [r0 + r1 + i], m0
%assign i i+mmsize
%endrep
add r1, mmsize*%1
jl .loop
RET
%endmacro
INIT_MMX mmx
MEMZERO 8
INIT_XMM sse
MEMZERO 8
INIT_YMM avx
MEMZERO 4
%if HIGH_BIT_DEPTH == 0
;-----------------------------------------------------------------------------
; void integral_init4h( uint16_t *sum, uint8_t *pix, intptr_t stride )
......
......@@ -146,9 +146,9 @@ void x264_load_deinterleave_chroma_fdec_avx2( uint16_t *dst, uint16_t *src, intp
void *x264_memcpy_aligned_sse ( void *dst, const void *src, size_t n );
void *x264_memcpy_aligned_avx ( void *dst, const void *src, size_t n );
void *x264_memcpy_aligned_avx512( void *dst, const void *src, size_t n );
void x264_memzero_aligned_mmx( void *dst, size_t n );
void x264_memzero_aligned_sse( void *dst, size_t n );
void x264_memzero_aligned_avx( void *dst, size_t n );
void x264_memzero_aligned_sse ( void *dst, size_t n );
void x264_memzero_aligned_avx ( void *dst, size_t n );
void x264_memzero_aligned_avx512( void *dst, size_t n );
void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride );
void x264_integral_init4h_avx2( uint16_t *sum, uint8_t *pix, intptr_t stride );
void x264_integral_init8h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride );
......@@ -559,7 +559,6 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_mmx;
pf->copy[PIXEL_8x8] = x264_mc_copy_w8_mmx;
pf->copy[PIXEL_4x4] = x264_mc_copy_w4_mmx;
pf->memzero_aligned = x264_memzero_aligned_mmx;
pf->integral_init4v = x264_integral_init4v_mmx;
pf->integral_init8v = x264_integral_init8v_mmx;
......@@ -871,5 +870,6 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
if( !(cpu&X264_CPU_AVX512) )
return;
pf->memcpy_aligned = x264_memcpy_aligned_avx512;
pf->memzero_aligned = x264_memzero_aligned_avx512;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx512;
}
......@@ -695,8 +695,12 @@ static inline void x264_mb_init_fenc_cache( x264_t *h, int b_satd )
x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
if( !h->mb.i_psy_rd )
return;
/* Writes beyond the end of the array, but not a problem since fenc_satd_cache is right after. */
h->mc.memzero_aligned( h->mb.pic.fenc_hadamard_cache, sizeof(h->mb.pic.fenc_hadamard_cache) );
M128( &h->mb.pic.fenc_hadamard_cache[0] ) = M128_ZERO;
M128( &h->mb.pic.fenc_hadamard_cache[2] ) = M128_ZERO;
M128( &h->mb.pic.fenc_hadamard_cache[4] ) = M128_ZERO;
M128( &h->mb.pic.fenc_hadamard_cache[6] ) = M128_ZERO;
h->mb.pic.fenc_hadamard_cache[8] = 0;
if( b_satd )
h->mc.memzero_aligned( h->mb.pic.fenc_satd_cache, sizeof(h->mb.pic.fenc_satd_cache) );
}
......
......@@ -1059,7 +1059,7 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
uint64_t bcostrd = COST_MAX64;
uint16_t amvd;
/* each byte of visited represents 8 possible m1y positions, so a 4D array isn't needed */
ALIGNED_ARRAY_32( uint8_t, visited,[8],[8][8] );
ALIGNED_ARRAY_64( uint8_t, visited,[8],[8][8] );
/* all permutations of an offset in up to 2 of the dimensions */
ALIGNED_4( static const int8_t dia4d[33][4] ) =
{
......
......@@ -1878,10 +1878,10 @@ static int check_mc( int cpu_ref, int cpu_new )
ok = 1; used_asm = 1;
for( size_t size = 128; size < 1024; size += 128 )
{
memset( buf4, 0xAA, size + 1 );
memset( buf4-1, 0xAA, size + 2 );
call_c( mc_c.memzero_aligned, buf3, size );
call_a( mc_a.memzero_aligned, buf4, size );
if( memcmp( buf3, buf4, size ) || buf4[size] != 0xAA )
if( memcmp( buf3, buf4, size ) || buf4[-1] != 0xAA || buf4[size] != 0xAA )
{
ok = 0;
fprintf( stderr, "memzero_aligned FAILED: size=%d\n", (int)size );
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment