Commit c0cd7650 authored by Henrik Gramner's avatar Henrik Gramner
Browse files

x86: AVX and AVX-512 memcpy_aligned

Reorder some elements in the x264_mb_analysis_list_t struct to reduce the
amount of padding required.

Also drop the MMX implementation in favor of SSE.
parent f29fbc6f
......@@ -42,7 +42,7 @@ typedef struct
uint8_t *p_end;
/* aligned for memcpy_aligned starting here */
ALIGNED_16( int f8_bits_encoded ); // only if using x264_cabac_size_decision()
ALIGNED_64( int f8_bits_encoded ); // only if using x264_cabac_size_decision()
/* context */
uint8_t state[1024];
......
......@@ -784,8 +784,8 @@ struct x264_t
/* i4x4 and i8x8 backup data, for skipping the encode stage when possible */
ALIGNED_16( pixel i4x4_fdec_buf[16*16] );
ALIGNED_16( pixel i8x8_fdec_buf[16*16] );
ALIGNED_16( dctcoef i8x8_dct_buf[3][64] );
ALIGNED_16( dctcoef i4x4_dct_buf[15][16] );
ALIGNED_64( dctcoef i8x8_dct_buf[3][64] );
ALIGNED_64( dctcoef i4x4_dct_buf[15][16] );
uint32_t i4x4_nnz_buf[4];
uint32_t i8x8_nnz_buf[4];
int i4x4_cbp;
......
......@@ -100,7 +100,7 @@ struc cb
.start: pointer 1
.p: pointer 1
.end: pointer 1
align 16, resb 1
align 64, resb 1
.bits_encoded: resd 1
.state: resb 1024
endstruc
......
......@@ -1473,52 +1473,65 @@ LOAD_DEINTERLEAVE_CHROMA_FENC_AVX2
PLANE_DEINTERLEAVE_RGB
%endif
; These functions are not general-use; not only do the SSE ones require aligned input,
; but they also will fail if given a non-mod16 size.
; memzero SSE will fail for non-mod128.
; These functions are not general-use; not only do they require aligned input, but memcpy
; requires size to be a multiple of 16 and memzero requires size to be a multiple of 128.
;-----------------------------------------------------------------------------
; void *memcpy_aligned( void *dst, const void *src, size_t n );
;-----------------------------------------------------------------------------
%macro MEMCPY 0
cglobal memcpy_aligned, 3,3
%if mmsize == 16
%if mmsize == 32
test r2d, 16
jz .copy2
mova m0, [r1+r2-16]
mova [r0+r2-16], m0
jz .copy32
mova xm0, [r1+r2-16]
mova [r0+r2-16], xm0
sub r2d, 16
.copy2:
%endif
test r2d, 2*mmsize
jz .copy4start
jle .ret
.copy32:
%endif
test r2d, mmsize
jz .loop
mova m0, [r1+r2-mmsize]
mova [r0+r2-mmsize], m0
sub r2d, mmsize
jle .ret
.loop:
mova m0, [r1+r2-1*mmsize]
mova m1, [r1+r2-2*mmsize]
mova [r0+r2-1*mmsize], m0
mova [r0+r2-2*mmsize], m1
sub r2d, 2*mmsize
.copy4start:
test r2d, r2d
jz .ret
.copy4:
mova m0, [r1+r2-1*mmsize]
mova m1, [r1+r2-2*mmsize]
mova m2, [r1+r2-3*mmsize]
mova m3, [r1+r2-4*mmsize]
mova [r0+r2-1*mmsize], m0
mova [r0+r2-2*mmsize], m1
mova [r0+r2-3*mmsize], m2
mova [r0+r2-4*mmsize], m3
sub r2d, 4*mmsize
jg .copy4
jg .loop
.ret:
REP_RET
RET
%endmacro
INIT_MMX mmx
MEMCPY
INIT_XMM sse
MEMCPY
INIT_YMM avx
MEMCPY
INIT_ZMM avx512
cglobal memcpy_aligned, 3,4
dec r2d ; offset of the last byte
rorx r3d, r2d, 2
and r2d, ~63
and r3d, 15 ; n = number of dwords minus one to copy in the tail
mova m0, [r1+r2]
not r3d ; bits 0-4: (n^15)+16, bits 16-31: 0xffff
shrx r3d, r3d, r3d ; 0xffff >> (n^15)
kmovw k1, r3d ; (1 << (n+1)) - 1
vmovdqa32 [r0+r2] {k1}, m0
sub r2d, 64
jl .ret
.loop:
mova m0, [r1+r2]
mova [r0+r2], m0
sub r2d, 64
jge .loop
.ret:
RET
;-----------------------------------------------------------------------------
; void *memzero_aligned( void *dst, size_t n );
......
......@@ -143,8 +143,9 @@ void x264_load_deinterleave_chroma_fdec_sse2( pixel *dst, pixel *src, intptr_t i
void x264_load_deinterleave_chroma_fdec_ssse3( uint8_t *dst, uint8_t *src, intptr_t i_src, int height );
void x264_load_deinterleave_chroma_fdec_avx( uint16_t *dst, uint16_t *src, intptr_t i_src, int height );
void x264_load_deinterleave_chroma_fdec_avx2( uint16_t *dst, uint16_t *src, intptr_t i_src, int height );
void *x264_memcpy_aligned_mmx( void *dst, const void *src, size_t n );
void *x264_memcpy_aligned_sse( void *dst, const void *src, size_t n );
void *x264_memcpy_aligned_sse ( void *dst, const void *src, size_t n );
void *x264_memcpy_aligned_avx ( void *dst, const void *src, size_t n );
void *x264_memcpy_aligned_avx512( void *dst, const void *src, size_t n );
void x264_memzero_aligned_mmx( void *dst, size_t n );
void x264_memzero_aligned_sse( void *dst, size_t n );
void x264_memzero_aligned_avx( void *dst, size_t n );
......@@ -558,7 +559,6 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_mmx;
pf->copy[PIXEL_8x8] = x264_mc_copy_w8_mmx;
pf->copy[PIXEL_4x4] = x264_mc_copy_w4_mmx;
pf->memcpy_aligned = x264_memcpy_aligned_mmx;
pf->memzero_aligned = x264_memzero_aligned_mmx;
pf->integral_init4v = x264_integral_init4v_mmx;
pf->integral_init8v = x264_integral_init8v_mmx;
......@@ -847,6 +847,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
if( !(cpu&X264_CPU_AVX) )
return;
pf->memcpy_aligned = x264_memcpy_aligned_avx;
pf->memzero_aligned = x264_memzero_aligned_avx;
pf->plane_copy = x264_plane_copy_avx;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx;
......@@ -869,5 +870,6 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
if( !(cpu&X264_CPU_AVX512) )
return;
pf->memcpy_aligned = x264_memcpy_aligned_avx512;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx512;
}
......@@ -34,37 +34,23 @@
typedef struct
{
/* 16x16 */
int i_rd16x16;
x264_me_t me16x16;
x264_me_t bi16x16; /* for b16x16 BI mode, since MVs can differ from l0/l1 */
/* 8x8 */
int i_cost8x8;
/* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
ALIGNED_4( int16_t mvc[32][5][2] );
x264_me_t me8x8[4];
/* Sub 4x4 */
int i_cost4x4[4]; /* cost per 8x8 partition */
x264_me_t me4x4[4][4];
/* Sub 8x4 */
int i_cost8x4[4]; /* cost per 8x8 partition */
x264_me_t me8x4[4][2];
/* Sub 4x8 */
int i_cost4x8[4]; /* cost per 8x8 partition */
x264_me_t me4x8[4][2];
/* 16x8 */
int i_cost16x8;
x264_me_t me16x8[2];
/* 8x16 */
int i_cost8x16;
x264_me_t me8x16[2];
int i_rd16x16;
int i_cost8x8;
int i_cost4x4[4]; /* cost per 8x8 partition */
int i_cost8x4[4]; /* cost per 8x8 partition */
int i_cost4x8[4]; /* cost per 8x8 partition */
int i_cost16x8;
int i_cost8x16;
/* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
ALIGNED_4( int16_t mvc[32][5][2] );
} x264_mb_analysis_list_t;
typedef struct
......
......@@ -32,10 +32,10 @@
typedef struct
{
/* aligning the first member is a gcc hack to force the struct to be
* 16 byte aligned, as well as force sizeof(struct) to be a multiple of 16 */
/* aligning the first member is a gcc hack to force the struct to be aligned,
* as well as force sizeof(struct) to be a multiple of the alignment. */
/* input */
ALIGNED_16( int i_pixel ); /* PIXEL_WxH */
ALIGNED_64( int i_pixel ); /* PIXEL_WxH */
uint16_t *p_cost_mv; /* lambda * nbits for each possible mv */
int i_ref_cost;
int i_ref;
......@@ -53,7 +53,7 @@ typedef struct
int cost_mv; /* lambda * nbits for the chosen mv */
int cost; /* satd + lambda * nbits */
ALIGNED_4( int16_t mv[2] );
} ALIGNED_16( x264_me_t );
} ALIGNED_64( x264_me_t );
void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, int *p_fullpel_thresh );
#define x264_me_search( h, m, mvc, i_mvc )\
......
......@@ -64,9 +64,8 @@ static uint16_t cabac_size_5ones[128];
#include "cabac.c"
#define COPY_CABAC h->mc.memcpy_aligned( &cabac_tmp.f8_bits_encoded, &h->cabac.f8_bits_encoded, \
sizeof(x264_cabac_t) - offsetof(x264_cabac_t,f8_bits_encoded) - (CHROMA444 ? 0 : (1024+12)-460) )
#define COPY_CABAC_PART( pos, size )\
memcpy( &cb->state[pos], &h->cabac.state[pos], size )
sizeof(int) + (CHROMA444 ? 1024+12 : 460) )
#define COPY_CABAC_PART( pos, size ) memcpy( &cb->state[pos], &h->cabac.state[pos], size )
static ALWAYS_INLINE uint64_t cached_hadamard( x264_t *h, int size, int x, int y )
{
......@@ -634,7 +633,7 @@ int quant_trellis_cabac( x264_t *h, dctcoef *dct,
const uint8_t *zigzag, int ctx_block_cat, int lambda2, int b_ac,
int b_chroma, int dc, int num_coefs, int idx )
{
ALIGNED_ARRAY_32( dctcoef, orig_coefs, [64] );
ALIGNED_ARRAY_64( dctcoef, orig_coefs, [64] );
ALIGNED_ARRAY_64( dctcoef, quant_coefs, [64] );
const uint32_t *coef_weight1 = num_coefs == 64 ? x264_dct8_weight_tab : x264_dct4_weight_tab;
const uint32_t *coef_weight2 = num_coefs == 64 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab;
......
......@@ -1855,12 +1855,14 @@ static int check_mc( int cpu_ref, int cpu_new )
{
set_func_name( "memcpy_aligned" );
ok = 1; used_asm = 1;
for( size_t size = 16; size < 256; size += 16 )
for( size_t size = 16; size < 512; size += 16 )
{
memset( buf4, 0xAA, size + 1 );
for( int i = 0; i < size; i++ )
buf1[i] = rand();
memset( buf4-1, 0xAA, size + 2 );
call_c( mc_c.memcpy_aligned, buf3, buf1, size );
call_a( mc_a.memcpy_aligned, buf4, buf1, size );
if( memcmp( buf3, buf4, size ) || buf4[size] != 0xAA )
if( memcmp( buf3, buf4, size ) || buf4[-1] != 0xAA || buf4[size] != 0xAA )
{
ok = 0;
fprintf( stderr, "memcpy_aligned FAILED: size=%d\n", (int)size );
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment