Commit 3d9ec58f authored by Henrik Gramner's avatar Henrik Gramner

x86: AVX-512 plane_copy and plane_copy_swap

Avoid the scalar C wrapper by utilizing opmasks to prevent overreading the
input buffer.
parent 698c5a32
......@@ -76,6 +76,9 @@ static x264_frame_t *frame_new( x264_t *h, int b_fdec )
int disalign = 1<<10;
#endif
/* ensure frame alignment after PADH is added */
int padh_align = X264_MAX( align - PADH * sizeof(pixel), 0 ) / sizeof(pixel);
CHECKED_MALLOCZERO( frame, sizeof(x264_frame_t) );
PREALLOC_INIT
......@@ -149,9 +152,9 @@ static x264_frame_t *frame_new( x264_t *h, int b_fdec )
{
int chroma_padv = i_padv >> (i_csp == X264_CSP_NV12);
int chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + 2*chroma_padv));
PREALLOC( frame->buffer[1], chroma_plane_size * sizeof(pixel) );
PREALLOC( frame->buffer[1], (chroma_plane_size + padh_align) * sizeof(pixel) );
if( PARAM_INTERLACED )
PREALLOC( frame->buffer_fld[1], chroma_plane_size * sizeof(pixel) );
PREALLOC( frame->buffer_fld[1], (chroma_plane_size + padh_align) * sizeof(pixel) );
}
/* all 4 luma planes allocated together, since the cacheline split code
......@@ -161,18 +164,12 @@ static x264_frame_t *frame_new( x264_t *h, int b_fdec )
{
int luma_plane_size = align_plane_size( frame->i_stride[p] * (frame->i_lines[p] + 2*i_padv), disalign );
if( h->param.analyse.i_subpel_refine && b_fdec )
{
/* FIXME: Don't allocate both buffers in non-adaptive MBAFF. */
PREALLOC( frame->buffer[p], 4*luma_plane_size * sizeof(pixel) );
if( PARAM_INTERLACED )
PREALLOC( frame->buffer_fld[p], 4*luma_plane_size * sizeof(pixel) );
}
else
{
PREALLOC( frame->buffer[p], luma_plane_size * sizeof(pixel) );
if( PARAM_INTERLACED )
PREALLOC( frame->buffer_fld[p], luma_plane_size * sizeof(pixel) );
}
luma_plane_size *= 4;
/* FIXME: Don't allocate both buffers in non-adaptive MBAFF. */
PREALLOC( frame->buffer[p], (luma_plane_size + padh_align) * sizeof(pixel) );
if( PARAM_INTERLACED )
PREALLOC( frame->buffer_fld[p], (luma_plane_size + padh_align) * sizeof(pixel) );
}
frame->b_duplicate = 0;
......@@ -210,7 +207,7 @@ static x264_frame_t *frame_new( x264_t *h, int b_fdec )
{
int luma_plane_size = align_plane_size( frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*PADV), disalign );
PREALLOC( frame->buffer_lowres[0], 4 * luma_plane_size * sizeof(pixel) );
PREALLOC( frame->buffer_lowres, (4 * luma_plane_size + padh_align) * sizeof(pixel) );
for( int j = 0; j <= !!h->param.i_bframe; j++ )
for( int i = 0; i <= h->param.i_bframe; i++ )
......@@ -240,9 +237,9 @@ static x264_frame_t *frame_new( x264_t *h, int b_fdec )
if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 )
{
int chroma_padv = i_padv >> (i_csp == X264_CSP_NV12);
frame->plane[1] = frame->buffer[1] + frame->i_stride[1] * chroma_padv + PADH;
frame->plane[1] = frame->buffer[1] + frame->i_stride[1] * chroma_padv + PADH + padh_align;
if( PARAM_INTERLACED )
frame->plane_fld[1] = frame->buffer_fld[1] + frame->i_stride[1] * chroma_padv + PADH;
frame->plane_fld[1] = frame->buffer_fld[1] + frame->i_stride[1] * chroma_padv + PADH + padh_align;
}
for( int p = 0; p < luma_plane_count; p++ )
......@@ -252,16 +249,16 @@ static x264_frame_t *frame_new( x264_t *h, int b_fdec )
{
for( int i = 0; i < 4; i++ )
{
frame->filtered[p][i] = frame->buffer[p] + i*luma_plane_size + frame->i_stride[p] * i_padv + PADH;
frame->filtered_fld[p][i] = frame->buffer_fld[p] + i*luma_plane_size + frame->i_stride[p] * i_padv + PADH;
frame->filtered[p][i] = frame->buffer[p] + i*luma_plane_size + frame->i_stride[p] * i_padv + PADH + padh_align;
frame->filtered_fld[p][i] = frame->buffer_fld[p] + i*luma_plane_size + frame->i_stride[p] * i_padv + PADH + padh_align;
}
frame->plane[p] = frame->filtered[p][0];
frame->plane_fld[p] = frame->filtered_fld[p][0];
}
else
{
frame->filtered[p][0] = frame->plane[p] = frame->buffer[p] + frame->i_stride[p] * i_padv + PADH;
frame->filtered_fld[p][0] = frame->plane_fld[p] = frame->buffer_fld[p] + frame->i_stride[p] * i_padv + PADH;
frame->filtered[p][0] = frame->plane[p] = frame->buffer[p] + frame->i_stride[p] * i_padv + PADH + padh_align;
frame->filtered_fld[p][0] = frame->plane_fld[p] = frame->buffer_fld[p] + frame->i_stride[p] * i_padv + PADH + padh_align;
}
}
......@@ -279,7 +276,7 @@ static x264_frame_t *frame_new( x264_t *h, int b_fdec )
{
int luma_plane_size = align_plane_size( frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*PADV), disalign );
for( int i = 0; i < 4; i++ )
frame->lowres[i] = frame->buffer_lowres[0] + (frame->i_stride_lowres * PADV + PADH) + i * luma_plane_size;
frame->lowres[i] = frame->buffer_lowres + frame->i_stride_lowres * PADV + PADH + padh_align + i * luma_plane_size;
for( int j = 0; j <= !!h->param.i_bframe; j++ )
for( int i = 0; i <= h->param.i_bframe; i++ )
......
......@@ -86,7 +86,7 @@ typedef struct x264_frame
* allocated data are stored in buffer */
pixel *buffer[4];
pixel *buffer_fld[4];
pixel *buffer_lowres[4];
pixel *buffer_lowres;
x264_weight_t weight[X264_REF_MAX][3]; /* [ref_index][plane] */
pixel *weighted[X264_REF_MAX]; /* plane[0] weighted of the reference frames */
......
......@@ -1017,6 +1017,143 @@ PLANE_COPY_CORE 0
INIT_YMM avx2
PLANE_COPY_CORE 1
%macro PLANE_COPY_AVX512 1 ; swap
%if %1
cglobal plane_copy_swap, 6,7
vbroadcasti32x4 m4, [copy_swap_shuf]
%else
cglobal plane_copy, 6,7
%endif
movsxdifnidn r4, r4d
%if %1 && HIGH_BIT_DEPTH
%define %%mload vmovdqu32
lea r2, [r2+4*r4-64]
lea r0, [r0+4*r4-64]
neg r4
mov r6d, r4d
shl r4, 2
or r6d, 0xffff0010
shrx r6d, r6d, r6d ; (1 << (w & 15)) - 1
kmovw k1, r6d
%elif %1 || HIGH_BIT_DEPTH
%define %%mload vmovdqu16
lea r2, [r2+2*r4-64]
lea r0, [r0+2*r4-64]
mov r6d, -1
neg r4
shrx r6d, r6d, r4d
add r4, r4
kmovd k1, r6d
%else
%define %%mload vmovdqu8
lea r2, [r2+1*r4-64]
lea r0, [r0+1*r4-64]
mov r6, -1
neg r4
shrx r6, r6, r4
%if ARCH_X86_64
kmovq k1, r6
%else
kmovd k1, r6d
test r4d, 32
jnz .l32
kxnord k2, k2, k2
kunpckdq k1, k1, k2
.l32:
%endif
%endif
FIX_STRIDES r3, r1
add r4, 4*64
jge .small
mov r6, r4
.loop: ; >256 bytes/row
PREFETCHNT_ITER r2+r4+64, 4*64
movu m0, [r2+r4-3*64]
movu m1, [r2+r4-2*64]
movu m2, [r2+r4-1*64]
movu m3, [r2+r4-0*64]
%if %1
pshufb m0, m4
pshufb m1, m4
pshufb m2, m4
pshufb m3, m4
%endif
movnta [r0+r4-3*64], m0
movnta [r0+r4-2*64], m1
movnta [r0+r4-1*64], m2
movnta [r0+r4-0*64], m3
add r4, 4*64
jl .loop
PREFETCHNT_ITER r2+r4+64, 4*64
sub r4, 3*64
jge .tail
.loop2:
movu m0, [r2+r4]
%if %1
pshufb m0, m4
%endif
movnta [r0+r4], m0
add r4, 64
jl .loop2
.tail:
%%mload m0 {k1}{z}, [r2+r4]
%if %1
pshufb m0, m4
%endif
movnta [r0+r4], m0
add r2, r3
add r0, r1
mov r4, r6
dec r5d
jg .loop
sfence
RET
.small: ; 65-256 bytes/row. skip non-temporal stores
sub r4, 3*64
jge .tiny
mov r6, r4
.small_loop:
PREFETCHNT_ITER r2+r4+64, 64
movu m0, [r2+r4]
%if %1
pshufb m0, m4
%endif
mova [r0+r4], m0
add r4, 64
jl .small_loop
PREFETCHNT_ITER r2+r4+64, 64
%%mload m0 {k1}{z}, [r2+r4]
%if %1
pshufb m0, m4
%endif
mova [r0+r4], m0
add r2, r3
add r0, r1
mov r4, r6
dec r5d
jg .small_loop
RET
.tiny: ; 1-64 bytes/row. skip non-temporal stores
PREFETCHNT_ITER r2+r4+64, 64
%%mload m0 {k1}{z}, [r2+r4]
%if %1
pshufb m0, m4
%endif
mova [r0+r4], m0
add r2, r3
add r0, r1
dec r5d
jg .tiny
RET
%endmacro
INIT_ZMM avx512
PLANE_COPY_AVX512 0
PLANE_COPY_AVX512 1
%macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint
%if HIGH_BIT_DEPTH
%assign x 0
......
......@@ -171,10 +171,14 @@ void x264_prefetch_ref_mmx2( pixel *, intptr_t, int );
void x264_plane_copy_core_sse( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
#define x264_plane_copy_core_avx x264_template(plane_copy_core_avx)
void x264_plane_copy_core_avx( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
#define x264_plane_copy_avx512 x264_template(plane_copy_avx512)
void x264_plane_copy_avx512( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
#define x264_plane_copy_swap_core_ssse3 x264_template(plane_copy_swap_core_ssse3)
void x264_plane_copy_swap_core_ssse3( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
#define x264_plane_copy_swap_core_avx2 x264_template(plane_copy_swap_core_avx2)
void x264_plane_copy_swap_core_avx2 ( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
#define x264_plane_copy_swap_avx512 x264_template(plane_copy_swap_avx512)
void x264_plane_copy_swap_avx512( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
#define x264_plane_copy_interleave_core_mmx2 x264_template(plane_copy_interleave_core_mmx2)
void x264_plane_copy_interleave_core_mmx2( pixel *dst, intptr_t i_dst,
pixel *srcu, intptr_t i_srcu,
......@@ -1122,6 +1126,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
return;
pf->memcpy_aligned = x264_memcpy_aligned_avx512;
pf->memzero_aligned = x264_memzero_aligned_avx512;
pf->plane_copy = x264_plane_copy_avx512;
pf->plane_copy_swap = x264_plane_copy_swap_avx512;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx512;
#if ARCH_X86_64
pf->mbtree_propagate_list = mbtree_propagate_list_avx512;
......
......@@ -483,7 +483,7 @@ void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int
if( weights[0].weightfn && b_lookahead )
{
//scale lowres in lookahead for slicetype_frame_cost
pixel *src = ref->buffer_lowres[0];
pixel *src = ref->buffer_lowres;
pixel *dst = h->mb.p_weight_buf[0];
int width = ref->i_width_lowres + PADH*2;
int height = ref->i_lines_lowres + PADV*2;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment