Commit 49fb50a6 authored by Henrik Gramner's avatar Henrik Gramner
Browse files

x86: AVX-512 pixel_var2_8x8 and 8x16

parent 92c074e2
......@@ -778,8 +778,8 @@ struct x264_t
/* space for p_fenc and p_fdec */
#define FENC_STRIDE 16
#define FDEC_STRIDE 32
ALIGNED_32( pixel fenc_buf[48*FENC_STRIDE] );
ALIGNED_32( pixel fdec_buf[52*FDEC_STRIDE] );
ALIGNED_64( pixel fenc_buf[48*FENC_STRIDE] );
ALIGNED_64( pixel fdec_buf[52*FDEC_STRIDE] );
/* i4x4 and i8x8 backup data, for skipping the encode stage when possible */
ALIGNED_16( pixel i4x4_fdec_buf[16*16] );
......
......@@ -532,16 +532,17 @@ void x264_macroblock_thread_init( x264_t *h )
h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf;
h->mb.pic.p_fdec[0] = h->mb.pic.fdec_buf + 2*FDEC_STRIDE;
h->mb.pic.p_fenc[1] = h->mb.pic.fenc_buf + 16*FENC_STRIDE;
h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE;
if( CHROMA444 )
{
h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 32*FENC_STRIDE;
h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE;
h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 36*FDEC_STRIDE;
}
else
{
h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 16*FENC_STRIDE + 8;
h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE + 16;
h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 20*FDEC_STRIDE;
h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 20*FDEC_STRIDE + 16;
}
}
......
......@@ -1049,6 +1049,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
{
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_avx512;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx512;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_avx512;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx512;
}
#endif // HAVE_MMX
#else // !HIGH_BIT_DEPTH
......@@ -1351,6 +1353,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_avx512;
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_avx512;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx512;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_avx512;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx512;
}
#endif //HAVE_MMX
......
......@@ -1128,10 +1128,17 @@ VAR2_8x8_SSSE3 16, 7
%macro VAR2_AVX2_LOAD 3 ; offset_reg, row1_offset, row2_offset
%if HIGH_BIT_DEPTH
%if mmsize == 64
mova m2, [r1+2*%1+%2*FDEC_STRIDEB]
vshufi32x4 m2, [r1+2*%1+%2*FDEC_STRIDEB+64], q2020
mova m3, [r1+2*%1+%3*FDEC_STRIDEB]
vshufi32x4 m3, [r1+2*%1+%3*FDEC_STRIDEB+64], q2020
%else
mova xm2, [r1+2*%1+%2*FDEC_STRIDEB]
vinserti128 m2, [r1+2*%1+%2*FDEC_STRIDEB+32], 1
mova xm3, [r1+2*%1+%3*FDEC_STRIDEB]
vinserti128 m3, [r1+2*%1+%3*FDEC_STRIDEB+32], 1
%endif
psubw m2, [r0+1*%1+%2*FENC_STRIDEB]
psubw m3, [r0+1*%1+%3*FENC_STRIDEB]
%else
......@@ -1174,6 +1181,44 @@ INIT_YMM avx2
VAR2_8x8_AVX2 8, 6
VAR2_8x8_AVX2 16, 7
%macro VAR2_AVX512_END 1 ; shift
vbroadcasti32x4 m2, [pw_1]
pmaddwd m0, m2
SBUTTERFLY qdq, 0, 1, 2
paddd m0, m1
vextracti32x8 ym1, m0, 1
paddd ym0, ym1
psrlq ym1, ym0, 32
paddd ym0, ym1
vpmovqd xmm0, ym0 ; sum_u, sqr_u, sum_v, sqr_v
VAR2_END xmm0, xmm1, %1
%endmacro
INIT_ZMM avx512
cglobal pixel_var2_8x8, 2,3
%if HIGH_BIT_DEPTH == 0
pxor xm6, xm6
%endif
VAR2_AVX2_LOAD 0, 0, 2
VAR2_CORE m2, m3, 0
VAR2_AVX2_LOAD 0, 4, 6
VAR2_CORE m2, m3, 1
VAR2_AVX512_END 6
cglobal pixel_var2_8x16, 2,3
%if HIGH_BIT_DEPTH == 0
pxor xm6, xm6
%endif
mov t0d, 10*FENC_STRIDEB
VAR2_AVX2_LOAD 0, 14, 12
VAR2_CORE m2, m3, 0
.loop:
VAR2_AVX2_LOAD t0, 0, -2
VAR2_CORE m2, m3, 1
sub t0d, 4*FENC_STRIDEB
jg .loop
VAR2_AVX512_END 7
;=============================================================================
; SATD
;=============================================================================
......
......@@ -169,9 +169,11 @@ float x264_pixel_ssim_end4_avx ( int sum0[5][4], int sum1[5][4], int width );
int x264_pixel_var2_8x8_sse2 ( pixel *fenc, pixel *fdec, int ssd[2] );
int x264_pixel_var2_8x8_ssse3 ( uint8_t *fenc, uint8_t *fdec, int ssd[2] );
int x264_pixel_var2_8x8_avx2 ( pixel *fenc, pixel *fdec, int ssd[2] );
int x264_pixel_var2_8x8_avx512 ( pixel *fenc, pixel *fdec, int ssd[2] );
int x264_pixel_var2_8x16_sse2 ( pixel *fenc, pixel *fdec, int ssd[2] );
int x264_pixel_var2_8x16_ssse3 ( uint8_t *fenc, uint8_t *fdec, int ssd[2] );
int x264_pixel_var2_8x16_avx2 ( pixel *fenc, pixel *fdec, int ssd[2] );
int x264_pixel_var2_8x16_avx512( pixel *fenc, pixel *fdec, int ssd[2] );
int x264_pixel_vsad_mmx2 ( pixel *src, intptr_t stride, int height );
int x264_pixel_vsad_sse2 ( pixel *src, intptr_t stride, int height );
int x264_pixel_vsad_ssse3( pixel *src, intptr_t stride, int height );
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment