Commit 90340852 authored by Henrik Gramner's avatar Henrik Gramner
Browse files

x86: AVX-512 sub16x16_dct

parent 774c6c76
......@@ -778,7 +778,7 @@ struct x264_t
#define FENC_STRIDE 16
#define FDEC_STRIDE 32
ALIGNED_64( pixel fenc_buf[48*FENC_STRIDE] );
ALIGNED_64( pixel fdec_buf[52*FDEC_STRIDE] );
ALIGNED_64( pixel fdec_buf[54*FDEC_STRIDE] );
/* i4x4 and i8x8 backup data, for skipping the encode stage when possible */
ALIGNED_16( pixel i4x4_fdec_buf[16*16] );
......
......@@ -716,6 +716,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
{
dctf->sub4x4_dct = x264_sub4x4_dct_avx512;
dctf->sub8x8_dct = x264_sub8x8_dct_avx512;
dctf->sub16x16_dct = x264_sub16x16_dct_avx512;
}
#endif //HAVE_MMX
......
......@@ -532,16 +532,15 @@ void x264_macroblock_thread_init( x264_t *h )
h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf;
h->mb.pic.p_fdec[0] = h->mb.pic.fdec_buf + 2*FDEC_STRIDE;
h->mb.pic.p_fenc[1] = h->mb.pic.fenc_buf + 16*FENC_STRIDE;
h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 20*FDEC_STRIDE;
if( CHROMA444 )
{
h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 32*FENC_STRIDE;
h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE;
h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 36*FDEC_STRIDE;
h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 38*FDEC_STRIDE;
}
else
{
h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 16*FENC_STRIDE + 8;
h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 20*FDEC_STRIDE;
h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 20*FDEC_STRIDE + 16;
}
}
......
......@@ -47,10 +47,10 @@ cavlc_shuf_avx512: dd 0x00018820, 0x000398a4, 0x0005a928, 0x0007b9ac ; bits 0-4:
dd 0x00010c01, 0x00031c85, 0x00052d09, 0x00073d8d ; bits 10-14: interleave3
dd 0x00094e11, 0x000b5e95, 0x000d6f19, 0x000f7f9d ; bits 15-19: interleave4
%else
dct_avx512: dd 0x00000000, 0x00000104, 0x0000014c, 0x00000048 ; bits 0-4: dct8x8_fenc
dd 0x00000210, 0x00000314, 0x0000035c, 0x00000258 ; bits 5-9: dct8x8_fdec
dd 0x00000021, 0x00000125, 0x0000016d, 0x00000069
dd 0x00000231, 0x00000335, 0x0000037d, 0x00000279
dct_avx512: dd 0x00000000, 0x00021104, 0x0006314c, 0x00042048 ; bits 0-4: dct8x8_fenc
dd 0x00008a10, 0x00029b14, 0x0006bb5c, 0x0004aa58 ; bits 5-9: dct8x8_fdec
dd 0x00004421, 0x00025525, 0x0006756d, 0x00046469 ; bits 10-13: dct16x16_fenc
dd 0x0000ce31, 0x0002df35, 0x0006ff7d, 0x0004ee79 ; bits 14-18: dct16x16_fdec
scan_frame_avx512: dw 0x7000, 0x5484, 0x3811, 0x1c22, 0x3c95, 0x5908, 0x758c, 0x9119 ; bits 0-3: 4x4_frame
dw 0xaca6, 0xc833, 0xe447, 0xe8ba, 0xcd2d, 0xb19e, 0x960b, 0x7a8f ; bits 4-9: 8x8_frame1
dw 0x5e10, 0x7da0, 0x9930, 0xb4c0, 0xd050, 0xec60, 0xf0d0, 0xd540 ; bits 10-15: 8x8_frame2
......@@ -699,6 +699,31 @@ cglobal sub8x8_dct, 3,3
mova [r0], m0
mova [r0+64], m1
RET
%macro SUB4x16_DCT_AVX512 2 ; dst, src
vpermd m1, m5, [r1+1*%2*64]
mova m3, [r2+2*%2*64]
vpermt2d m3, m6, [r2+2*%2*64+64]
call dct4x4x4_internal_avx512
mova [r0+%1*64 ], m0
mova [r0+%1*64+128], m1
%endmacro
cglobal sub16x16_dct
psrld m5, [dct_avx512], 10
mov eax, 0xaaaaaaaa
kmovd k1, eax
mov eax, 0xf0f0f0f0
kmovd k2, eax
PROLOGUE 3,3
pxor xm4, xm4
knotw k3, k2
psrld m6, m5, 4
SUB4x16_DCT_AVX512 0, 0
SUB4x16_DCT_AVX512 1, 1
SUB4x16_DCT_AVX512 4, 2
SUB4x16_DCT_AVX512 5, 3
RET
%endif ; HIGH_BIT_DEPTH
INIT_MMX
......
......@@ -44,6 +44,7 @@ void x264_sub16x16_dct_xop ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2
void x264_sub8x8_dct_avx2 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_avx512 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_avx2 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_avx512( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_dc_mmx2( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_dc_sse2( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
void x264_sub8x16_dct_dc_sse2 ( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment