Commit f6727954 authored by Henrik Gramner's avatar Henrik Gramner
Browse files

x86: AVX-512 sub8x8_dct_dc

parent 0af1c6d0
...@@ -717,6 +717,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf ) ...@@ -717,6 +717,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->sub4x4_dct = x264_sub4x4_dct_avx512; dctf->sub4x4_dct = x264_sub4x4_dct_avx512;
dctf->sub8x8_dct = x264_sub8x8_dct_avx512; dctf->sub8x8_dct = x264_sub8x8_dct_avx512;
dctf->sub16x16_dct = x264_sub16x16_dct_avx512; dctf->sub16x16_dct = x264_sub16x16_dct_avx512;
dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_avx512;
dctf->add8x8_idct = x264_add8x8_idct_avx512; dctf->add8x8_idct = x264_add8x8_idct_avx512;
} }
#endif //HAVE_MMX #endif //HAVE_MMX
......
...@@ -725,6 +725,30 @@ cglobal sub16x16_dct ...@@ -725,6 +725,30 @@ cglobal sub16x16_dct
SUB4x16_DCT_AVX512 5, 3 SUB4x16_DCT_AVX512 5, 3
RET RET
cglobal sub8x8_dct_dc, 3,3
mova m3, [dct_avx512]
DCT8x8_LOAD_FENC_AVX512 m0, m3, 0, 4 ; 0 2 1 3
mov r1d, 0xaa
kmovb k1, r1d
psrld m3, 5
DCT8x8_LOAD_FDEC_AVX512 m1, m3, m2, 0, 4
pxor xm3, xm3
psadbw m0, m3
psadbw m1, m3
psubw m0, m1
vpmovqw xmm0, m0
vprold xmm1, xmm0, 16
paddw xmm0, xmm1 ; 0 0 2 2 1 1 3 3
punpckhqdq xmm2, xmm0, xmm0
psubw xmm1, xmm0, xmm2 ; 0-1 0-1 2-3 2-3
paddw xmm0, xmm2 ; 0+1 0+1 2+3 2+3
punpckldq xmm0, xmm1 ; 0+1 0+1 0-1 0-1 2+3 2+3 2-3 2-3
punpcklqdq xmm1, xmm0, xmm0
psubw xmm0 {k1}, xm3, xmm0
paddw xmm0, xmm1 ; 0+1+2+3 0+1-2-3 0-1+2-3 0-1-2+3
movhps [r0], xmm0
RET
%macro SARSUMSUB 3 ; a, b, tmp %macro SARSUMSUB 3 ; a, b, tmp
mova m%3, m%1 mova m%3, m%1
vpsraw m%1 {k1}, 1 vpsraw m%1 {k1}, 1
......
...@@ -45,8 +45,9 @@ void x264_sub8x8_dct_avx2 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 ...@@ -45,8 +45,9 @@ void x264_sub8x8_dct_avx2 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2
void x264_sub8x8_dct_avx512 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 ); void x264_sub8x8_dct_avx512 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_avx2 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 ); void x264_sub16x16_dct_avx2 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_avx512( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 ); void x264_sub16x16_dct_avx512( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_dc_mmx2( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 ); void x264_sub8x8_dct_dc_mmx2 ( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_dc_sse2( dctcoef dct [ 4], pixel *pix1, pixel *pix2 ); void x264_sub8x8_dct_dc_sse2 ( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
void x264_sub8x8_dct_dc_avx512 ( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x16_dct_dc_sse2 ( dctcoef dct [ 4], pixel *pix1, pixel *pix2 ); void x264_sub8x16_dct_dc_sse2 ( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
void x264_sub8x16_dct_dc_ssse3( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 ); void x264_sub8x16_dct_dc_ssse3( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x16_dct_dc_avx ( dctcoef dct [ 4], pixel *pix1, pixel *pix2 ); void x264_sub8x16_dct_dc_avx ( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment