Commit 0ea5be85 authored by Fiona Glaser's avatar Fiona Glaser

x86: more AVX2 framework, AVX2 functions, plus some existing asm tweaks

AVX2 functions:
mc_chroma
intra_sad_x3_16x16
last64
ads
hpel
dct4
idct4
sub16x16_dct8
quant_4x4x4
quant_4x4
quant_4x4_dc
quant_8x8
SAD_X3/X4
SATD
var
var2
SSD
zigzag interleave
weightp
weightb
intra_sad_8x8_x9
decimate
integral
hadamard_ac
sa8d_satd
sa8d
lowres_init
denoise
parent 19e1a2bb
......@@ -1180,17 +1180,14 @@ void x264_picture_clean( x264_picture_t *pic )
void *x264_malloc( int i_size )
{
uint8_t *align_buf = NULL;
#if SYS_MACOSX || (SYS_WINDOWS && ARCH_X86_64)
/* Mac OS X and Win x64 always returns 16 byte aligned memory */
align_buf = malloc( i_size );
#elif HAVE_MALLOC_H
align_buf = memalign( 16, i_size );
#if HAVE_MALLOC_H
align_buf = memalign( NATIVE_ALIGN, i_size );
#else
uint8_t *buf = malloc( i_size + 15 + sizeof(void **) );
uint8_t *buf = malloc( i_size + (NATIVE_ALIGN-1) + sizeof(void **) );
if( buf )
{
align_buf = buf + 15 + sizeof(void **);
align_buf -= (intptr_t) align_buf & 15;
align_buf = buf + (NATIVE_ALIGN-1) + sizeof(void **);
align_buf -= (intptr_t) align_buf & (NATIVE_ALIGN-1);
*( (void **) ( align_buf - sizeof(void **) ) ) = buf;
}
#endif
......@@ -1206,7 +1203,7 @@ void x264_free( void *p )
{
if( p )
{
#if HAVE_MALLOC_H || SYS_MACOSX || (SYS_WINDOWS && ARCH_X86_64)
#if HAVE_MALLOC_H
free( p );
#else
free( *( ( ( void **) p ) - 1 ) );
......
......@@ -610,11 +610,11 @@ struct x264_t
/* Current MB DCT coeffs */
struct
{
ALIGNED_16( dctcoef luma16x16_dc[3][16] );
ALIGNED_N( dctcoef luma16x16_dc[3][16] );
ALIGNED_16( dctcoef chroma_dc[2][8] );
// FIXME share memory?
ALIGNED_16( dctcoef luma8x8[12][64] );
ALIGNED_16( dctcoef luma4x4[16*3][16] );
ALIGNED_N( dctcoef luma8x8[12][64] );
ALIGNED_N( dctcoef luma4x4[16*3][16] );
} dct;
/* MB table and cache for current frame/mb */
......
......@@ -690,6 +690,17 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->sub8x8_dct = x264_sub8x8_dct_xop;
dctf->sub16x16_dct = x264_sub16x16_dct_xop;
}
if( cpu&X264_CPU_AVX2 )
{
dctf->add8x8_idct = x264_add8x8_idct_avx2;
dctf->add16x16_idct = x264_add16x16_idct_avx2;
dctf->sub8x8_dct = x264_sub8x8_dct_avx2;
dctf->sub16x16_dct = x264_sub16x16_dct_avx2;
#if ARCH_X86_64
dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx2;
#endif
}
#endif //HAVE_MMX
#if HAVE_ALTIVEC
......@@ -1024,6 +1035,12 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig
pf_interlaced->interleave_8x8_cavlc =
pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx;
}
if( cpu&X264_CPU_AVX2 )
{
pf_interlaced->interleave_8x8_cavlc =
pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx2;
}
#endif // HIGH_BIT_DEPTH
#endif
}
......@@ -72,7 +72,13 @@ static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
int i_mb_count = h->mb.i_mb_count;
int i_stride, i_width, i_lines, luma_plane_count;
int i_padv = PADV << PARAM_INTERLACED;
int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16;
int align = 16;
#if ARCH_X86 || ARCH_X86_64
if( h->param.cpu&X264_CPU_CACHELINE_64 )
align = 64;
else if( h->param.cpu&X264_CPU_CACHELINE_32 || h->param.cpu&X264_CPU_AVX2 )
align = 32;
#endif
#if ARCH_PPC
int disalign = 1<<9;
#else
......
......@@ -387,7 +387,7 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
int scratch_size = 0;
if( !b_lookahead )
{
int buf_hpel = (h->thread[0]->fdec->i_width[0]+48) * sizeof(int16_t);
int buf_hpel = (h->thread[0]->fdec->i_width[0]+48+32) * sizeof(int16_t);
int buf_ssim = h->param.analyse.b_ssim * 8 * (h->param.i_width/4+3) * sizeof(int);
int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range);
int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
......
......@@ -79,6 +79,7 @@
#else
#define DECLARE_ALIGNED( var, n ) var __attribute__((aligned(n)))
#endif
#define ALIGNED_32( var ) DECLARE_ALIGNED( var, 32 )
#define ALIGNED_16( var ) DECLARE_ALIGNED( var, 16 )
#define ALIGNED_8( var ) DECLARE_ALIGNED( var, 8 )
#define ALIGNED_4( var ) DECLARE_ALIGNED( var, 4 )
......@@ -113,6 +114,17 @@
#define ALIGNED_ARRAY_32( ... ) EXPAND( ALIGNED_ARRAY_EMU( 31, __VA_ARGS__ ) )
#define ALIGNED_ARRAY_64( ... ) EXPAND( ALIGNED_ARRAY_EMU( 63, __VA_ARGS__ ) )
/* For AVX2 */
#if ARCH_X86 || ARCH_X86_64
#define NATIVE_ALIGN 32
#define ALIGNED_N ALIGNED_32
#define ALIGNED_ARRAY_N ALIGNED_ARRAY_32
#else
#define NATIVE_ALIGN 16
#define ALIGNED_N ALIGNED_16
#define ALIGNED_ARRAY_N ALIGNED_ARRAY_16
#endif
#define UNINIT(x) x=x
#if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 0)
......
......@@ -1083,10 +1083,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_sse2;
INIT6( satd_x3, _sse2 );
INIT6( satd_x4, _sse2 );
if( !(cpu&X264_CPU_STACK_MOD4) )
{
INIT4( hadamard_ac, _sse2 );
}
INIT4( hadamard_ac, _sse2 );
INIT_ADS( _sse2 );
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2;
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_sse2;
......@@ -1136,9 +1133,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
if( cpu&X264_CPU_SSSE3 )
{
INIT4( hadamard_ac, _ssse3 );
if( !(cpu&X264_CPU_STACK_MOD4) )
{
INIT4( hadamard_ac, _ssse3 );
pixf->intra_sad_x9_4x4 = x264_intra_sad_x9_4x4_ssse3;
pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_ssse3;
pixf->intra_sad_x9_8x8 = x264_intra_sad_x9_8x8_ssse3;
......@@ -1155,10 +1152,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_ssse3_atom;
INIT6( satd_x3, _ssse3_atom );
INIT6( satd_x4, _ssse3_atom );
if( !(cpu&X264_CPU_STACK_MOD4) )
{
INIT4( hadamard_ac, _ssse3_atom );
}
INIT4( hadamard_ac, _ssse3_atom );
#if ARCH_X86_64
pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_ssse3_atom;
#endif
......@@ -1190,6 +1184,12 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT2( sad_x3, _cache64_ssse3 );
INIT2( sad_x4, _cache64_ssse3 );
}
else
{
pixf->sad_x4[PIXEL_8x4] = x264_pixel_sad_x4_8x4_ssse3;
pixf->sad_x4[PIXEL_8x8] = x264_pixel_sad_x4_8x8_ssse3;
pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_ssse3;
}
if( (cpu&X264_CPU_SLOW_ATOM) || (cpu&X264_CPU_SLOW_SHUFFLE) )
{
INIT5( ssd, _sse2 ); /* on conroe, sse2 is faster for width8/16 */
......@@ -1201,9 +1201,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT8( satd, _sse4 );
INIT7( satd_x3, _sse4 );
INIT7( satd_x4, _sse4 );
INIT4( hadamard_ac, _sse4 );
if( !(cpu&X264_CPU_STACK_MOD4) )
{
INIT4( hadamard_ac, _sse4 );
pixf->intra_sad_x9_4x4 = x264_intra_sad_x9_4x4_sse4;
pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_sse4;
pixf->intra_sad_x9_8x8 = x264_intra_sad_x9_8x8_sse4;
......@@ -1225,9 +1225,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT7( satd_x3, _avx );
INIT7( satd_x4, _avx );
INIT_ADS( _avx );
INIT4( hadamard_ac, _avx );
if( !(cpu&X264_CPU_STACK_MOD4) )
{
INIT4( hadamard_ac, _avx );
pixf->intra_sad_x9_4x4 = x264_intra_sad_x9_4x4_avx;
pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_avx;
pixf->intra_sad_x9_8x8 = x264_intra_sad_x9_8x8_avx;
......@@ -1255,9 +1255,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT7( satd, _xop );
INIT7( satd_x3, _xop );
INIT7( satd_x4, _xop );
INIT4( hadamard_ac, _xop );
if( !(cpu&X264_CPU_STACK_MOD4) )
{
INIT4( hadamard_ac, _xop );
pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_xop;
}
INIT5( ssd, _xop );
......@@ -1271,6 +1271,25 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_xop;
#if ARCH_X86_64
pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_xop;
#endif
}
if( cpu&X264_CPU_AVX2 )
{
INIT2( ssd, _avx2 );
INIT2( sad_x3, _avx2 );
INIT2( sad_x4, _avx2 );
INIT4( satd, _avx2 );
INIT2( hadamard_ac, _avx2 );
INIT_ADS( _avx2 );
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx2;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx2;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx2;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_avx2;
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_avx2;
pixf->intra_sad_x9_8x8 = x264_intra_sad_x9_8x8_avx2;
#if ARCH_X86_64
pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_avx2;
#endif
}
#endif //HAVE_MMX
......
......@@ -456,11 +456,6 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->denoise_dct = x264_denoise_dct_mmx;
pf->decimate_score15 = x264_decimate_score15_mmx2;
pf->decimate_score16 = x264_decimate_score16_mmx2;
if( cpu&X264_CPU_SLOW_CTZ )
{
pf->decimate_score15 = x264_decimate_score15_mmx2_slowctz;
pf->decimate_score16 = x264_decimate_score16_mmx2_slowctz;
}
pf->decimate_score64 = x264_decimate_score64_mmx2;
pf->coeff_last8 = x264_coeff_last8_mmx2;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_mmx2;
......@@ -489,11 +484,6 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->decimate_score15 = x264_decimate_score15_sse2;
pf->decimate_score16 = x264_decimate_score16_sse2;
pf->decimate_score64 = x264_decimate_score64_sse2;
if( cpu&X264_CPU_SLOW_CTZ )
{
pf->decimate_score15 = x264_decimate_score15_sse2_slowctz;
pf->decimate_score16 = x264_decimate_score16_sse2_slowctz;
}
pf->coeff_last8 = x264_coeff_last8_sse2;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2;
pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2;
......@@ -523,11 +513,6 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->denoise_dct = x264_denoise_dct_ssse3;
pf->decimate_score15 = x264_decimate_score15_ssse3;
pf->decimate_score16 = x264_decimate_score16_ssse3;
if( cpu&X264_CPU_SLOW_CTZ )
{
pf->decimate_score15 = x264_decimate_score15_ssse3_slowctz;
pf->decimate_score16 = x264_decimate_score16_ssse3_slowctz;
}
pf->decimate_score64 = x264_decimate_score64_ssse3;
INIT_TRELLIS( ssse3 );
}
......@@ -581,11 +566,6 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->quant_4x4_dc = x264_quant_4x4_dc_mmx2;
pf->decimate_score15 = x264_decimate_score15_mmx2;
pf->decimate_score16 = x264_decimate_score16_mmx2;
if( cpu&X264_CPU_SLOW_CTZ )
{
pf->decimate_score15 = x264_decimate_score15_mmx2_slowctz;
pf->decimate_score16 = x264_decimate_score16_mmx2_slowctz;
}
pf->decimate_score64 = x264_decimate_score64_mmx2;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_mmx2;
pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmx2;
......@@ -625,11 +605,6 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->decimate_score15 = x264_decimate_score15_sse2;
pf->decimate_score16 = x264_decimate_score16_sse2;
pf->decimate_score64 = x264_decimate_score64_sse2;
if( cpu&X264_CPU_SLOW_CTZ )
{
pf->decimate_score15 = x264_decimate_score15_sse2_slowctz;
pf->decimate_score16 = x264_decimate_score16_sse2_slowctz;
}
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2;
pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2;
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2;
......@@ -656,11 +631,6 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->denoise_dct = x264_denoise_dct_ssse3;
pf->decimate_score15 = x264_decimate_score15_ssse3;
pf->decimate_score16 = x264_decimate_score16_ssse3;
if( cpu&X264_CPU_SLOW_CTZ )
{
pf->decimate_score15 = x264_decimate_score15_ssse3_slowctz;
pf->decimate_score16 = x264_decimate_score16_ssse3_slowctz;
}
pf->decimate_score64 = x264_decimate_score64_ssse3;
INIT_TRELLIS( ssse3 );
}
......@@ -693,6 +663,25 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->dequant_8x8 = x264_dequant_8x8_xop;
}
}
if( cpu&X264_CPU_AVX2 )
{
pf->quant_4x4 = x264_quant_4x4_avx2;
pf->quant_4x4_dc = x264_quant_4x4_dc_avx2;
pf->quant_8x8 = x264_quant_8x8_avx2;
pf->quant_4x4x4 = x264_quant_4x4x4_avx2;
if( cpu&X264_CPU_LZCNT )
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2_lzcnt;
pf->dequant_4x4 = x264_dequant_4x4_avx2;
pf->dequant_8x8 = x264_dequant_8x8_avx2;
if( h->param.i_cqm_preset == X264_CQM_FLAT )
{
pf->dequant_4x4 = x264_dequant_4x4_flat16_avx2;
pf->dequant_8x8 = x264_dequant_8x8_flat16_avx2;
}
pf->decimate_score64 = x264_decimate_score64_avx2;
pf->denoise_dct = x264_denoise_dct_avx2;
}
#endif // HAVE_MMX
#if HAVE_ALTIVEC
......
......@@ -26,23 +26,27 @@
%include "x86inc.asm"
SECTION_RODATA
SECTION_RODATA 32
const pb_1, times 32 db 1
const hsub_mul, times 16 db 1, -1
const pw_1, times 16 dw 1
const pw_16, times 16 dw 16
const pw_32, times 16 dw 32
const pw_00ff, times 16 dw 0x00ff
const pd_1, times 8 dd 1
const deinterleave_shufd, dd 0,4,1,5,2,6,3,7
const pb_01, times 8 db 0,1
const pb_0, times 16 db 0
const pb_a1, times 16 db 0xa1
const pb_1, times 16 db 1
const pb_3, times 16 db 3
const hsub_mul, times 8 db 1, -1
const pb_shuf8x8c, db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6
const pw_1, times 8 dw 1
const pw_2, times 8 dw 2
const pw_m2, times 8 dw -2
const pw_4, times 8 dw 4
const pw_8, times 8 dw 8
const pw_16, times 8 dw 16
const pw_32, times 8 dw 32
const pw_64, times 8 dw 64
const pw_32_0, times 4 dw 32,
times 4 dw 0
......@@ -54,11 +58,9 @@ const pw_ppmmppmm, dw 1,1,-1,-1,1,1,-1,-1
const pw_pmpmpmpm, dw 1,-1,1,-1,1,-1,1,-1
const pw_pmmpzzzz, dw 1,-1,-1,1,0,0,0,0
const pd_1, times 4 dd 1
const pd_32, times 4 dd 32
const pd_1024, times 4 dd 1024
const pd_ffff, times 4 dd 0xffff
const pw_00ff, times 8 dw 0x00ff
const pw_ff00, times 8 dw 0xff00
const sw_64, dd 64
......@@ -311,6 +311,42 @@ DCT_SUB8
INIT_XMM xop
DCT_SUB8
INIT_YMM avx2
cglobal sub16x16_dct8, 3,3,10
add r0, 128
add r2, 4*FDEC_STRIDE
call .sub16x8_dct8
add r0, 256
add r1, FENC_STRIDE*8
add r2, FDEC_STRIDE*8
call .sub16x8_dct8
RET
.sub16x8_dct8:
LOAD_DIFF16x2_AVX2 0, 1, 2, 3, 0, 1
LOAD_DIFF16x2_AVX2 2, 3, 4, 5, 2, 3
LOAD_DIFF16x2_AVX2 4, 5, 6, 7, 4, 5
LOAD_DIFF16x2_AVX2 6, 7, 8, 9, 6, 7
DCT8_1D w, 0,1,2,3,4,5,6,7,8,9
TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
DCT8_1D w, 0,1,2,3,4,5,6,7,8,9
mova [r0-0x80+0x00], xm0
vextracti128 [r0+0x00], m0, 1
mova [r0-0x80+0x10], xm1
vextracti128 [r0+0x10], m1, 1
mova [r0-0x80+0x20], xm2
vextracti128 [r0+0x20], m2, 1
mova [r0-0x80+0x30], xm3
vextracti128 [r0+0x30], m3, 1
mova [r0-0x80+0x40], xm4
vextracti128 [r0+0x40], m4, 1
mova [r0-0x80+0x50], xm5
vextracti128 [r0+0x50], m5, 1
mova [r0-0x80+0x60], xm6
vextracti128 [r0+0x60], m6, 1
mova [r0-0x80+0x70], xm7
vextracti128 [r0+0x70], m7, 1
ret
;-----------------------------------------------------------------------------
; void add8x8_idct8( uint8_t *p_dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
......@@ -390,4 +426,5 @@ INIT_XMM sse2
ADD8x8
INIT_XMM avx
ADD8x8
%endif ; !HIGH_BIT_DEPTH
......@@ -83,6 +83,7 @@ cextern pd_1
cextern pd_32
cextern pw_ppppmmmm
cextern pw_pmpmpmpm
cextern deinterleave_shufd
%macro WALSH4_1D 6
SUMSUB_BADC %1, %5, %4, %3, %2, %6
......@@ -377,6 +378,133 @@ INIT_XMM sse4
ADD4x4
INIT_XMM avx
ADD4x4
%macro STOREx2_AVX2 9
movq xm%3, [r0+%5*FDEC_STRIDE]
vinserti128 m%3, m%3, [r0+%6*FDEC_STRIDE], 1
movq xm%4, [r0+%7*FDEC_STRIDE]
vinserti128 m%4, m%4, [r0+%8*FDEC_STRIDE], 1
punpcklbw m%3, m%9
punpcklbw m%4, m%9
psraw m%1, 6
psraw m%2, 6
paddsw m%1, m%3
paddsw m%2, m%4
packuswb m%1, m%2
vextracti128 xm%2, m%1, 1
movq [r0+%5*FDEC_STRIDE], xm%1
movq [r0+%6*FDEC_STRIDE], xm%2
movhps [r0+%7*FDEC_STRIDE], xm%1
movhps [r0+%8*FDEC_STRIDE], xm%2
%endmacro
INIT_YMM avx2
cglobal add8x8_idct, 2,3,8
add r0, 4*FDEC_STRIDE
pxor m7, m7
TAIL_CALL .skip_prologue, 0
global current_function %+ .skip_prologue
.skip_prologue:
mova m0, [r1+ 0]
mova m1, [r1+ 32]
mova m2, [r1+ 64]
mova m3, [r1+ 96]
; TRANSPOSE4x4Q
SBUTTERFLY qdq, 0, 1, 4
SBUTTERFLY qdq, 2, 3, 5
SBUTTERFLY dqqq, 0, 2, 4
SBUTTERFLY dqqq, 1, 3, 5
IDCT4_1D w,0,1,2,3,4,5
TRANSPOSE2x4x4W 0,1,2,3,4
paddw m0, [pw_32]
IDCT4_1D w,0,1,2,3,4,5
STOREx2_AVX2 0, 1, 4, 5, -4, 0, -3, 1, 7
STOREx2_AVX2 2, 3, 4, 5, -2, 2, -1, 3, 7
ret
; 2xdst, 2xtmp, 4xsrcrow, 1xzero
%macro LOAD_DIFF8x2_AVX2 9
movq xm%1, [r1+%5*FENC_STRIDE]
movq xm%2, [r1+%6*FENC_STRIDE]
vinserti128 m%1, m%1, [r1+%7*FENC_STRIDE], 1
vinserti128 m%2, m%2, [r1+%8*FENC_STRIDE], 1
punpcklbw m%1, m%9
punpcklbw m%2, m%9
movq xm%3, [r2+(%5-4)*FDEC_STRIDE]
movq xm%4, [r2+(%6-4)*FDEC_STRIDE]
vinserti128 m%3, m%3, [r2+(%7-4)*FDEC_STRIDE], 1
vinserti128 m%4, m%4, [r2+(%8-4)*FDEC_STRIDE], 1
punpcklbw m%3, m%9
punpcklbw m%4, m%9
psubw m%1, m%3
psubw m%2, m%4
%endmacro
; 4x src, 1x tmp
%macro STORE8_DCT_AVX2 5
SBUTTERFLY qdq, %1, %2, %5
SBUTTERFLY qdq, %3, %4, %5
mova [r0+ 0], xm%1
mova [r0+ 16], xm%3
mova [r0+ 32], xm%2
mova [r0+ 48], xm%4
vextracti128 [r0+ 64], m%1, 1
vextracti128 [r0+ 80], m%3, 1
vextracti128 [r0+ 96], m%2, 1
vextracti128 [r0+112], m%4, 1
%endmacro
%macro STORE16_DCT_AVX2 5
SBUTTERFLY qdq, %1, %2, %5
SBUTTERFLY qdq, %3, %4, %5
mova [r0+ 0-128], xm%1
mova [r0+16-128], xm%3
mova [r0+32-128], xm%2
mova [r0+48-128], xm%4
vextracti128 [r0+ 0], m%1, 1
vextracti128 [r0+16], m%3, 1
vextracti128 [r0+32], m%2, 1
vextracti128 [r0+48], m%4, 1
%endmacro
INIT_YMM avx2
cglobal sub8x8_dct, 3,3,7
pxor m6, m6
add r2, 4*FDEC_STRIDE
LOAD_DIFF8x2_AVX2 0, 1, 4, 5, 0, 1, 4, 5, 6
LOAD_DIFF8x2_AVX2 2, 3, 4, 5, 2, 3, 6, 7, 6
DCT4_1D 0, 1, 2, 3, 4
TRANSPOSE2x4x4W 0, 1, 2, 3, 4
DCT4_1D 0, 1, 2, 3, 4
STORE8_DCT_AVX2 0, 1, 2, 3, 4
RET
INIT_YMM avx2
cglobal sub16x16_dct, 3,3,6
add r0, 128
add r2, 4*FDEC_STRIDE
call .sub16x4_dct
add r0, 64
add r1, 4*FENC_STRIDE
add r2, 4*FDEC_STRIDE
call .sub16x4_dct
add r0, 256-64
add r1, 4*FENC_STRIDE
add r2, 4*FDEC_STRIDE
call .sub16x4_dct
add r0, 64
add r1, 4*FENC_STRIDE
add r2, 4*FDEC_STRIDE
call .sub16x4_dct
RET
.sub16x4_dct:
LOAD_DIFF16x2_AVX2 0, 1, 4, 5, 0, 1
LOAD_DIFF16x2_AVX2 2, 3, 4, 5, 2, 3
DCT4_1D 0, 1, 2, 3, 4
TRANSPOSE2x4x4W 0, 1, 2, 3, 4
DCT4_1D 0, 1, 2, 3, 4
STORE16_DCT_AVX2 0, 1, 2, 3, 4
ret
%endif ; HIGH_BIT_DEPTH
INIT_MMX
......@@ -422,7 +550,7 @@ cglobal %1, 2,2,%7
cglobal %1, 2,2,11
pxor m7, m7
%endif
%if mmsize==16 && %3!=256
%if mmsize>=16 && %3!=256
add r0, 4*FDEC_STRIDE
%endif
.skip_prologue:
......@@ -497,6 +625,9 @@ cextern sub8x8_dct8_avx.skip_prologue
SUB_NxN_DCT sub16x16_dct8_sse2, sub8x8_dct8_sse2, 128, 8, 0, 0, 11
SUB_NxN_DCT sub16x16_dct8_ssse3, sub8x8_dct8_ssse3, 128, 8, 0, 0, 11
SUB_NxN_DCT sub16x16_dct8_avx, sub8x8_dct8_avx, 128, 8, 0, 0, 11
INIT_YMM
ADD_NxN_IDCT add16x16_idct_avx2, add8x8_idct_avx2, 128, 8, 0, 0
%endif ; HIGH_BIT_DEPTH
%if HIGH_BIT_DEPTH
......@@ -1608,4 +1739,42 @@ INIT_XMM sse2
ZIGZAG_8x8_CAVLC
INIT_XMM avx
ZIGZAG_8x8_CAVLC
INIT_YMM avx2
cglobal zigzag_interleave_8x8_cavlc, 3,3,6
mova m0, [r1+ 0]
mova m1, [r1+32]
mova m2, [r1+64]
mova m3, [r1+96]
mova m5, [deinterleave_shufd]
SBUTTERFLY wd, 0, 1, 4
SBUTTERFLY wd, 2, 3, 4
SBUTTERFLY wd, 0, 1, 4
SBUTTERFLY wd, 2, 3, 4
vpermd m0, m5, m0
vpermd m1, m5, m1
vpermd m2, m5, m2
vpermd m3, m5, m3
mova [r0+ 0], xm0
mova [r0+ 16], xm2
vextracti128 [r0+ 32], m0, 1
vextracti128 [r0+ 48], m2, 1
mova [r0+ 64], xm1
mova [r0+ 80], xm3
vextracti128 [r0+ 96], m1, 1
vextracti128 [r0+112], m3, 1
packsswb m0, m2 ; nnz0, nnz1
packsswb m1, m3 ; nnz2, nnz3
packsswb m0, m1 ; {nnz0,nnz2}, {nnz1,nnz3}
vpermq m0, m0, q3120 ; {nnz0,nnz1}, {nnz2,nnz3}
pxor m5, m5
pcmpeqq m0, m5
pmovmskb r0d, m0
not r0d
and r0d, 0x01010101
mov [r2+0], r0w
shr r0d, 16
mov [r2+8], r0w
RET
%endif ; !HIGH_BIT_DEPTH
......@@ -40,6 +40,8 @@ void x264_sub8x8_dct_avx ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2
void x264_sub16x16_dct_avx ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_xop ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_xop ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_avx2 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_avx2 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_dc_mmx2( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_dc_sse2( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
void x264_sub8x16_dct_dc_sse2 ( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
......@@ -56,8 +58,10 @@ void x264_add16x16_idct_mmx ( uint8_t *p_dst, int16_t dct[16][16] );
void x264_add16x16_idct_dc_mmx2 ( uint8_t *p_dst, int16_t dct [16] );
void x264_add8x8_idct_sse2 ( pixel *p_dst, dctcoef dct[ 4][16] );
void x264_add8x8_idct_avx ( pixel *p_dst, dctcoef dct[ 4][16] );
void x264_add8x8_idct_avx2 ( pixel *p_dst, dctcoef dct[ 4][16] );
void x264_add16x16_idct_sse2 ( pixel *p_dst, dctcoef dct[16][16] );
void x264_add16x16_idct_avx ( pixel *p_dst, dctcoef dct[16][16] );
void x264_add16x16_idct_avx2 ( pixel *p_dst, dctcoef dct[16][16] );
void x264_add8x8_idct_dc_sse2 ( pixel *p_dst, dctcoef dct [ 4] );
void x264_add16x16_idct_dc_sse2 ( pixel *p_dst, dctcoef dct [16] );
void x264_add8x8_idct_dc_ssse3 ( uint8_t *p_dst, int16_t dct [ 4] );
......@@ -82,6 +86,7 @@ void x264_sub8x8_dct8_sse4 ( int32_t dct [64], uint16_t *pix1, uint16_t *pix
void x264_sub16x16_dct8_sse4 ( int32_t dct[4][64], uint16_t *pix1, uint16_t *pix2 );
void x264_sub8x8_dct8_avx ( dctcoef dct [64], pixel *pix1, pixel *pix2 );
void x264_sub16x16_dct8_avx ( dctcoef dct[4][64], pixel *pix1, pixel *pix2 );
void x264_sub16x16_dct8_avx2 ( dctcoef dct[4][64], pixel *pix1, pixel *pix2 );
void x264_add8x8_idct8_mmx ( uint8_t *dst, int16_t dct [64] );
......@@ -118,5 +123,6 @@ int x264_zigzag_sub_4x4ac_field_ssse3( int16_t level[16], const uint8_t *src, u
void x264_zigzag_interleave_8x8_cavlc_mmx ( int16_t *dst, int16_t *src, uint8_t *nnz );
void x264_zigzag_interleave_8x8_cavlc_sse2( dctcoef *dst, dctcoef *src, uint8_t *nnz );
void x264_zigzag_interleave_8x8_cavlc_avx ( dctcoef *dst, dctcoef *src, uint8_t *nnz );
void x264_zigzag_interleave_8x8_cavlc_avx2( int16_t *dst, int16_t *src, uint8_t *nnz );
#endif
......@@ -34,7 +34,8 @@
SECTION_RODATA 32
ch_shuf: db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9
pw_512: times 16 dw 512
ch_shuf: times 2 db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9
ch_shuf_adj: times 8 db 0
times 8 db 2
times 8 db 4
......@@ -53,6 +54,7 @@ cextern pw_00ff
cextern pw_pixel_max
cextern sw_64
cextern pd_32
cextern deinterleave_shufd
;=============================================================================
; implicit weighted biprediction
......@@ -141,8 +143,7 @@ cextern pd_32
movh m1, %2
punpcklbw m0, m1
pmaddubsw m0, m3
paddw m0, m4
psraw m0, 6
pmulhrsw m0, m4
%endmacro
%macro BIWEIGHT_START_SSSE3 0
......@@ -151,9 +152,13 @@ cextern pd_32
sub t7d, t6d
shl t7d, 8
add t6d, t7d
movd m3, t6d
mova m4, [pw_32]
mova m4, [pw_512]
movd xm3, t6d
%if cpuflag(avx2)
vpbroadcastw m3, xm3
%else
SPLATW m3, m3 ; weight_dst,src
%endif
%endmacro
%if HIGH_BIT_DEPTH
......@@ -244,6 +249,25 @@ AVG_WEIGHT 4
INIT_XMM ssse3
AVG_WEIGHT 8, 7
AVG_WEIGHT 16, 7
INIT_YMM avx2
cglobal pixel_avg_weight_w16
BIWEIGHT_START
AVG_START 5
.height_loop:
movu xm0, [t2]
movu xm1, [t4]
vinserti128 m0, m0, [t2+t3], 1
vinserti128 m1, m1, [t4+t5], 1