Commit abc2283e authored by Daniel Kang's avatar Daniel Kang Committed by Fiona Glaser

Add AVX functions where 3+ arg commands are useful

parent 7f918d15
......@@ -42,6 +42,7 @@ static uint8_t *x264_nal_escape_c( uint8_t *dst, uint8_t *src, uint8_t *end )
#if HAVE_MMX
uint8_t *x264_nal_escape_mmxext( uint8_t *dst, uint8_t *src, uint8_t *end );
uint8_t *x264_nal_escape_sse2( uint8_t *dst, uint8_t *src, uint8_t *end );
uint8_t *x264_nal_escape_avx( uint8_t *dst, uint8_t *src, uint8_t *end );
#endif
/****************************************************************************
......@@ -93,5 +94,7 @@ void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf )
pf->nal_escape = x264_nal_escape_mmxext;
if( (cpu&X264_CPU_SSE2) && (cpu&X264_CPU_SSE2_IS_FAST) )
pf->nal_escape = x264_nal_escape_sse2;
if( cpu&X264_CPU_AVX )
pf->nal_escape = x264_nal_escape_avx;
#endif
}
......@@ -439,6 +439,16 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->add8x8_idct_dc = x264_add8x8_idct_dc_sse2;
dctf->add16x16_idct_dc= x264_add16x16_idct_dc_sse2;
}
if( cpu&X264_CPU_AVX )
{
dctf->add4x4_idct = x264_add4x4_idct_avx;
dctf->dct4x4dc = x264_dct4x4dc_avx;
dctf->idct4x4dc = x264_idct4x4dc_avx;
dctf->add8x8_idct = x264_add8x8_idct_avx;
dctf->add16x16_idct = x264_add16x16_idct_avx;
dctf->add8x8_idct_dc = x264_add8x8_idct_dc_avx;
dctf->add16x16_idct_dc= x264_add16x16_idct_dc_avx;
}
#endif // HAVE_MMX
#else // !HIGH_BIT_DEPTH
#if HAVE_MMX
......@@ -494,6 +504,19 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
if( cpu&X264_CPU_SSE4 )
dctf->add4x4_idct = x264_add4x4_idct_sse4;
if( cpu&X264_CPU_AVX )
{
dctf->add4x4_idct = x264_add4x4_idct_avx;
dctf->add8x8_idct = x264_add8x8_idct_avx;
dctf->add16x16_idct = x264_add16x16_idct_avx;
dctf->add8x8_idct8 = x264_add8x8_idct8_avx;
dctf->add16x16_idct8 = x264_add16x16_idct8_avx;
dctf->add16x16_idct_dc = x264_add16x16_idct_dc_avx;
dctf->sub8x8_dct = x264_sub8x8_dct_avx;
dctf->sub16x16_dct = x264_sub16x16_dct_avx;
dctf->sub8x8_dct8 = x264_sub8x8_dct8_avx;
dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx;
}
#endif //HAVE_MMX
#if HAVE_ALTIVEC
......@@ -738,6 +761,8 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
pf->scan_4x4 = x264_zigzag_scan_4x4_field_sse2;
if( cpu&X264_CPU_SSE4 )
pf->scan_8x8 = x264_zigzag_scan_8x8_field_sse4;
if( cpu&X264_CPU_AVX )
pf->scan_8x8 = x264_zigzag_scan_8x8_field_avx;
#endif // HAVE_MMX
#else
#if HAVE_MMX
......@@ -751,6 +776,13 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
pf->sub_4x4 = x264_zigzag_sub_4x4_field_ssse3;
pf->sub_4x4ac= x264_zigzag_sub_4x4ac_field_ssse3;
}
if( cpu&X264_CPU_AVX )
{
pf->sub_4x4 = x264_zigzag_sub_4x4_field_avx;
#if ARCH_X86_64
pf->sub_4x4ac= x264_zigzag_sub_4x4ac_field_avx;
#endif
}
#endif // HAVE_MMX
#if HAVE_ALTIVEC
if( cpu&X264_CPU_ALTIVEC )
......@@ -772,6 +804,13 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
pf->scan_4x4 = x264_zigzag_scan_4x4_frame_sse2;
pf->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
}
#if ARCH_X86_64
if( cpu&X264_CPU_AVX )
{
pf->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
pf->scan_8x8 = x264_zigzag_scan_8x8_frame_avx;
}
#endif // ARCH_X86_64
#endif // HAVE_MMX
#else
#if HAVE_MMX
......@@ -789,6 +828,15 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
if( cpu&X264_CPU_SHUFFLE_IS_FAST )
pf->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
}
if( cpu&X264_CPU_AVX )
{
pf->sub_4x4 = x264_zigzag_sub_4x4_frame_avx;
#if ARCH_X86_64
pf->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_avx;
#endif
if( cpu&X264_CPU_SHUFFLE_IS_FAST )
pf->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
}
#endif // HAVE_MMX
#if HAVE_ALTIVEC
if( cpu&X264_CPU_ALTIVEC )
......@@ -806,11 +854,15 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
#if HIGH_BIT_DEPTH
if( cpu&X264_CPU_SSE2 )
pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
if( cpu&X264_CPU_AVX )
pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx;
#else
if( cpu&X264_CPU_MMX )
pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
if( cpu&X264_CPU_SHUFFLE_IS_FAST )
pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
if( cpu&X264_CPU_AVX )
pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx;
#endif // HIGH_BIT_DEPTH
#endif
}
......@@ -437,13 +437,21 @@ void x264_macroblock_deblock( x264_t *h )
#if HAVE_MMX
void x264_deblock_v_luma_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v_luma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_luma_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_luma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v_chroma_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v_chroma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v_luma_intra_sse2( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_v_luma_intra_avx ( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_luma_intra_sse2( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_luma_intra_avx ( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_v_chroma_intra_sse2( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_v_chroma_intra_avx ( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_sse2( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_avx ( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_strength_mmxext( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4],
int mvy_limit, int bframe );
......@@ -453,6 +461,9 @@ void x264_deblock_strength_sse2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X
void x264_deblock_strength_ssse3 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4],
int mvy_limit, int bframe );
void x264_deblock_strength_avx ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4],
int mvy_limit, int bframe );
#if ARCH_X86
void x264_deblock_h_luma_mmxext( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
......@@ -537,6 +548,21 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf )
}
if( cpu&X264_CPU_SSSE3 )
pf->deblock_strength = x264_deblock_strength_ssse3;
if( cpu&X264_CPU_AVX )
{
pf->deblock_strength = x264_deblock_strength_avx;
if( !(cpu&X264_CPU_STACK_MOD4) )
{
pf->deblock_luma[1] = x264_deblock_v_luma_avx;
pf->deblock_luma[0] = x264_deblock_h_luma_avx;
pf->deblock_chroma[1] = x264_deblock_v_chroma_avx;
pf->deblock_chroma[0] = x264_deblock_h_chroma_avx;
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_avx;
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_avx;
pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_avx;
pf->deblock_chroma_intra[0] = x264_deblock_h_chroma_intra_avx;
}
}
}
#endif
......
......@@ -488,6 +488,7 @@ SATD_X_DECL7( _mmxext )
SATD_X_DECL6( _sse2 )
SATD_X_DECL7( _ssse3 )
SATD_X_DECL7( _sse4 )
SATD_X_DECL7( _avx )
#endif // !HIGH_BIT_DEPTH
#endif
......@@ -1030,6 +1031,32 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
/* Slower on Conroe, so only enable under SSE4 */
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_ssse3;
}
if( cpu&X264_CPU_AVX )
{
INIT7( satd, _avx );
INIT7( satd_x3, _avx );
INIT7( satd_x4, _avx );
pixf->ads[PIXEL_16x16] = x264_pixel_ads4_avx;
pixf->ads[PIXEL_16x8] = x264_pixel_ads2_avx;
if( !(cpu&X264_CPU_STACK_MOD4) )
{
INIT4( hadamard_ac, _avx );
}
INIT5( ssd, _avx );
#if ARCH_X86_64
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_avx;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx;
pixf->intra_sa8d_x3_8x8= x264_intra_sa8d_x3_8x8_avx;
#endif
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_avx;
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_avx;
pixf->ssim_end4 = x264_pixel_ssim_end4_avx;
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_avx;
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_avx;
}
#endif //HAVE_MMX
#if HAVE_ARMV6
......
......@@ -477,6 +477,9 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
if( cpu&X264_CPU_AVX )
{
pf->dequant_4x4 = x264_dequant_4x4_avx;
pf->dequant_8x8 = x264_dequant_8x8_avx;
pf->dequant_4x4_dc = x264_dequant_4x4dc_avx;
pf->denoise_dct = x264_denoise_dct_avx;
}
#endif // HAVE_MMX
......
......@@ -44,19 +44,17 @@ SECTION .text
jmp %1_continue
ALIGN 16
%1:
mova m3, m1
mova m2, m0
pcmpeqb m1, m4
pcmpeqb m0, m4
pmovmskb r3d, m1
%2 [r0+r1], m2
pmovmskb r4d, m0
pcmpeqb m3, m1, m4
pcmpeqb m2, m0, m4
pmovmskb r3d, m3
%2 [r0+r1], m0
pmovmskb r4d, m2
shl r3d, mmsize
mova m0, [r1+r2+2*mmsize]
or r4d, r3d
mova m1, [r1+r2+3*mmsize]
%2 [r0+r1+mmsize], m1
lea r3d, [r4+r4+1]
%2 [r0+r1+mmsize], m3
mova m1, [r1+r2+3*mmsize]
and r4d, r3d
jnz %1_escape
%1_continue:
......@@ -129,3 +127,5 @@ INIT_MMX
NAL_ESCAPE mmxext
INIT_XMM
NAL_ESCAPE sse2
INIT_AVX
NAL_ESCAPE avx
......@@ -39,48 +39,39 @@ cextern hsub_mul
; in: m0..m7
; out: 0,4,6 in mem, rest in regs
%macro DCT8_1D 9
SUMSUB_BA w, m%8, m%1 ; %8 = s07, %1 = d07
SUMSUB_BA w, m%7, m%2 ; %7 = s16, %2 = d16
SUMSUB_BA w, m%6, m%3 ; %6 = s25, %3 = d25
SUMSUB_BA w, m%5, m%4 ; %5 = s34, %4 = d34
SUMSUB_BA w, m%5, m%8 ; %5 = a0, %8 = a2
SUMSUB_BA w, m%6, m%7 ; %6 = a1, %7 = a3
SUMSUB_BA w, m%6, m%5 ; %6 = dst0, %5 = dst4
SUMSUB_BA w, %8, %1 ; %8 = s07, %1 = d07
SUMSUB_BA w, %7, %2 ; %7 = s16, %2 = d16
SUMSUB_BA w, %6, %3 ; %6 = s25, %3 = d25
SUMSUB_BA w, %5, %4 ; %5 = s34, %4 = d34
SUMSUB_BA w, %5, %8 ; %5 = a0, %8 = a2
SUMSUB_BA w, %6, %7 ; %6 = a1, %7 = a3
SUMSUB_BA w, %6, %5 ; %6 = dst0, %5 = dst4
mova [%9+0x00], m%6
mova [%9+0x40], m%5
mova m%6, m%7 ; a3
psraw m%6, 1 ; a3>>1
psraw m%6, m%7, 1 ; a3>>1
paddw m%6, m%8 ; a2 + (a3>>1)
psraw m%8, 1 ; a2>>1
psubw m%8, m%7 ; (a2>>1) - a3
mova [%9+0x60], m%8
mova m%5, m%3
psraw m%5, 1
psraw m%5, m%3, 1
paddw m%5, m%3 ; d25+(d25>>1)
mova m%7, m%1
psubw m%7, m%4 ; a5 = d07-d34-(d25+(d25>>1))
psubw m%7, m%1, m%4 ; a5 = d07-d34-(d25+(d25>>1))
psubw m%7, m%5
mova m%5, m%2
psraw m%5, 1
psraw m%5, m%2, 1
paddw m%5, m%2 ; d16+(d16>>1)
mova m%8, m%1
paddw m%8, m%4
paddw m%8, m%1, m%4
psubw m%8, m%5 ; a6 = d07+d34-(d16+(d16>>1))
mova m%5, m%1
psraw m%5, 1
psraw m%5, m%1, 1
paddw m%5, m%1 ; d07+(d07>>1)
paddw m%5, m%2
paddw m%5, m%3 ; a4 = d16+d25+(d07+(d07>>1))
mova m%1, m%4
psraw m%1, 1
psraw m%1, m%4, 1
paddw m%1, m%4 ; d34+(d34>>1)
paddw m%1, m%2
psubw m%1, m%3 ; a7 = d16-d25+(d34+(d34>>1))
mova m%4, m%1
psraw m%4, 2
psraw m%4, m%1, 2
paddw m%4, m%5 ; a4 + (a7>>2)
mova m%3, m%8
psraw m%3, 2
psraw m%3, m%8, 2
paddw m%3, m%7 ; a5 + (a6>>2)
psraw m%5, 2
psraw m%7, 2
......@@ -92,22 +83,18 @@ cextern hsub_mul
; in: 0,4 in mem, rest in regs
; out: m0..m7
%macro IDCT8_1D 9
mova m%1, m%3
mova m%5, m%7
psraw m%3, 1
psraw m%7, 1
psubw m%3, m%5
paddw m%7, m%1
mova m%5, m%2
psraw m%5, 1
paddw m%5, m%2
paddw m%5, m%4
paddw m%5, m%6
mova m%1, m%6
psraw m%1, 1
paddw m%1, m%6
paddw m%1, m%8
psubw m%1, m%2
psraw m%1, m%3, 1
psraw m%5, m%7, 1
psubw m%1, m%7
paddw m%5, m%3
psraw m%7, m%2, 1
paddw m%7, m%2
paddw m%7, m%4
paddw m%7, m%6
psraw m%3, m%6, 1
paddw m%3, m%6
paddw m%3, m%8
psubw m%3, m%2
psubw m%2, m%4
psubw m%6, m%4
paddw m%2, m%8
......@@ -116,25 +103,25 @@ cextern hsub_mul
psraw m%8, 1
psubw m%2, m%4
psubw m%6, m%8
mova m%4, m%5
mova m%8, m%1
psraw m%4, 2
psraw m%8, 2
psraw m%4, m%7, 2
psraw m%8, m%3, 2
paddw m%4, m%6
paddw m%8, m%2
psraw m%6, 2
psraw m%2, 2
psubw m%5, m%6
psubw m%2, m%1
mova m%1, [%9+0x00]
psubw m%7, m%6
psubw m%2, m%3
mova m%3, [%9+0x00]
mova m%6, [%9+0x40]
SUMSUB_BA w, m%6, m%1
SUMSUB_BA w, m%7, m%6
SUMSUB_BA w, m%3, m%1
SUMSUB_BA w, m%5, m%7
SUMSUB_BA w, m%2, m%3
SUMSUB_BA w, m%8, m%1
SUMSUB_BA w, m%4, m%6
SUMSUB_BA w, %6, %3
SUMSUB_BA w, %5, %6
SUMSUB_BA w, %1, %3
SUMSUB_BA w, %7, %5
SUMSUB_BA w, %2, %1
SUMSUB_BA w, %8, %3
SUMSUB_BA w, %4, %6
SWAP %1, %3
SWAP %5, %7
SWAP %1, %5, %6
SWAP %3, %8, %7
%endmacro
......@@ -246,9 +233,8 @@ idct8_mmx:
%macro ADD_STORE_ROW 3
movq m1, [r0+%1*FDEC_STRIDE]
movq m2, m1
punpckhbw m2, m1, m0
punpcklbw m1, m0
punpckhbw m2, m0
paddw m1, %2
paddw m2, %3
packuswb m1, m2
......@@ -344,7 +330,6 @@ global add8x8_idct8_mmx.skip_prologue
ADD_STORE_ROW 7, m7, [r1+0x78]
ret
INIT_XMM
%macro DCT_SUB8 1
cglobal sub8x8_dct_%1, 3,3
add r2, 4*FDEC_STRIDE
......@@ -411,6 +396,7 @@ global sub8x8_dct8_%1.skip_prologue
ret
%endmacro
INIT_XMM
%define LOAD_DIFF8x4 LOAD_DIFF8x4_SSE2
%define movdqa movaps
%define punpcklqdq movlhps
......@@ -419,13 +405,16 @@ DCT_SUB8 sse2
%undef punpcklqdq
%define LOAD_DIFF8x4 LOAD_DIFF8x4_SSSE3
DCT_SUB8 ssse3
INIT_AVX
DCT_SUB8 avx
;-----------------------------------------------------------------------------
; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
;-----------------------------------------------------------------------------
cglobal add8x8_idct_sse2, 2,2
%macro ADD8x8 1
cglobal add8x8_idct_%1, 2,2
add r0, 4*FDEC_STRIDE
global add8x8_idct_sse2.skip_prologue
global add8x8_idct_%1.skip_prologue
.skip_prologue:
UNSPILL_SHUFFLE r1, 0,2,1,3, 0,1,2,3
SBUTTERFLY qdq, 0, 1, 4
......@@ -456,13 +445,20 @@ global add8x8_idct_sse2.skip_prologue
DIFFx2 m0, m2, m6, m7, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE]; m5
STORE_IDCT m1, m3, m5, m2
ret
%endmacro ; ADD8x8
INIT_XMM
ADD8x8 sse2
INIT_AVX
ADD8x8 avx
;-----------------------------------------------------------------------------
; void add8x8_idct8( uint8_t *p_dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
cglobal add8x8_idct8_sse2, 2,2
%macro ADD8x8_IDCT8 1
cglobal add8x8_idct8_%1, 2,2
add r0, 4*FDEC_STRIDE
global add8x8_idct8_sse2.skip_prologue
global add8x8_idct8_%1.skip_prologue
.skip_prologue:
UNSPILL r1, 1,2,3,5,6,7
IDCT8_1D 0,1,2,3,4,5,6,7,r1
......@@ -480,4 +476,10 @@ global add8x8_idct8_sse2.skip_prologue
DIFFx2 m0, m2, m6, m7, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE]; m5
STORE_IDCT m1, m3, m5, m2
ret
%endmacro ; ADD8x8_IDCT8
INIT_XMM
ADD8x8_IDCT8 sse2
INIT_AVX
ADD8x8_IDCT8 avx
%endif ; !HIGH_BIT_DEPTH
......@@ -37,27 +37,25 @@ cextern hsub_mul
INIT_XMM
%macro DCT8_1D 10
SUMSUB_BA w, m%5, m%4 ; %5=s34, %4=d34
SUMSUB_BA w, m%6, m%3 ; %6=s25, %3=d25
SUMSUB_BA w, m%7, m%2 ; %7=s16, %2=d16
SUMSUB_BA w, m%8, m%1 ; %8=s07, %1=d07
SUMSUB_BA w, %5, %4 ; %5=s34, %4=d34
SUMSUB_BA w, %6, %3 ; %6=s25, %3=d25
SUMSUB_BA w, %7, %2 ; %7=s16, %2=d16
SUMSUB_BA w, %8, %1 ; %8=s07, %1=d07
SUMSUB_BA w, m%6, m%7, m%10 ; %6=a1, %7=a3
SUMSUB_BA w, m%5, m%8, m%10 ; %5=a0, %8=a2
SUMSUB_BA w, %6, %7, %10 ; %6=a1, %7=a3
SUMSUB_BA w, %5, %8, %10 ; %5=a0, %8=a2
movdqa m%9, m%1
psraw m%9, 1
psraw m%9, m%1, 1
paddw m%9, m%1
paddw m%9, m%2
paddw m%9, m%3 ; %9=a4
movdqa m%10, m%4
psraw m%10, 1
psraw m%10, m%4, 1
paddw m%10, m%4
paddw m%10, m%2
psubw m%10, m%3 ; %10=a7
SUMSUB_BA w, m%4, m%1
SUMSUB_BA w, %4, %1
psubw m%1, m%3
psubw m%4, m%2
psraw m%3, 1
......@@ -65,22 +63,19 @@ INIT_XMM
psubw m%1, m%3 ; %1=a5
psubw m%4, m%2 ; %4=a6
movdqa m%2, m%10
psraw m%2, 2
psraw m%2, m%10, 2
paddw m%2, m%9 ; %2=b1
psraw m%9, 2
psubw m%9, m%10 ; %9=b7
SUMSUB_BA w, m%6, m%5, m%10 ; %6=b0, %5=b4
SUMSUB_BA w, %6, %5, %10 ; %6=b0, %5=b4
movdqa m%3, m%7
psraw m%3, 1
psraw m%3, m%7, 1
paddw m%3, m%8 ; %3=b2
psraw m%8, 1
psubw m%8, m%7 ; %8=b6
movdqa m%7, m%4
psraw m%7, 2
psraw m%7, m%4, 2
paddw m%7, m%1 ; %7=b3
psraw m%1, 2
psubw m%4, m%1 ; %4=b5
......@@ -89,25 +84,22 @@ INIT_XMM
%endmacro
%macro IDCT8_1D 10
SUMSUB_BA w, m%5, m%1, m%9 ; %5=a0, %1=a2
SUMSUB_BA w, %5, %1, %9 ; %5=a0, %1=a2
movdqa m%9, m%2
psraw m%9, 1
psraw m%9, m%2, 1
paddw m%9, m%2
paddw m%9, m%4
paddw m%9, m%6 ; %9=a7
paddw m%9, m%6 ; %9=a7
movdqa m%10, m%3
psraw m%3, 1
psubw m%3, m%7 ; %3=a4
psraw m%10, m%3, 1
psubw m%10, m%7 ; %10=a4
psraw m%7, 1
paddw m%7, m%10 ; %7=a6
paddw m%7, m%3 ; %7=a6
movdqa m%10, m%6
psraw m%10, 1
paddw m%10, m%6
paddw m%10, m%8
psubw m%10, m%2 ; %10=a5
psraw m%3, m%6, 1
paddw m%3, m%6
paddw m%3, m%8
psubw m%3, m%2 ; %3=a5
psubw m%2, m%4
psubw m%6, m%4
......@@ -115,31 +107,30 @@ INIT_XMM
psubw m%6, m%8
psraw m%4, 1
psraw m%8, 1
psubw m%2, m%4 ; %2=a3
psubw m%6, m%8 ; %6=a1
psubw m%2, m%4 ; %2=a3
psubw m%6, m%8 ; %6=a1
movdqa m%4, m%9
psraw m%4, 2
paddw m%4, m%6 ; %4=b1
psraw m%4, m%9, 2
paddw m%4, m%6 ; %4=b1
psraw m%6, 2
psubw m%9, m%6 ; %9=b7
psubw m%9, m%6 ; %9=b7
SUMSUB_BA w, m%7, m%5, m%6 ; %7=b0, %5=b6
SUMSUB_BA w, m%3, m%1, m%6 ; %3=b2, %1=b4
SUMSUB_BA w, %7, %5, %6 ; %7=b0, %5=b6
SUMSUB_BA w, %10, %1, %6 ; %10=b2, %1=b4
movdqa m%8, m%10
psraw m%8, 2
psraw m%8, m%3, 2
paddw m%8, m%2 ; %8=b3
psraw m%2, 2
psubw m%2, m%10 ; %2=b5
psubw m%2, m%3 ; %2=b5
SUMSUB_BA w, m%9, m%7, m%6 ; %9=c0, %7=c7
SUMSUB_BA w, m%2, m%3, m%6 ; %2=c1, %3=c6
SUMSUB_BA w, m%8, m%1, m%6 ; %8=c2, %1=c5
SUMSUB_BA w, m%4, m%5, m%6 ; %4=c3, %5=c4
SUMSUB_BA w, %9, %7, %6 ; %9=c0, %7=c7
SUMSUB_BA w, %2, %10, %6 ; %2=c1, %10=c6
SUMSUB_BA w, %8, %1, %6 ; %8=c2, %1=c5
SUMSUB_BA w, %4, %5, %6 ; %4=c3, %5=c4
SWAP %1, %9, %6
SWAP %3, %8, %7
SWAP %10, %3
SWAP %1, %9, %6
SWAP %3, %8, %7
%endmacro
%macro DCT_SUB8 1
......@@ -206,18 +197,21 @@ DCT_SUB8 sse2
%undef punpcklqdq
%define LOAD_DIFF8x4 LOAD_DIFF8x4_SSSE3
DCT_SUB8 ssse3
INIT_AVX
DCT_SUB8 avx
;-----------------------------------------------------------------------------
; void add8x8_idct8( uint8_t *p_dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
cglobal add8x8_idct8_sse2, 2,2,11
%macro ADD8x8_IDCT8 1
cglobal add8x8_idct8_%1, 2,2,11
add r0, 4*FDEC_STRIDE
pxor m7, m7
%ifdef WIN64
call .skip_prologue
RET
%endif
global add8x8_idct8_sse2.skip_prologue
global add8x8_idct8_%1.skip_prologue
.skip_prologue:
SWAP 7, 9
movdqa m0, [r1+0x00]
......@@ -238,18 +232,25 @@ global add8x8_idct8_sse2.skip_prologue
DIFFx2 m6, m7, m8, m9, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE]
STORE_IDCT m1, m3, m5, m7
ret
%endmacro ; ADD8x8_IDCT8
INIT_XMM
ADD8x8_IDCT8 sse2
INIT_AVX
ADD8x8_IDCT8 avx
;-----------------------------------------------------------------------------
; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
;-----------------------------------------------------------------------------
cglobal add8x8_idct_sse2, 2,2,11
%macro ADD8x8 1
cglobal add8x8_idct_%1, 2,2,11
add r0, 4*FDEC_STRIDE
pxor m7, m7
%ifdef WIN64
call .skip_prologue
RET
%endif
global add8x8_idct_sse2.skip_prologue
global add8x8_idct_%1.skip_prologue
.skip_prologue:
SWAP 7, 9
mova m0, [r1+ 0]
......@@ -278,4 +279,10 @@ global add8x8_idct_sse2.skip_prologue
DIFFx2 m6, m7, m8, m9, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE]
STORE_IDCT m1, m3, m5, m7
ret
%endmacro ; ADD8x8
INIT_XMM
ADD8x8 sse2
INIT_AVX
ADD8x8 avx
%endif ; !HIGH_BIT_DEPTH
......@@ -60,8 +60,8 @@ cextern pd_1
cextern pd_32
%macro WALSH4_1D 6
SUMSUB_BADC %1, m%5, m%4, m%3, m%2, m%6
SUMSUB_BADC %1, m%5, m%3, m%4, m%2, m%6
SUMSUB_BADC %1, %5, %4, %3, %2, %6
SUMSUB_BADC %1, %5, %3, %4, %2, %6
SWAP %2, %5, %4
%endmacro
......@@ -86,11 +86,11 @@ cextern pd_32
%endmacro
%ifdef HIGH_BIT_DEPTH
INIT_XMM
;-----------------------------------------------------------------------------
; void dct4x4dc( dctcoef d[4][4] )
;-----------------------------------------------------------------------------
cglobal dct4x4dc_sse2, 1,1,5
%macro DCT4x4_DC 1
cglobal dct4x4dc_%1, 1,1,5
mova m0, [r0+ 0]
mova m1, [r0+16]
mova m2, [r0+32]
......@@ -108,6 +108,12 @@ cglobal dct4x4dc_sse2, 1,1,5
mova [r0+32], m2
mova [r0+48], m3
RET
%endmacro ; DCT4x4_DC
INIT_XMM
DCT4x4_DC sse2
INIT_AVX
DCT4x4_DC avx
%else
INIT_MMX
......@@ -119,7 +125,7 @@ cglobal dct4x4dc_mmx, 1,1
movq m7, [pw_8000] ; convert to unsigned and back, so that pavgw works
WALSH4_1D w, 0,1,2,3,4
TRANSPOSE4x4W 0,1,2,3,4
SUMSUB_BADC w, m1, m0, m3, m2, m4
SUMSUB_BADC w, 1, 0, 3, 2, 4
SWAP 0, 1