Commit b63a73da authored by Fiona Glaser's avatar Fiona Glaser
Browse files

Some more 4:2:2 x86 asm

coeff_last8, coeff_level_run8, var2_8x16, predict_8x16c_dc, satd_4x16, intra_mbcmp_8x16c_x3, deblock_h_chroma_422
parent 50aaf8d8
......@@ -647,6 +647,9 @@ void x264_deblock_v_chroma_sse2( pixel *pix, int stride, int alpha, int beta, in
void x264_deblock_v_chroma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_422_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_422_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_422_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v_luma_intra_sse2( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_v_luma_intra_avx ( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_luma_intra_sse2( pixel *pix, int stride, int alpha, int beta );
......@@ -736,6 +739,9 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
pf->deblock_luma[0] = x264_deblock_h_luma_mmx2;
pf->deblock_chroma[1] = x264_deblock_v_chroma_mmx2;
pf->deblock_h_chroma_420 = x264_deblock_h_chroma_mmx2;
#if !HIGH_BIT_DEPTH
pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_mmx2;
#endif
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_mmx2;
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_mmx2;
pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_mmx2;
......@@ -745,12 +751,15 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
if( cpu&X264_CPU_SSE2 )
{
pf->deblock_strength = x264_deblock_strength_sse2;
pf->deblock_h_chroma_420 = x264_deblock_h_chroma_sse2;
#if !HIGH_BIT_DEPTH
pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_sse2;
#endif
if( !(cpu&X264_CPU_STACK_MOD4) )
{
pf->deblock_luma[1] = x264_deblock_v_luma_sse2;
pf->deblock_luma[0] = x264_deblock_h_luma_sse2;
pf->deblock_chroma[1] = x264_deblock_v_chroma_sse2;
pf->deblock_h_chroma_420 = x264_deblock_h_chroma_sse2;
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_sse2;
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_sse2;
pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_sse2;
......@@ -762,12 +771,15 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
if( cpu&X264_CPU_AVX )
{
pf->deblock_strength = x264_deblock_strength_avx;
pf->deblock_h_chroma_420 = x264_deblock_h_chroma_avx;
#if !HIGH_BIT_DEPTH
pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_avx;
#endif
if( !(cpu&X264_CPU_STACK_MOD4) )
{
pf->deblock_luma[1] = x264_deblock_v_luma_avx;
pf->deblock_luma[0] = x264_deblock_h_luma_avx;
pf->deblock_chroma[1] = x264_deblock_v_chroma_avx;
pf->deblock_h_chroma_420 = x264_deblock_h_chroma_avx;
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_avx;
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_avx;
pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_avx;
......
......@@ -547,7 +547,8 @@ INTRA_MBCMP(satd, 8x16, dc, h, v, c,, _c )
INTRA_MBCMP( sad, 16x16, v, h, dc, ,, _c )
INTRA_MBCMP(satd, 16x16, v, h, dc, ,, _c )
#if HIGH_BIT_DEPTH && HAVE_MMX
#if HAVE_MMX
#if HIGH_BIT_DEPTH
INTRA_MBCMP( sad, 4x4, v, h, dc, , _mmx2, _c )
INTRA_MBCMP(satd, 4x4, v, h, dc, , _mmx2, _c )
INTRA_MBCMP( sad, 8x8, dc, h, v, c, _mmx2, _c )
......@@ -559,6 +560,17 @@ INTRA_MBCMP( sad, 16x16, v, h, dc, , _sse2, _sse2 )
INTRA_MBCMP( sad, 4x4, v, h, dc, , _ssse3, _c )
INTRA_MBCMP( sad, 8x8, dc, h, v, c, _ssse3, _sse2 )
INTRA_MBCMP( sad, 16x16, v, h, dc, , _ssse3, _sse2 )
#else
#define x264_predict_8x16c_v_mmx2 x264_predict_8x16c_v_mmx
INTRA_MBCMP( sad, 8x16, dc, h, v, c, _mmx2, _mmx2 )
INTRA_MBCMP(satd, 8x16, dc, h, v, c, _mmx2, _mmx2 )
INTRA_MBCMP( sad, 8x16, dc, h, v, c, _sse2, _mmx2 )
INTRA_MBCMP(satd, 8x16, dc, h, v, c, _sse2, _mmx2 )
INTRA_MBCMP(satd, 8x16, dc, h, v, c, _ssse3, _mmx2 )
INTRA_MBCMP(satd, 8x16, dc, h, v, c, _sse4, _mmx2 )
INTRA_MBCMP(satd, 8x16, dc, h, v, c, _avx, _mmx2 )
INTRA_MBCMP(satd, 8x16, dc, h, v, c, _xop, _mmx2 )
#endif
#endif
// No C implementation of intra_satd_x9. See checkasm for its behavior,
......@@ -820,17 +832,20 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT7( sad, _mmx2 );
INIT7( sad_x3, _mmx2 );
INIT7( sad_x4, _mmx2 );
INIT7( satd, _mmx2 );
INIT8( satd, _mmx2 );
INIT7( satd_x3, _mmx2 );
INIT7( satd_x4, _mmx2 );
INIT4( hadamard_ac, _mmx2 );
INIT7( ssd, _mmx2 );
INIT8( ssd, _mmx2 );
INIT_ADS( _mmx2 );
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_mmx2;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmx2;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmx2;
#if ARCH_X86
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_mmx2;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_mmx2;
#endif
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_mmx2;
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmx2;
......@@ -856,6 +871,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_sse2;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_sse2;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_sse2;
}
if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
{
......@@ -941,7 +957,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT8_NAME( sad_aligned, sad, _mmx2 );
INIT7( sad_x3, _mmx2 );
INIT7( sad_x4, _mmx2 );
INIT7( satd, _mmx2 );
INIT8( satd, _mmx2 );
INIT7( satd_x3, _mmx2 );
INIT7( satd_x4, _mmx2 );
INIT4( hadamard_ac, _mmx2 );
......@@ -956,6 +972,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmx2;
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_mmx2;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_mmx2;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_mmx2;
pixf->vsad = x264_pixel_vsad_mmx2;
if( cpu&X264_CPU_CACHELINE_32 )
......@@ -984,6 +1001,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
#endif
pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_mmx2;
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_mmx2;
pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_mmx2;
pixf->intra_sad_x3_8x16c = x264_intra_sad_x3_8x16c_mmx2;
pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_mmx2;
pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_mmx2;
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_mmx2;
......@@ -1005,6 +1024,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2;
#endif
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_sse2;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_sse2;
pixf->vsad = x264_pixel_vsad_sse2;
}
......@@ -1014,6 +1034,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT2( sad_x3, _sse2 );
INIT2( sad_x4, _sse2 );
INIT6( satd, _sse2 );
pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_sse2;
INIT6( satd_x3, _sse2 );
INIT6( satd_x4, _sse2 );
if( !(cpu&X264_CPU_STACK_MOD4) )
......@@ -1024,6 +1045,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2;
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_sse2;
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_sse2;
pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_sse2;
pixf->intra_sad_x3_8x16c = x264_intra_sad_x3_8x16c_sse2;
if( cpu&X264_CPU_CACHELINE_64 )
{
INIT2( ssd, _sse2); /* faster for width 16 on p4 */
......@@ -1083,15 +1106,17 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT8( ssd, _ssse3 );
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3;
INIT7( satd, _ssse3 );
INIT8( satd, _ssse3 );
INIT7( satd_x3, _ssse3 );
INIT7( satd_x4, _ssse3 );
}
pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_ssse3;
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_ssse3;
pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_ssse3;
pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_ssse3;
pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_ssse3;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_ssse3;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_ssse3;
if( cpu&X264_CPU_CACHELINE_64 )
{
INIT2( sad, _cache64_ssse3 );
......@@ -1106,7 +1131,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
if( cpu&X264_CPU_SSE4 )
{
INIT7( satd, _sse4 );
INIT8( satd, _sse4 );
INIT7( satd_x3, _sse4 );
INIT7( satd_x4, _sse4 );
if( !(cpu&X264_CPU_STACK_MOD4) )
......@@ -1121,11 +1146,12 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
}
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_sse4;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse4;
pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_sse4;
}
if( cpu&X264_CPU_AVX )
{
INIT7( satd, _avx );
INIT8( satd, _avx );
INIT7( satd_x3, _avx );
INIT7( satd_x4, _avx );
INIT_ADS( _avx );
......@@ -1142,6 +1168,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT5( ssd, _avx );
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_avx;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx;
pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_avx;
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx;
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_avx;
......@@ -1163,10 +1190,12 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT5( ssd, _xop );
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_xop;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_xop;
pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_xop;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_xop;
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_xop;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_xop;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_xop;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_xop;
}
#endif //HAVE_MMX
......
......@@ -435,13 +435,15 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->decimate_score16 = x264_decimate_score16_mmx2_slowctz;
}
pf->decimate_score64 = x264_decimate_score64_mmx2;
pf->coeff_last4 = x264_coeff_last4_mmx2;
pf->coeff_last8 = x264_coeff_last8_mmx2;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_mmx2;
pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmx2;
pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64_mmx2;
pf->coeff_level_run8 = x264_coeff_level_run8_mmx2;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_mmx2;
pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_mmx2;
#endif
pf->coeff_last4 = x264_coeff_last4_mmx2;
pf->coeff_level_run4 = x264_coeff_level_run4_mmx2;
if( cpu&X264_CPU_LZCNT )
pf->coeff_level_run4 = x264_coeff_level_run4_mmx2_lzcnt;
......@@ -464,17 +466,21 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->decimate_score15 = x264_decimate_score15_sse2_slowctz;
pf->decimate_score16 = x264_decimate_score16_sse2_slowctz;
}
pf->coeff_last8 = x264_coeff_last8_sse2;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2;
pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2;
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2;
pf->coeff_level_run8 = x264_coeff_level_run8_sse2;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2;
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2;
if( cpu&X264_CPU_LZCNT )
{
pf->coeff_last4 = x264_coeff_last4_mmx2_lzcnt;
pf->coeff_last8 = x264_coeff_last8_sse2_lzcnt;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2_lzcnt;
pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2_lzcnt;
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2_lzcnt;
pf->coeff_level_run8 = x264_coeff_level_run8_sse2_lzcnt;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2_lzcnt;
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2_lzcnt;
}
......@@ -555,11 +561,15 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_mmx2;
#endif
pf->coeff_last4 = x264_coeff_last4_mmx2;
pf->coeff_last8 = x264_coeff_last8_mmx2;
pf->coeff_level_run4 = x264_coeff_level_run4_mmx2;
pf->coeff_level_run8 = x264_coeff_level_run8_mmx2;
if( cpu&X264_CPU_LZCNT )
{
pf->coeff_last4 = x264_coeff_last4_mmx2_lzcnt;
pf->coeff_last8 = x264_coeff_last8_mmx2_lzcnt;
pf->coeff_level_run4 = x264_coeff_level_run4_mmx2_lzcnt;
pf->coeff_level_run8 = x264_coeff_level_run8_mmx2_lzcnt;
}
}
......
......@@ -1881,6 +1881,48 @@ INIT_MMX mmx2
DEBLOCK_CHROMA
%endif
%macro DEBLOCK_H_CHROMA_422 0
cglobal deblock_h_chroma_422, 5,7,8
%ifdef ARCH_X86_64
%define cntr r11
%else
%define cntr dword r0m
%endif
dec r2d
dec r3d
sub r0, 4
lea t6, [r1*3]
mov t5, r0
add r0, t6
mov cntr, 32/mmsize
.skip_prologue:
TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6)
LOAD_MASK r2d, r3d
movd m6, [r4] ; tc0
punpcklbw m6, m6
%if mmsize == 16
punpcklbw m6, m6
punpcklbw m6, m6
%else
pshufw m6, m6, q0000
%endif
pand m7, m6
DEBLOCK_P0_Q0
TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
lea r0, [r0+r1*(mmsize/2)]
lea t5, [t5+r1*(mmsize/2)]
add r4, mmsize/8
dec cntr
jg .skip_prologue
REP_RET
%endmacro
INIT_MMX mmx2
DEBLOCK_H_CHROMA_422
INIT_XMM sse2
DEBLOCK_H_CHROMA_422
INIT_XMM avx
DEBLOCK_H_CHROMA_422
; in: %1=p0 %2=p1 %3=q1
; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
......
......@@ -213,6 +213,7 @@ cglobal pixel_ssd_%1x%2, 4,5
INIT_MMX mmx2
SSD_ONE 4, 4
SSD_ONE 4, 8
SSD_ONE 4, 16
SSD_ONE 8, 4
SSD_ONE 8, 8
SSD_ONE 8, 16
......@@ -806,12 +807,12 @@ INIT_XMM xop
VAR
%endif ; !HIGH_BIT_DEPTH
%macro VAR2_END 0
%macro VAR2_END 1
HADDW m5, m7
movd r1d, m5
imul r1d, r1d
HADDD m6, m1
shr r1d, 6
shr r1d, %1
movd eax, m6
mov [r4], eax
sub eax, r1d ; sqr - (sum * sum >> shift)
......@@ -821,11 +822,11 @@ VAR
;-----------------------------------------------------------------------------
; int pixel_var2_8x8( pixel *, int, pixel *, int, int * )
;-----------------------------------------------------------------------------
INIT_MMX mmx2
cglobal pixel_var2_8x8, 5,6
%macro VAR2_8x8_MMX 2
cglobal pixel_var2_8x%1, 5,6
FIX_STRIDES r1, r3
VAR_START 0
mov r5d, 8
mov r5d, %1
.loop:
%ifdef HIGH_BIT_DEPTH
mova m0, [r0]
......@@ -854,13 +855,19 @@ cglobal pixel_var2_8x8, 5,6
add r2, r3
dec r5d
jg .loop
VAR2_END
RET
VAR2_END %2
%endmacro
INIT_XMM sse2
cglobal pixel_var2_8x8, 5,6,8
%ifndef ARCH_X86_64
INIT_MMX mmx2
VAR2_8x8_MMX 8, 6
VAR2_8x8_MMX 16, 7
%endif
%macro VAR2_8x8_SSE2 2
cglobal pixel_var2_8x%1, 5,6,8
VAR_START 1
mov r5d, 4
mov r5d, %1/2
.loop:
%ifdef HIGH_BIT_DEPTH
mova m0, [r0]
......@@ -886,16 +893,20 @@ cglobal pixel_var2_8x8, 5,6,8
lea r2, [r2+r3*2*SIZEOF_PIXEL]
dec r5d
jg .loop
VAR2_END
RET
VAR2_END %2
%endmacro
INIT_XMM sse2
VAR2_8x8_SSE2 8, 6
VAR2_8x8_SSE2 16, 7
%ifndef HIGH_BIT_DEPTH
%macro VAR2_8x8 0
cglobal pixel_var2_8x8, 5,6,8
%macro VAR2_8x8_SSSE3 2
cglobal pixel_var2_8x%1, 5,6,8
pxor m5, m5 ; sum
pxor m6, m6 ; sum squared
mova m7, [hsub_mul]
mov r5d, 2
mov r5d, %1/4
.loop:
movq m0, [r0]
movq m2, [r2]
......@@ -931,14 +942,15 @@ cglobal pixel_var2_8x8, 5,6,8
lea r2, [r2+r3*2]
dec r5d
jg .loop
VAR2_END
RET
VAR2_END %2
%endmacro
INIT_XMM ssse3
VAR2_8x8
VAR2_8x8_SSSE3 8, 6
VAR2_8x8_SSSE3 16, 7
INIT_XMM xop
VAR2_8x8
VAR2_8x8_SSSE3 8, 6
VAR2_8x8_SSSE3 16, 7
%endif ; !HIGH_BIT_DEPTH
......@@ -1215,6 +1227,17 @@ cglobal pixel_satd_8x4, 4,6
call pixel_satd_8x4_internal_mmx2
SATD_END_MMX
cglobal pixel_satd_4x16, 4,6
SATD_START_MMX
SATD_4x4_MMX m0, 0, 1
SATD_4x4_MMX m1, 0, 1
paddw m0, m1
SATD_4x4_MMX m1, 0, 1
paddw m0, m1
SATD_4x4_MMX m1, 0, 0
paddw m0, m1
SATD_END_MMX
cglobal pixel_satd_4x8, 4,6
SATD_START_MMX
SATD_4x4_MMX m0, 0, 1
......@@ -1261,32 +1284,7 @@ cglobal pixel_satd_4x4, 4,6
%endif
%endmacro
;-----------------------------------------------------------------------------
; int pixel_satd_8x4( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
%macro SATDS_SSE2 0
%if cpuflag(ssse3)
cglobal pixel_satd_4x4, 4, 6, 6
SATD_START_MMX
mova m4, [hmul_4p]
LOAD_DUP_2x4P m2, m5, [r2], [r2+r3]
LOAD_DUP_2x4P m3, m5, [r2+2*r3], [r2+r5]
LOAD_DUP_2x4P m0, m5, [r0], [r0+r1]
LOAD_DUP_2x4P m1, m5, [r0+2*r1], [r0+r4]
DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4
HADAMARD 0, sumsub, 0, 1, 2, 3
HADAMARD 4, sumsub, 0, 1, 2, 3
HADAMARD 1, amax, 0, 1, 2, 3
HADDW m0, m1
movd eax, m0
RET
%endif
cglobal pixel_satd_4x8, 4, 6, 8
SATD_START_MMX
%if cpuflag(ssse3)
mova m7, [hmul_4p]
%endif
%macro SATD_4x8_SSE 2
movd m4, [r2]
movd m5, [r2+r3]
movd m6, [r2+2*r3]
......@@ -1303,7 +1301,12 @@ cglobal pixel_satd_4x8, 4, 6, 8
JDUP m5, m3
movd m3, [r0+2*r1]
JDUP m1, m3
%if cpuflag(ssse3) && %1==1
mova m3, [hmul_4p]
DIFFOP 0, 4, 1, 5, 3
%else
DIFFOP 0, 4, 1, 5, 7
%endif
movd m5, [r2]
add r2, r5
movd m3, [r0]
......@@ -1316,10 +1319,57 @@ cglobal pixel_satd_4x8, 4, 6, 8
JDUP m5, m4
movd m4, [r0+r1]
JDUP m3, m4
%if cpuflag(ssse3) && %1==1
mova m4, [hmul_4p]
DIFFOP 2, 6, 3, 5, 4
%else
DIFFOP 2, 6, 3, 5, 7
SATD_8x4_SSE cpuname, 0, 1, 2, 3, 4, 5, 6, swap
HADDW m6, m1
movd eax, m6
%endif
SATD_8x4_SSE cpuname, 0, 1, 2, 3, 4, 5, 7, %2
%endmacro
;-----------------------------------------------------------------------------
; int pixel_satd_8x4( uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
%macro SATDS_SSE2 0
%if cpuflag(ssse3)
cglobal pixel_satd_4x4, 4, 6, 6
SATD_START_MMX
mova m4, [hmul_4p]
LOAD_DUP_2x4P m2, m5, [r2], [r2+r3]
LOAD_DUP_2x4P m3, m5, [r2+2*r3], [r2+r5]
LOAD_DUP_2x4P m0, m5, [r0], [r0+r1]
LOAD_DUP_2x4P m1, m5, [r0+2*r1], [r0+r4]
DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4
HADAMARD 0, sumsub, 0, 1, 2, 3
HADAMARD 4, sumsub, 0, 1, 2, 3
HADAMARD 1, amax, 0, 1, 2, 3
HADDW m0, m1
movd eax, m0
RET
%endif
cglobal pixel_satd_4x8, 4, 6, 8
SATD_START_MMX
%if cpuflag(ssse3)
mova m7, [hmul_4p]
%endif
SATD_4x8_SSE 0, swap
HADDW m7, m1
movd eax, m7
RET
cglobal pixel_satd_4x16, 4, 6, 8
SATD_START_MMX
%if cpuflag(ssse3)
mova m7, [hmul_4p]
%endif
SATD_4x8_SSE 0, swap
lea r0, [r0+r1*2]
lea r2, [r2+r3*2]
SATD_4x8_SSE 1, add
HADDW m7, m1
movd eax, m7
RET
cglobal pixel_satd_8x8_internal
......
......@@ -147,6 +147,10 @@ int x264_pixel_var2_8x8_mmx2( pixel *, int, pixel *, int, int * );
int x264_pixel_var2_8x8_sse2( pixel *, int, pixel *, int, int * );
int x264_pixel_var2_8x8_ssse3( uint8_t *, int, uint8_t *, int, int * );
int x264_pixel_var2_8x8_xop( uint8_t *, int, uint8_t *, int, int * );
int x264_pixel_var2_8x16_mmx2( pixel *, int, pixel *, int, int * );
int x264_pixel_var2_8x16_sse2( pixel *, int, pixel *, int, int * );
int x264_pixel_var2_8x16_ssse3( uint8_t *, int, uint8_t *, int, int * );
int x264_pixel_var2_8x16_xop( uint8_t *, int, uint8_t *, int, int * );
int x264_pixel_vsad_mmx2( pixel *src, int stride, int height );
int x264_pixel_vsad_sse2( pixel *src, int stride, int height );
......
......@@ -1675,6 +1675,16 @@ PREDICT_C_H 16
; void predict_8x8c_dc( pixel *src )
;-----------------------------------------------------------------------------
%macro LOAD_LEFT 1
movzx r1d, pixel [r0+FDEC_STRIDEB*(%1-4)-SIZEOF_PIXEL]
movzx r2d, pixel [r0+FDEC_STRIDEB*(%1-3)-SIZEOF_PIXEL]
add r1d, r2d
movzx r2d, pixel [r0+FDEC_STRIDEB*(%1-2)-SIZEOF_PIXEL]
add r1d, r2d
movzx r2d, pixel [r0+FDEC_STRIDEB*(%1-1)-SIZEOF_PIXEL]
add r1d, r2d
%endmacro
%macro PREDICT_8x8C_DC 0
cglobal predict_8x8c_dc, 1,3
pxor m7, m7
......@@ -1691,23 +1701,10 @@ cglobal predict_8x8c_dc, 1,3
%endif
add r0, FDEC_STRIDEB*4
movzx r1d, pixel [r0-FDEC_STRIDEB*4-SIZEOF_PIXEL]
movzx r2d, pixel [r0-FDEC_STRIDEB*3-SIZEOF_PIXEL]
add r1d, r2d
movzx r2d, pixel [r0-FDEC_STRIDEB*2-SIZEOF_PIXEL]
add r1d, r2d
movzx r2d, pixel [r0-FDEC_STRIDEB*1-SIZEOF_PIXEL]
add r1d, r2d
movd m2, r1d ; s2
movzx r1d, pixel [r0+FDEC_STRIDEB*0-SIZEOF_PIXEL]
movzx r2d, pixel [r0+FDEC_STRIDEB*1-SIZEOF_PIXEL]
add r1d, r2d
movzx r2d, pixel [r0+FDEC_STRIDEB*2-SIZEOF_PIXEL]
add r1d, r2d
movzx r2d, pixel [r0+FDEC_STRIDEB*3-SIZEOF_PIXEL]
add r1d, r2d
movd m3, r1d ; s3
LOAD_LEFT 0 ; s2
movd m2, r1d
LOAD_LEFT 4 ; s3
movd m3, r1d
punpcklwd m0, m1