Commit 4d84a45d authored by Fiona Glaser's avatar Fiona Glaser

Add merged SAD for i16x16 analysis

Roughly 30% faster i16x16 analysis under subme=1
parent 2bff5070
......@@ -619,6 +619,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
}
#endif
pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_mmxext;
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_mmxext;
pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_mmxext;
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmxext;
}
......@@ -630,7 +631,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT2( sad_x4, _sse2 );
INIT_ADS( _sse2 );
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2;
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_sse2;
#ifdef ARCH_X86
if( cpu&X264_CPU_CACHELINE_64 )
{
......@@ -673,6 +674,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3;
pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_ssse3;
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_ssse3;
pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_ssse3;
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_ssse3;
#ifdef ARCH_X86_64
......
......@@ -91,12 +91,14 @@ typedef struct
int (*ads[7])( int enc_dc[4], uint16_t *sums, int delta,
uint16_t *cost_mvx, int16_t *mvs, int width, int thresh );
/* calculate satd of V, H, and DC modes.
/* calculate satd or sad of V, H, and DC modes.
* may be NULL, in which case just use pred+satd instead. */
void (*intra_satd_x3_16x16)( uint8_t *fenc, uint8_t *fdec, int res[3] );
void (*intra_satd_x3_8x8c)( uint8_t *fenc, uint8_t *fdec, int res[3] );
void (*intra_satd_x3_4x4)( uint8_t *fenc, uint8_t *fdec, int res[3] );
void (*intra_sa8d_x3_8x8)( uint8_t *fenc, uint8_t edge[33], int res[3] );
void (*intra_mbcmp_x3_16x16)( uint8_t *fenc, uint8_t *fdec , int res[3] );
void (*intra_satd_x3_16x16) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
void (*intra_sad_x3_16x16) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
void (*intra_satd_x3_8x8c) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
void (*intra_satd_x3_4x4) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
void (*intra_sa8d_x3_8x8) ( uint8_t *fenc, uint8_t edge[33], int res[3] );
} x264_pixel_function_t;
void x264_pixel_init( int cpu, x264_pixel_function_t *pixf );
......
......@@ -74,18 +74,21 @@ int x264_pixel_var_16x16_sse2 ( uint8_t *pix, int i_stride, uint32_t *sad );
int x264_pixel_var_8x8_mmxext ( uint8_t *pix, int i_stride, uint32_t *sad );
int x264_pixel_var_8x8_sse2 ( uint8_t *pix, int i_stride, uint32_t *sad );
void x264_intra_satd_x3_4x4_mmxext( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_4x4_ssse3( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_8x8c_mmxext( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_8x8c_ssse3( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_4x4_mmxext ( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_4x4_ssse3 ( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_8x8c_mmxext ( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_8x8c_ssse3 ( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_16x16_mmxext( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_16x16_ssse3( uint8_t *, uint8_t *, int * );
void x264_intra_sa8d_x3_8x8_mmxext( uint8_t *, uint8_t *, int * );
void x264_intra_sa8d_x3_8x8_sse2( uint8_t *, uint8_t *, int * );
void x264_intra_sa8d_x3_8x8_ssse3( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_16x16_ssse3 ( uint8_t *, uint8_t *, int * );
void x264_intra_sad_x3_16x16_mmxext ( uint8_t *, uint8_t *, int * );
void x264_intra_sad_x3_16x16_sse2 ( uint8_t *, uint8_t *, int * );
void x264_intra_sad_x3_16x16_ssse3 ( uint8_t *, uint8_t *, int * );
void x264_intra_sa8d_x3_8x8_mmxext ( uint8_t *, uint8_t *, int * );
void x264_intra_sa8d_x3_8x8_sse2 ( uint8_t *, uint8_t *, int * );
void x264_intra_sa8d_x3_8x8_ssse3 ( uint8_t *, uint8_t *, int * );
void x264_intra_sa8d_x3_8x8_core_mmxext( uint8_t *, int16_t [2][8], int * );
void x264_intra_sa8d_x3_8x8_core_sse2( uint8_t *, int16_t [2][8], int * );
void x264_intra_sa8d_x3_8x8_core_ssse3( uint8_t *, int16_t [2][8], int * );
void x264_intra_sa8d_x3_8x8_core_sse2 ( uint8_t *, int16_t [2][8], int * );
void x264_intra_sa8d_x3_8x8_core_ssse3 ( uint8_t *, int16_t [2][8], int * );
void x264_pixel_ssim_4x4x2_core_mmxext( const uint8_t *pix1, int stride1,
const uint8_t *pix2, int stride2, int sums[2][4] );
......
......@@ -23,8 +23,10 @@
;*****************************************************************************
%include "x86inc.asm"
%include "x86util.asm"
SECTION_RODATA
pb_3: times 16 db 3
sw_64: dd 64
SECTION .text
......@@ -221,6 +223,96 @@ SAD_W16 sse2_aligned
;-----------------------------------------------------------------------------
; void intra_sad_x3_16x16 ( uint8_t *fenc, uint8_t *fdec, int res[3] );
;-----------------------------------------------------------------------------
;xmm7: DC prediction xmm6: H prediction xmm5: V prediction
;xmm4: DC pred score xmm3: H pred score xmm2: V pred score
%macro INTRA_SAD16 1
cglobal x264_intra_sad_x3_16x16_%1,3,5
pxor mm0, mm0
pxor mm1, mm1
psadbw mm0, [r1-FDEC_STRIDE+0]
psadbw mm1, [r1-FDEC_STRIDE+8]
paddw mm0, mm1
movd r3d, mm0
%ifidn %1, ssse3
mova m1, [pb_3 GLOBAL]
%endif
%assign n 0
%rep 16
movzx r4d, byte [r1-1+FDEC_STRIDE*n]
add r3d, r4d
%assign n n+1
%endrep
add r3d, 16
shr r3d, 5
imul r3d, 0x01010101
movd m7, r3d
mova m5, [r1-FDEC_STRIDE]
%if mmsize==16
pshufd m7, m7, 0
%else
mova m1, [r1-FDEC_STRIDE+8]
punpckldq m7, m7
%endif
pxor m4, m4
pxor m3, m3
pxor m2, m2
mov r3d, 15*FENC_STRIDE
.vloop:
SPLATB m6, r1+r3*2-1, m1
mova m0, [r0+r3]
psadbw m0, m7
paddw m4, m0
mova m0, [r0+r3]
psadbw m0, m5
paddw m2, m0
%if mmsize==8
mova m0, [r0+r3]
psadbw m0, m6
paddw m3, m0
mova m0, [r0+r3+8]
psadbw m0, m7
paddw m4, m0
mova m0, [r0+r3+8]
psadbw m0, m1
paddw m2, m0
psadbw m6, [r0+r3+8]
paddw m3, m6
%else
psadbw m6, [r0+r3]
paddw m3, m6
%endif
add r3d, -FENC_STRIDE
jge .vloop
%if mmsize==16
pslldq m3, 4
por m3, m2
movhlps m1, m3
paddw m3, m1
movq [r2+0], m3
movhlps m1, m4
paddw m4, m1
%else
movd [r2+0], m2
movd [r2+4], m3
%endif
movd [r2+8], m4
RET
%endmacro
INIT_MMX
%define SPLATB SPLATB_MMX
INTRA_SAD16 mmxext
INIT_XMM
INTRA_SAD16 sse2
%define SPLATB SPLATB_SSSE3
INTRA_SAD16 ssse3
;=============================================================================
; SAD x3/x4 MMX
;=============================================================================
......
......@@ -131,6 +131,22 @@
ABS2 %3, %4, %5, %6
%endmacro
%macro SPLATB_MMX 3
movd %1, [%2-3] ;to avoid crossing a cacheline
punpcklbw %1, %1
%if mmsize==16
pshuflw %1, %1, 0xff
movlhps %1, %1
%else
pshufw %1, %1, 0xff
%endif
%endmacro
%macro SPLATB_SSSE3 3
movd %1, [%2-3]
pshufb %1, %3
%endmacro
%macro PALIGNR_MMX 4
%ifnidn %4, %2
mova %4, %2
......@@ -221,3 +237,4 @@
packuswb %1, %1
movh %4, %1
%endmacro
......@@ -544,7 +544,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
int i, idx;
int i_max;
int predict_mode[9];
int b_merged_satd = h->pixf.intra_satd_x3_16x16 && h->pixf.mbcmp[0] == h->pixf.satd[0];
int b_merged_satd = !!h->pixf.intra_mbcmp_x3_16x16;
/*---------------- Try all mode and calculate their score ---------------*/
......@@ -553,7 +553,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
if( b_merged_satd && i_max == 4 )
{
h->pixf.intra_satd_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
h->pixf.intra_mbcmp_x3_16x16( p_src, p_dst, a->i_satd_i16x16_dir );
h->predict_16x16[I_PRED_16x16_P]( p_dst );
a->i_satd_i16x16_dir[I_PRED_16x16_P] =
h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
......
......@@ -585,6 +585,7 @@ static void mbcmp_init( x264_t *h )
int satd = !h->mb.b_lossless && h->param.analyse.i_subpel_refine > 1;
memcpy( h->pixf.mbcmp, satd ? h->pixf.satd : h->pixf.sad_aligned, sizeof(h->pixf.mbcmp) );
memcpy( h->pixf.mbcmp_unaligned, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.mbcmp_unaligned) );
h->pixf.intra_mbcmp_x3_16x16 = satd ? h->pixf.intra_satd_x3_16x16 : h->pixf.intra_sad_x3_16x16;
satd &= h->param.analyse.i_me_method == X264_ME_TESA;
memcpy( h->pixf.fpelcmp, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.fpelcmp) );
memcpy( h->pixf.fpelcmp_x3, satd ? h->pixf.satd_x3 : h->pixf.sad_x3, sizeof(h->pixf.fpelcmp_x3) );
......
......@@ -324,7 +324,7 @@ static int check_pixel( int cpu_ref, int cpu_new )
TEST_PIXEL_VAR( PIXEL_8x8 );
report( "pixel var :" );
#define TEST_INTRA_SATD( name, pred, satd, i8x8, ... ) \
#define TEST_INTRA_MBCMP( name, pred, satd, i8x8, ... ) \
if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \
{ \
int res_c[3], res_asm[3]; \
......@@ -333,10 +333,10 @@ static int check_pixel( int cpu_ref, int cpu_new )
memcpy( buf3, buf2, 1024 ); \
for( i=0; i<3; i++ ) \
{ \
pred[i]( buf3+40, ##__VA_ARGS__ ); \
res_c[i] = pixel_c.satd( buf1+40, 16, buf3+40, 32 ); \
pred[i]( buf3+48, ##__VA_ARGS__ ); \
res_c[i] = pixel_c.satd( buf1+48, 16, buf3+48, 32 ); \
} \
call_a( pixel_asm.name, buf1+40, i8x8 ? edge : buf3+40, res_asm ); \
call_a( pixel_asm.name, buf1+48, i8x8 ? edge : buf3+48, res_asm ); \
if( memcmp(res_c, res_asm, sizeof(res_c)) ) \
{ \
ok = 0; \
......@@ -347,11 +347,13 @@ static int check_pixel( int cpu_ref, int cpu_new )
}
ok = 1; used_asm = 0;
TEST_INTRA_SATD( intra_satd_x3_16x16, predict_16x16, satd[PIXEL_16x16], 0 );
TEST_INTRA_SATD( intra_satd_x3_8x8c, predict_8x8c, satd[PIXEL_8x8], 0 );
TEST_INTRA_SATD( intra_satd_x3_4x4, predict_4x4, satd[PIXEL_4x4], 0 );
TEST_INTRA_SATD( intra_sa8d_x3_8x8, predict_8x8, sa8d[PIXEL_8x8], 1, edge );
TEST_INTRA_MBCMP( intra_satd_x3_16x16, predict_16x16, satd[PIXEL_16x16], 0 );
TEST_INTRA_MBCMP( intra_satd_x3_8x8c , predict_8x8c , satd[PIXEL_8x8] , 0 );
TEST_INTRA_MBCMP( intra_satd_x3_4x4 , predict_4x4 , satd[PIXEL_4x4] , 0 );
TEST_INTRA_MBCMP( intra_sa8d_x3_8x8 , predict_8x8 , sa8d[PIXEL_8x8] , 1, edge );
report( "intra satd_x3 :" );
TEST_INTRA_MBCMP( intra_sad_x3_16x16 , predict_16x16, sad [PIXEL_16x16], 0 );
report( "intra sad_x3 :" );
if( pixel_asm.ssim_4x4x2_core != pixel_ref.ssim_4x4x2_core ||
pixel_asm.ssim_end4 != pixel_ref.ssim_end4 )
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment