Commit e184ff26 authored by Loren Merritt's avatar Loren Merritt Committed by Fiona Glaser
Browse files

Faster intra_mbcmp_x3 for versions without dedicated asm

Select asm subroutines more intelligently in the wrapper functions.
parent d94edd73
......@@ -29,6 +29,7 @@
#if HAVE_MMX
# include "x86/pixel.h"
# include "x86/predict.h"
#endif
#if ARCH_PPC
# include "ppc/pixel.h"
......@@ -498,57 +499,57 @@ SATD_X_DECL7( _neon )
#endif
#endif // !HIGH_BIT_DEPTH
#define INTRA_MBCMP_8x8( mbcmp, cpu )\
#define INTRA_MBCMP_8x8( mbcmp, cpu, cpu2 )\
void x264_intra_##mbcmp##_x3_8x8##cpu( pixel *fenc, pixel edge[36], int res[3] )\
{\
ALIGNED_ARRAY_16( pixel, pix, [8*FDEC_STRIDE] );\
x264_predict_8x8_v_c( pix, edge );\
x264_predict_8x8_v##cpu2( pix, edge );\
res[0] = x264_pixel_##mbcmp##_8x8##cpu( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\
x264_predict_8x8_h_c( pix, edge );\
x264_predict_8x8_h##cpu2( pix, edge );\
res[1] = x264_pixel_##mbcmp##_8x8##cpu( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\
x264_predict_8x8_dc_c( pix, edge );\
x264_predict_8x8_dc##cpu2( pix, edge );\
res[2] = x264_pixel_##mbcmp##_8x8##cpu( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\
}
INTRA_MBCMP_8x8( sad, )
INTRA_MBCMP_8x8(sa8d, )
INTRA_MBCMP_8x8( sad,, _c )
INTRA_MBCMP_8x8(sa8d,, _c )
#if HIGH_BIT_DEPTH && HAVE_MMX
INTRA_MBCMP_8x8( sad, _mmx2 )
INTRA_MBCMP_8x8( sad, _sse2 )
INTRA_MBCMP_8x8( sad, _ssse3 )
INTRA_MBCMP_8x8(sa8d, _sse2 )
INTRA_MBCMP_8x8( sad, _mmx2, _c )
INTRA_MBCMP_8x8( sad, _sse2, _sse2 )
INTRA_MBCMP_8x8( sad, _ssse3, _sse2 )
INTRA_MBCMP_8x8(sa8d, _sse2, _sse2 )
#endif
#define INTRA_MBCMP( mbcmp, size, pred1, pred2, pred3, chroma, cpu )\
void x264_intra_##mbcmp##_x3_##size##x##size##chroma##cpu( pixel *fenc, pixel *fdec, int res[3] )\
#define INTRA_MBCMP( mbcmp, size, pred1, pred2, pred3, chroma, cpu, cpu2 )\
void x264_intra_##mbcmp##_x3_##size##chroma##cpu( pixel *fenc, pixel *fdec, int res[3] )\
{\
x264_predict_##size##x##size##chroma##_##pred1##_c( fdec );\
res[0] = x264_pixel_##mbcmp##_##size##x##size##cpu( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\
x264_predict_##size##x##size##chroma##_##pred2##_c( fdec );\
res[1] = x264_pixel_##mbcmp##_##size##x##size##cpu( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\
x264_predict_##size##x##size##chroma##_##pred3##_c( fdec );\
res[2] = x264_pixel_##mbcmp##_##size##x##size##cpu( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\
x264_predict_##size##chroma##_##pred1##cpu2( fdec );\
res[0] = x264_pixel_##mbcmp##_##size##cpu( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\
x264_predict_##size##chroma##_##pred2##cpu2( fdec );\
res[1] = x264_pixel_##mbcmp##_##size##cpu( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\
x264_predict_##size##chroma##_##pred3##cpu2( fdec );\
res[2] = x264_pixel_##mbcmp##_##size##cpu( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\
}
INTRA_MBCMP( sad, 4, v, h, dc, , )
INTRA_MBCMP(satd, 4, v, h, dc, , )
INTRA_MBCMP( sad, 8, dc, h, v, c, )
INTRA_MBCMP(satd, 8, dc, h, v, c, )
INTRA_MBCMP( sad, 16, v, h, dc, , )
INTRA_MBCMP(satd, 16, v, h, dc, , )
INTRA_MBCMP( sad, 4x4, v, h, dc, ,, _c )
INTRA_MBCMP(satd, 4x4, v, h, dc, ,, _c )
INTRA_MBCMP( sad, 8x8, dc, h, v, c,, _c )
INTRA_MBCMP(satd, 8x8, dc, h, v, c,, _c )
INTRA_MBCMP( sad, 16x16, v, h, dc, ,, _c )
INTRA_MBCMP(satd, 16x16, v, h, dc, ,, _c )
#if HIGH_BIT_DEPTH && HAVE_MMX
INTRA_MBCMP( sad, 4, v, h, dc, , _mmx2 )
INTRA_MBCMP(satd, 4, v, h, dc, , _mmx2 )
INTRA_MBCMP( sad, 8, dc, h, v, c, _mmx2 )
INTRA_MBCMP(satd, 8, dc, h, v, c, _mmx2 )
INTRA_MBCMP( sad, 16, v, h, dc, , _mmx2 )
INTRA_MBCMP(satd, 16, v, h, dc, , _mmx2 )
INTRA_MBCMP( sad, 8, dc, h, v, c, _sse2 )
INTRA_MBCMP( sad, 16, v, h, dc, , _sse2 )
INTRA_MBCMP( sad, 4, v, h, dc, , _ssse3 )
INTRA_MBCMP( sad, 8, dc, h, v, c, _ssse3 )
INTRA_MBCMP( sad, 16, v, h, dc, , _ssse3 )
INTRA_MBCMP( sad, 4x4, v, h, dc, , _mmx2, _c )
INTRA_MBCMP(satd, 4x4, v, h, dc, , _mmx2, _c )
INTRA_MBCMP( sad, 8x8, dc, h, v, c, _mmx2, _c )
INTRA_MBCMP(satd, 8x8, dc, h, v, c, _mmx2, _c )
INTRA_MBCMP( sad, 16x16, v, h, dc, , _mmx2, _mmx2 )
INTRA_MBCMP(satd, 16x16, v, h, dc, , _mmx2, _mmx2 )
INTRA_MBCMP( sad, 8x8, dc, h, v, c, _sse2, _sse2 )
INTRA_MBCMP( sad, 16x16, v, h, dc, , _sse2, _sse2 )
INTRA_MBCMP( sad, 4x4, v, h, dc, , _ssse3, _c )
INTRA_MBCMP( sad, 8x8, dc, h, v, c, _ssse3, _sse2 )
INTRA_MBCMP( sad, 16x16, v, h, dc, , _ssse3, _sse2 )
#endif
/****************************************************************************
......
......@@ -1744,7 +1744,7 @@ cglobal predict_8x8c_dc_top_mmx2, 1,1
;-----------------------------------------------------------------------------
%ifdef HIGH_BIT_DEPTH
INIT_MMX
cglobal predict_16x16_v_mmx, 1,2
cglobal predict_16x16_v_mmx2, 1,2
mova m0, [r0 - FDEC_STRIDEB+ 0]
mova m1, [r0 - FDEC_STRIDEB+ 8]
mova m2, [r0 - FDEC_STRIDEB+16]
......@@ -1759,7 +1759,7 @@ cglobal predict_16x16_v_sse2, 2,2
REP_RET
%else ; !HIGH_BIT_DEPTH
INIT_MMX
cglobal predict_16x16_v_mmx, 1,2
cglobal predict_16x16_v_mmx2, 1,2
movq m0, [r0 - FDEC_STRIDE + 0]
movq m1, [r0 - FDEC_STRIDE + 8]
STORE16x16 m0, m1
......
......@@ -29,94 +29,8 @@
#include "predict.h"
#include "pixel.h"
void x264_predict_16x16_v_mmx( pixel *src );
void x264_predict_16x16_v_sse2( pixel *src );
void x264_predict_16x16_h_mmx2( pixel *src );
void x264_predict_16x16_h_sse2( uint16_t *src );
void x264_predict_16x16_h_ssse3( uint8_t *src );
void x264_predict_16x16_dc_core_mmx2( pixel *src, int i_dc_left );
void x264_predict_16x16_dc_core_sse2( pixel *src, int i_dc_left );
void x264_predict_16x16_dc_left_core_mmx2( pixel *src, int i_dc_left );
void x264_predict_16x16_dc_left_core_sse2( pixel *src, int i_dc_left );
void x264_predict_16x16_dc_top_mmx2( pixel *src );
void x264_predict_16x16_dc_top_sse2( pixel *src );
void x264_predict_16x16_dc_top_ssse3( uint16_t *src );
void x264_predict_16x16_p_core_mmx2( uint8_t *src, int i00, int b, int c );
void x264_predict_16x16_p_core_sse2( pixel *src, int i00, int b, int c );
void x264_predict_16x16_p_core_avx( pixel *src, int i00, int b, int c );
void x264_predict_8x8c_p_core_mmx2( uint8_t *src, int i00, int b, int c );
void x264_predict_8x8c_p_core_sse2( pixel *src, int i00, int b, int c );
void x264_predict_8x8c_dc_mmx2( pixel *src );
void x264_predict_8x8c_dc_sse2( uint16_t *src );
void x264_predict_8x8c_dc_top_mmx2( uint8_t *src );
void x264_predict_8x8c_dc_top_sse2( uint16_t *src );
void x264_predict_8x8c_v_mmx( pixel *src );
void x264_predict_8x8c_v_sse2( uint16_t *src );
void x264_predict_8x8c_h_mmx2( uint8_t *src );
void x264_predict_8x8c_h_sse2( pixel *src );
void x264_predict_8x8c_h_ssse3( uint8_t *src );
void x264_predict_8x8_v_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_v_sse2( uint16_t *src, uint16_t edge[36] );
void x264_predict_8x8_h_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_h_sse2( uint16_t *src, uint16_t edge[36] );
void x264_predict_8x8_hd_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_hu_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_dc_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_dc_sse2( uint16_t *src, uint16_t edge[36] );
void x264_predict_8x8_dc_top_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_dc_top_sse2( uint16_t *src, uint16_t edge[36] );
void x264_predict_8x8_dc_left_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_dc_left_sse2( uint16_t *src, uint16_t edge[36] );
void x264_predict_8x8_ddl_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_ddl_sse2( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddl_ssse3( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddl_ssse3_cache64( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddl_avx( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddr_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_ddr_sse2( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddr_ssse3( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddr_ssse3_cache64( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddr_avx( pixel *src, pixel edge[36] );
void x264_predict_8x8_vl_sse2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_vl_avx( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_vr_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_vr_sse2( pixel *src, pixel edge[36] );
void x264_predict_8x8_vr_ssse3( pixel *src, pixel edge[36] );
void x264_predict_8x8_vr_avx( pixel *src, pixel edge[36] );
void x264_predict_8x8_hu_sse2( pixel *src, pixel edge[36] );
void x264_predict_8x8_hu_ssse3( pixel *src, pixel edge[36] );
void x264_predict_8x8_hu_avx( pixel *src, pixel edge[36] );
void x264_predict_8x8_hd_sse2( pixel *src, pixel edge[36] );
void x264_predict_8x8_hd_ssse3( pixel *src, pixel edge[36] );
void x264_predict_8x8_hd_avx( pixel *src, pixel edge[36] );
void x264_predict_8x8_filter_mmx2( uint8_t *src, uint8_t edge[36], int i_neighbor, int i_filters );
void x264_predict_8x8_filter_sse2( uint16_t *src, uint16_t edge[36], int i_neighbor, int i_filters );
void x264_predict_8x8_filter_ssse3( pixel *src, pixel edge[36], int i_neighbor, int i_filters );
void x264_predict_8x8_filter_avx( uint16_t *src, uint16_t edge[36], int i_neighbor, int i_filters );
void x264_predict_4x4_ddl_mmx2( pixel *src );
void x264_predict_4x4_ddl_sse2( uint16_t *src );
void x264_predict_4x4_ddl_avx( uint16_t *src );
void x264_predict_4x4_ddr_mmx2( pixel *src );
void x264_predict_4x4_vl_mmx2( pixel *src );
void x264_predict_4x4_vl_sse2( uint16_t *src );
void x264_predict_4x4_vl_avx( uint16_t *src );
void x264_predict_4x4_vr_mmx2( uint8_t *src );
void x264_predict_4x4_vr_sse2( uint16_t *src );
void x264_predict_4x4_vr_ssse3( pixel *src );
void x264_predict_4x4_vr_ssse3_cache64( uint8_t *src );
void x264_predict_4x4_vr_avx( uint16_t *src );
void x264_predict_4x4_hd_mmx2( pixel *src );
void x264_predict_4x4_hd_sse2( uint16_t *src );
void x264_predict_4x4_hd_ssse3( pixel *src );
void x264_predict_4x4_hd_avx( uint16_t *src );
void x264_predict_4x4_dc_mmx2( pixel *src );
void x264_predict_4x4_ddr_sse2( uint16_t *src );
void x264_predict_4x4_ddr_ssse3( pixel *src );
void x264_predict_4x4_ddr_avx( uint16_t *src );
void x264_predict_4x4_hu_mmx2( pixel *src );
#define PREDICT_16x16_DC(name)\
static void x264_predict_16x16_dc_##name( pixel *src )\
void x264_predict_16x16_dc_##name( pixel *src )\
{\
uint32_t dc = 16;\
for( int i = 0; i < 16; i += 2 )\
......@@ -362,16 +276,13 @@ static void x264_predict_8x8c_dc_left( uint8_t *src )
****************************************************************************/
void x264_predict_16x16_init_mmx( int cpu, x264_predict_t pf[7] )
{
if( !(cpu&X264_CPU_MMX) )
if( !(cpu&X264_CPU_MMX2) )
return;
pf[I_PRED_16x16_V] = x264_predict_16x16_v_mmx;
if( cpu&X264_CPU_MMX2 )
{
pf[I_PRED_16x16_DC] = x264_predict_16x16_dc_mmx2;
pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_mmx2;
pf[I_PRED_16x16_DC_LEFT] = x264_predict_16x16_dc_left_mmx2;
pf[I_PRED_16x16_H] = x264_predict_16x16_h_mmx2;
}
pf[I_PRED_16x16_DC] = x264_predict_16x16_dc_mmx2;
pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_mmx2;
pf[I_PRED_16x16_DC_LEFT] = x264_predict_16x16_dc_left_mmx2;
pf[I_PRED_16x16_V] = x264_predict_16x16_v_mmx2;
pf[I_PRED_16x16_H] = x264_predict_16x16_h_mmx2;
#if HIGH_BIT_DEPTH
if( !(cpu&X264_CPU_SSE2) )
return;
......
......@@ -31,4 +31,93 @@ void x264_predict_16x16_init_mmx ( int cpu, x264_predict_t pf[7] );
void x264_predict_8x8c_init_mmx ( int cpu, x264_predict_t pf[7] );
void x264_predict_4x4_init_mmx ( int cpu, x264_predict_t pf[12] );
void x264_predict_8x8_init_mmx ( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_8x8_filter );
void x264_predict_16x16_v_mmx2( pixel *src );
void x264_predict_16x16_v_sse2( pixel *src );
void x264_predict_16x16_h_mmx2( pixel *src );
void x264_predict_16x16_h_sse2( uint16_t *src );
void x264_predict_16x16_h_ssse3( uint8_t *src );
void x264_predict_16x16_dc_mmx2( pixel *src );
void x264_predict_16x16_dc_sse2( pixel *src );
void x264_predict_16x16_dc_core_mmx2( pixel *src, int i_dc_left );
void x264_predict_16x16_dc_core_sse2( pixel *src, int i_dc_left );
void x264_predict_16x16_dc_left_core_mmx2( pixel *src, int i_dc_left );
void x264_predict_16x16_dc_left_core_sse2( pixel *src, int i_dc_left );
void x264_predict_16x16_dc_top_mmx2( pixel *src );
void x264_predict_16x16_dc_top_sse2( pixel *src );
void x264_predict_16x16_dc_top_ssse3( uint16_t *src );
void x264_predict_16x16_p_core_mmx2( uint8_t *src, int i00, int b, int c );
void x264_predict_16x16_p_core_sse2( pixel *src, int i00, int b, int c );
void x264_predict_16x16_p_core_avx( pixel *src, int i00, int b, int c );
void x264_predict_8x8c_p_core_mmx2( uint8_t *src, int i00, int b, int c );
void x264_predict_8x8c_p_core_sse2( pixel *src, int i00, int b, int c );
void x264_predict_8x8c_dc_mmx2( pixel *src );
void x264_predict_8x8c_dc_sse2( uint16_t *src );
void x264_predict_8x8c_dc_top_mmx2( uint8_t *src );
void x264_predict_8x8c_dc_top_sse2( uint16_t *src );
void x264_predict_8x8c_v_mmx( pixel *src );
void x264_predict_8x8c_v_sse2( uint16_t *src );
void x264_predict_8x8c_h_mmx2( uint8_t *src );
void x264_predict_8x8c_h_sse2( pixel *src );
void x264_predict_8x8c_h_ssse3( uint8_t *src );
void x264_predict_8x8_v_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_v_sse2( uint16_t *src, uint16_t edge[36] );
void x264_predict_8x8_h_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_h_sse2( uint16_t *src, uint16_t edge[36] );
void x264_predict_8x8_hd_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_hu_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_dc_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_dc_sse2( uint16_t *src, uint16_t edge[36] );
void x264_predict_8x8_dc_top_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_dc_top_sse2( uint16_t *src, uint16_t edge[36] );
void x264_predict_8x8_dc_left_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_dc_left_sse2( uint16_t *src, uint16_t edge[36] );
void x264_predict_8x8_ddl_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_ddl_sse2( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddl_ssse3( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddl_ssse3_cache64( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddl_avx( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddr_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_ddr_sse2( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddr_ssse3( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddr_ssse3_cache64( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddr_avx( pixel *src, pixel edge[36] );
void x264_predict_8x8_vl_sse2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_vl_avx( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_vr_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_vr_sse2( pixel *src, pixel edge[36] );
void x264_predict_8x8_vr_ssse3( pixel *src, pixel edge[36] );
void x264_predict_8x8_vr_avx( pixel *src, pixel edge[36] );
void x264_predict_8x8_hu_sse2( pixel *src, pixel edge[36] );
void x264_predict_8x8_hu_ssse3( pixel *src, pixel edge[36] );
void x264_predict_8x8_hu_avx( pixel *src, pixel edge[36] );
void x264_predict_8x8_hd_sse2( pixel *src, pixel edge[36] );
void x264_predict_8x8_hd_ssse3( pixel *src, pixel edge[36] );
void x264_predict_8x8_hd_avx( pixel *src, pixel edge[36] );
void x264_predict_8x8_filter_mmx2( uint8_t *src, uint8_t edge[36], int i_neighbor, int i_filters );
void x264_predict_8x8_filter_sse2( uint16_t *src, uint16_t edge[36], int i_neighbor, int i_filters );
void x264_predict_8x8_filter_ssse3( pixel *src, pixel edge[36], int i_neighbor, int i_filters );
void x264_predict_8x8_filter_avx( uint16_t *src, uint16_t edge[36], int i_neighbor, int i_filters );
void x264_predict_4x4_ddl_mmx2( pixel *src );
void x264_predict_4x4_ddl_sse2( uint16_t *src );
void x264_predict_4x4_ddl_avx( uint16_t *src );
void x264_predict_4x4_ddr_mmx2( pixel *src );
void x264_predict_4x4_vl_mmx2( pixel *src );
void x264_predict_4x4_vl_sse2( uint16_t *src );
void x264_predict_4x4_vl_avx( uint16_t *src );
void x264_predict_4x4_vr_mmx2( uint8_t *src );
void x264_predict_4x4_vr_sse2( uint16_t *src );
void x264_predict_4x4_vr_ssse3( pixel *src );
void x264_predict_4x4_vr_ssse3_cache64( uint8_t *src );
void x264_predict_4x4_vr_avx( uint16_t *src );
void x264_predict_4x4_hd_mmx2( pixel *src );
void x264_predict_4x4_hd_sse2( uint16_t *src );
void x264_predict_4x4_hd_ssse3( pixel *src );
void x264_predict_4x4_hd_avx( uint16_t *src );
void x264_predict_4x4_dc_mmx2( pixel *src );
void x264_predict_4x4_ddr_sse2( uint16_t *src );
void x264_predict_4x4_ddr_ssse3( pixel *src );
void x264_predict_4x4_ddr_avx( uint16_t *src );
void x264_predict_4x4_hu_mmx2( pixel *src );
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment