Commit 2a7dd58c authored by Fiona Glaser's avatar Fiona Glaser

Add more inline asm and a runtime check for MMXEXT support

x264 will now terminate gracefully rather than SIGILL when run on a machine with no MMXEXT support.
A configure option is now available to build x264 without assembly support for support on such old CPUs as the Pentium 2, K6, etc.
parent 56108cb6
......@@ -141,10 +141,6 @@ static inline int x264_predictor_difference( int16_t (*mvc)[2], intptr_t i_mvc )
return sum;
}
#ifdef HAVE_MMX
#include "x86/util.h"
#endif
/****************************************************************************
*
****************************************************************************/
......@@ -595,5 +591,9 @@ struct x264_t
// included at the end because it needs x264_t
#include "macroblock.h"
#ifdef HAVE_MMX
#include "x86/util.h"
#endif
#endif
......@@ -356,6 +356,38 @@ static ALWAYS_INLINE void x264_macroblock_cache_intra8x8_pred( x264_t *h, int x,
int8_t *cache = &h->mb.cache.intra4x4_pred_mode[X264_SCAN8_0+x+8*y];
cache[0] = cache[1] = cache[8] = cache[9] = i_mode;
}
#define array_non_zero(a) array_non_zero_int(a, sizeof(a))
#define array_non_zero_int array_non_zero_int_c
static ALWAYS_INLINE int array_non_zero_int_c( void *v, int i_count )
{
uint64_t *x = v;
if(i_count == 8)
return !!x[0];
else if(i_count == 16)
return !!(x[0]|x[1]);
else if(i_count == 32)
return !!(x[0]|x[1]|x[2]|x[3]);
else
{
int i;
i_count /= sizeof(uint64_t);
for( i = 0; i < i_count; i++ )
if( x[i] ) return 1;
return 0;
}
}
/* This function and its MMX version only work on arrays of size 16 */
static ALWAYS_INLINE int array_non_zero_count( int16_t *v )
{
int i;
int i_nz;
for( i = 0, i_nz = 0; i < 16; i++ )
if( v[i] )
i_nz++;
return i_nz;
}
#endif
......@@ -65,11 +65,67 @@ static inline int x264_predictor_difference_mmxext( int16_t (*mvc)[2], intptr_t
"paddusw %%mm0, %%mm4 \n"
"jg 1b \n"
"movq %%mm4, %0 \n"
:"=m"(output), "+r"(i_mvc), "+r"(mvc)
:"=m"(output), "+r"(i_mvc)
:"r"(mvc)
);
sum += output[0] + output[1] + output[2] + output[3];
return sum;
}
#define array_non_zero_count array_non_zero_count_mmx
static inline int array_non_zero_count_mmx( int16_t *v )
{
static const uint64_t pw_2 = 0x0202020202020202ULL;
int count;
asm(
"pxor %%mm7, %%mm7 \n"
"movq (%1), %%mm0 \n"
"movq 16(%1), %%mm1 \n"
"packsswb 8(%1), %%mm0 \n"
"packsswb 24(%1), %%mm1 \n"
"pcmpeqb %%mm7, %%mm0 \n"
"pcmpeqb %%mm7, %%mm1 \n"
"paddb %%mm0, %%mm1 \n"
"paddb %2, %%mm1 \n"
"psadbw %%mm7, %%mm1 \n"
"movd %%mm1, %0 \n"
:"=r"(count)
:"r"(v), "m"(pw_2)
);
return count;
}
#undef array_non_zero_int
#define array_non_zero_int array_non_zero_int_mmx
static ALWAYS_INLINE int array_non_zero_int_mmx( void *v, int i_count )
{
if(i_count == 128)
{
int nonzero;
asm(
"movq (%1), %%mm0 \n"
"por 8(%1), %%mm0 \n"
"por 16(%1), %%mm0 \n"
"por 24(%1), %%mm0 \n"
"por 32(%1), %%mm0 \n"
"por 40(%1), %%mm0 \n"
"por 48(%1), %%mm0 \n"
"por 56(%1), %%mm0 \n"
"por 64(%1), %%mm0 \n"
"por 72(%1), %%mm0 \n"
"por 80(%1), %%mm0 \n"
"por 88(%1), %%mm0 \n"
"por 96(%1), %%mm0 \n"
"por 104(%1), %%mm0 \n"
"por 112(%1), %%mm0 \n"
"por 120(%1), %%mm0 \n"
"packsswb %%mm0, %%mm0 \n"
"movd %%mm0, %0 \n"
:"=r"(nonzero)
:"r"(v)
);
return !!nonzero;
}
else return array_non_zero_int_c( v, i_count );
}
#endif
#endif
......@@ -7,10 +7,11 @@ echo ""
echo "available options:"
echo ""
echo " --help print this message"
echo " --enable-avis-input enables avisynth input (win32 only)"
echo " --enable-mp4-output enables mp4 output (using gpac)"
echo " --disable-avis-input disables avisynth input (win32 only)"
echo " --disable-mp4-output disables mp4 output (using gpac)"
echo " --disable-pthread disables multithreaded encoding"
echo " --disable-asm disables assembly optimizations on x86"
echo " --enable-gtk build GTK+ interface"
echo " --enable-pthread enables multithreaded encoding"
echo " --enable-debug adds -g, doesn't strip"
echo " --enable-gprof adds -pg, doesn't strip"
echo " --enable-visualize enables visualization (X11 only)"
......@@ -53,6 +54,7 @@ DEVNULL='/dev/null'
avis_input="auto"
mp4_output="auto"
pthread="auto"
asm="yes"
debug="no"
gprof="no"
pic="no"
......@@ -102,6 +104,12 @@ for opt do
--includedir=*)
includedir="$optarg"
;;
--enable-asm)
asm="yes"
;;
--disable-asm)
asm="no"
;;
--enable-avis-input)
avis_input="yes"
;;
......@@ -300,7 +308,7 @@ if [ $shared = yes -a \( $ARCH = "X86_64" -o $ARCH = "PPC" -o $ARCH = "ALPHA" \)
pic="yes"
fi
if [ $ARCH = X86 -o $ARCH = X86_64 ] ; then
if [ $asm = yes -a \( $ARCH = X86 -o $ARCH = X86_64 \) ] ; then
if [ $ARCH = X86 -a $pic = yes -a x$AS = xyasm -a\
"`yasm --version 2>$DEVNULL | head -n 1`" "<" "yasm 0.6.2" ] ; then
echo "yasm prior to 0.6.2 miscompiles PIC. trying nasm instead..."
......@@ -309,10 +317,12 @@ if [ $ARCH = X86 -o $ARCH = X86_64 ] ; then
if as_check "pabsw xmm0, xmm0" ; then
CFLAGS="$CFLAGS -DHAVE_MMX"
else
echo "No suitable assembler found. x264 will be several times slower."
echo "Please install 'yasm' to get MMX/SSE optimized code."
AS=""
echo "No suitable assembler found. Install 'yasm' to get MMX/SSE optimized code."
echo "If you really want to compile without asm, configure with --disable-asm."
exit 1
fi
else
AS=""
fi
CFLAGS="$CFLAGS -DARCH_$ARCH -DSYS_$SYS"
......@@ -482,6 +492,7 @@ EOF
echo "Platform: $ARCH"
echo "System: $SYS"
echo "asm: $asm"
echo "avis input: $avis_input"
echo "mp4 output: $mp4_output"
echo "pthread: $pthread"
......
......@@ -301,7 +301,7 @@ static inline void x264_macroblock_luma_write_cavlc( x264_t *h, bs_t *s, int i8s
if( h->mb.i_cbp_luma & (1 << i8) )
for( i4 = 0; i4 < 4; i4++ )
{
h->mb.cache.non_zero_count[x264_scan8[i4+i8*4]] = array_non_zero_count( h->dct.luma4x4[i4+i8*4], 16 );
h->mb.cache.non_zero_count[x264_scan8[i4+i8*4]] = array_non_zero_count( h->dct.luma4x4[i4+i8*4] );
block_residual_write_cavlc( h, s, i4+i8*4, h->dct.luma4x4[i4+i8*4], 16 );
}
}
......@@ -657,7 +657,7 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
if( h->mb.i_cbp_luma != 0 )
for( i = 0; i < 16; i++ )
{
h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i]+1, 15 );
h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i] );
block_residual_write_cavlc( h, s, i, h->dct.luma4x4[i]+1, 15 );
}
}
......@@ -674,7 +674,7 @@ void x264_macroblock_write_cavlc( x264_t *h, bs_t *s )
if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */
for( i = 16; i < 24; i++ )
{
h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i]+1, 15 );
h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i] );
block_residual_write_cavlc( h, s, i, h->dct.luma4x4[i]+1, 15 );
}
}
......@@ -741,9 +741,9 @@ int x264_partition_size_cavlc( x264_t *h, int i8, int i_pixel )
for( j = (i_pixel < PIXEL_8x8); j >= 0; j-- )
{
x264_macroblock_luma_write_cavlc( h, &s, i8, i8 );
h->mb.cache.non_zero_count[x264_scan8[16+i8]] = array_non_zero_count( h->dct.luma4x4[16+i8]+1, 15 );
h->mb.cache.non_zero_count[x264_scan8[16+i8]] = array_non_zero_count( h->dct.luma4x4[16+i8] );
block_residual_write_cavlc( h, &s, 16+i8, h->dct.luma4x4[16+i8]+1, 15 );
h->mb.cache.non_zero_count[x264_scan8[20+i8]] = array_non_zero_count( h->dct.luma4x4[20+i8]+1, 15 );
h->mb.cache.non_zero_count[x264_scan8[20+i8]] = array_non_zero_count( h->dct.luma4x4[20+i8] );
block_residual_write_cavlc( h, &s, 20+i8, h->dct.luma4x4[20+i8]+1, 15 );
i8 += x264_pixel_size[i_pixel].h >> 3;
}
......@@ -768,7 +768,7 @@ static int x264_partition_i8x8_size_cavlc( x264_t *h, int i8, int i_mode )
for( i = 0; i < 16; i++ )
h->dct.luma4x4[i4+i8*4][i] = h->dct.luma8x8[i8][i4+i*4];
h->mb.cache.non_zero_count[x264_scan8[i4+i8*4]] =
array_non_zero_count( h->dct.luma4x4[i4+i8*4], 16 );
array_non_zero_count( h->dct.luma4x4[i4+i8*4] );
block_residual_write_cavlc( h, &h->out.bs, i4+i8*4, h->dct.luma4x4[i4+i8*4], 16 );
}
return h->out.bs.i_bits_encoded;
......@@ -794,7 +794,7 @@ static int x264_i8x8_chroma_size_cavlc( x264_t *h )
int i;
for( i = 16; i < 24; i++ )
{
h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i]+1, 15 );
h->mb.cache.non_zero_count[x264_scan8[i]] = array_non_zero_count( h->dct.luma4x4[i] );
block_residual_write_cavlc( h, &h->out.bs, i, h->dct.luma4x4[i]+1, 15 );
}
}
......
......@@ -300,6 +300,14 @@ static void x264_slice_header_write( bs_t *s, x264_slice_header_t *sh, int i_nal
static int x264_validate_parameters( x264_t *h )
{
#ifdef HAVE_MMX
if( !(x264_cpu_detect() & X264_CPU_MMXEXT) )
{
x264_log( h, X264_LOG_ERROR, "your cpu does not support MMXEXT, but x264 was compiled with asm support\n");
x264_log( h, X264_LOG_ERROR, "to run x264, recompile without asm support (configure --disable-asm)\n");
return -1;
}
#endif
if( h->param.i_width <= 0 || h->param.i_height <= 0 )
{
x264_log( h, X264_LOG_ERROR, "invalid width x height (%dx%d)\n",
......
......@@ -54,29 +54,5 @@ void x264_quant_8x8_trellis( x264_t *h, int16_t dct[8][8], int i_quant_cat,
void x264_noise_reduction_update( x264_t *h );
void x264_denoise_dct( x264_t *h, int16_t *dct );
#define array_non_zero(a) array_non_zero_int(a, sizeof(a))
static inline int array_non_zero_int( void *v, int i_count )
{
int i;
uint64_t *x = v;
i_count /= sizeof(uint64_t);
for( i = 0; i < i_count; i++ )
if( x[i] ) return 1;
return 0;
}
static inline int array_non_zero_count( int16_t *v, int i_count )
{
int i;
int i_nz;
for( i = 0, i_nz = 0; i < i_count; i++ )
if( v[i] )
i_nz++;
return i_nz;
}
#endif
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment