Commit fb62734c authored by David Conrad's avatar David Conrad Committed by Fiona Glaser

GSOC merge part 1: Framework for ARM assembly optimizations

x264 will detect which ARM core it's building for and only build NEON asm if the target is ARMv6 or above, then enable NEON at runtime.
parent 8368e151
......@@ -55,6 +55,14 @@ SRCS += $(ALTIVECSRC)
$(ALTIVECSRC:%.c=%.o): CFLAGS += $(ALTIVECFLAGS)
endif
# NEON optims
ifeq ($(ARCH),ARM)
ifneq ($(AS),)
ASMSRC += common/arm/cpu-a.S
OBJASM = $(ASMSRC:%.S=%.o)
endif
endif
# VIS optims
ifeq ($(ARCH),UltraSparc)
ASMSRC += common/sparc/pixel.asm
......@@ -88,6 +96,10 @@ checkasm: tools/checkasm.o libx264.a
%.o: %.asm
$(AS) $(ASFLAGS) -o $@ $<
%.o: %.S
$(AS) $(ASFLAGS) -o $@ $<
# delete local/anonymous symbols, so they don't show up in oprofile
-@ $(STRIP) -x $@
......
......@@ -61,9 +61,30 @@ const x264_cpu_name_t x264_cpu_names[] = {
{"SSEMisalign", X264_CPU_SSE_MISALIGN},
{"LZCNT", X264_CPU_LZCNT},
{"Slow_mod4_stack", X264_CPU_STACK_MOD4},
{"ARMv6", X264_CPU_ARMV6},
{"NEON", X264_CPU_NEON},
{"Fast_NEON_MRC", X264_CPU_FAST_NEON_MRC},
{"", 0},
};
#if (defined(ARCH_PPC) && defined(SYS_LINUX)) || (defined(ARCH_ARM) && !defined(HAVE_NEON))
#include <signal.h>
#include <setjmp.h>
static sigjmp_buf jmpbuf;
static volatile sig_atomic_t canjump = 0;
static void sigill_handler( int sig )
{
if( !canjump )
{
signal( sig, SIG_DFL );
raise( sig );
}
canjump = 0;
siglongjmp( jmpbuf, 1 );
}
#endif
#ifdef HAVE_MMX
extern int x264_cpu_cpuid_test( void );
......@@ -224,22 +245,6 @@ uint32_t x264_cpu_detect( void )
}
#elif defined( SYS_LINUX )
#include <signal.h>
#include <setjmp.h>
static sigjmp_buf jmpbuf;
static volatile sig_atomic_t canjump = 0;
static void sigill_handler( int sig )
{
if( !canjump )
{
signal( sig, SIG_DFL );
raise( sig );
}
canjump = 0;
siglongjmp( jmpbuf, 1 );
}
uint32_t x264_cpu_detect( void )
{
......@@ -265,6 +270,48 @@ uint32_t x264_cpu_detect( void )
}
#endif
#elif defined( ARCH_ARM )
void x264_cpu_neon_test();
int x264_cpu_fast_neon_mrc_test();
uint32_t x264_cpu_detect( void )
{
int flags = 0;
#ifdef HAVE_ARMV6
flags |= X264_CPU_ARMV6;
// don't do this hack if compiled with -mfpu=neon
#ifndef HAVE_NEON
static void (* oldsig)( int );
oldsig = signal( SIGILL, sigill_handler );
if( sigsetjmp( jmpbuf, 1 ) )
{
signal( SIGILL, oldsig );
return flags;
}
canjump = 1;
x264_cpu_neon_test();
canjump = 0;
signal( SIGILL, oldsig );
#endif
flags |= X264_CPU_NEON;
// fast neon -> arm (Cortex-A9) detection relies on user access to the
// cycle counter; this assumes ARMv7 performance counters.
// NEON requires at least ARMv7, ARMv8 may require changes here, but
// hopefully this hacky detection method will have been replaced by then.
// Note that there is potential for a race condition if another program or
// x264 instance disables or reinits the counters while x264 is using them,
// which may result in incorrect detection and the counters stuck enabled.
flags |= x264_cpu_fast_neon_mrc_test() ? X264_CPU_FAST_NEON_MRC : 0;
// TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)
#endif
return flags;
}
#else
uint32_t x264_cpu_detect( void )
......
......@@ -163,6 +163,13 @@ static ALWAYS_INLINE intptr_t endian_fix( intptr_t x )
asm("bswap %0":"+r"(x));
return x;
}
#elif defined(__GNUC__) && defined(HAVE_ARMV6)
static ALWAYS_INLINE intptr_t endian_fix( intptr_t x )
{
asm("rev %0, %0":"+r"(x));
return x;
}
#define endian_fix32 endian_fix
#else
static ALWAYS_INLINE uint32_t endian_fix32( uint32_t x )
{
......
......@@ -10,7 +10,7 @@ echo " --help print this message"
echo " --disable-avis-input disables avisynth input (win32 only)"
echo " --disable-mp4-output disables mp4 output (using gpac)"
echo " --disable-pthread disables multithreaded encoding"
echo " --disable-asm disables assembly optimizations on x86"
echo " --disable-asm disables assembly optimizations on x86 and arm"
echo " --enable-debug adds -g, doesn't strip"
echo " --enable-gprof adds -pg, doesn't strip"
echo " --enable-visualize enables visualization (X11 only)"
......@@ -157,7 +157,6 @@ CC="${CC-${cross_prefix}gcc}"
AR="${AR-${cross_prefix}ar}"
RANLIB="${RANLIB-${cross_prefix}ranlib}"
STRIP="${STRIP-${cross_prefix}strip}"
AS=""
if [ "x$host" = x ]; then
host=`./config.guess`
......@@ -286,6 +285,7 @@ case $host_cpu in
;;
arm*)
ARCH="ARM"
AS="${AS-${cross_prefix}gcc}"
;;
s390|s390x)
ARCH="S390"
......@@ -324,6 +324,17 @@ if [ $asm = yes -a \( $ARCH = X86 -o $ARCH = X86_64 \) ] ; then
fi
CFLAGS="$CFLAGS -DHAVE_MMX"
fi
if [ $asm = yes -a $ARCH = ARM ] ; then
if cc_check '' '' 'asm("rev r0, r0");' ; then CFLAGS="$CFLAGS -DHAVE_ARMV6"
cc_check '' '' 'asm("movt r0, #0");' && CFLAGS="$CFLAGS -DHAVE_ARMV6T2"
cc_check '' '' 'asm("vadd.i16 q0, q0, q0");' && CFLAGS="$CFLAGS -DHAVE_NEON"
ASFLAGS="$ASFLAGS $CFLAGS -c"
else
asm="no"
fi
fi
[ $asm = no ] && AS=""
[ "x$AS" = x ] && asm="no"
......
......@@ -30,6 +30,12 @@
#include "common/common.h"
#include "common/cpu.h"
// GCC doesn't align stack variables on ARM, so use .bss
#ifdef ARCH_ARM
#undef DECLARE_ALIGNED_16
#define DECLARE_ALIGNED_16( var ) DECLARE_ALIGNED( static var, 16 )
#endif
/* buf1, buf2: initialised to random data and shouldn't write into them */
uint8_t * buf1, * buf2;
/* buf3, buf4: used to store output */
......@@ -76,17 +82,15 @@ static const char **intra_predict_8x8_names = intra_predict_4x4_names;
static inline uint32_t read_time(void)
{
uint32_t a = 0;
#if defined(__GNUC__) && (defined(ARCH_X86) || defined(ARCH_X86_64))
uint32_t a;
asm volatile( "rdtsc" :"=a"(a) ::"edx" );
return a;
#elif defined(ARCH_PPC)
uint32_t a;
asm volatile( "mftb %0" : "=r" (a) );
return a;
#else
return 0;
#elif defined(ARCH_ARM) // ARMv7 only
asm volatile( "mrc p15, 0, %0, c9, c13, 0" : "=r"(a) );
#endif
return a;
}
static bench_t* get_bench( const char *name, int cpu )
......@@ -158,11 +162,14 @@ static void print_bench(void)
b->cpu&X264_CPU_SSE2_IS_SLOW && j<MAX_CPUS && b[1].cpu&X264_CPU_SSE2_IS_FAST && !(b[1].cpu&X264_CPU_SSE3) ? "sse2slow" :
b->cpu&X264_CPU_SSE2 ? "sse2" :
b->cpu&X264_CPU_MMX ? "mmx" :
b->cpu&X264_CPU_ALTIVEC ? "altivec" : "c",
b->cpu&X264_CPU_ALTIVEC ? "altivec" :
b->cpu&X264_CPU_NEON ? "neon" :
b->cpu&X264_CPU_ARMV6 ? "armv6" : "c",
b->cpu&X264_CPU_CACHELINE_32 ? "_c32" :
b->cpu&X264_CPU_CACHELINE_64 ? "_c64" :
b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" :
b->cpu&X264_CPU_LZCNT ? "_lzcnt" : "",
b->cpu&X264_CPU_LZCNT ? "_lzcnt" :
b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" : "",
((int64_t)10*b->cycles/b->den - nop_time)/4 );
}
}
......@@ -1580,6 +1587,13 @@ static int check_all_flags( void )
fprintf( stderr, "x264: ALTIVEC against C\n" );
ret = check_all_funcs( 0, X264_CPU_ALTIVEC );
}
#elif ARCH_ARM
if( x264_cpu_detect() & X264_CPU_ARMV6 )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_ARMV6, "ARMv6" );
if( x264_cpu_detect() & X264_CPU_NEON )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_NEON, "NEON" );
if( x264_cpu_detect() & X264_CPU_FAST_NEON_MRC )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_FAST_NEON_MRC, "Fast NEON MRC" );
#endif
return ret;
}
......@@ -1591,7 +1605,7 @@ int main(int argc, char *argv[])
if( argc > 1 && !strncmp( argv[1], "--bench", 7 ) )
{
#if !defined(ARCH_X86) && !defined(ARCH_X86_64) && !defined(ARCH_PPC)
#if !defined(ARCH_X86) && !defined(ARCH_X86_64) && !defined(ARCH_PPC) && !defined(ARCH_ARM)
fprintf( stderr, "no --bench for your cpu until you port rdtsc\n" );
return 1;
#endif
......
......@@ -63,6 +63,9 @@ typedef struct x264_t x264_t;
#define X264_CPU_SSE42 0x004000 /* SSE4.2 */
#define X264_CPU_SSE_MISALIGN 0x008000 /* Phenom support for misaligned SSE instruction arguments */
#define X264_CPU_LZCNT 0x010000 /* Phenom support for "leading zero count" instruction. */
#define X264_CPU_ARMV6 0x020000
#define X264_CPU_NEON 0x040000 /* ARM NEON */
#define X264_CPU_FAST_NEON_MRC 0x080000 /* Transfer from NEON to ARM register is fast (Cortex-A9) */
/* Analyse flags
*/
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment