Commit ad6c91f0 authored by Loren Merritt's avatar Loren Merritt
Browse files

drop support for pre-SSE3 assemblers

parent 27ae7576
......@@ -84,12 +84,10 @@ uint32_t x264_cpu_detect( void )
cpu |= X264_CPU_MMXEXT|X264_CPU_SSE;
if( edx&0x04000000 )
cpu |= X264_CPU_SSE2;
#ifdef HAVE_SSE3
if( ecx&0x00000001 )
cpu |= X264_CPU_SSE3;
if( ecx&0x00000200 )
cpu |= X264_CPU_SSSE3;
#endif
x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx );
max_extended_cap = eax;
......
......@@ -580,7 +580,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
pf->scan_8x8 = zigzag_scan_8x8_frame;
pf->scan_4x4 = zigzag_scan_4x4_frame;
pf->sub_4x4 = zigzag_sub_4x4_frame;
#ifdef HAVE_SSE3
#ifdef HAVE_MMX
if( cpu&X264_CPU_SSSE3 )
pf->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3;
#endif
......
......@@ -359,10 +359,8 @@ SATD_X_DECL7()
#ifdef HAVE_MMX
SATD_X_DECL7( _mmxext )
SATD_X_DECL5( _sse2 )
#ifdef HAVE_SSE3
SATD_X_DECL7( _ssse3 )
#endif
#endif
/****************************************************************************
* structural similarity metric
......@@ -623,7 +621,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
#endif
}
#ifdef HAVE_SSE3
if( (cpu&X264_CPU_SSE3) && (cpu&X264_CPU_CACHELINE_SPLIT) )
{
INIT2( sad, _sse3 );
......@@ -652,7 +649,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT2( sad_x4, _cache64_ssse3 );
}
}
#endif //HAVE_SSE3
#endif //HAVE_MMX
#ifdef ARCH_PPC
......
......@@ -240,16 +240,14 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->dequant_8x8 = x264_dequant_8x8_flat16_sse2;
}
}
#endif
#ifdef HAVE_SSE3
if( cpu&X264_CPU_SSSE3 )
{
pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3;
pf->quant_4x4 = x264_quant_4x4_ssse3;
pf->quant_8x8 = x264_quant_8x8_ssse3;
}
#endif
#endif // HAVE_MMX
#ifdef ARCH_PPC
if( cpu&X264_CPU_ALTIVEC ) {
......
......@@ -325,7 +325,6 @@ cglobal x264_zigzag_scan_4x4_field_mmxext, 2,3
mov [r0+12], r2d
RET
%ifdef HAVE_SSE3
;-----------------------------------------------------------------------------
; void x264_zigzag_sub_4x4_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst )
;-----------------------------------------------------------------------------
......@@ -364,4 +363,3 @@ cglobal x264_zigzag_sub_4x4_frame_ssse3, 3,3
movdqa [r0], xmm0
movdqa [r0+16], xmm1
RET
%endif
......@@ -275,11 +275,9 @@ cglobal x264_pixel_avg2_w20_%1, 6,7
%endmacro
PIXEL_AVG_SSE sse2
%ifdef HAVE_SSE3
%define movdqu lddqu
PIXEL_AVG_SSE sse3
%undef movdqu
%endif
; Cacheline split code for processors with high latencies for loads
; split over cache lines. See sad-a.asm for a more detailed explanation.
......@@ -481,9 +479,7 @@ cglobal %1, 5,7
COPY_W16_SSE2 x264_mc_copy_w16_sse2, movdqu
; cacheline split with mmx has too much overhead; the speed benefit is near-zero.
; but with SSE3 the overhead is zero, so there's no reason not to include it.
%ifdef HAVE_SSE3
COPY_W16_SSE2 x264_mc_copy_w16_sse3, lddqu
%endif
COPY_W16_SSE2 x264_mc_copy_w16_aligned_sse2, movdqa
......
......@@ -309,10 +309,8 @@ cglobal x264_hpel_filter_h_sse2, 3,3,1
%define PALIGNR PALIGNR_SSE2
HPEL_V sse2
HPEL_C sse2
%ifdef HAVE_SSE3
%define PALIGNR PALIGNR_SSSE3
HPEL_C ssse3
%endif
cglobal x264_sfence
sfence
......
......@@ -102,9 +102,7 @@ PIXEL_AVG_WTAB(cache32_mmxext, mmxext, cache32_mmxext, cache32_mmxext, cache32_m
PIXEL_AVG_WTAB(cache64_mmxext, mmxext, cache64_mmxext, cache64_mmxext, cache64_mmxext, cache64_mmxext)
PIXEL_AVG_WTAB(sse2, mmxext, mmxext, mmxext, sse2, sse2)
PIXEL_AVG_WTAB(cache64_sse2, mmxext, cache64_mmxext, cache64_sse2, cache64_sse2, cache64_sse2)
#ifdef HAVE_SSE3
PIXEL_AVG_WTAB(cache64_sse3, mmxext, cache64_mmxext, sse3, sse3, sse3)
#endif
#define MC_COPY_WTAB(instr, name1, name2, name3)\
static void (* const x264_mc_copy_wtab_##instr[5])( uint8_t *, int, uint8_t *, int, int ) =\
......@@ -118,9 +116,7 @@ static void (* const x264_mc_copy_wtab_##instr[5])( uint8_t *, int, uint8_t *, i
MC_COPY_WTAB(mmx,mmx,mmx,mmx)
MC_COPY_WTAB(sse2,mmx,mmx,sse2)
#ifdef HAVE_SSE3
MC_COPY_WTAB(sse3,mmx,mmx,sse3)
#endif
static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
......@@ -155,9 +151,7 @@ MC_LUMA(cache64_mmxext,cache64_mmxext,mmx)
#endif
MC_LUMA(sse2,sse2,sse2)
MC_LUMA(cache64_sse2,cache64_sse2,sse2)
#ifdef HAVE_SSE3
MC_LUMA(cache64_sse3,cache64_sse3,sse3)
#endif
#define GET_REF(name)\
uint8_t *get_ref_##name( uint8_t *dst, int *i_dst_stride,\
......@@ -190,9 +184,7 @@ GET_REF(cache64_mmxext)
#endif
GET_REF(sse2)
GET_REF(cache64_sse2)
#ifdef HAVE_SSE3
GET_REF(cache64_sse3)
#endif
#define HPEL(align, cpu, cpuv, cpuc, cpuh)\
void x264_hpel_filter_v_##cpuv( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width);\
......@@ -227,9 +219,7 @@ void x264_hpel_filter_##cpu( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_
HPEL(8, mmxext, mmxext, mmxext, mmxext)
HPEL(16, sse2_amd, mmxext, mmxext, sse2)
HPEL(16, sse2, sse2, sse2, sse2)
#ifdef HAVE_SSE3
HPEL(16, ssse3, sse2, ssse3, sse2)
#endif
void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
{
......@@ -305,20 +295,16 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
{
pf->mc_luma = mc_luma_cache64_sse2;
pf->get_ref = get_ref_cache64_sse2;
#ifdef HAVE_SSE3
/* lddqu doesn't work on Core2 */
if( (cpu&X264_CPU_SSE3) && !(cpu&X264_CPU_SSSE3) )
{
pf->mc_luma = mc_luma_cache64_sse3;
pf->get_ref = get_ref_cache64_sse3;
}
#endif
}
if( !(cpu&X264_CPU_SSSE3) )
return;
#ifdef HAVE_SSE3
pf->hpel_filter = x264_hpel_filter_ssse3;
#endif
}
......@@ -1272,7 +1272,6 @@ SATDS_SSE2 sse2
SA8D_16x16_32 sse2
INTRA_SA8D_SSE2 sse2
INTRA_SATDS_MMX mmxext
%ifdef HAVE_SSE3
%define ABS1 ABS1_SSSE3
%define ABS2 ABS2_SSSE3
SATDS_SSE2 ssse3
......@@ -1280,7 +1279,6 @@ SA8D_16x16_32 ssse3
INTRA_SA8D_SSE2 ssse3
INTRA_SATDS_MMX ssse3
SATD_W4 ssse3 ; mmx, but uses pabsw from ssse3.
%endif
......@@ -1655,10 +1653,8 @@ cglobal x264_pixel_ads1_%1, 4,7
%endmacro
ADS_SSE2 sse2
%ifdef HAVE_SSE3
%define ABS1 ABS1_SSSE3
ADS_SSE2 ssse3
%endif
; int x264_pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
; {
......
......@@ -483,9 +483,7 @@ void x264_intra_sa8d_x3_8x8_##cpu( uint8_t *fenc, uint8_t edge[33], int res[3] )
#ifdef ARCH_X86_64
INTRA_SA8D_X3(sse2)
#ifdef HAVE_SSE3
INTRA_SA8D_X3(ssse3)
#endif
#else
INTRA_SA8D_X3(mmxext)
#endif
......
......@@ -145,11 +145,9 @@ QUANT_DC x264_quant_4x4_dc_sse2, QUANT_MMX, 2, 16
QUANT_AC x264_quant_4x4_sse2, QUANT_MMX, 2, 16
QUANT_AC x264_quant_8x8_sse2, QUANT_MMX, 8, 16
%ifdef HAVE_SSE3
QUANT_DC x264_quant_4x4_dc_ssse3, QUANT_SSSE3, 2, 16
QUANT_AC x264_quant_4x4_ssse3, QUANT_SSSE3, 2, 16
QUANT_AC x264_quant_8x8_ssse3, QUANT_SSSE3, 8, 16
%endif
......
......@@ -25,7 +25,7 @@
%include "x86inc.asm"
SECTION_RODATA
sw_64: dq 64
sw_64: dd 64
SECTION .text
......@@ -213,11 +213,9 @@ cglobal x264_pixel_sad_16x8_%1, 4,4
%endmacro
SAD_W16 sse2
%ifdef HAVE_SSE3
%define movdqu lddqu
SAD_W16 sse3
%undef movdqu
%endif
......@@ -613,14 +611,12 @@ SAD_X_SSE2 3, 16, 8, sse2
SAD_X_SSE2 4, 16, 16, sse2
SAD_X_SSE2 4, 16, 8, sse2
%ifdef HAVE_SSE3
%define movdqu lddqu
SAD_X_SSE2 3, 16, 16, sse3
SAD_X_SSE2 3, 16, 8, sse3
SAD_X_SSE2 4, 16, 16, sse3
SAD_X_SSE2 4, 16, 8, sse3
%undef movdqu
%endif
......@@ -961,7 +957,6 @@ SADX34_CACHELINE_FUNC 16, 16, 64, sse2, sse2
SADX34_CACHELINE_FUNC 16, 8, 64, sse2, sse2
%endif ; !ARCH_X86_64
%ifdef HAVE_SSE3
SAD16_CACHELINE_FUNC ssse3, 8
SAD16_CACHELINE_FUNC ssse3, 16
%assign i 1
......@@ -971,4 +966,3 @@ SAD16_CACHELINE_LOOP_SSSE3 i
%endrep
SADX34_CACHELINE_FUNC 16, 16, 64, sse2, ssse3
SADX34_CACHELINE_FUNC 16, 8, 64, sse2, ssse3
%endif ; HAVE_SSE3
......@@ -321,12 +321,8 @@ if [ $ARCH = X86 -o $ARCH = X86_64 ] ; then
echo "yasm prior to 0.6.2 miscompiles PIC. trying nasm instead..."
AS=nasm
fi
if as_check ; then
if as_check "pabsw xmm0, xmm0" ; then
CFLAGS="$CFLAGS -DHAVE_MMX"
if as_check "pabsw xmm0, xmm0" ; then
ASFLAGS="$ASFLAGS -DHAVE_SSE3"
CFLAGS="$CFLAGS -DHAVE_SSE3"
fi
else
echo "No suitable assembler found. x264 will be several times slower."
echo "Please install 'yasm' to get MMX/SSE optimized code."
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment