Commit 80ea99c0 authored by Fiona Glaser's avatar Fiona Glaser

Phenom CPU optimizations

Faster hpel_filter by using unaligned loads instead of emulated PALIGNR
Faster hpel_filter on 64-bit by using the 32-bit version (the cost of emulated PALIGNR is high enough that the savings from caching intermediate values is not worth it).
Add support for misaligned_mask on Phenom: ~2% faster hpel_filter, ~4% faster width16 multisad, 7% faster width20 get_ref.
Replace width12 mmx with width16 sse on Phenom and Nehalem: 32% faster width12 get_ref on Phenom.
Merge cpu-32.asm and cpu-64.asm
Thanks to Easy123 for contributing a Phenom box for a weekend so I could write these optimizations.
parent 7df060be
......@@ -23,7 +23,7 @@ endif
ifneq ($(AS),)
X86SRC0 = cabac-a.asm dct-a.asm deblock-a.asm mc-a.asm mc-a2.asm \
pixel-a.asm predict-a.asm quant-a.asm sad-a.asm \
cpu-32.asm dct-32.asm
cpu-a.asm dct-32.asm
X86SRC = $(X86SRC0:%=common/x86/%)
ifeq ($(ARCH),X86)
......
......@@ -52,12 +52,15 @@ const x264_cpu_name_t x264_cpu_names[] = {
{"SSE4.2", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42},
{"Cache32", X264_CPU_CACHELINE_32},
{"Cache64", X264_CPU_CACHELINE_64},
{"SSEMisalign", X264_CPU_SSE_MISALIGN},
{"Slow_mod4_stack", X264_CPU_STACK_MOD4},
{"", 0},
};
#ifdef HAVE_MMX
extern int x264_cpu_cpuid_test( void );
extern void x264_cpu_mask_misalign_sse( void );
extern uint32_t x264_cpu_cpuid( uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx );
uint32_t x264_cpu_detect( void )
......@@ -111,7 +114,11 @@ uint32_t x264_cpu_detect( void )
if( cpu & X264_CPU_SSE2 )
{
if( ecx&0x00000040 ) /* SSE4a */
{
cpu |= X264_CPU_SSE2_IS_FAST;
cpu |= X264_CPU_SSE_MISALIGN;
x264_cpu_mask_misalign_sse();
}
else
cpu |= X264_CPU_SSE2_IS_SLOW;
}
......
......@@ -689,6 +689,11 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT2( sad_x4, _cache64_sse2 );
}
#endif
if( cpu&X264_CPU_SSE_MISALIGN )
{
INIT2( sad_x3, _sse2_misalign );
INIT2( sad_x4, _sse2_misalign );
}
}
if( cpu&X264_CPU_SSE2 )
{
......
;*****************************************************************************
;* cpu-64.asm: h264 encoder library
;*****************************************************************************
;* Copyright (C) 2003-2008 x264 project
;*
;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
;* Loren Merritt <lorenm@u.washington.edu>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
;*****************************************************************************
%include "x86inc.asm"
SECTION .text
;-----------------------------------------------------------------------------
; int x264_cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
;-----------------------------------------------------------------------------
cglobal x264_cpu_cpuid
push rbx
mov r10, r3
mov r11, r2
mov r9, r1
mov eax, r0d
cpuid
mov [r9], eax
mov [r11], ebx
mov [r10], ecx
mov [r8], edx
pop rbx
ret
;-----------------------------------------------------------------------------
; void x264_emms( void )
;-----------------------------------------------------------------------------
cglobal x264_emms
emms
ret
......@@ -25,6 +25,26 @@
SECTION .text
%ifdef ARCH_X86_64
;-----------------------------------------------------------------------------
; int x264_cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
;-----------------------------------------------------------------------------
cglobal x264_cpu_cpuid
push rbx
mov r10, r3
mov r11, r2
mov r9, r1
mov eax, r0d
cpuid
mov [r9], eax
mov [r11], ebx
mov [r10], ecx
mov [r8], edx
pop rbx
ret
%else
;-----------------------------------------------------------------------------
; int x264_cpu_cpuid_test( void )
; return 0 if unsupported
......@@ -67,13 +87,6 @@ cglobal x264_cpu_cpuid, 0,6
mov [esi], edx
RET
;-----------------------------------------------------------------------------
; void x264_emms( void )
;-----------------------------------------------------------------------------
cglobal x264_emms
emms
ret
;-----------------------------------------------------------------------------
; void x264_stack_align( void (*func)(void*), void *arg );
;-----------------------------------------------------------------------------
......@@ -88,4 +101,22 @@ cglobal x264_stack_align
call ecx
leave
ret
%endif
;-----------------------------------------------------------------------------
; void x264_emms( void )
;-----------------------------------------------------------------------------
cglobal x264_emms
emms
ret
;-----------------------------------------------------------------------------
; void x264_cpu_mask_misalign_sse(void)
;-----------------------------------------------------------------------------
cglobal x264_cpu_mask_misalign_sse
sub rsp, 4
stmxcsr [rsp]
or dword [rsp], 1<<17
ldmxcsr [rsp]
add rsp, 4
ret
......@@ -386,18 +386,24 @@ cglobal x264_pixel_avg2_w16_sse2, 6,7
jg .height_loop
REP_RET
cglobal x264_pixel_avg2_w20_sse2, 6,7
%macro AVG2_W20 1
cglobal x264_pixel_avg2_w20_%1, 6,7
sub r4, r2
lea r6, [r4+r3]
.height_loop:
movdqu xmm0, [r2]
movdqu xmm2, [r2+r3]
movdqu xmm1, [r2+r4]
movdqu xmm3, [r2+r6]
movd mm4, [r2+16]
movd mm5, [r2+r3+16]
%ifidn %1, sse2_misalign
pavgb xmm0, [r2+r4]
pavgb xmm2, [r2+r6]
%else
movdqu xmm1, [r2+r4]
movdqu xmm3, [r2+r6]
pavgb xmm0, xmm1
pavgb xmm2, xmm3
%endif
pavgb mm4, [r2+r4+16]
pavgb mm5, [r2+r6+16]
movdqa [r0], xmm0
......@@ -409,6 +415,10 @@ cglobal x264_pixel_avg2_w20_sse2, 6,7
sub r5d, 2
jg .height_loop
REP_RET
%endmacro
AVG2_W20 sse2
AVG2_W20 sse2_misalign
; Cacheline split code for processors with high latencies for loads
; split over cache lines. See sad-a.asm for a more detailed explanation.
......
......@@ -249,6 +249,14 @@ cglobal x264_hpel_filter_c_%1, 3,3
%define tpw_32 [pw_32 GLOBAL]
%endif
.loop:
%ifidn %1,sse2_misalign
movu m0, [src-4]
movu m1, [src-2]
mova m2, [src]
paddw m0, [src+6]
paddw m1, [src+4]
paddw m2, [src+2]
%else
mova m6, [src-16]
mova m2, [src]
mova m3, [src+16]
......@@ -264,6 +272,7 @@ cglobal x264_hpel_filter_c_%1, 3,3
paddw m2, m3
paddw m1, m4
paddw m0, m5
%endif
FILT_H m0, m1, m2
paddw m0, tpw_32
psraw m0, 6
......@@ -322,6 +331,7 @@ cglobal x264_hpel_filter_h_sse2, 3,3
jl .loop
REP_RET
%ifndef ARCH_X86_64
;-----------------------------------------------------------------------------
; void x264_hpel_filter_h_ssse3( uint8_t *dst, uint8_t *src, int width );
;-----------------------------------------------------------------------------
......@@ -387,11 +397,14 @@ cglobal x264_hpel_filter_h_ssse3, 3,3
jl .loop
REP_RET
%endif
%define PALIGNR PALIGNR_MMX
HPEL_V sse2
%ifndef ARCH_X86_64
HPEL_C sse2
%endif
HPEL_V sse2
HPEL_C sse2_misalign
%define PALIGNR PALIGNR_SSSE3
HPEL_C ssse3
......
......@@ -83,6 +83,7 @@ PIXEL_AVG_WALL(cache32_mmxext)
PIXEL_AVG_WALL(cache64_mmxext)
PIXEL_AVG_WALL(cache64_sse2)
PIXEL_AVG_WALL(sse2)
PIXEL_AVG_WALL(sse2_misalign)
#define PIXEL_AVG_WTAB(instr, name1, name2, name3, name4, name5)\
static void (* const x264_pixel_avg_wtab_##instr[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) =\
......@@ -98,13 +99,15 @@ static void (* const x264_pixel_avg_wtab_##instr[6])( uint8_t *, int, uint8_t *,
/* w16 sse2 is faster than w12 mmx as long as the cacheline issue is resolved */
#define x264_pixel_avg2_w12_cache64_sse2 x264_pixel_avg2_w16_cache64_sse2
#define x264_pixel_avg2_w12_sse3 x264_pixel_avg2_w16_sse3
#define x264_pixel_avg2_w12_sse2 x264_pixel_avg2_w16_sse2
PIXEL_AVG_WTAB(mmxext, mmxext, mmxext, mmxext, mmxext, mmxext)
#ifdef ARCH_X86
PIXEL_AVG_WTAB(cache32_mmxext, mmxext, cache32_mmxext, cache32_mmxext, cache32_mmxext, cache32_mmxext)
PIXEL_AVG_WTAB(cache64_mmxext, mmxext, cache64_mmxext, cache64_mmxext, cache64_mmxext, cache64_mmxext)
#endif
PIXEL_AVG_WTAB(sse2, mmxext, mmxext, mmxext, sse2, sse2)
PIXEL_AVG_WTAB(sse2, mmxext, mmxext, sse2, sse2, sse2)
PIXEL_AVG_WTAB(sse2_misalign, mmxext, mmxext, sse2, sse2, sse2_misalign)
PIXEL_AVG_WTAB(cache64_sse2, mmxext, cache64_mmxext, cache64_sse2, cache64_sse2, cache64_sse2)
#define MC_COPY_WTAB(instr, name1, name2, name3)\
......@@ -184,6 +187,7 @@ GET_REF(cache32_mmxext)
GET_REF(cache64_mmxext)
#endif
GET_REF(sse2)
GET_REF(sse2_misalign)
GET_REF(cache64_sse2)
#define HPEL(align, cpu, cpuv, cpuc, cpuh)\
......@@ -225,6 +229,7 @@ void x264_hpel_filter_ssse3( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_
HPEL(16, sse2, sse2, sse2, sse2)
HPEL(16, ssse3, sse2, ssse3, ssse3)
#endif
HPEL(16, sse2_misalign, sse2, sse2_misalign, sse2)
void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
{
......@@ -293,6 +298,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_sse2;
pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_sse2;
pf->hpel_filter = x264_hpel_filter_sse2;
if( cpu&X264_CPU_SSE_MISALIGN )
pf->hpel_filter = x264_hpel_filter_sse2_misalign;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2;
pf->mc_chroma = x264_mc_chroma_sse2;
......@@ -305,6 +312,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->mc_luma = mc_luma_cache64_sse2;
pf->get_ref = get_ref_cache64_sse2;
}
if( cpu&X264_CPU_SSE_MISALIGN )
pf->get_ref = get_ref_sse2_misalign;
}
if( !(cpu&X264_CPU_SSSE3) )
......
......@@ -42,6 +42,7 @@
DECL_X1( sad, mmxext )
DECL_X1( sad, sse2 )
DECL_X4( sad, sse2_misalign )
DECL_X1( sad, sse3 )
DECL_X1( sad, sse2_aligned )
DECL_X4( sad, mmxext )
......
......@@ -830,6 +830,80 @@ SAD_X 4, 4, 4
RET
%endmacro
%macro SAD_X3_START_1x16P_SSE2_MISALIGN 0
movdqa xmm2, [r0]
movdqu xmm0, [r1]
movdqu xmm1, [r2]
psadbw xmm0, xmm2
psadbw xmm1, xmm2
psadbw xmm2, [r3]
%endmacro
%macro SAD_X3_1x16P_SSE2_MISALIGN 2
movdqa xmm3, [r0+%1]
movdqu xmm4, [r1+%2]
movdqu xmm5, [r2+%2]
psadbw xmm4, xmm3
psadbw xmm5, xmm3
psadbw xmm3, [r3+%2]
paddw xmm0, xmm4
paddw xmm1, xmm5
paddw xmm2, xmm3
%endmacro
%macro SAD_X4_START_1x16P_SSE2_MISALIGN 0
movdqa xmm3, [r0]
movdqu xmm0, [r1]
movdqu xmm1, [r2]
movdqu xmm2, [r3]
psadbw xmm0, xmm3
psadbw xmm1, xmm3
psadbw xmm2, xmm3
psadbw xmm3, [r4]
%endmacro
%macro SAD_X4_1x16P_SSE2_MISALIGN 2
movdqa xmm7, [r0+%1]
movdqu xmm4, [r1+%2]
movdqu xmm5, [r2+%2]
movdqu xmm6, [r3+%2]
psadbw xmm4, xmm7
psadbw xmm5, xmm7
psadbw xmm6, xmm7
psadbw xmm7, [r4+%2]
paddw xmm0, xmm4
paddw xmm1, xmm5
paddw xmm2, xmm6
paddw xmm3, xmm7
%endmacro
%macro SAD_X3_2x16P_SSE2_MISALIGN 1
%if %1
SAD_X3_START_1x16P_SSE2_MISALIGN
%else
SAD_X3_1x16P_SSE2_MISALIGN 0, 0
%endif
SAD_X3_1x16P_SSE2_MISALIGN FENC_STRIDE, r4
add r0, 2*FENC_STRIDE
lea r1, [r1+2*r4]
lea r2, [r2+2*r4]
lea r3, [r3+2*r4]
%endmacro
%macro SAD_X4_2x16P_SSE2_MISALIGN 1
%if %1
SAD_X4_START_1x16P_SSE2_MISALIGN
%else
SAD_X4_1x16P_SSE2_MISALIGN 0, 0
%endif
SAD_X4_1x16P_SSE2_MISALIGN FENC_STRIDE, r5
add r0, 2*FENC_STRIDE
lea r1, [r1+2*r5]
lea r2, [r2+2*r5]
lea r3, [r3+2*r5]
lea r4, [r4+2*r5]
%endmacro
;-----------------------------------------------------------------------------
; void x264_pixel_sad_x3_16x16_sse2( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1,
; uint8_t *pix2, int i_stride, int scores[3] )
......@@ -843,6 +917,15 @@ cglobal x264_pixel_sad_x%1_%2x%3_%4, 2+%1,2+%1
SAD_X%1_END_SSE2
%endmacro
%macro SAD_X_SSE2_MISALIGN 4
cglobal x264_pixel_sad_x%1_%2x%3_%4_misalign, 2+%1,2+%1
SAD_X%1_2x%2P_SSE2_MISALIGN 1
%rep %3/2-1
SAD_X%1_2x%2P_SSE2_MISALIGN 0
%endrep
SAD_X%1_END_SSE2
%endmacro
SAD_X_SSE2 3, 16, 16, sse2
SAD_X_SSE2 3, 16, 8, sse2
SAD_X_SSE2 3, 8, 16, sse2
......@@ -854,6 +937,11 @@ SAD_X_SSE2 4, 8, 16, sse2
SAD_X_SSE2 4, 8, 8, sse2
SAD_X_SSE2 4, 8, 4, sse2
SAD_X_SSE2_MISALIGN 3, 16, 16, sse2
SAD_X_SSE2_MISALIGN 3, 16, 8, sse2
SAD_X_SSE2_MISALIGN 4, 16, 16, sse2
SAD_X_SSE2_MISALIGN 4, 16, 8, sse2
%define movdqu lddqu
SAD_X_SSE2 3, 16, 16, sse3
SAD_X_SSE2 3, 16, 8, sse3
......@@ -869,8 +957,8 @@ SAD_X_SSE2 4, 16, 8, sse3
; Core2 (Conroe) can load unaligned data just as quickly as aligned data...
; unless the unaligned data spans the border between 2 cachelines, in which
; case it's really slow. The exact numbers may differ, but all Intel cpus
; have a large penalty for cacheline splits.
; case it's really slow. The exact numbers may differ, but all Intel cpus prior
; to Nehalem have a large penalty for cacheline splits.
; (8-byte alignment exactly half way between two cachelines is ok though.)
; LDDQU was supposed to fix this, but it only works on Pentium 4.
; So in the split case we load aligned data and explicitly perform the
......
......@@ -154,7 +154,8 @@ static void print_bench(void)
b->cpu&X264_CPU_SSE2 ? "sse2" :
b->cpu&X264_CPU_MMX ? "mmx" : "c",
b->cpu&X264_CPU_CACHELINE_32 ? "_c32" :
b->cpu&X264_CPU_CACHELINE_64 ? "_c64" : "",
b->cpu&X264_CPU_CACHELINE_64 ? "_c64" :
b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" : "",
((int64_t)10*b->cycles/b->den - nop_time)/4 );
}
}
......@@ -1262,6 +1263,12 @@ static int check_all_flags( void )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE2_IS_FAST, "SSE2Fast" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSE2Fast Cache64" );
}
if( x264_cpu_detect() & X264_CPU_SSE_MISALIGN )
{
cpu1 &= ~X264_CPU_CACHELINE_64;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE_MISALIGN, "SSE_Misalign" );
cpu1 &= ~X264_CPU_SSE_MISALIGN;
}
if( x264_cpu_detect() & X264_CPU_SSE3 )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE3 | X264_CPU_CACHELINE_64, "SSE3" );
if( x264_cpu_detect() & X264_CPU_SSSE3 )
......
......@@ -61,6 +61,7 @@ typedef struct x264_t x264_t;
#define X264_CPU_STACK_MOD4 0x001000 /* if stack is only mod4 and not mod16 */
#define X264_CPU_SSE4 0x002000 /* SSE4.1 */
#define X264_CPU_SSE42 0x004000 /* SSE4.2 */
#define X264_CPU_SSE_MISALIGN 0x008000 /* Phenom support for misaligned SSE instruction arguments */
/* Analyse flags
*/
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment