Commit d94edd73 authored by Loren Merritt's avatar Loren Merritt Committed by Fiona Glaser

Optimize x86 intra_predict_4x4 and 8x8

High bit depth Penryn, Sandybridge cycles:
4x4_ddl: 11->10,  9-> 8
4x4_ddr: 15->13, 12->11
4x4_hd:        , 15->12
4x4_hu:        , 14->13
4x4_vr:  15->14, 14->12
8x8_ddl: 32->19, 19->14
8x8_ddr: 42->19, 21->14
8x8_hd:        , 15->13
8x8_hu:  21->17, 16->12
8x8_vr:  33->19,

8-bit Penryn, Sandybridge cycles:
4x4_ddr: 24->15,
4x4_hd:  24->16,
4x4_hu:  23->15,
4x4_vr:  23->16,
4x4_vl:  10-> 9,
8x8_ddl: 23->15,
8x8_hd:        , 17->14
8x8_hu:        , 15->14
8x8_vr:  20->16, 17->13
parent 37b2d963
......@@ -94,24 +94,27 @@
// - armcc can't either, but is nice enough to actually tell you so
// - Apple gcc only maintains 4 byte alignment
// - llvm can align the stack, but only in svn and (unrelated) it exposes bugs in all released GNU binutils...
#define ALIGNED_ARRAY_EMU( mask, type, name, sub1, ... )\
uint8_t name##_u [sizeof(type sub1 __VA_ARGS__) + mask]; \
type (*name) __VA_ARGS__ = (void*)((intptr_t)(name##_u+mask) & ~mask)
#if ARCH_ARM && SYS_MACOSX
#define ALIGNED_ARRAY_8( type, name, sub1, ... )\
uint8_t name##_u [sizeof(type sub1 __VA_ARGS__) + 7]; \
type (*name) __VA_ARGS__ = (void*)((intptr_t)(name##_u+7) & ~7)
#define ALIGNED_ARRAY_8( ... ) ALIGNED_ARRAY_EMU( 7, __VA_ARGS__ )
#else
#define ALIGNED_ARRAY_8( type, name, sub1, ... )\
ALIGNED_8( type name sub1 __VA_ARGS__ )
#endif
#if ARCH_ARM
#define ALIGNED_ARRAY_16( type, name, sub1, ... )\
uint8_t name##_u [sizeof(type sub1 __VA_ARGS__) + 15];\
type (*name) __VA_ARGS__ = (void*)((intptr_t)(name##_u+15) & ~15)
#define ALIGNED_ARRAY_16( ... ) ALIGNED_ARRAY_EMU( 15, __VA_ARGS__ )
#else
#define ALIGNED_ARRAY_16( type, name, sub1, ... )\
ALIGNED_16( type name sub1 __VA_ARGS__ )
#endif
#define ALIGNED_ARRAY_32( ... ) ALIGNED_ARRAY_EMU( 31, __VA_ARGS__ )
#define UNINIT(x) x=x
#if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 0)
......
......@@ -59,5 +59,4 @@ const pd_ffff, times 4 dd 0xffff
const pw_00ff, times 8 dw 0x00ff
const pw_ff00, times 8 dw 0xff00
const pb_reverse, db 7, 6, 5, 4, 3, 2, 1, 0
const sw_64, dd 64
This diff is collapsed.
......@@ -69,15 +69,19 @@
void x264_predict_8x8_dc_left_sse2( uint16_t *src, uint16_t edge[36] );
void x264_predict_8x8_ddl_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_ddl_sse2( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddl_ssse3( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddl_ssse3_cache64( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddl_avx( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddr_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_ddr_sse2( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddr_ssse3( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddr_ssse3_cache64( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddr_avx( pixel *src, pixel edge[36] );
void x264_predict_8x8_vl_sse2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_vl_avx( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_vr_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_vr_sse2( pixel *src, pixel edge[36] );
void x264_predict_8x8_vr_ssse3( uint16_t *src, uint16_t edge[36] );
void x264_predict_8x8_vr_ssse3( pixel *src, pixel edge[36] );
void x264_predict_8x8_vr_avx( pixel *src, pixel edge[36] );
void x264_predict_8x8_hu_sse2( pixel *src, pixel edge[36] );
void x264_predict_8x8_hu_ssse3( pixel *src, pixel edge[36] );
......@@ -99,6 +103,7 @@
void x264_predict_4x4_vr_mmx2( uint8_t *src );
void x264_predict_4x4_vr_sse2( uint16_t *src );
void x264_predict_4x4_vr_ssse3( pixel *src );
void x264_predict_4x4_vr_ssse3_cache64( uint8_t *src );
void x264_predict_4x4_vr_avx( uint16_t *src );
void x264_predict_4x4_hd_mmx2( pixel *src );
void x264_predict_4x4_hd_sse2( uint16_t *src );
......@@ -466,13 +471,19 @@ void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12], x264_predict_
*predict_8x8_filter = x264_predict_8x8_filter_sse2;
if( !(cpu&X264_CPU_SSSE3) )
return;
pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_ssse3;
pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_ssse3;
pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_ssse3;
pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_ssse3;
pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_ssse3;
*predict_8x8_filter = x264_predict_8x8_filter_ssse3;
if( cpu&X264_CPU_CACHELINE_64 )
{
pf[I_PRED_8x8_DDL]= x264_predict_8x8_ddl_ssse3_cache64;
pf[I_PRED_8x8_DDR]= x264_predict_8x8_ddr_ssse3_cache64;
}
if( !(cpu&X264_CPU_AVX) )
return;
pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_avx;
pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_avx;
pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_avx;
pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_avx;
......@@ -501,7 +512,8 @@ void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12], x264_predict_
pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_sse2;
if( !(cpu&X264_CPU_SSSE3) )
return;
pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_ssse3;
pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_ssse3;
pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_ssse3;
pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_ssse3;
*predict_8x8_filter = x264_predict_8x8_filter_ssse3;
if( !(cpu&X264_CPU_AVX) )
......@@ -532,21 +544,26 @@ void x264_predict_4x4_init_mmx( int cpu, x264_predict_t pf[12] )
pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_sse2;
pf[I_PRED_4x4_VL] = x264_predict_4x4_vl_sse2;
pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_sse2;
if( !(cpu&X264_CPU_SSSE3) )
return;
pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_ssse3;
pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_ssse3;
pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_ssse3;
if( !(cpu&X264_CPU_AVX) )
return;
pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_avx;
pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_avx;
pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_avx;
#if ARCH_X86_64
pf[I_PRED_4x4_VL] = x264_predict_4x4_vl_avx;
#endif
pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_avx;
#else
pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_mmx2;
#endif // HIGH_BIT_DEPTH
if( !(cpu&X264_CPU_SSSE3) )
return;
pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_ssse3;
pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_ssse3;
pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_ssse3;
if( cpu&X264_CPU_CACHELINE_64 )
pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_ssse3_cache64;
#endif // HIGH_BIT_DEPTH
}
......@@ -322,6 +322,22 @@
%endif
%endmacro
; shift a mmxreg by n bytes, or a xmmreg by 2*n bytes
; values shifted in are undefined
; faster if dst==src
%define PSLLPIX PSXLPIX l, -1, ;dst, src, shift
%define PSRLPIX PSXLPIX r, 1, ;dst, src, shift
%macro PSXLPIX 5
%if mmsize == 8
%if %5&1
ps%1lq %3, %4, %5*8
%else
pshufw %3, %4, (q3210<<8>>(8+%2*%5))&0xff
%endif
%else
ps%1ldq %3, %4, %5*2
%endif
%endmacro
%macro DEINTB 5 ; mask, reg1, mask, reg2, optional src to fill masks from
%ifnum %5
......
......@@ -834,7 +834,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
/* 8x8 prediction selection */
if( flags & X264_ANALYSE_I8x8 )
{
ALIGNED_ARRAY_16( pixel, edge,[36] );
ALIGNED_ARRAY_32( pixel, edge,[36] );
x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
......@@ -1189,7 +1189,7 @@ static void x264_intra_rd_refine( x264_t *h, x264_mb_analysis_t *a )
}
else if( h->mb.i_type == I_8x8 )
{
ALIGNED_ARRAY_16( pixel, edge,[3],[48] );
ALIGNED_ARRAY_32( pixel, edge,[4],[32] ); // really [3][36], but they can overlap
pixel4 pels_h[3][2] = {{0}};
pixel pels_v[3][7] = {{0}};
uint16_t nnz[3][2] = {{0}}; //shut up gcc
......
......@@ -171,7 +171,7 @@ void x264_mb_encode_i8x8( x264_t *h, int p, int idx, int i_qp, int i_mode, pixel
pixel *p_src = &h->mb.pic.p_fenc[p][8*x + 8*y*FENC_STRIDE];
pixel *p_dst = &h->mb.pic.p_fdec[p][8*x + 8*y*FDEC_STRIDE];
ALIGNED_ARRAY_16( dctcoef, dct8x8,[64] );
ALIGNED_ARRAY_16( pixel, edge_buf,[36] );
ALIGNED_ARRAY_32( pixel, edge_buf,[36] );
if( !edge )
{
......
......@@ -273,7 +273,7 @@ uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i4, int i_pixel )
return (i_ssd<<8) + i_bits;
}
static uint64_t x264_rd_cost_i8x8( x264_t *h, int i_lambda2, int i8, int i_mode, pixel edge[3][48] )
static uint64_t x264_rd_cost_i8x8( x264_t *h, int i_lambda2, int i8, int i_mode, pixel edge[4][32] )
{
uint64_t i_ssd, i_bits;
int plane_count = CHROMA444 ? 3 : 1;
......
......@@ -1784,8 +1784,8 @@ static int check_quant( int cpu_ref, int cpu_new )
static int check_intra( int cpu_ref, int cpu_new )
{
int ret = 0, ok = 1, used_asm = 0;
ALIGNED_16( pixel edge[36] );
ALIGNED_16( pixel edge2[36] );
ALIGNED_ARRAY_32( pixel, edge,[36] );
ALIGNED_ARRAY_32( pixel, edge2,[36] );
ALIGNED_16( pixel fdec[FDEC_STRIDE*20] );
struct
{
......@@ -1871,8 +1871,8 @@ static int check_intra( int cpu_ref, int cpu_new )
if( !(i&7) || ((i&MB_TOPRIGHT) && !(i&MB_TOP)) )
continue;
int neighbor = (i&24)>>1;
memset( edge, 0, sizeof(edge) );
memset( edge2, 0, sizeof(edge2) );
memset( edge, 0, 36*sizeof(pixel) );
memset( edge2, 0, 36*sizeof(pixel) );
call_c( ip_c.predict_8x8_filter, pbuf1+48, edge, neighbor, i&7 );
call_a( ip_a.predict_8x8_filter, pbuf1+48, edge2, neighbor, i&7 );
if( !(neighbor&MB_TOPLEFT) )
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment