Commit 30da25a9 authored by Loren Merritt's avatar Loren Merritt

reduce zigzag arrays from int to int16_t

parent 7a125e4a
......@@ -337,14 +337,14 @@ struct x264_t
/* Current MB DCT coeffs */
struct
{
DECLARE_ALIGNED( int, luma16x16_dc[16], 16 );
DECLARE_ALIGNED( int, chroma_dc[2][4], 16 );
DECLARE_ALIGNED( int16_t, luma16x16_dc[16], 16 );
DECLARE_ALIGNED( int16_t, chroma_dc[2][4], 16 );
// FIXME merge with union
DECLARE_ALIGNED( int, luma8x8[4][64], 16 );
DECLARE_ALIGNED( int16_t, luma8x8[4][64], 16 );
union
{
DECLARE_ALIGNED( int, residual_ac[15], 16 );
DECLARE_ALIGNED( int, luma4x4[16], 16 );
DECLARE_ALIGNED( int16_t, residual_ac[15], 16 );
DECLARE_ALIGNED( int16_t, luma4x4[16], 16 );
} block[16+8];
} dct;
......@@ -441,8 +441,8 @@ struct x264_t
/* i4x4 and i8x8 backup data, for skipping the encode stage when possible */
DECLARE_ALIGNED( uint8_t, i4x4_fdec_buf[16*16], 16 );
DECLARE_ALIGNED( uint8_t, i8x8_fdec_buf[16*16], 16 );
DECLARE_ALIGNED( int, i8x8_dct_buf[3][64], 16 );
DECLARE_ALIGNED( int, i4x4_dct_buf[15][16], 16 );
DECLARE_ALIGNED( int16_t, i8x8_dct_buf[3][64], 16 );
DECLARE_ALIGNED( int16_t, i4x4_dct_buf[15][16], 16 );
/* pointer over mb of the frame to be compressed */
uint8_t *p_fenc[3];
......
......@@ -458,9 +458,10 @@ void x264_dct_init_weights( void )
}
#define ZIG(i,y,x) level[i] = dct[x][y];
// gcc pessimizes multi-dimensional arrays here, even with constant indices
#define ZIG(i,y,x) level[i] = dct[0][x*8+y];
static void zigzag_scan_8x8_frame( int level[64], int16_t dct[8][8] )
static void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] )
{
ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)
ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)
......@@ -480,7 +481,7 @@ static void zigzag_scan_8x8_frame( int level[64], int16_t dct[8][8] )
ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)
}
static void zigzag_scan_8x8_field( int level[64], int16_t dct[8][8] )
static void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[8][8] )
{
ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,2,0) ZIG( 3,0,1)
ZIG( 4,1,1) ZIG( 5,3,0) ZIG( 6,4,0) ZIG( 7,2,1)
......@@ -500,7 +501,10 @@ static void zigzag_scan_8x8_field( int level[64], int16_t dct[8][8] )
ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7)
}
static void zigzag_scan_4x4_frame( int level[16], int16_t dct[4][4] )
#undef ZIG
#define ZIG(i,y,x) level[i] = dct[0][x*4+y];
static void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] )
{
ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)
ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)
......@@ -508,15 +512,16 @@ static void zigzag_scan_4x4_frame( int level[16], int16_t dct[4][4] )
ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
}
static void zigzag_scan_4x4_field( int level[16], int16_t dct[4][4] )
static void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] )
{
ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)
ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)
ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)
ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3)
*(uint32_t*)level = *(uint32_t*)dct;
ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1)
*(uint32_t*)(level+6) = *(uint32_t*)(*dct+6);
*(uint64_t*)(level+8) = *(uint64_t*)(*dct+8);
*(uint64_t*)(level+12) = *(uint64_t*)(*dct+12);
}
static void zigzag_scan_4x4ac_frame( int level[15], int16_t dct[4][4] )
static void zigzag_scan_4x4ac_frame( int16_t level[15], int16_t dct[4][4] )
{
ZIG( 0,0,1) ZIG( 1,1,0) ZIG( 2,2,0)
ZIG( 3,1,1) ZIG( 4,0,2) ZIG( 5,0,3) ZIG( 6,1,2)
......@@ -524,7 +529,7 @@ static void zigzag_scan_4x4ac_frame( int level[15], int16_t dct[4][4] )
ZIG(11,1,3) ZIG(12,2,3) ZIG(13,3,2) ZIG(14,3,3)
}
static void zigzag_scan_4x4ac_field( int level[15], int16_t dct[4][4] )
static void zigzag_scan_4x4ac_field( int16_t level[15], int16_t dct[4][4] )
{
ZIG( 0,1,0) ZIG( 1,0,1) ZIG( 2,2,0)
ZIG( 3,3,0) ZIG( 4,1,1) ZIG( 5,2,1) ZIG( 6,3,1)
......@@ -533,7 +538,6 @@ static void zigzag_scan_4x4ac_field( int level[15], int16_t dct[4][4] )
}
#undef ZIG
#define ZIG(i,y,x) {\
int oe = x+y*FENC_STRIDE;\
int od = x+y*FDEC_STRIDE;\
......@@ -541,7 +545,7 @@ static void zigzag_scan_4x4ac_field( int level[15], int16_t dct[4][4] )
p_dst[od] = p_src[oe];\
}
static void zigzag_sub_4x4_frame( int level[16], const uint8_t *p_src, uint8_t *p_dst )
static void zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
{
ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)
ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)
......@@ -549,7 +553,7 @@ static void zigzag_sub_4x4_frame( int level[16], const uint8_t *p_src, uint8_t *
ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
}
static void zigzag_sub_4x4_field( int level[16], const uint8_t *p_src, uint8_t *p_dst )
static void zigzag_sub_4x4_field( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
{
ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)
ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)
......@@ -557,7 +561,7 @@ static void zigzag_sub_4x4_field( int level[16], const uint8_t *p_src, uint8_t *
ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3)
}
static void zigzag_sub_4x4ac_frame( int level[15], const uint8_t *p_src, uint8_t *p_dst )
static void zigzag_sub_4x4ac_frame( int16_t level[15], const uint8_t *p_src, uint8_t *p_dst )
{
ZIG( 0,0,1) ZIG( 1,1,0) ZIG( 2,2,0)
ZIG( 3,1,1) ZIG( 4,0,2) ZIG( 5,0,3) ZIG( 6,1,2)
......@@ -565,7 +569,7 @@ static void zigzag_sub_4x4ac_frame( int level[15], const uint8_t *p_src, uint8_t
ZIG(11,1,3) ZIG(12,2,3) ZIG(13,3,2) ZIG(14,3,3)
}
static void zigzag_sub_4x4ac_field( int level[15], const uint8_t *p_src, uint8_t *p_dst )
static void zigzag_sub_4x4ac_field( int16_t level[15], const uint8_t *p_src, uint8_t *p_dst )
{
ZIG( 0,1,0) ZIG( 1,0,1) ZIG( 2,2,0)
ZIG( 3,3,0) ZIG( 4,1,1) ZIG( 5,2,1) ZIG( 6,3,1)
......@@ -585,12 +589,8 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
pf->sub_4x4 = zigzag_sub_4x4_field;
pf->sub_4x4ac = zigzag_sub_4x4ac_field;
#ifdef HAVE_MMX
#ifdef ARCH_X86
if( cpu&X264_CPU_MMX )
pf->scan_4x4 = x264_zigzag_scan_4x4_field_mmx;
#endif
if( cpu&X264_CPU_SSE2 )
pf->scan_4x4 = x264_zigzag_scan_4x4_field_sse2;
if( cpu&X264_CPU_MMXEXT )
pf->scan_4x4 = x264_zigzag_scan_4x4_field_mmxext;
#endif
#ifdef ARCH_PPC
......
......@@ -108,11 +108,11 @@ typedef struct
typedef struct
{
void (*scan_8x8)( int level[64], int16_t dct[8][8] );
void (*scan_4x4)( int level[16], int16_t dct[4][4] );
void (*scan_4x4ac)( int level[15], int16_t dct[4][4] );
void (*sub_4x4)( int level[16], const uint8_t *p_src, uint8_t *p_dst );
void (*sub_4x4ac)( int level[15], const uint8_t *p_src, uint8_t *p_dst );
void (*scan_8x8)( int16_t level[64], int16_t dct[8][8] );
void (*scan_4x4)( int16_t level[16], int16_t dct[4][4] );
void (*scan_4x4ac)( int16_t level[15], int16_t dct[4][4] );
void (*sub_4x4)( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst );
void (*sub_4x4ac)( int16_t level[15], const uint8_t *p_src, uint8_t *p_dst );
} x264_zigzag_function_t;
......
......@@ -526,35 +526,3 @@ ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx, 128, 8
ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2, 128, 8
;-----------------------------------------------------------------------------
; void x264_zigzag_scan_4x4_field_mmx( int level[16], int16_t dct[4][4] )
;-----------------------------------------------------------------------------
cglobal x264_zigzag_scan_4x4_field_mmx
mov edx, [esp+8]
mov ecx, [esp+4]
punpcklwd mm0, [edx]
punpckhwd mm1, [edx]
punpcklwd mm2, [edx+8]
punpckhwd mm3, [edx+8]
punpcklwd mm4, [edx+16]
punpckhwd mm5, [edx+16]
punpcklwd mm6, [edx+24]
punpckhwd mm7, [edx+24]
psrad mm0, 16
psrad mm1, 16
psrad mm2, 16
psrad mm3, 16
psrad mm4, 16
psrad mm5, 16
psrad mm6, 16
psrad mm7, 16
movq [ecx ], mm0
movq [ecx+16], mm2
movq [ecx+24], mm3
movq [ecx+32], mm4
movq [ecx+40], mm5
movq [ecx+48], mm6
movq [ecx+56], mm7
movq [ecx+12], mm1
movd [ecx+ 8], mm2
ret
......@@ -272,24 +272,21 @@ ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2, 128, 8, 0, 8
%endif
;-----------------------------------------------------------------------------
; void x264_zigzag_scan_4x4_field_sse2( int level[16], int16_t dct[4][4] )
; void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] )
;-----------------------------------------------------------------------------
cglobal x264_zigzag_scan_4x4_field_sse2, 2,2
punpcklwd xmm0, [r1]
punpckhwd xmm1, [r1]
punpcklwd xmm2, [r1+16]
punpckhwd xmm3, [r1+16]
psrad xmm0, 16
psrad xmm1, 16
psrad xmm2, 16
psrad xmm3, 16
movq [r0 ], xmm0
movdqa [r0+16], xmm1
movdqa [r0+32], xmm2
movhlps xmm0, xmm0
movdqa [r0+48], xmm3
movq [r0+12], xmm0
movd [r0+ 8], xmm1
; sse2 is only 1 cycle faster, and ssse3/pshufb is slower on core2
cglobal x264_zigzag_scan_4x4_field_mmxext, 2,3
pshufw mm0, [r1+4], 0xd2
movq mm1, [r1+16]
movq mm2, [r1+24]
movq [r0+4], mm0
movq [r0+16], mm1
movq [r0+24], mm2
mov r2d, [r1]
mov [r0], r2d
mov r2d, [r1+12]
mov [r0+12], r2d
RET
......@@ -46,7 +46,6 @@ void x264_sub16x16_dct8_sse2( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2
void x264_add8x8_idct8_sse2( uint8_t *dst, int16_t dct[8][8] );
void x264_add16x16_idct8_sse2( uint8_t *dst, int16_t dct[4][8][8] );
void x264_zigzag_scan_4x4_field_sse2( int level[16], int16_t dct[4][4] );
void x264_zigzag_scan_4x4_field_mmx( int level[16], int16_t dct[4][4] );
void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] );
#endif
......@@ -707,7 +707,7 @@ static const int last_coeff_flag_offset_8x8[63] = {
static const int identity[16] =
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int i_idx, int *l, int i_count )
static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int i_ctxBlockCat, int i_idx, int16_t *l, int i_count )
{
const int i_ctx_sig = significant_coeff_flag_offset[h->mb.b_interlaced][i_ctxBlockCat];
const int i_ctx_last = last_coeff_flag_offset[h->mb.b_interlaced][i_ctxBlockCat];
......
......@@ -63,7 +63,7 @@ static inline void bs_write_vlc( bs_t *s, vlc_t v )
/****************************************************************************
* block_residual_write_cavlc:
****************************************************************************/
static void block_residual_write_cavlc( x264_t *h, bs_t *s, int i_idx, int *l, int i_count )
static void block_residual_write_cavlc( x264_t *h, bs_t *s, int i_idx, int16_t *l, int i_count )
{
int level[16], run[16];
int i_total, i_trailing;
......
......@@ -26,7 +26,7 @@
#define ZIG(i,y,x) level[i] = dct[x][y];
static inline void zigzag_scan_2x2_dc( int level[4], int16_t dct[2][2] )
static inline void zigzag_scan_2x2_dc( int16_t level[4], int16_t dct[2][2] )
{
ZIG(0,0,0)
ZIG(1,0,1)
......@@ -43,7 +43,7 @@ static inline void zigzag_scan_2x2_dc( int level[4], int16_t dct[2][2] )
* for the complete mb: if score < 6 -> null
* chroma: for the complete mb: if score < 7 -> null
*/
static int x264_mb_decimate_score( int *dct, int i_max )
static int x264_mb_decimate_score( int16_t *dct, int i_max )
{
static const int i_ds_table4[16] = {
3,2,2,1,1,1,0,0,0,0,0,0,0,0,0,0 };
......@@ -618,7 +618,7 @@ int x264_macroblock_probe_skip( x264_t *h, const int b_bidir )
{
DECLARE_ALIGNED( int16_t, dct4x4[16][4][4], 16 );
DECLARE_ALIGNED( int16_t, dct2x2[2][2], 16 );
DECLARE_ALIGNED( int, dctscan[16], 16 );
DECLARE_ALIGNED( int16_t, dctscan[16], 16 );
int i_qp = h->mb.i_qp;
int mvp[2];
......
......@@ -65,7 +65,7 @@ static inline int array_non_zero_int( void *v, int i_count )
return 0;
}
static inline int array_non_zero_count( int *v, int i_count )
static inline int array_non_zero_count( int16_t *v, int i_count )
{
int i;
int i_nz;
......
......@@ -347,8 +347,8 @@ static int check_dct( int cpu_ref, int cpu_new )
x264_zigzag_function_t zigzag_ref;
x264_zigzag_function_t zigzag_asm;
int32_t level1[64] __attribute__((aligned(16)));
int32_t level2[64] __attribute__((aligned(16)));
int16_t level1[64] __attribute__((aligned(16)));
int16_t level2[64] __attribute__((aligned(16)));
#define TEST_ZIGZAG_SCAN( name, t1, t2, dct, size ) \
if( zigzag_asm.name != zigzag_ref.name ) \
......@@ -356,7 +356,7 @@ static int check_dct( int cpu_ref, int cpu_new )
used_asm = 1; \
call_c( zigzag_c.name, t1, dct ); \
call_a( zigzag_asm.name, t2, dct ); \
if( memcmp( t1, t2, size ) ) \
if( memcmp( t1, t2, size*sizeof(int16_t) ) ) \
{ \
ok = 0; \
fprintf( stderr, #name " [FAILED]\n" ); \
......@@ -371,7 +371,7 @@ static int check_dct( int cpu_ref, int cpu_new )
memcpy( buf4, buf1, 16*FDEC_STRIDE ); \
call_c( zigzag_c.name, t1, buf2, buf3 ); \
call_a( zigzag_asm.name, t2, buf2, buf4 ); \
if( memcmp( t1, t2, size )|| memcmp( buf3, buf4, 16*FDEC_STRIDE ) ) \
if( memcmp( t1, t2, size*sizeof(int16_t) )|| memcmp( buf3, buf4, 16*FDEC_STRIDE ) ) \
{ \
ok = 0; \
fprintf( stderr, #name " [FAILED]\n" ); \
......@@ -383,11 +383,11 @@ static int check_dct( int cpu_ref, int cpu_new )
x264_zigzag_init( cpu_new, &zigzag_asm, 0 );
ok = 1; used_asm = 0;
TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, (void*)dct1, 64*4 );
TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 16*4 );
TEST_ZIGZAG_SCAN( scan_4x4ac, level1, level2, dct1[0], 15*4 );
TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16*4 );
TEST_ZIGZAG_SUB( sub_4x4ac, level1, level2, 15*4 );
TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, (void*)dct1, 64 );
TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 16 );
TEST_ZIGZAG_SCAN( scan_4x4ac, level1, level2, dct1[0], 15 );
TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16 );
TEST_ZIGZAG_SUB( sub_4x4ac, level1, level2, 15 );
report( "zigzag_frame :" );
x264_zigzag_init( 0, &zigzag_c, 1 );
......@@ -395,11 +395,11 @@ static int check_dct( int cpu_ref, int cpu_new )
x264_zigzag_init( cpu_new, &zigzag_asm, 1 );
ok = 1; used_asm = 0;
TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, (void*)dct1, 64*4 );
TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 16*4 );
TEST_ZIGZAG_SCAN( scan_4x4ac, level1, level2, dct1[0], 15*4 );
TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16*4 );
TEST_ZIGZAG_SUB( sub_4x4ac, level1, level2, 15*4 );
TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, (void*)dct1, 64 );
TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 16 );
TEST_ZIGZAG_SCAN( scan_4x4ac, level1, level2, dct1[0], 15 );
TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16 );
TEST_ZIGZAG_SUB( sub_4x4ac, level1, level2, 15 );
report( "zigzag_field :" );
#undef TEST_ZIGZAG_SCAN
#undef TEST_ZIGZAG_SUB
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment