Commit 1fbba0ca authored by Loren Merritt's avatar Loren Merritt

change all dct arrays to 1d.

the C standard doesn't allow you to iterate 1-dimensionally over 2d arrays, and nothing other than the dsp functions themselves cares about the 2dness of dct.
this fixes a miscompilation in x264_mb_optimize_chroma_dc.
parent 507c8342
......@@ -23,27 +23,27 @@
#ifndef X264_ARM_DCT_H
#define X264_ARM_DCT_H
void x264_dct4x4dc_neon( int16_t d[4][4] );
void x264_idct4x4dc_neon( int16_t d[4][4] );
void x264_dct4x4dc_neon( int16_t d[16] );
void x264_idct4x4dc_neon( int16_t d[16] );
void x264_sub4x4_dct_neon( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_neon( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_neon( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 );
void x264_sub4x4_dct_neon( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_neon( int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_neon( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void x264_add4x4_idct_neon( uint8_t *p_dst, int16_t dct[4][4] );
void x264_add8x8_idct_neon( uint8_t *p_dst, int16_t dct[4][4][4] );
void x264_add16x16_idct_neon( uint8_t *p_dst, int16_t dct[16][4][4] );
void x264_add4x4_idct_neon( uint8_t *p_dst, int16_t dct[16] );
void x264_add8x8_idct_neon( uint8_t *p_dst, int16_t dct[4][16] );
void x264_add16x16_idct_neon( uint8_t *p_dst, int16_t dct[16][16] );
void x264_add8x8_idct_dc_neon( uint8_t *p_dst, int16_t dct[2][2] );
void x264_add16x16_idct_dc_neon( uint8_t *p_dst, int16_t dct[4][4] );
void x264_add16x16_idct_dc_neon( uint8_t *p_dst, int16_t dct[16] );
void x264_sub8x8_dct_dc_neon( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct8_neon( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct8_neon( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct8_neon( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct8_neon( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
void x264_add8x8_idct8_neon( uint8_t *p_dst, int16_t dct[8][8] );
void x264_add16x16_idct8_neon( uint8_t *p_dst, int16_t dct[4][8][8] );
void x264_add8x8_idct8_neon( uint8_t *p_dst, int16_t dct[64] );
void x264_add16x16_idct8_neon( uint8_t *p_dst, int16_t dct[4][64] );
void x264_zigzag_scan_4x4_frame_neon( int16_t level[16], int16_t dct[4][4] );
void x264_zigzag_scan_4x4_frame_neon( int16_t level[16], int16_t dct[16] );
#endif
......@@ -63,7 +63,7 @@ pmovmskb_byte:
bx lr
.endm
// quant_2x2_dc( int16_t dct[2][2], int mf, int bias )
// quant_2x2_dc( int16_t dct[4], int mf, int bias )
function x264_quant_2x2_dc_neon, export=1
vld1.64 {d0}, [r0,:64]
vabs.s16 d3, d0
......@@ -79,7 +79,7 @@ function x264_quant_2x2_dc_neon, export=1
QUANT_END d3
.endfunc
// quant_4x4_dc( int16_t dct[4][4], int mf, int bias )
// quant_4x4_dc( int16_t dct[16], int mf, int bias )
function x264_quant_4x4_dc_neon, export=1
vld1.64 {d28-d31}, [r0,:128]
vabs.s16 q8, q14
......@@ -91,7 +91,7 @@ function x264_quant_4x4_dc_neon, export=1
QUANT_END d0
.endfunc
// quant_4x4( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] )
// quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
function x264_quant_4x4_neon, export=1
vld1.64 {d28-d31}, [r0,:128]
vabs.s16 q8, q14
......@@ -103,7 +103,7 @@ function x264_quant_4x4_neon, export=1
QUANT_END d0
.endfunc
// quant_8x8( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] )
// quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
function x264_quant_8x8_neon, export=1
vld1.64 {d28-d31}, [r0,:128]
vabs.s16 q8, q14
......@@ -137,7 +137,7 @@ function x264_quant_8x8_neon, export=1
subs r3, r3, #\offset // 6 for 8x8
.endm
// dequant_4x4( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
// dequant_4x4( int16_t dct[16], int dequant_mf[6][16], int i_qp )
.macro DEQUANT size bits
function x264_dequant_\size\()_neon, export=1
DEQUANT_START \bits+2, \bits
......@@ -219,7 +219,7 @@ dequant_\size\()_rshift_loop:
DEQUANT 4x4, 4
DEQUANT 8x8, 6
// dequant_4x4_dc( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
// dequant_4x4_dc( int16_t dct[16], int dequant_mf[6][16], int i_qp )
function x264_dequant_4x4_dc_neon, export=1
DEQUANT_START 6, 6, yes
blt dequant_4x4_dc_rshift
......
......@@ -23,16 +23,16 @@
#ifndef X264_ARM_QUANT_H
#define X264_ARM_QUANT_H
int x264_quant_2x2_dc_armv6( int16_t dct[2][2], int mf, int bias );
int x264_quant_2x2_dc_armv6( int16_t dct[4], int mf, int bias );
int x264_quant_2x2_dc_neon( int16_t dct[2][2], int mf, int bias );
int x264_quant_4x4_dc_neon( int16_t dct[4][4], int mf, int bias );
int x264_quant_4x4_neon( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
int x264_quant_8x8_neon( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
int x264_quant_2x2_dc_neon( int16_t dct[4], int mf, int bias );
int x264_quant_4x4_dc_neon( int16_t dct[16], int mf, int bias );
int x264_quant_4x4_neon( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] );
int x264_quant_8x8_neon( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] );
void x264_dequant_4x4_dc_neon( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
void x264_dequant_4x4_neon( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
void x264_dequant_8x8_neon( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
void x264_dequant_4x4_dc_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_4x4_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_neon( int16_t dct[64], int dequant_mf[6][64], int i_qp );
int x264_coeff_last4_arm( int16_t * );
int x264_coeff_last15_neon( int16_t * );
......
......@@ -341,9 +341,9 @@ struct x264_t
x264_pps_t *pps;
int i_idr_pic_id;
/* quantization matrix for decoding, [cqm][qp%6][coef_y][coef_x] */
int (*dequant4_mf[4])[4][4]; /* [4][6][4][4] */
int (*dequant8_mf[2])[8][8]; /* [2][6][8][8] */
/* quantization matrix for decoding, [cqm][qp%6][coef] */
int (*dequant4_mf[4])[16]; /* [4][6][16] */
int (*dequant8_mf[2])[64]; /* [2][6][64] */
/* quantization matrix for trellis, [cqm][qp][coef] */
int (*unquant4_mf[4])[16]; /* [4][52][16] */
int (*unquant8_mf[2])[64]; /* [2][52][64] */
......
......@@ -35,75 +35,71 @@
int x264_dct4_weight2_zigzag[2][16];
int x264_dct8_weight2_zigzag[2][64];
/*
* XXX For all dct dc : input could be equal to output so ...
*/
static void dct4x4dc( int16_t d[4][4] )
static void dct4x4dc( int16_t d[16] )
{
int16_t tmp[4][4];
int16_t tmp[16];
int s01, s23;
int d01, d23;
int i;
for( i = 0; i < 4; i++ )
{
s01 = d[i][0] + d[i][1];
d01 = d[i][0] - d[i][1];
s23 = d[i][2] + d[i][3];
d23 = d[i][2] - d[i][3];
tmp[0][i] = s01 + s23;
tmp[1][i] = s01 - s23;
tmp[2][i] = d01 - d23;
tmp[3][i] = d01 + d23;
s01 = d[i*4+0] + d[i*4+1];
d01 = d[i*4+0] - d[i*4+1];
s23 = d[i*4+2] + d[i*4+3];
d23 = d[i*4+2] - d[i*4+3];
tmp[0*4+i] = s01 + s23;
tmp[1*4+i] = s01 - s23;
tmp[2*4+i] = d01 - d23;
tmp[3*4+i] = d01 + d23;
}
for( i = 0; i < 4; i++ )
{
s01 = tmp[i][0] + tmp[i][1];
d01 = tmp[i][0] - tmp[i][1];
s23 = tmp[i][2] + tmp[i][3];
d23 = tmp[i][2] - tmp[i][3];
d[i][0] = ( s01 + s23 + 1 ) >> 1;
d[i][1] = ( s01 - s23 + 1 ) >> 1;
d[i][2] = ( d01 - d23 + 1 ) >> 1;
d[i][3] = ( d01 + d23 + 1 ) >> 1;
s01 = tmp[i*4+0] + tmp[i*4+1];
d01 = tmp[i*4+0] - tmp[i*4+1];
s23 = tmp[i*4+2] + tmp[i*4+3];
d23 = tmp[i*4+2] - tmp[i*4+3];
d[i*4+0] = ( s01 + s23 + 1 ) >> 1;
d[i*4+1] = ( s01 - s23 + 1 ) >> 1;
d[i*4+2] = ( d01 - d23 + 1 ) >> 1;
d[i*4+3] = ( d01 + d23 + 1 ) >> 1;
}
}
static void idct4x4dc( int16_t d[4][4] )
static void idct4x4dc( int16_t d[16] )
{
int16_t tmp[4][4];
int16_t tmp[16];
int s01, s23;
int d01, d23;
int i;
for( i = 0; i < 4; i++ )
{
s01 = d[i][0] + d[i][1];
d01 = d[i][0] - d[i][1];
s23 = d[i][2] + d[i][3];
d23 = d[i][2] - d[i][3];
tmp[0][i] = s01 + s23;
tmp[1][i] = s01 - s23;
tmp[2][i] = d01 - d23;
tmp[3][i] = d01 + d23;
s01 = d[i*4+0] + d[i*4+1];
d01 = d[i*4+0] - d[i*4+1];
s23 = d[i*4+2] + d[i*4+3];
d23 = d[i*4+2] - d[i*4+3];
tmp[0*4+i] = s01 + s23;
tmp[1*4+i] = s01 - s23;
tmp[2*4+i] = d01 - d23;
tmp[3*4+i] = d01 + d23;
}
for( i = 0; i < 4; i++ )
{
s01 = tmp[i][0] + tmp[i][1];
d01 = tmp[i][0] - tmp[i][1];
s23 = tmp[i][2] + tmp[i][3];
d23 = tmp[i][2] - tmp[i][3];
d[i][0] = s01 + s23;
d[i][1] = s01 - s23;
d[i][2] = d01 - d23;
d[i][3] = d01 + d23;
s01 = tmp[i*4+0] + tmp[i*4+1];
d01 = tmp[i*4+0] - tmp[i*4+1];
s23 = tmp[i*4+2] + tmp[i*4+3];
d23 = tmp[i*4+2] - tmp[i*4+3];
d[i*4+0] = s01 + s23;
d[i*4+1] = s01 - s23;
d[i*4+2] = d01 - d23;
d[i*4+3] = d01 + d23;
}
}
......@@ -122,42 +118,42 @@ static inline void pixel_sub_wxh( int16_t *diff, int i_size,
}
}
static void sub4x4_dct( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
static void sub4x4_dct( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 )
{
int16_t d[4][4];
int16_t tmp[4][4];
int16_t d[16];
int16_t tmp[16];
int i;
pixel_sub_wxh( (int16_t*)d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
for( i = 0; i < 4; i++ )
{
const int s03 = d[i][0] + d[i][3];
const int s12 = d[i][1] + d[i][2];
const int d03 = d[i][0] - d[i][3];
const int d12 = d[i][1] - d[i][2];
tmp[0][i] = s03 + s12;
tmp[1][i] = 2*d03 + d12;
tmp[2][i] = s03 - s12;
tmp[3][i] = d03 - 2*d12;
const int s03 = d[i*4+0] + d[i*4+3];
const int s12 = d[i*4+1] + d[i*4+2];
const int d03 = d[i*4+0] - d[i*4+3];
const int d12 = d[i*4+1] - d[i*4+2];
tmp[0*4+i] = s03 + s12;
tmp[1*4+i] = 2*d03 + d12;
tmp[2*4+i] = s03 - s12;
tmp[3*4+i] = d03 - 2*d12;
}
for( i = 0; i < 4; i++ )
{
const int s03 = tmp[i][0] + tmp[i][3];
const int s12 = tmp[i][1] + tmp[i][2];
const int d03 = tmp[i][0] - tmp[i][3];
const int d12 = tmp[i][1] - tmp[i][2];
dct[i][0] = s03 + s12;
dct[i][1] = 2*d03 + d12;
dct[i][2] = s03 - s12;
dct[i][3] = d03 - 2*d12;
const int s03 = tmp[i*4+0] + tmp[i*4+3];
const int s12 = tmp[i*4+1] + tmp[i*4+2];
const int d03 = tmp[i*4+0] - tmp[i*4+3];
const int d12 = tmp[i*4+1] - tmp[i*4+2];
dct[i*4+0] = s03 + s12;
dct[i*4+1] = 2*d03 + d12;
dct[i*4+2] = s03 - s12;
dct[i*4+3] = d03 - 2*d12;
}
}
static void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
static void sub8x8_dct( int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2 )
{
sub4x4_dct( dct[0], &pix1[0], &pix2[0] );
sub4x4_dct( dct[1], &pix1[4], &pix2[4] );
......@@ -165,7 +161,7 @@ static void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
sub4x4_dct( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
}
static void sub16x16_dct( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 )
static void sub16x16_dct( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 )
{
sub8x8_dct( &dct[ 0], &pix1[0], &pix2[0] );
sub8x8_dct( &dct[ 4], &pix1[8], &pix2[8] );
......@@ -175,72 +171,68 @@ static void sub16x16_dct( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 )
static int sub4x4_dct_dc( uint8_t *pix1, uint8_t *pix2 )
{
int16_t d[4][4];
int16_t d[16];
int sum = 0;
pixel_sub_wxh( (int16_t*)d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
sum += d[0][0] + d[0][1] + d[0][2] + d[0][3];
sum += d[1][0] + d[1][1] + d[1][2] + d[1][3];
sum += d[2][0] + d[2][1] + d[2][2] + d[2][3];
sum += d[3][0] + d[3][1] + d[3][2] + d[3][3];
sum += d[0] + d[1] + d[2] + d[3] + d[4] + d[5] + d[6] + d[7];
sum += d[8] + d[9] + d[10] + d[11] + d[12] + d[13] + d[14] + d[15];
return sum;
}
static void sub8x8_dct_dc( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 )
static void sub8x8_dct_dc( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 )
{
dct[0][0] = sub4x4_dct_dc( &pix1[0], &pix2[0] );
dct[0][1] = sub4x4_dct_dc( &pix1[4], &pix2[4] );
dct[1][0] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
dct[1][1] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
dct[0] = sub4x4_dct_dc( &pix1[0], &pix2[0] );
dct[1] = sub4x4_dct_dc( &pix1[4], &pix2[4] );
dct[2] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
dct[3] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
}
static void add4x4_idct( uint8_t *p_dst, int16_t dct[4][4] )
static void add4x4_idct( uint8_t *p_dst, int16_t dct[16] )
{
int16_t d[4][4];
int16_t tmp[4][4];
int16_t d[16];
int16_t tmp[16];
int x, y;
int i;
for( i = 0; i < 4; i++ )
{
const int s02 = dct[0][i] + dct[2][i];
const int d02 = dct[0][i] - dct[2][i];
const int s13 = dct[1][i] + (dct[3][i]>>1);
const int d13 = (dct[1][i]>>1) - dct[3][i];
tmp[i][0] = s02 + s13;
tmp[i][1] = d02 + d13;
tmp[i][2] = d02 - d13;
tmp[i][3] = s02 - s13;
const int s02 = dct[0*4+i] + dct[2*4+i];
const int d02 = dct[0*4+i] - dct[2*4+i];
const int s13 = dct[1*4+i] + (dct[3*4+i]>>1);
const int d13 = (dct[1*4+i]>>1) - dct[3*4+i];
tmp[i*4+0] = s02 + s13;
tmp[i*4+1] = d02 + d13;
tmp[i*4+2] = d02 - d13;
tmp[i*4+3] = s02 - s13;
}
for( i = 0; i < 4; i++ )
{
const int s02 = tmp[0][i] + tmp[2][i];
const int d02 = tmp[0][i] - tmp[2][i];
const int s13 = tmp[1][i] + (tmp[3][i]>>1);
const int d13 = (tmp[1][i]>>1) - tmp[3][i];
d[0][i] = ( s02 + s13 + 32 ) >> 6;
d[1][i] = ( d02 + d13 + 32 ) >> 6;
d[2][i] = ( d02 - d13 + 32 ) >> 6;
d[3][i] = ( s02 - s13 + 32 ) >> 6;
const int s02 = tmp[0*4+i] + tmp[2*4+i];
const int d02 = tmp[0*4+i] - tmp[2*4+i];
const int s13 = tmp[1*4+i] + (tmp[3*4+i]>>1);
const int d13 = (tmp[1*4+i]>>1) - tmp[3*4+i];
d[0*4+i] = ( s02 + s13 + 32 ) >> 6;
d[1*4+i] = ( d02 + d13 + 32 ) >> 6;
d[2*4+i] = ( d02 - d13 + 32 ) >> 6;
d[3*4+i] = ( s02 - s13 + 32 ) >> 6;
}
for( y = 0; y < 4; y++ )
{
for( x = 0; x < 4; x++ )
{
p_dst[x] = x264_clip_uint8( p_dst[x] + d[y][x] );
}
p_dst[x] = x264_clip_uint8( p_dst[x] + d[y*4+x] );
p_dst += FDEC_STRIDE;
}
}
static void add8x8_idct( uint8_t *p_dst, int16_t dct[4][4][4] )
static void add8x8_idct( uint8_t *p_dst, int16_t dct[4][16] )
{
add4x4_idct( &p_dst[0], dct[0] );
add4x4_idct( &p_dst[4], dct[1] );
......@@ -248,7 +240,7 @@ static void add8x8_idct( uint8_t *p_dst, int16_t dct[4][4][4] )
add4x4_idct( &p_dst[4*FDEC_STRIDE+4], dct[3] );
}
static void add16x16_idct( uint8_t *p_dst, int16_t dct[16][4][4] )
static void add16x16_idct( uint8_t *p_dst, int16_t dct[16][16] )
{
add8x8_idct( &p_dst[0], &dct[0] );
add8x8_idct( &p_dst[8], &dct[4] );
......@@ -287,29 +279,29 @@ static void add16x16_idct( uint8_t *p_dst, int16_t dct[16][4][4] )
DST(7) = (a4>>2) - a7 ;\
}
static void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
static void sub8x8_dct8( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 )
{
int i;
int16_t tmp[8][8];
int16_t tmp[64];
pixel_sub_wxh( (int16_t*)tmp, 8, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
pixel_sub_wxh( tmp, 8, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
#define SRC(x) tmp[x][i]
#define DST(x) tmp[x][i]
#define SRC(x) tmp[x*8+i]
#define DST(x) tmp[x*8+i]
for( i = 0; i < 8; i++ )
DCT8_1D
#undef SRC
#undef DST
#define SRC(x) tmp[i][x]
#define DST(x) dct[x][i]
#define SRC(x) tmp[i*8+x]
#define DST(x) dct[x*8+i]
for( i = 0; i < 8; i++ )
DCT8_1D
#undef SRC
#undef DST
}
static void sub16x16_dct8( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 )
static void sub16x16_dct8( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 )
{
sub8x8_dct8( dct[0], &pix1[0], &pix2[0] );
sub8x8_dct8( dct[1], &pix1[8], &pix2[8] );
......@@ -344,20 +336,20 @@ static void sub16x16_dct8( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 )
DST(7, b0 - b7);\
}
static void add8x8_idct8( uint8_t *dst, int16_t dct[8][8] )
static void add8x8_idct8( uint8_t *dst, int16_t dct[64] )
{
int i;
dct[0][0] += 32; // rounding for the >>6 at the end
dct[0] += 32; // rounding for the >>6 at the end
#define SRC(x) dct[x][i]
#define DST(x,rhs) dct[x][i] = (rhs)
#define SRC(x) dct[x*8+i]
#define DST(x,rhs) dct[x*8+i] = (rhs)
for( i = 0; i < 8; i++ )
IDCT8_1D
#undef SRC
#undef DST
#define SRC(x) dct[i][x]
#define SRC(x) dct[i*8+x]
#define DST(x,rhs) dst[i + x*FDEC_STRIDE] = x264_clip_uint8( dst[i + x*FDEC_STRIDE] + ((rhs) >> 6) );
for( i = 0; i < 8; i++ )
IDCT8_1D
......@@ -365,7 +357,7 @@ static void add8x8_idct8( uint8_t *dst, int16_t dct[8][8] )
#undef DST
}
static void add16x16_idct8( uint8_t *dst, int16_t dct[4][8][8] )
static void add16x16_idct8( uint8_t *dst, int16_t dct[4][64] )
{
add8x8_idct8( &dst[0], dct[0] );
add8x8_idct8( &dst[8], dct[1] );
......@@ -386,23 +378,23 @@ static void inline add4x4_idct_dc( uint8_t *p_dst, int16_t dc )
}
}
static void add8x8_idct_dc( uint8_t *p_dst, int16_t dct[2][2] )
static void add8x8_idct_dc( uint8_t *p_dst, int16_t dct[4] )
{
add4x4_idct_dc( &p_dst[0], dct[0][0] );
add4x4_idct_dc( &p_dst[4], dct[0][1] );
add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+0], dct[1][0] );
add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[1][1] );
add4x4_idct_dc( &p_dst[0], dct[0] );
add4x4_idct_dc( &p_dst[4], dct[1] );
add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+0], dct[2] );
add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[3] );
}
static void add16x16_idct_dc( uint8_t *p_dst, int16_t dct[4][4] )
static void add16x16_idct_dc( uint8_t *p_dst, int16_t dct[16] )
{
int i;
for( i = 0; i < 4; i++, p_dst += 4*FDEC_STRIDE )
for( i = 0; i < 4; i++, dct += 4, p_dst += 4*FDEC_STRIDE )
{
add4x4_idct_dc( &p_dst[ 0], dct[i][0] );
add4x4_idct_dc( &p_dst[ 4], dct[i][1] );
add4x4_idct_dc( &p_dst[ 8], dct[i][2] );
add4x4_idct_dc( &p_dst[12], dct[i][3] );
add4x4_idct_dc( &p_dst[ 0], dct[0] );
add4x4_idct_dc( &p_dst[ 4], dct[1] );
add4x4_idct_dc( &p_dst[ 8], dct[2] );
add4x4_idct_dc( &p_dst[12], dct[3] );
}
}
......@@ -545,8 +537,7 @@ void x264_dct_init_weights( void )
}
// gcc pessimizes multi-dimensional arrays here, even with constant indices
#define ZIG(i,y,x) level[i] = dct[0][x*8+y];
#define ZIG(i,y,x) level[i] = dct[x*8+y];
#define ZIGZAG8_FRAME\
ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
......@@ -595,32 +586,32 @@ void x264_dct_init_weights( void )
ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)\
ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3)
static void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] )
static void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[64] )
{
ZIGZAG8_FRAME
}
static void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[8][8] )
static void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[64] )
{
ZIGZAG8_FIELD
}
#undef ZIG
#define ZIG(i,y,x) level[i] = dct[0][x*4+y];
#define ZIG(i,y,x) level[i] = dct[x*4+y];
#define ZIGDC(i,y,x) ZIG(i,y,x)
static void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] )
static void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[16] )
{
ZIGZAG4_FRAME
}
static void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] )
static void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[16] )
{
*(uint32_t*)level = *(uint32_t*)dct;
ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1)
*(uint32_t*)(level+6) = *(uint32_t*)(*dct+6);