Commit 46a48729 authored by Loren Merritt's avatar Loren Merritt

slightly faster 8x8 dct


git-svn-id: svn://svn.videolan.org/x264/trunk@249 df754926-b1dd-0310-bc7b-ec298dee348c
parent 398a6bf0
......@@ -260,60 +260,56 @@ static void add16x16_idct( uint8_t *p_dst, int i_dst, int16_t dct[16][4][4] )
* 8x8 transform:
****************************************************************************/
static inline void dct8_1d( int16_t src[8][8], int16_t dst[8][8] )
{
int i;
for( i = 0; i < 8; i++ )
{
const int s07 = src[i][0] + src[i][7];
const int s16 = src[i][1] + src[i][6];
const int s25 = src[i][2] + src[i][5];
const int s34 = src[i][3] + src[i][4];
const int a0 = s07 + s34;
const int a1 = s16 + s25;
const int a2 = s07 - s34;
const int a3 = s16 - s25;
const int d07 = src[i][0] - src[i][7];
const int d16 = src[i][1] - src[i][6];
const int d25 = src[i][2] - src[i][5];
const int d34 = src[i][3] - src[i][4];
const int a4 = d16 + d25 + (d07 + (d07>>1));
const int a5 = d07 - d34 - (d25 + (d25>>1));
const int a6 = d07 + d34 - (d16 + (d16>>1));
const int a7 = d16 - d25 + (d34 + (d34>>1));
dst[0][i] = a0 + a1;
dst[1][i] = a4 + (a7>>2);
dst[2][i] = a2 + (a3>>1);
dst[3][i] = a5 + (a6>>2);
dst[4][i] = a0 - a1;
dst[5][i] = a6 - (a5>>2);
dst[6][i] = (a2>>1) - a3;
dst[7][i] = (a4>>2) - a7;
}
#define DCT8_1D {\
const int s07 = SRC(0) + SRC(7);\
const int s16 = SRC(1) + SRC(6);\
const int s25 = SRC(2) + SRC(5);\
const int s34 = SRC(3) + SRC(4);\
const int a0 = s07 + s34;\
const int a1 = s16 + s25;\
const int a2 = s07 - s34;\
const int a3 = s16 - s25;\
const int d07 = SRC(0) - SRC(7);\
const int d16 = SRC(1) - SRC(6);\
const int d25 = SRC(2) - SRC(5);\
const int d34 = SRC(3) - SRC(4);\
const int a4 = d16 + d25 + (d07 + (d07>>1));\
const int a5 = d07 - d34 - (d25 + (d25>>1));\
const int a6 = d07 + d34 - (d16 + (d16>>1));\
const int a7 = d16 - d25 + (d34 + (d34>>1));\
SRC(0) = a0 + a1 ;\
SRC(1) = a4 + (a7>>2);\
SRC(2) = a2 + (a3>>1);\
SRC(3) = a5 + (a6>>2);\
SRC(4) = a0 - a1 ;\
SRC(5) = a6 - (a5>>2);\
SRC(6) = (a2>>1) - a3 ;\
SRC(7) = (a4>>2) - a7 ;\
}
static void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
{
int16_t d[8][8];
int16_t tmp[8][8];
int y, x;
int y, x, i;
for( y = 0; y < 8; y++ )
{
for( x = 0; x < 8; x++ )
{
d[y][x] = pix1[x] - pix2[x];
dct[y][x] = pix1[x] - pix2[x];
}
pix1 += i_pix1;
pix2 += i_pix2;
}
dct8_1d( d, tmp );
dct8_1d( tmp, dct );
#define SRC(x) dct[i][x]
for( i = 0; i < 8; i++ )
DCT8_1D
#undef SRC
#define SRC(x) dct[x][i]
for( i = 0; i < 8; i++ )
DCT8_1D
#undef SRC
}
static void sub16x16_dct8( int16_t dct[4][8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
......@@ -324,67 +320,60 @@ static void sub16x16_dct8( int16_t dct[4][8][8], uint8_t *pix1, int i_pix1, uint
sub8x8_dct8( dct[3], &pix1[8*i_pix1+8], i_pix1, &pix2[8*i_pix2+8], i_pix2 );
}
static inline void idct8_1d( int16_t src[8][8], int16_t dst[8][8] )
{
int i;
for( i = 0; i < 8; i++ )
{
const int a0 = src[i][0] + src[i][4];
const int a2 = src[i][0] - src[i][4];
const int a4 = (src[i][2]>>1) - src[i][6];
const int a6 = (src[i][6]>>1) + src[i][2];
const int b0 = a0 + a6;
const int b2 = a2 + a4;
const int b4 = a2 - a4;
const int b6 = a0 - a6;
const int a1 = -src[i][3] + src[i][5] - src[i][7] - (src[i][7]>>1);
const int a3 = src[i][1] + src[i][7] - src[i][3] - (src[i][3]>>1);
const int a5 = -src[i][1] + src[i][7] + src[i][5] + (src[i][5]>>1);
const int a7 = src[i][3] + src[i][5] + src[i][1] + (src[i][1]>>1);
const int b1 = (a7>>2) + a1;
const int b3 = a3 + (a5>>2);
const int b5 = (a3>>2) - a5;
const int b7 = a7 - (a1>>2);
dst[0][i] = b0 + b7;
dst[7][i] = b0 - b7;
dst[1][i] = b2 + b5;
dst[6][i] = b2 - b5;
dst[2][i] = b4 + b3;
dst[5][i] = b4 - b3;
dst[3][i] = b6 + b1;
dst[4][i] = b6 - b1;
}
#define IDCT8_1D {\
const int a0 = SRC(0) + SRC(4);\
const int a2 = SRC(0) - SRC(4);\
const int a4 = (SRC(2)>>1) - SRC(6);\
const int a6 = (SRC(6)>>1) + SRC(2);\
const int b0 = a0 + a6;\
const int b2 = a2 + a4;\
const int b4 = a2 - a4;\
const int b6 = a0 - a6;\
const int a1 = -SRC(3) + SRC(5) - SRC(7) - (SRC(7)>>1);\
const int a3 = SRC(1) + SRC(7) - SRC(3) - (SRC(3)>>1);\
const int a5 = -SRC(1) + SRC(7) + SRC(5) + (SRC(5)>>1);\
const int a7 = SRC(3) + SRC(5) + SRC(1) + (SRC(1)>>1);\
const int b1 = (a7>>2) + a1;\
const int b3 = a3 + (a5>>2);\
const int b5 = (a3>>2) - a5;\
const int b7 = a7 - (a1>>2);\
DST(0, b0 + b7);\
DST(1, b2 + b5);\
DST(2, b4 + b3);\
DST(3, b6 + b1);\
DST(4, b6 - b1);\
DST(5, b4 - b3);\
DST(6, b2 - b5);\
DST(7, b0 - b7);\
}
static void add8x8_idct8( uint8_t *p_dst, int i_dst, int16_t dct[8][8] )
static void add8x8_idct8( uint8_t *dst, int i_dst, int16_t dct[8][8] )
{
int16_t d[8][8];
int16_t tmp[8][8];
int y, x;
int i;
idct8_1d( dct, tmp );
idct8_1d( tmp, d );
dct[0][0] += 32; // rounding for the >>6 at the end
for( y = 0; y < 8; y++ )
{
for( x = 0; x < 8; x++ )
{
p_dst[x] = clip_uint8( p_dst[x] + ((d[y][x] + 32) >> 6) );
}
p_dst += i_dst;
}
#define SRC(x) dct[i][x]
#define DST(x,rhs) dct[i][x] = (rhs)
for( i = 0; i < 8; i++ )
IDCT8_1D
#undef SRC
#undef DST
#define SRC(x) dct[x][i]
#define DST(x,rhs) dst[i + x*i_dst] = clip_uint8( dst[i + x*i_dst] + ((rhs) >> 6) );
for( i = 0; i < 8; i++ )
IDCT8_1D
#undef SRC
#undef DST
}
static void add16x16_idct8( uint8_t *p_dst, int i_dst, int16_t dct[4][8][8] )
static void add16x16_idct8( uint8_t *dst, int i_dst, int16_t dct[4][8][8] )
{
add8x8_idct8( &p_dst[0], i_dst, dct[0] );
add8x8_idct8( &p_dst[8], i_dst, dct[1] );
add8x8_idct8( &p_dst[8*i_dst], i_dst, dct[2] );
add8x8_idct8( &p_dst[8*i_dst+8], i_dst, dct[3] );
add8x8_idct8( &dst[0], i_dst, dct[0] );
add8x8_idct8( &dst[8], i_dst, dct[1] );
add8x8_idct8( &dst[8*i_dst], i_dst, dct[2] );
add8x8_idct8( &dst[8*i_dst+8], i_dst, dct[3] );
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment