Commit 17a04af4 authored by Oskar Arvidsson's avatar Oskar Arvidsson Committed by Fiona Glaser

Convert to a unified "pixel" type for pixel data

Necessary for future high bit-depth support.
Various macros and extra types have been introduced to make operations on variable-size pixels more convenient.
parent 7adf25b1
......@@ -100,6 +100,14 @@ typedef union { x264_uint128_t i; uint64_t a[2]; uint32_t b[4]; uint16_t c[8]; u
#define CP64(dst,src) M64(dst) = M64(src)
#define CP128(dst,src) M128(dst) = M128(src)
typedef uint8_t pixel;
typedef uint32_t pixel4;
#define PIXEL_SPLAT_X4(x) ((x)*0x01010101U)
#define MPIXEL_X4(src) M32(src)
#define CPPIXEL_X4(dst,src) CP32(dst,src)
#define CPPIXEL_X8(dst,src) CP64(dst,src)
#define X264_SCAN8_SIZE (6*8)
#define X264_SCAN8_LUMA_SIZE (5*8)
#define X264_SCAN8_0 (4+1*8)
......@@ -172,7 +180,7 @@ void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... );
void x264_reduce_fraction( uint32_t *n, uint32_t *d );
void x264_init_vlc_tables();
static ALWAYS_INLINE uint8_t x264_clip_uint8( int x )
static ALWAYS_INLINE pixel x264_clip_pixel( int x )
{
return x&(~255) ? (-x)>>31 : x;
}
......@@ -580,7 +588,7 @@ struct x264_t
* NOTE: this will fail on resolutions above 2^16 MBs... */
/* buffer for weighted versions of the reference frames */
uint8_t *p_weight_buf[16];
pixel *p_weight_buf[16];
/* current value */
int i_type;
......@@ -611,12 +619,12 @@ struct x264_t
/* space for p_fenc and p_fdec */
#define FENC_STRIDE 16
#define FDEC_STRIDE 32
ALIGNED_16( uint8_t fenc_buf[24*FENC_STRIDE] );
ALIGNED_16( uint8_t fdec_buf[27*FDEC_STRIDE] );
ALIGNED_16( pixel fenc_buf[24*FENC_STRIDE] );
ALIGNED_16( pixel fdec_buf[27*FDEC_STRIDE] );
/* i4x4 and i8x8 backup data, for skipping the encode stage when possible */
ALIGNED_16( uint8_t i4x4_fdec_buf[16*16] );
ALIGNED_16( uint8_t i8x8_fdec_buf[16*16] );
ALIGNED_16( pixel i4x4_fdec_buf[16*16] );
ALIGNED_16( pixel i8x8_fdec_buf[16*16] );
ALIGNED_16( int16_t i8x8_dct_buf[3][64] );
ALIGNED_16( int16_t i4x4_dct_buf[15][16] );
uint32_t i4x4_nnz_buf[4];
......@@ -633,17 +641,17 @@ struct x264_t
ALIGNED_16( uint32_t fenc_satd_cache[32] );
/* pointer over mb of the frame to be compressed */
uint8_t *p_fenc[3];
pixel *p_fenc[3];
/* pointer to the actual source frame, not a block copy */
uint8_t *p_fenc_plane[3];
pixel *p_fenc_plane[3];
/* pointer over mb of the frame to be reconstructed */
uint8_t *p_fdec[3];
pixel *p_fdec[3];
/* pointer over mb of the references */
int i_fref[2];
uint8_t *p_fref[2][32][4+2]; /* last: lN, lH, lV, lHV, cU, cV */
uint8_t *p_fref_w[32]; /* weighted fullpel luma */
pixel *p_fref[2][32][4+2]; /* last: lN, lH, lV, lHV, cU, cV */
pixel *p_fref_w[32]; /* weighted fullpel luma */
uint16_t *p_integral[2][16];
/* fref stride */
......@@ -778,7 +786,7 @@ struct x264_t
/* Buffers that are allocated per-thread even in sliced threads. */
void *scratch_buffer; /* for any temporary storage that doesn't want repeated malloc */
uint8_t *intra_border_backup[2][3]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */
pixel *intra_border_backup[2][3]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */
uint8_t (*deblock_strength[2])[2][4][4];
/* CPU functions dependents */
......
......@@ -98,7 +98,7 @@ static void idct4x4dc( int16_t d[16] )
}
static inline void pixel_sub_wxh( int16_t *diff, int i_size,
uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
{
for( int y = 0; y < i_size; y++ )
{
......@@ -109,7 +109,7 @@ static inline void pixel_sub_wxh( int16_t *diff, int i_size,
}
}
static void sub4x4_dct( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 )
static void sub4x4_dct( int16_t dct[16], pixel *pix1, pixel *pix2 )
{
int16_t d[16];
int16_t tmp[16];
......@@ -143,7 +143,7 @@ static void sub4x4_dct( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 )
}
}
static void sub8x8_dct( int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2 )
static void sub8x8_dct( int16_t dct[4][16], pixel *pix1, pixel *pix2 )
{
sub4x4_dct( dct[0], &pix1[0], &pix2[0] );
sub4x4_dct( dct[1], &pix1[4], &pix2[4] );
......@@ -151,7 +151,7 @@ static void sub8x8_dct( int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2 )
sub4x4_dct( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
}
static void sub16x16_dct( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 )
static void sub16x16_dct( int16_t dct[16][16], pixel *pix1, pixel *pix2 )
{
sub8x8_dct( &dct[ 0], &pix1[0], &pix2[0] );
sub8x8_dct( &dct[ 4], &pix1[8], &pix2[8] );
......@@ -159,7 +159,7 @@ static void sub16x16_dct( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 )
sub8x8_dct( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
}
static int sub4x4_dct_dc( uint8_t *pix1, uint8_t *pix2 )
static int sub4x4_dct_dc( pixel *pix1, pixel *pix2 )
{
int16_t d[16];
int sum = 0;
......@@ -172,7 +172,7 @@ static int sub4x4_dct_dc( uint8_t *pix1, uint8_t *pix2 )
return sum;
}
static void sub8x8_dct_dc( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 )
static void sub8x8_dct_dc( int16_t dct[4], pixel *pix1, pixel *pix2 )
{
dct[0] = sub4x4_dct_dc( &pix1[0], &pix2[0] );
dct[1] = sub4x4_dct_dc( &pix1[4], &pix2[4] );
......@@ -190,7 +190,7 @@ static void sub8x8_dct_dc( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 )
dct[3] = d2 - d3;
}
static void add4x4_idct( uint8_t *p_dst, int16_t dct[16] )
static void add4x4_idct( pixel *p_dst, int16_t dct[16] )
{
int16_t d[16];
int16_t tmp[16];
......@@ -225,12 +225,12 @@ static void add4x4_idct( uint8_t *p_dst, int16_t dct[16] )
for( int y = 0; y < 4; y++ )
{
for( int x = 0; x < 4; x++ )
p_dst[x] = x264_clip_uint8( p_dst[x] + d[y*4+x] );
p_dst[x] = x264_clip_pixel( p_dst[x] + d[y*4+x] );
p_dst += FDEC_STRIDE;
}
}
static void add8x8_idct( uint8_t *p_dst, int16_t dct[4][16] )
static void add8x8_idct( pixel *p_dst, int16_t dct[4][16] )
{
add4x4_idct( &p_dst[0], dct[0] );
add4x4_idct( &p_dst[4], dct[1] );
......@@ -238,7 +238,7 @@ static void add8x8_idct( uint8_t *p_dst, int16_t dct[4][16] )
add4x4_idct( &p_dst[4*FDEC_STRIDE+4], dct[3] );
}
static void add16x16_idct( uint8_t *p_dst, int16_t dct[16][16] )
static void add16x16_idct( pixel *p_dst, int16_t dct[16][16] )
{
add8x8_idct( &p_dst[0], &dct[0] );
add8x8_idct( &p_dst[8], &dct[4] );
......@@ -277,7 +277,7 @@ static void add16x16_idct( uint8_t *p_dst, int16_t dct[16][16] )
DST(7) = (a4>>2) - a7 ;\
}
static void sub8x8_dct8( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 )
static void sub8x8_dct8( int16_t dct[64], pixel *pix1, pixel *pix2 )
{
int16_t tmp[64];
......@@ -298,7 +298,7 @@ static void sub8x8_dct8( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 )
#undef DST
}
static void sub16x16_dct8( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 )
static void sub16x16_dct8( int16_t dct[4][64], pixel *pix1, pixel *pix2 )
{
sub8x8_dct8( dct[0], &pix1[0], &pix2[0] );
sub8x8_dct8( dct[1], &pix1[8], &pix2[8] );
......@@ -333,7 +333,7 @@ static void sub16x16_dct8( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 )
DST(7, b0 - b7);\
}
static void add8x8_idct8( uint8_t *dst, int16_t dct[64] )
static void add8x8_idct8( pixel *dst, int16_t dct[64] )
{
dct[0] += 32; // rounding for the >>6 at the end
......@@ -345,14 +345,14 @@ static void add8x8_idct8( uint8_t *dst, int16_t dct[64] )
#undef DST
#define SRC(x) dct[i*8+x]
#define DST(x,rhs) dst[i + x*FDEC_STRIDE] = x264_clip_uint8( dst[i + x*FDEC_STRIDE] + ((rhs) >> 6) );
#define DST(x,rhs) dst[i + x*FDEC_STRIDE] = x264_clip_pixel( dst[i + x*FDEC_STRIDE] + ((rhs) >> 6) );
for( int i = 0; i < 8; i++ )
IDCT8_1D
#undef SRC
#undef DST
}
static void add16x16_idct8( uint8_t *dst, int16_t dct[4][64] )
static void add16x16_idct8( pixel *dst, int16_t dct[4][64] )
{
add8x8_idct8( &dst[0], dct[0] );
add8x8_idct8( &dst[8], dct[1] );
......@@ -360,19 +360,19 @@ static void add16x16_idct8( uint8_t *dst, int16_t dct[4][64] )
add8x8_idct8( &dst[8*FDEC_STRIDE+8], dct[3] );
}
static void inline add4x4_idct_dc( uint8_t *p_dst, int16_t dc )
static void inline add4x4_idct_dc( pixel *p_dst, int16_t dc )
{
dc = (dc + 32) >> 6;
for( int i = 0; i < 4; i++, p_dst += FDEC_STRIDE )
{
p_dst[0] = x264_clip_uint8( p_dst[0] + dc );
p_dst[1] = x264_clip_uint8( p_dst[1] + dc );
p_dst[2] = x264_clip_uint8( p_dst[2] + dc );
p_dst[3] = x264_clip_uint8( p_dst[3] + dc );
p_dst[0] = x264_clip_pixel( p_dst[0] + dc );
p_dst[1] = x264_clip_pixel( p_dst[1] + dc );
p_dst[2] = x264_clip_pixel( p_dst[2] + dc );
p_dst[3] = x264_clip_pixel( p_dst[3] + dc );
}
}
static void add8x8_idct_dc( uint8_t *p_dst, int16_t dct[4] )
static void add8x8_idct_dc( pixel *p_dst, int16_t dct[4] )
{
add4x4_idct_dc( &p_dst[0], dct[0] );
add4x4_idct_dc( &p_dst[4], dct[1] );
......@@ -380,7 +380,7 @@ static void add8x8_idct_dc( uint8_t *p_dst, int16_t dct[4] )
add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[3] );
}
static void add16x16_idct_dc( uint8_t *p_dst, int16_t dct[16] )
static void add16x16_idct_dc( pixel *p_dst, int16_t dct[16] )
{
for( int i = 0; i < 4; i++, dct += 4, p_dst += 4*FDEC_STRIDE )
{
......@@ -614,21 +614,21 @@ static void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[16] )
nz |= level[i];\
}
#define COPY4x4\
CP32( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
CP32( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
CP32( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
CP32( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );
CPPIXEL_X4( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
CPPIXEL_X4( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
CPPIXEL_X4( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
CPPIXEL_X4( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );
#define COPY8x8\
CP64( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
CP64( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
CP64( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
CP64( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );\
CP64( p_dst+4*FDEC_STRIDE, p_src+4*FENC_STRIDE );\
CP64( p_dst+5*FDEC_STRIDE, p_src+5*FENC_STRIDE );\
CP64( p_dst+6*FDEC_STRIDE, p_src+6*FENC_STRIDE );\
CP64( p_dst+7*FDEC_STRIDE, p_src+7*FENC_STRIDE );
CPPIXEL_X8( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
CPPIXEL_X8( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
CPPIXEL_X8( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
CPPIXEL_X8( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );\
CPPIXEL_X8( p_dst+4*FDEC_STRIDE, p_src+4*FENC_STRIDE );\
CPPIXEL_X8( p_dst+5*FDEC_STRIDE, p_src+5*FENC_STRIDE );\
CPPIXEL_X8( p_dst+6*FDEC_STRIDE, p_src+6*FENC_STRIDE );\
CPPIXEL_X8( p_dst+7*FDEC_STRIDE, p_src+7*FENC_STRIDE );
static int zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
static int zigzag_sub_4x4_frame( int16_t level[16], const pixel *p_src, pixel *p_dst )
{
int nz = 0;
ZIGZAG4_FRAME
......@@ -636,7 +636,7 @@ static int zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *p_src, uint8_
return !!nz;
}
static int zigzag_sub_4x4_field( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
static int zigzag_sub_4x4_field( int16_t level[16], const pixel *p_src, pixel *p_dst )
{
int nz = 0;
ZIGZAG4_FIELD
......@@ -652,7 +652,7 @@ static int zigzag_sub_4x4_field( int16_t level[16], const uint8_t *p_src, uint8_
level[0] = 0;\
}
static int zigzag_sub_4x4ac_frame( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst, int16_t *dc )
static int zigzag_sub_4x4ac_frame( int16_t level[16], const pixel *p_src, pixel *p_dst, int16_t *dc )
{
int nz = 0;
ZIGZAG4_FRAME
......@@ -660,7 +660,7 @@ static int zigzag_sub_4x4ac_frame( int16_t level[16], const uint8_t *p_src, uint
return !!nz;
}
static int zigzag_sub_4x4ac_field( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst, int16_t *dc )
static int zigzag_sub_4x4ac_field( int16_t level[16], const pixel *p_src, pixel *p_dst, int16_t *dc )
{
int nz = 0;
ZIGZAG4_FIELD
......@@ -668,14 +668,14 @@ static int zigzag_sub_4x4ac_field( int16_t level[16], const uint8_t *p_src, uint
return !!nz;
}
static int zigzag_sub_8x8_frame( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst )
static int zigzag_sub_8x8_frame( int16_t level[64], const pixel *p_src, pixel *p_dst )
{
int nz = 0;
ZIGZAG8_FRAME
COPY8x8
return !!nz;
}
static int zigzag_sub_8x8_field( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst )
static int zigzag_sub_8x8_field( int16_t level[64], const pixel *p_src, pixel *p_dst )
{
int nz = 0;
ZIGZAG8_FIELD
......
......@@ -91,23 +91,23 @@ typedef struct
// pix1 stride = FENC_STRIDE
// pix2 stride = FDEC_STRIDE
// p_dst stride = FDEC_STRIDE
void (*sub4x4_dct) ( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 );
void (*add4x4_idct) ( uint8_t *p_dst, int16_t dct[16] );
void (*sub4x4_dct) ( int16_t dct[16], pixel *pix1, pixel *pix2 );
void (*add4x4_idct) ( pixel *p_dst, int16_t dct[16] );
void (*sub8x8_dct) ( int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2 );
void (*sub8x8_dct_dc)( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 );
void (*add8x8_idct) ( uint8_t *p_dst, int16_t dct[4][16] );
void (*add8x8_idct_dc) ( uint8_t *p_dst, int16_t dct[4] );
void (*sub8x8_dct) ( int16_t dct[4][16], pixel *pix1, pixel *pix2 );
void (*sub8x8_dct_dc)( int16_t dct[4], pixel *pix1, pixel *pix2 );
void (*add8x8_idct) ( pixel *p_dst, int16_t dct[4][16] );
void (*add8x8_idct_dc) ( pixel *p_dst, int16_t dct[4] );
void (*sub16x16_dct) ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void (*add16x16_idct)( uint8_t *p_dst, int16_t dct[16][16] );
void (*add16x16_idct_dc) ( uint8_t *p_dst, int16_t dct[16] );
void (*sub16x16_dct) ( int16_t dct[16][16], pixel *pix1, pixel *pix2 );
void (*add16x16_idct)( pixel *p_dst, int16_t dct[16][16] );
void (*add16x16_idct_dc) ( pixel *p_dst, int16_t dct[16] );
void (*sub8x8_dct8) ( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 );
void (*add8x8_idct8) ( uint8_t *p_dst, int16_t dct[64] );
void (*sub8x8_dct8) ( int16_t dct[64], pixel *pix1, pixel *pix2 );
void (*add8x8_idct8) ( pixel *p_dst, int16_t dct[64] );
void (*sub16x16_dct8) ( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
void (*add16x16_idct8)( uint8_t *p_dst, int16_t dct[4][64] );
void (*sub16x16_dct8) ( int16_t dct[4][64], pixel *pix1, pixel *pix2 );
void (*add16x16_idct8)( pixel *p_dst, int16_t dct[4][64] );
void (*dct4x4dc) ( int16_t d[16] );
void (*idct4x4dc)( int16_t d[16] );
......@@ -118,9 +118,9 @@ typedef struct
{
void (*scan_8x8)( int16_t level[64], int16_t dct[64] );
void (*scan_4x4)( int16_t level[16], int16_t dct[16] );
int (*sub_8x8) ( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst );
int (*sub_4x4) ( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst );
int (*sub_4x4ac)( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst, int16_t *dc );
int (*sub_8x8) ( int16_t level[64], const pixel *p_src, pixel *p_dst );
int (*sub_4x4) ( int16_t level[16], const pixel *p_src, pixel *p_dst );
int (*sub_4x4ac)( int16_t level[16], const pixel *p_src, pixel *p_dst, int16_t *dc );
void (*interleave_8x8_cavlc)( int16_t *dst, int16_t *src, uint8_t *nnz );
} x264_zigzag_function_t;
......
......@@ -68,7 +68,7 @@ static const int8_t i_tc0_table[52+12*2][4] =
#define tc0_table(x) i_tc0_table[(x)+12]
/* From ffmpeg */
static inline void deblock_luma_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
static inline void deblock_luma_c( pixel *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
{
for( int i = 0; i < 4; i++ )
{
......@@ -104,23 +104,23 @@ static inline void deblock_luma_c( uint8_t *pix, int xstride, int ystride, int a
}
delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
pix[-1*xstride] = x264_clip_uint8( p0 + delta ); /* p0' */
pix[ 0*xstride] = x264_clip_uint8( q0 - delta ); /* q0' */
pix[-1*xstride] = x264_clip_pixel( p0 + delta ); /* p0' */
pix[ 0*xstride] = x264_clip_pixel( q0 - delta ); /* q0' */
}
pix += ystride;
}
}
}
static void deblock_v_luma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
static void deblock_v_luma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
{
deblock_luma_c( pix, stride, 1, alpha, beta, tc0 );
}
static void deblock_h_luma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
static void deblock_h_luma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
{
deblock_luma_c( pix, 1, stride, alpha, beta, tc0 );
}
static inline void deblock_chroma_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
static inline void deblock_chroma_c( pixel *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
{
for( int i = 0; i < 4; i++ )
{
......@@ -140,23 +140,23 @@ static inline void deblock_chroma_c( uint8_t *pix, int xstride, int ystride, int
if( abs( p0 - q0 ) < alpha && abs( p1 - p0 ) < beta && abs( q1 - q0 ) < beta )
{
int delta = x264_clip3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
pix[-1*xstride] = x264_clip_uint8( p0 + delta ); /* p0' */
pix[ 0*xstride] = x264_clip_uint8( q0 - delta ); /* q0' */
pix[-1*xstride] = x264_clip_pixel( p0 + delta ); /* p0' */
pix[ 0*xstride] = x264_clip_pixel( q0 - delta ); /* q0' */
}
pix += ystride;
}
}
}
static void deblock_v_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
static void deblock_v_chroma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
{
deblock_chroma_c( pix, stride, 1, alpha, beta, tc0 );
}
static void deblock_h_chroma_c( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
static void deblock_h_chroma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
{
deblock_chroma_c( pix, 1, stride, alpha, beta, tc0 );
}
static inline void deblock_luma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta )
static inline void deblock_luma_intra_c( pixel *pix, int xstride, int ystride, int alpha, int beta )
{
for( int d = 0; d < 16; d++ )
{
......@@ -199,16 +199,16 @@ static inline void deblock_luma_intra_c( uint8_t *pix, int xstride, int ystride,
pix += ystride;
}
}
static void deblock_v_luma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
static void deblock_v_luma_intra_c( pixel *pix, int stride, int alpha, int beta )
{
deblock_luma_intra_c( pix, stride, 1, alpha, beta );
}
static void deblock_h_luma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
static void deblock_h_luma_intra_c( pixel *pix, int stride, int alpha, int beta )
{
deblock_luma_intra_c( pix, 1, stride, alpha, beta );
}
static inline void deblock_chroma_intra_c( uint8_t *pix, int xstride, int ystride, int alpha, int beta )
static inline void deblock_chroma_intra_c( pixel *pix, int xstride, int ystride, int alpha, int beta )
{
for( int d = 0; d < 8; d++ )
{
......@@ -225,11 +225,11 @@ static inline void deblock_chroma_intra_c( uint8_t *pix, int xstride, int ystrid
pix += ystride;
}
}
static void deblock_v_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
static void deblock_v_chroma_intra_c( pixel *pix, int stride, int alpha, int beta )
{
deblock_chroma_intra_c( pix, stride, 1, alpha, beta );
}
static void deblock_h_chroma_intra_c( uint8_t *pix, int stride, int alpha, int beta )
static void deblock_h_chroma_intra_c( pixel *pix, int stride, int alpha, int beta )
{
deblock_chroma_intra_c( pix, 1, stride, alpha, beta );
}
......@@ -263,7 +263,7 @@ static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264
}
}
static inline void deblock_edge( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_inter_t pf_inter )
static inline void deblock_edge( x264_t *h, pixel *pix1, pixel *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_inter_t pf_inter )
{
int index_a = i_qp + h->sh.i_alpha_c0_offset;
int alpha = alpha_table(index_a);
......@@ -283,7 +283,7 @@ static inline void deblock_edge( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_
pf_inter( pix2, i_stride, alpha, beta, tc );
}
static inline void deblock_edge_intra( x264_t *h, uint8_t *pix1, uint8_t *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_intra_t pf_intra )
static inline void deblock_edge_intra( x264_t *h, pixel *pix1, pixel *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_intra_t pf_intra )
{
int alpha = alpha_table(i_qp + h->sh.i_alpha_c0_offset);
int beta = beta_table(i_qp + h->sh.i_beta_offset);
......@@ -315,9 +315,9 @@ void x264_frame_deblock_row( x264_t *h, int mb_y )
int intra_cur = IS_INTRA( h->mb.type[mb_xy] );
uint8_t (*bs)[4][4] = h->deblock_strength[mb_y&b_interlaced][mb_x];
uint8_t *pixy = h->fdec->plane[0] + 16*mb_y*stridey + 16*mb_x;
uint8_t *pixu = h->fdec->plane[1] + 8*mb_y*strideuv + 8*mb_x;
uint8_t *pixv = h->fdec->plane[2] + 8*mb_y*strideuv + 8*mb_x;
pixel *pixy = h->fdec->plane[0] + 16*mb_y*stridey + 16*mb_x;
pixel *pixu = h->fdec->plane[1] + 8*mb_y*strideuv + 8*mb_x;
pixel *pixv = h->fdec->plane[2] + 8*mb_y*strideuv + 8*mb_x;
if( mb_y & b_interlaced )
{
pixy -= 15*stridey;
......
......@@ -56,7 +56,7 @@ x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + 2*i_padv));
for( int i = 1; i < 3; i++ )
{
CHECKED_MALLOC( frame->buffer[i], chroma_plane_size );
CHECKED_MALLOC( frame->buffer[i], chroma_plane_size * sizeof(pixel) );
frame->plane[i] = frame->buffer[i] + (frame->i_stride[i] * i_padv + PADH)/2;
}
......@@ -87,14 +87,14 @@ x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
* requires them to be in-phase wrt cacheline alignment. */
if( h->param.analyse.i_subpel_refine && b_fdec )
{
CHECKED_MALLOC( frame->buffer[0], 4*luma_plane_size );
CHECKED_MALLOC( frame->buffer[0], 4*luma_plane_size * sizeof(pixel) );
for( int i = 0; i < 4; i++ )
frame->filtered[i] = frame->buffer[0] + i*luma_plane_size + frame->i_stride[0] * i_padv + PADH;
frame->plane[0] = frame->filtered[0];
}
else
{
CHECKED_MALLOC( frame->buffer[0], luma_plane_size );
CHECKED_MALLOC( frame->buffer[0], luma_plane_size * sizeof(pixel) );
frame->filtered[0] = frame->plane[0] = frame->buffer[0] + frame->i_stride[0] * i_padv + PADH;
}
......@@ -136,7 +136,7 @@ x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
luma_plane_size = frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*PADV);
CHECKED_MALLOC( frame->buffer_lowres[0], 4 * luma_plane_size );
CHECKED_MALLOC( frame->buffer_lowres[0], 4 * luma_plane_size * sizeof(pixel) );
for( int i = 0; i < 4; i++ )
frame->lowres[i] = frame->buffer_lowres[0] + (frame->i_stride_lowres * PADV + PADH) + i * luma_plane_size;
......@@ -253,26 +253,30 @@ int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
return 0;
}
static void ALWAYS_INLINE pixel_memset( pixel *dst, int value, int size )
{
for( int i = 0; i < size; i++ )
dst[i] = value;
}
static void plane_expand_border( uint8_t *pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom )
static void plane_expand_border( pixel *pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom )
{
#define PPIXEL(x, y) ( pix + (x) + (y)*i_stride )
for( int y = 0; y < i_height; y++ )
{
/* left band */
memset( PPIXEL(-i_padh, y), PPIXEL(0, y)[0], i_padh );
pixel_memset( PPIXEL(-i_padh, y), PPIXEL(0, y)[0], i_padh );
/* right band */
memset( PPIXEL(i_width, y), PPIXEL(i_width-1, y)[0], i_padh );
pixel_memset( PPIXEL(i_width, y), PPIXEL(i_width-1, y)[0], i_padh );
}
/* upper band */
if( b_pad_top )
for( int y = 0; y < i_padv; y++ )
memcpy( PPIXEL(-i_padh, -y-1), PPIXEL(-i_padh, 0), i_width+2*i_padh );
memcpy( PPIXEL(-i_padh, -y-1), PPIXEL(-i_padh, 0), (i_width+2*i_padh) * sizeof(pixel) );
/* lower band */
if( b_pad_bottom )
for( int y = 0; y < i_padv; y++ )
memcpy( PPIXEL(-i_padh, i_height+y), PPIXEL(-i_padh, i_height-1), i_width+2*i_padh );
memcpy( PPIXEL(-i_padh, i_height+y), PPIXEL(-i_padh, i_height-1), (i_width+2*i_padh) * sizeof(pixel) );
#undef PPIXEL
}
......@@ -289,7 +293,7 @@ void x264_frame_expand_border( x264_t *h, x264_frame_t *frame, int mb_y, int b_e
int padh = PADH >> !!i;
int padv = PADV >> !!i;
// buffer: 2 chroma, 3 luma (rounded to 4) because deblocking goes beyond the top of the mb
uint8_t *pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> !!i);
pixel *pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> !!i);
if( b_end && !b_start )
height += 4 >> (!!i + h->sh.b_mbaff);
if( h->sh.b_mbaff )
......@@ -318,7 +322,7 @@ void x264_frame_expand_border_filtered( x264_t *h, x264_frame_t *frame, int mb_y
for( int i = 1; i < 4; i++ )
{
// buffer: 8 luma, to match the hpel filter
uint8_t *pix = frame->filtered[i] + (16*mb_y - (8 << h->sh.b_mbaff)) * stride - 4;
pixel *pix = frame->filtered[i] + (16*mb_y - (8 << h->sh.b_mbaff)) * stride - 4;
if( h->sh.b_mbaff )
{
plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end );
......@@ -348,16 +352,17 @@ void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame )
if( i_padx )
{
for( int y = 0; y < i_height; y++ )
memset( &frame->plane[i][y*frame->i_stride[i] + i_width],
frame->plane[i][y*frame->i_stride[i] + i_width - 1],
i_padx );
{
pixel value = frame->plane[i][y*frame->i_stride[i] + i_width - 1];
pixel_memset( &frame->plane[i][y*frame->i_stride[i] + i_width], value, i_padx );
}
}
if( i_pady )
{
for( int y = i_height; y < i_height + i_pady; y++ )
memcpy( &frame->plane[i][y*frame->i_stride[i]],
&frame->plane[i][(i_height-(~y&h->param.b_interlaced)-1)*frame->i_stride[i]],
i_width + i_padx );
(i_width + i_padx) * sizeof(pixel) );
}
}
}
......@@ -489,7 +494,7 @@ void x264_frame_sort( x264_frame_t **list, int b_dts )
} while( !b_ok );
}
void x264_weight_scale_plane( x264_t *h, uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride,
void x264_weight_scale_plane( x264_t *h, pixel *dst, int i_dst_stride, pixel *src, int i_src_stride,
int i_width, int i_height, x264_weight_t *w )
{
/* Weight horizontal strips of height 16. This was found to be the optimal height
......
......@@ -64,18 +64,18 @@ typedef struct x264_frame
int i_stride_lowres;
int i_width_lowres;
int i_lines_lowres;
uint8_t *plane[3];
uint8_t *filtered[4]; /* plane[0], H, V, HV */
uint8_t *lowres[4]; /* half-size copy of input frame: Orig, H, V, HV */
pixel *plane[3];
pixel *filtered[4]; /* plane[0], H, V, HV */
pixel *lowres[4]; /* half-size copy of input frame: Orig, H, V, HV */
uint16_t *integral;
/* for unrestricted mv we allocate more data than needed
* allocated data are stored in buffer */
uint8_t *buffer[4];
uint8_t *buffer_lowres[4];
pixel *buffer[4];
pixel *buffer_lowres[4];
x264_weight_t weight[16][3]; /* [ref_index][plane] */
uint8_t *weighted[16]; /* plane[0] weighted of the reference frames */
pixel *weighted[16]; /* plane[0] weighted of the reference frames */
int b_duplicate;
struct x264_frame *orig;
......@@ -156,8 +156,8 @@ typedef struct
x264_pthread_cond_t cv_empty; /* event signaling that the list became emptier */
} x264_synch_frame_list_t;
typedef void (*x264_deblock_inter_t)( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
typedef void (*x264_deblock_intra_t)( uint8_t *pix, int stride, int alpha, int beta );
typedef void (*x264_deblock_inter_t)( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
typedef void (*x264_deblock_intra_t)( pixel *pix, int stride, int alpha, int beta );
typedef struct
{
x264_deblock_inter_t deblock_luma[2];
......@@ -196,7 +196,7 @@ x264_frame_t *x264_frame_shift( x264_frame_t **list );
void x264_frame_push_unused( x264_t *h, x264_frame_t *frame );
void x264_frame_push_blank_unused( x264_t *h, x264_frame_t *frame );
x264_frame_t *x264_frame_pop_blank_unused( x264_t *h );
void x264_weight_scale_plane( x264_t *h, uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride,
void x264_weight_scale_plane( x264_t *h, pixel *dst, int i_dst_stride, pixel *src, int i_src_stride,
int i_width, int i_height, x264_weight_t *w );
x264_frame_t *x264_frame_pop_unused( x264_t *h, int b_fdec );
void x264_frame_sort( x264_frame_t **list, int b_dts );
......
......@@ -94,9 +94,9 @@ static NOINLINE void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int
int mvy1 = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y;
int i_mode = x264_size2pixel[height][width];
int i_stride0 = 16, i_stride1 = 16;
ALIGNED_ARRAY_16( uint8_t, tmp0,[16*16] );
ALIGNED_ARRAY_16( uint8_t, tmp1,[16*16] );
uint8_t *src0, *src1;
ALIGNED_ARRAY_16( pixel, tmp0,[16*16] );
ALIGNED_ARRAY_16( pixel, tmp1,[16*16] );
pixel *src0, *src1;
src0 = h->mc.get_ref( tmp0, &i_stride0, h->mb.pic.p_fref[0][i_ref0], h->mb.pic.i_stride[0],
mvx0, mvy0, 4*width, 4*height, weight_none );
......@@ -290,7 +290,7 @@ int x264_macroblock_cache_allocate( x264_t *h )
}
for( int i = 0; i < numweightbuf; i++ )
CHECKED_MALLOC( h->mb.p_weight_buf[i], luma_plane_size );
CHECKED_MALLOC( h->mb.p_weight_buf[i], luma_plane_size * sizeof(pixel) );
#undef ALIGN
}
......@@ -329,7 +329,7 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
for( int j = 0; j < 3; j++ )
{
/* shouldn't really be initialized, just silences a valgrind false-positive in predict_8x8_filter_mmx */
CHECKED_MALLOCZERO( h->intra_border_backup[i][j], (h->sps->i_mb_width*16+32)>>!!j );
CHECKED_MALLOCZERO( h->intra_border_backup[i][j], ((h->sps->i_mb_width*16+32)>>!!j) * sizeof(pixel) );
h->intra_border_backup[i][j] += 8;