Commit c91f43a4 authored by Oskar Arvidsson's avatar Oskar Arvidsson Committed by Fiona Glaser

Support for 9 and 10-bit encoding

Output bit depth is specified on compilation time via --bit-depth.
There is currently almost no assembly code available for high-bit-depth modes, so encoding will be very slow.
Input is still 8-bit only; this will change in the future.

Note that very few H.264 decoders support >8 bit depth currently.
Also note that the quantizer scale differs for higher bit depth.  For example, for 10-bit, the quantizer (and crf) ranges from 0 to 63 instead of 0 to 51.
parent b7789b1f
......@@ -64,6 +64,19 @@ MC_WEIGHT(_nodenom)
MC_WEIGHT(_offsetadd)
MC_WEIGHT(_offsetsub)
void x264_mc_copy_w4_neon( uint8_t *, int, uint8_t *, int, int );
void x264_mc_copy_w8_neon( uint8_t *, int, uint8_t *, int, int );
void x264_mc_copy_w16_neon( uint8_t *, int, uint8_t *, int, int );
void x264_mc_copy_w16_aligned_neon( uint8_t *, int, uint8_t *, int, int );
void x264_mc_chroma_neon( uint8_t *, int, uint8_t *, int, int, int, int, int );
void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int, int, int);
void x264_hpel_filter_v_neon( uint8_t *, uint8_t *, int16_t *, int, int );
void x264_hpel_filter_c_neon( uint8_t *, int16_t *, int );
void x264_hpel_filter_h_neon( uint8_t *, uint8_t *, int );
#if !X264_HIGH_BIT_DEPTH
static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w )
{
if( w->i_scale == 1<<w->i_denom )
......@@ -85,14 +98,6 @@ static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w )
w->weightfn = x264_mc_wtab_neon;
}
void x264_mc_copy_w4_neon( uint8_t *, int, uint8_t *, int, int );
void x264_mc_copy_w8_neon( uint8_t *, int, uint8_t *, int, int );
void x264_mc_copy_w16_neon( uint8_t *, int, uint8_t *, int, int );
void x264_mc_copy_w16_aligned_neon( uint8_t *, int, uint8_t *, int, int );
void x264_mc_chroma_neon( uint8_t *, int, uint8_t *, int, int, int, int, int );
void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int, int, int);
static void (* const x264_pixel_avg_wtab_neon[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) =
{
NULL,
......@@ -174,10 +179,6 @@ static uint8_t *get_ref_neon( uint8_t *dst, int *i_dst_stride,
}
}
void x264_hpel_filter_v_neon( uint8_t *, uint8_t *, int16_t *, int, int );
void x264_hpel_filter_c_neon( uint8_t *, int16_t *, int );
void x264_hpel_filter_h_neon( uint8_t *, uint8_t *, int );
static void hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
int stride, int width, int height, int16_t *buf )
{
......@@ -198,18 +199,22 @@ static void hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8
src += stride;
}
}
#endif // !X264_HIGH_BIT_DEPTH
void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf )
{
if( !(cpu&X264_CPU_ARMV6) )
return;
#if !X264_HIGH_BIT_DEPTH
pf->prefetch_fenc = x264_prefetch_fenc_arm;
pf->prefetch_ref = x264_prefetch_ref_arm;
#endif // !X264_HIGH_BIT_DEPTH
if( !(cpu&X264_CPU_NEON) )
return;
#if !X264_HIGH_BIT_DEPTH
pf->copy_16x16_unaligned = x264_mc_copy_w16_neon;
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_neon;
pf->copy[PIXEL_8x8] = x264_mc_copy_w8_neon;
......@@ -229,15 +234,16 @@ void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf )
pf->offsetsub = x264_mc_offsetsub_wtab_neon;
pf->weight_cache = x264_weight_cache_neon;
// Apple's gcc stupidly cannot align stack variables, and ALIGNED_ARRAY can't work on structs
#ifndef SYS_MACOSX
pf->memcpy_aligned = x264_memcpy_aligned_neon;
#endif
pf->memzero_aligned = x264_memzero_aligned_neon;
pf->mc_chroma = x264_mc_chroma_neon;
pf->mc_luma = mc_luma_neon;
pf->get_ref = get_ref_neon;
pf->hpel_filter = hpel_filter_neon;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_neon;
#endif // !X264_HIGH_BIT_DEPTH
// Apple's gcc stupidly cannot align stack variables, and ALIGNED_ARRAY can't work on structs
#ifndef SYS_MACOSX
pf->memcpy_aligned = x264_memcpy_aligned_neon;
#endif
pf->memzero_aligned = x264_memzero_aligned_neon;
}
......@@ -51,6 +51,7 @@ void x264_predict_4x4_init_arm( int cpu, x264_predict_t pf[12] )
if (!(cpu&X264_CPU_ARMV6))
return;
#if !X264_HIGH_BIT_DEPTH
pf[I_PRED_4x4_H] = x264_predict_4x4_h_armv6;
pf[I_PRED_4x4_DC] = x264_predict_4x4_dc_armv6;
pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_armv6;
......@@ -59,6 +60,7 @@ void x264_predict_4x4_init_arm( int cpu, x264_predict_t pf[12] )
return;
pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_neon;
#endif // !X264_HIGH_BIT_DEPTH
}
void x264_predict_8x8c_init_arm( int cpu, x264_predict_t pf[7] )
......@@ -66,12 +68,14 @@ void x264_predict_8x8c_init_arm( int cpu, x264_predict_t pf[7] )
if (!(cpu&X264_CPU_NEON))
return;
#if !X264_HIGH_BIT_DEPTH
pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_neon;
pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x8c_dc_top_neon;
pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left_neon;
pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_neon;
pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_neon;
pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_neon;
#endif // !X264_HIGH_BIT_DEPTH
}
void x264_predict_8x8_init_arm( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter )
......@@ -79,8 +83,10 @@ void x264_predict_8x8_init_arm( int cpu, x264_predict8x8_t pf[12], x264_predict_
if (!(cpu&X264_CPU_NEON))
return;
#if !X264_HIGH_BIT_DEPTH
pf[I_PRED_8x8_DC] = x264_predict_8x8_dc_neon;
pf[I_PRED_8x8_H] = x264_predict_8x8_h_neon;
#endif // !X264_HIGH_BIT_DEPTH
}
void x264_predict_16x16_init_arm( int cpu, x264_predict_t pf[7] )
......@@ -88,10 +94,12 @@ void x264_predict_16x16_init_arm( int cpu, x264_predict_t pf[7] )
if (!(cpu&X264_CPU_NEON))
return;
#if !X264_HIGH_BIT_DEPTH
pf[I_PRED_16x16_DC ] = x264_predict_16x16_dc_neon;
pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_neon;
pf[I_PRED_16x16_DC_LEFT]= x264_predict_16x16_dc_left_neon;
pf[I_PRED_16x16_H ] = x264_predict_16x16_h_neon;
pf[I_PRED_16x16_V ] = x264_predict_16x16_v_neon;
pf[I_PRED_16x16_P ] = x264_predict_16x16_p_neon;
#endif // !X264_HIGH_BIT_DEPTH
}
......@@ -53,7 +53,7 @@ typedef struct bs_s
typedef struct
{
int last;
int16_t level[16];
dctcoef level[16];
uint8_t run[16];
} x264_run_level_t;
......
......@@ -91,10 +91,10 @@ void x264_param_default( x264_param_t *param )
param->rc.i_vbv_max_bitrate = 0;
param->rc.i_vbv_buffer_size = 0;
param->rc.f_vbv_buffer_init = 0.9;
param->rc.i_qp_constant = 23;
param->rc.f_rf_constant = 23;
param->rc.i_qp_constant = 23 + QP_BD_OFFSET;
param->rc.f_rf_constant = 23 + QP_BD_OFFSET;
param->rc.i_qp_min = 10;
param->rc.i_qp_max = 51;
param->rc.i_qp_max = QP_MAX;
param->rc.i_qp_step = 4;
param->rc.f_ip_factor = 1.4;
param->rc.f_pb_factor = 1.3;
......@@ -418,6 +418,15 @@ int x264_param_apply_profile( x264_param_t *param, const char *profile )
if( !profile )
return 0;
#if BIT_DEPTH > 8
if( !strcasecmp( profile, "baseline" ) || !strcasecmp( profile, "main" ) ||
!strcasecmp( profile, "high" ) )
{
x264_log( NULL, X264_LOG_ERROR, "%s profile doesn't support a bit depth of %d.\n", profile, BIT_DEPTH );
return -1;
}
#endif
if( !strcasecmp( profile, "baseline" ) )
{
param->analyse.b_transform_8x8 = 0;
......@@ -441,7 +450,7 @@ int x264_param_apply_profile( x264_param_t *param, const char *profile )
param->analyse.b_transform_8x8 = 0;
param->i_cqm_preset = X264_CQM_FLAT;
}
else if( !strcasecmp( profile, "high" ) )
else if( !strcasecmp( profile, "high" ) || !strcasecmp( profile, "high10" ) )
{
/* Default */
}
......
......@@ -52,10 +52,15 @@ do {\
#define X264_BFRAME_MAX 16
#define X264_THREAD_MAX 128
#define X264_PCM_COST (386*8)
#define X264_PCM_COST (384*BIT_DEPTH+16)
#define X264_LOOKAHEAD_MAX 250
#define QP_BD_OFFSET (6*(BIT_DEPTH-8))
#define QP_MAX (51+QP_BD_OFFSET)
#define QP_MAX_MAX (51+2*6)
#define LAMBDA_MAX (91 << (BIT_DEPTH-8))
#define PIXEL_MAX ((1 << BIT_DEPTH)-1)
// arbitrary, but low because SATD scores are 1/4 normal
#define X264_LOOKAHEAD_QP 12
#define X264_LOOKAHEAD_QP (12+QP_BD_OFFSET)
// number of pixels (per thread) in progress at any given time.
// 16 for the macroblock in progress + 3 for deblocking + 3 for motion compensation filter + 2 for extra safety
......@@ -101,17 +106,23 @@ typedef union { x264_uint128_t i; uint64_t a[2]; uint32_t b[4]; uint16_t c[8]; u
#define CP64(dst,src) M64(dst) = M64(src)
#define CP128(dst,src) M128(dst) = M128(src)
typedef uint8_t pixel;
typedef uint32_t pixel4;
typedef int16_t dctcoef;
#if X264_HIGH_BIT_DEPTH
typedef uint16_t pixel;
typedef uint64_t pixel4;
typedef int32_t dctcoef;
#define PIXEL_SPLAT_X4(x) ((x)*0x01010101U)
#define MPIXEL_X4(src) M32(src)
#define CPPIXEL_X4(dst,src) CP32(dst,src)
#define CPPIXEL_X8(dst,src) CP64(dst,src)
#define MDCT_X2(dct) M32(dct)
#define CPDCT_X2(dst,src) CP32(dst,src)
#define CPDCT_X4(dst,src) CP64(dst,src)
# define PIXEL_SPLAT_X4(x) ((x)*0x0001000100010001ULL)
# define MPIXEL_X4(src) M64(src)
#else
typedef uint8_t pixel;
typedef uint32_t pixel4;
typedef int16_t dctcoef;
# define PIXEL_SPLAT_X4(x) ((x)*0x01010101U)
# define MPIXEL_X4(src) M32(src)
#endif
#define CPPIXEL_X4(dst,src) MPIXEL_X4(dst) = MPIXEL_X4(src)
#define X264_SCAN8_SIZE (6*8)
#define X264_SCAN8_LUMA_SIZE (5*8)
......@@ -189,7 +200,7 @@ void x264_init_vlc_tables();
static ALWAYS_INLINE pixel x264_clip_pixel( int x )
{
return x&(~255) ? (-x)>>31 : x;
return ( (x & ~PIXEL_MAX) ? (-x)>>31 & PIXEL_MAX : x );
}
static ALWAYS_INLINE int x264_clip3( int v, int i_min, int i_max )
......@@ -449,8 +460,8 @@ struct x264_t
/* mv/ref cost arrays. Indexed by lambda instead of
* qp because, due to rounding, some quantizers share
* lambdas. This saves memory. */
uint16_t *cost_mv[92];
uint16_t *cost_mv_fpel[92][4];
uint16_t *cost_mv[LAMBDA_MAX+1];
uint16_t *cost_mv_fpel[LAMBDA_MAX+1][4];
const uint8_t *chroma_qp_table; /* includes both the nonlinear luma->chroma mapping and chroma_qp_offset */
......
......@@ -418,6 +418,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->dct4x4dc = dct4x4dc;
dctf->idct4x4dc = idct4x4dc;
#if !X264_HIGH_BIT_DEPTH
#if HAVE_MMX
if( cpu&X264_CPU_MMX )
{
......@@ -515,6 +516,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->add16x16_idct8= x264_add16x16_idct8_neon;
}
#endif
#endif // !X264_HIGH_BIT_DEPTH
}
void x264_dct_init_weights( void )
......@@ -599,11 +601,9 @@ static void zigzag_scan_4x4_frame( dctcoef level[16], dctcoef dct[16] )
static void zigzag_scan_4x4_field( dctcoef level[16], dctcoef dct[16] )
{
CPDCT_X2( level, dct );
memcpy( level, dct, 2 * sizeof(dctcoef) );
ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1)
CPDCT_X2( level+6, dct+6 );
CPDCT_X4( level+8, dct+8 );
CPDCT_X4( level+12, dct+12 );
memcpy( level+6, dct+6, 10 * sizeof(dctcoef) );
}
#undef ZIG
......@@ -618,6 +618,7 @@ static void zigzag_scan_4x4_field( dctcoef level[16], dctcoef dct[16] )
CPPIXEL_X4( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
CPPIXEL_X4( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
CPPIXEL_X4( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );
#define CPPIXEL_X8(dst,src) ( CPPIXEL_X4(dst,src), CPPIXEL_X4(dst+4,src+4) )
#define COPY8x8\
CPPIXEL_X8( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
CPPIXEL_X8( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
......@@ -709,6 +710,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
pf->sub_8x8 = zigzag_sub_8x8_field;
pf->sub_4x4 = zigzag_sub_4x4_field;
pf->sub_4x4ac = zigzag_sub_4x4ac_field;
#if !X264_HIGH_BIT_DEPTH
#if HAVE_MMX
if( cpu&X264_CPU_MMXEXT )
{
......@@ -726,6 +728,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
if( cpu&X264_CPU_ALTIVEC )
pf->scan_4x4 = x264_zigzag_scan_4x4_field_altivec;
#endif
#endif // !X264_HIGH_BIT_DEPTH
}
else
{
......@@ -734,6 +737,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
pf->sub_8x8 = zigzag_sub_8x8_frame;
pf->sub_4x4 = zigzag_sub_4x4_frame;
pf->sub_4x4ac = zigzag_sub_4x4ac_frame;
#if !X264_HIGH_BIT_DEPTH
#if HAVE_MMX
if( cpu&X264_CPU_MMX )
pf->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx;
......@@ -759,13 +763,16 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
if( cpu&X264_CPU_NEON )
pf->scan_4x4 = x264_zigzag_scan_4x4_frame_neon;
#endif
#endif // !X264_HIGH_BIT_DEPTH
}
pf->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc;
#if !X264_HIGH_BIT_DEPTH
#if HAVE_MMX
if( cpu&X264_CPU_MMX )
pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
if( cpu&X264_CPU_SHUFFLE_IS_FAST )
pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
#endif
#endif // !X264_HIGH_BIT_DEPTH
}
......@@ -25,8 +25,9 @@
#include "common.h"
/* Deblocking filter */
static const uint8_t i_alpha_table[52+12*2] =
static const uint8_t i_alpha_table[52+12*3] =
{
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 4, 4, 5, 6,
......@@ -36,8 +37,9 @@ static const uint8_t i_alpha_table[52+12*2] =
255,255,
255,255,255,255,255,255,255,255,255,255,255,255,
};
static const uint8_t i_beta_table[52+12*2] =
static const uint8_t i_beta_table[52+12*3] =
{
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 2, 2, 2, 3,
......@@ -47,12 +49,14 @@ static const uint8_t i_beta_table[52+12*2] =
18, 18,
18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
};
static const int8_t i_tc0_table[52+12*2][4] =
static const int8_t i_tc0_table[52+12*3][4] =
{
{-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
{-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
{-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
{-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
{-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
{-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
{-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 1 },
{-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 1, 1 }, {-1, 0, 1, 1 }, {-1, 1, 1, 1 },
{-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 },
......@@ -63,9 +67,9 @@ static const int8_t i_tc0_table[52+12*2][4] =
{-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
{-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
};
#define alpha_table(x) i_alpha_table[(x)+12]
#define beta_table(x) i_beta_table[(x)+12]
#define tc0_table(x) i_tc0_table[(x)+12]
#define alpha_table(x) i_alpha_table[(x)+24]
#define beta_table(x) i_beta_table[(x)+24]
#define tc0_table(x) i_tc0_table[(x)+24]
/* From ffmpeg */
static inline void deblock_luma_c( pixel *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
......@@ -265,18 +269,19 @@ static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264
static inline void deblock_edge( x264_t *h, pixel *pix1, pixel *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_inter_t pf_inter )
{
int index_a = i_qp + h->sh.i_alpha_c0_offset;
int alpha = alpha_table(index_a);
int beta = beta_table(i_qp + h->sh.i_beta_offset);
int index_a = i_qp-QP_BD_OFFSET + h->sh.i_alpha_c0_offset;
int index_b = i_qp-QP_BD_OFFSET + h->sh.i_beta_offset;
int alpha = alpha_table(index_a) << (BIT_DEPTH-8);
int beta = beta_table(index_b) << (BIT_DEPTH-8);
int8_t tc[4];
if( !M32(bS) || !alpha || !beta )
return;
tc[0] = tc0_table(index_a)[bS[0]] + b_chroma;
tc[1] = tc0_table(index_a)[bS[1]] + b_chroma;
tc[2] = tc0_table(index_a)[bS[2]] + b_chroma;
tc[3] = tc0_table(index_a)[bS[3]] + b_chroma;
tc[0] = (tc0_table(index_a)[bS[0]] << (BIT_DEPTH-8)) + b_chroma;
tc[1] = (tc0_table(index_a)[bS[1]] << (BIT_DEPTH-8)) + b_chroma;
tc[2] = (tc0_table(index_a)[bS[2]] << (BIT_DEPTH-8)) + b_chroma;
tc[3] = (tc0_table(index_a)[bS[3]] << (BIT_DEPTH-8)) + b_chroma;
pf_inter( pix1, i_stride, alpha, beta, tc );
if( b_chroma )
......@@ -285,8 +290,10 @@ static inline void deblock_edge( x264_t *h, pixel *pix1, pixel *pix2, int i_stri
static inline void deblock_edge_intra( x264_t *h, pixel *pix1, pixel *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_intra_t pf_intra )
{
int alpha = alpha_table(i_qp + h->sh.i_alpha_c0_offset);
int beta = beta_table(i_qp + h->sh.i_beta_offset);
int index_a = i_qp-QP_BD_OFFSET + h->sh.i_alpha_c0_offset;
int index_b = i_qp-QP_BD_OFFSET + h->sh.i_beta_offset;
int alpha = alpha_table(index_a) << (BIT_DEPTH-8);
int beta = beta_table(index_b) << (BIT_DEPTH-8);
if( !alpha || !beta )
return;
......@@ -450,6 +457,7 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf )
#if HAVE_MMX
if( cpu&X264_CPU_MMXEXT )
{
#if !X264_HIGH_BIT_DEPTH
pf->deblock_chroma[1] = x264_deblock_v_chroma_mmxext;
pf->deblock_chroma[0] = x264_deblock_h_chroma_mmxext;
pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_mmxext;
......@@ -460,10 +468,12 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf )
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_mmxext;
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_mmxext;
#endif
#endif // !X264_HIGH_BIT_DEPTH
pf->deblock_strength = x264_deblock_strength_mmxext;
if( cpu&X264_CPU_SSE2 )
{
pf->deblock_strength = x264_deblock_strength_sse2;
#if !X264_HIGH_BIT_DEPTH
if( !(cpu&X264_CPU_STACK_MOD4) )
{
pf->deblock_luma[1] = x264_deblock_v_luma_sse2;
......@@ -471,12 +481,14 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf )
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_sse2;
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_sse2;
}
#endif // !X264_HIGH_BIT_DEPTH
}
if( cpu&X264_CPU_SSSE3 )
pf->deblock_strength = x264_deblock_strength_ssse3;
}
#endif
#if !X264_HIGH_BIT_DEPTH
#if HAVE_ALTIVEC
if( cpu&X264_CPU_ALTIVEC )
{
......@@ -494,4 +506,5 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf )
pf->deblock_chroma[0] = x264_deblock_h_chroma_neon;
}
#endif
#endif // !X264_HIGH_BIT_DEPTH
}
......@@ -337,7 +337,7 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
int scratch_size = 0;
if( !b_lookahead )
{
int buf_hpel = (h->thread[0]->fdec->i_width[0]+48) * sizeof(int16_t);
int buf_hpel = (h->thread[0]->fdec->i_width[0]+48) * sizeof(dctcoef);
int buf_ssim = h->param.analyse.b_ssim * 8 * (h->param.i_width/4+3) * sizeof(int);
int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range);
int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
......
......@@ -238,17 +238,30 @@ static const uint16_t block_idx_xy_fdec[16] =
2*4 + 3*4*FDEC_STRIDE, 3*4 + 3*4*FDEC_STRIDE
};
static const uint8_t i_chroma_qp_table[52+12*2] =
#define QP(qP) ( (qP)+QP_BD_OFFSET )
static const uint8_t i_chroma_qp_table[QP_MAX+1+12*2] =
{
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
29, 30, 31, 32, 32, 33, 34, 34, 35, 35,
36, 36, 37, 37, 37, 38, 38, 38, 39, 39,
39, 39,
39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39,
0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0,
#if BIT_DEPTH > 9
QP(-12),QP(-11),QP(-10), QP(-9), QP(-8), QP(-7),
#endif
#if BIT_DEPTH > 8
QP(-6), QP(-5), QP(-4), QP(-3), QP(-2), QP(-1),
#endif
QP(0), QP(1), QP(2), QP(3), QP(4), QP(5),
QP(6), QP(7), QP(8), QP(9), QP(10), QP(11),
QP(12), QP(13), QP(14), QP(15), QP(16), QP(17),
QP(18), QP(19), QP(20), QP(21), QP(22), QP(23),
QP(24), QP(25), QP(26), QP(27), QP(28), QP(29),
QP(29), QP(30), QP(31), QP(32), QP(32), QP(33),
QP(34), QP(34), QP(35), QP(35), QP(36), QP(36),
QP(37), QP(37), QP(37), QP(38), QP(38), QP(38),
QP(39), QP(39), QP(39), QP(39),
QP(39), QP(39), QP(39), QP(39), QP(39), QP(39),
QP(39), QP(39), QP(39), QP(39), QP(39), QP(39),
};
#undef QP
enum cabac_ctx_block_cat_e
{
......@@ -340,26 +353,31 @@ static ALWAYS_INLINE uint32_t pack16to32_mask( int a, int b )
return (a&0xFFFF) + (b<<16);
#endif
}
static ALWAYS_INLINE uint64_t pack32to64( uint32_t a, uint32_t b )
{
#ifdef WORDS_BIGENDIAN
return b + ((uint64_t)a<<32);
#else
return a + ((uint64_t)b<<32);
#endif
}
#define pack_pixel_1to2 pack8to16
#define pack_pixel_2to4 pack16to32
#if X264_HIGH_BIT_DEPTH
# define pack_pixel_1to2 pack16to32
# define pack_pixel_2to4 pack32to64
#else
# define pack_pixel_1to2 pack8to16
# define pack_pixel_2to4 pack16to32
#endif
#define array_non_zero(a) array_non_zero_int(a, sizeof(a))
#define array_non_zero(a) array_non_zero_int(a, sizeof(a)/sizeof(dctcoef))
#define array_non_zero_int array_non_zero_int
static ALWAYS_INLINE int array_non_zero_int( dctcoef *v, int i_count )
{
if(i_count == 8)
return !!M64( &v[0] );
else if(i_count == 16)
return !!(M64( &v[0] ) | M64( &v[4] ));
else if(i_count == 32)
return !!(M64( &v[0] ) | M64( &v[4] ) | M64( &v[8] ) | M64( &v[12] ));
else
{
for( int i = 0; i < i_count; i+=4 )
if( M64( &v[i] ) ) return 1;
return 0;
}
for( int i = 0; i < i_count; i++ )
if( v[i] )
return 1;
return 0;
}
static ALWAYS_INLINE int x264_mb_predict_intra4x4_mode( x264_t *h, int idx )
{
......
......@@ -117,11 +117,14 @@ static void x264_weight_cache( x264_t *h, x264_weight_t *w )
{
w->weightfn = h->mc.weight;
}
#define opscale(x) dst[x] = x264_clip_pixel( ((src[x] * weight->i_scale + (1<<(weight->i_denom - 1))) >> weight->i_denom) + weight->i_offset )
#define opscale_noden(x) dst[x] = x264_clip_pixel( src[x] * weight->i_scale + weight->i_offset )
static inline void mc_weight( pixel *dst, int i_dst_stride, pixel *src, int i_src_stride, const x264_weight_t *weight, int i_width, int i_height )
#define opscale(x) dst[x] = x264_clip_pixel( ((src[x] * scale + (1<<(denom - 1))) >> denom) + offset )
#define opscale_noden(x) dst[x] = x264_clip_pixel( src[x] * scale + offset )
static void mc_weight( pixel *dst, int i_dst_stride, pixel *src, int i_src_stride, const x264_weight_t *weight, int i_width, int i_height )
{
if( weight->i_denom >= 1 )
int offset = weight->i_offset << (BIT_DEPTH-8);
int scale = weight->i_scale;
int denom = weight->i_denom;
if( denom >= 1 )
{
for( int y = 0; y < i_height; y++, dst += i_dst_stride, src += i_src_stride )
for( int x = 0; x < i_width; x++ )
......@@ -135,21 +138,10 @@ static inline void mc_weight( pixel *dst, int i_dst_stride, pixel *src, int i_sr
}
}
#define MC_WEIGHT_C( name, lx ) \
#define MC_WEIGHT_C( name, width ) \
static void name( pixel *dst, int i_dst_stride, pixel *src, int i_src_stride, const x264_weight_t *weight, int height ) \
{ \
if( weight->i_denom >= 1 ) \
{ \
for( int y = 0; y < height; y++, dst += i_dst_stride, src += i_src_stride ) \
for( int x = 0; x < lx; x++ ) \
opscale( x ); \
} \
else \
{ \
for( int y = 0; y < height; y++, dst += i_dst_stride, src += i_src_stride ) \
for( int x = 0; x < lx; x++ ) \
opscale_noden( x ); \
} \
mc_weight( dst, i_dst_stride, src, i_src_stride, weight, width, height );\
}
MC_WEIGHT_C( mc_weight_w20, 20 )
......@@ -182,7 +174,7 @@ static void mc_copy( pixel *src, int i_src_stride, pixel *dst, int i_dst_stride,
#define TAPFILTER(pix, d) ((pix)[x-2*d] + (pix)[x+3*d] - 5*((pix)[x-d] + (pix)[x+2*d]) + 20*((pix)[x] + (pix)[x+d]))
static void hpel_filter( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,
int stride, int width, int height, int16_t *buf )
int stride, int width, int height, dctcoef *buf )
{
for( int y = 0; y < height; y++ )
{
......@@ -301,7 +293,12 @@ void x264_plane_copy_c( pixel *dst, int i_dst,
{
while( h-- )
{
#if X264_HIGH_BIT_DEPTH
for( int i = 0; i < w; i++ )
dst[i] = src[i] << (BIT_DEPTH-8);
#else
memcpy( dst, src, w );
#endif
dst += i_dst;
src += i_src;
}
......
......@@ -82,7 +82,7 @@ typedef struct
uint8_t *src, int i_src, int w, int h);
void (*hpel_filter)( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,
int i_stride, int i_width, int i_height, int16_t *buf );
int i_stride, int i_width, int i_height, dctcoef *buf );
/* prefetch the next few macroblocks of fenc or fdec */
void (*prefetch_fenc)( pixel *pix_y, int stride_y,
......
......@@ -177,7 +177,7 @@ static int pixel_var2_8x8( pixel *pix1, int i_stride1, pixel *pix2, int i_stride
pix2 += i_stride2;
}
sum = abs(sum);
var = sqr - (sum * sum >> 6);
var = sqr - ((uint64_t)sum * sum >> 6);
*ssd = sqr;
return var;
}
......@@ -406,12 +406,14 @@ SAD_X( 8x4 )
SAD_X( 4x8 )
SAD_X( 4x4 )
#if !X264_HIGH_BIT_DEPTH
#if ARCH_UltraSparc
SAD_X( 16x16_vis )
SAD_X( 16x8_vis )
SAD_X( 8x16_vis )
SAD_X( 8x8_vis )
#endif
#endif // !X264_HIGH_BIT_DEPTH
/****************************************************************************
* pixel_satd_x4
......@@ -444,6 +446,7 @@ SATD_X_DECL6( cpu )\
SATD_X( 4x4, cpu )
SATD_X_DECL7()
#if !X264_HIGH_BIT_DEPTH
#if HAVE_MMX
SATD_X_DECL7( _mmxext )
SATD_X_DECL6( _sse2 )
......@@ -454,6 +457,7 @@ SATD_X_DECL7( _sse4 )
#if HAVE_ARMV6
SATD_X_DECL7( _neon )
#endif
#endif // !X264_HIGH_BIT_DEPTH
#define INTRA_MBCMP_8x8( mbcmp )\
void x264_intra_##mbcmp##_x3_8x8( pixel *fenc, pixel edge[33], int res[3] )\
......@@ -520,8 +524,8 @@ static void ssim_4x4x2_core( const pixel *pix1, int stride1,
static float ssim_end1( int s1, int s2, int ss, int s12 )
{
static const int ssim_c1 = (int)(.01*.01*255*255*64 + .5);
static const int ssim_c2 = (int)(.03*.03*255*255*64*63 + .5);
static const int ssim_c1 = (int)(.01*.01*PIXEL_MAX*PIXEL_MAX*64 + .5);
static const int ssim_c2 = (int)(.03*.03*PIXEL_MAX*PIXEL_MAX*64*63 + .5);
int vars = ss*64 - s1*s1 - s2*s2;
int covar = s12*64 - s1*s2;
return (float)(2*s1*s2 + ssim_c1) * (float)(2*covar + ssim_c2)
......@@ -678,6 +682,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16;
pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16;
#if !X264_HIGH_BIT_DEPTH
#if HAVE_MMX
if( cpu&X264_CPU_MMX )
{
......@@ -903,17 +908,20 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
}
}
#endif
#endif // !X264_HIGH_BIT_DEPTH
#if HAVE_ALTIVEC
if( cpu&X264_CPU_ALTIVEC )
{
x264_pixel_altivec_init( pixf );
}
#endif
#if !X264_HIGH_BIT_DEPTH
#if ARCH_UltraSparc