Commit c91f43a4 authored by Oskar Arvidsson's avatar Oskar Arvidsson Committed by Fiona Glaser

Support for 9 and 10-bit encoding

Output bit depth is specified on compilation time via --bit-depth.
There is currently almost no assembly code available for high-bit-depth modes, so encoding will be very slow.
Input is still 8-bit only; this will change in the future.

Note that very few H.264 decoders support >8 bit depth currently.
Also note that the quantizer scale differs for higher bit depth.  For example, for 10-bit, the quantizer (and crf) ranges from 0 to 63 instead of 0 to 51.
parent b7789b1f
...@@ -64,6 +64,19 @@ MC_WEIGHT(_nodenom) ...@@ -64,6 +64,19 @@ MC_WEIGHT(_nodenom)
MC_WEIGHT(_offsetadd) MC_WEIGHT(_offsetadd)
MC_WEIGHT(_offsetsub) MC_WEIGHT(_offsetsub)
void x264_mc_copy_w4_neon( uint8_t *, int, uint8_t *, int, int );
void x264_mc_copy_w8_neon( uint8_t *, int, uint8_t *, int, int );
void x264_mc_copy_w16_neon( uint8_t *, int, uint8_t *, int, int );
void x264_mc_copy_w16_aligned_neon( uint8_t *, int, uint8_t *, int, int );
void x264_mc_chroma_neon( uint8_t *, int, uint8_t *, int, int, int, int, int );
void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int, int, int);
void x264_hpel_filter_v_neon( uint8_t *, uint8_t *, int16_t *, int, int );
void x264_hpel_filter_c_neon( uint8_t *, int16_t *, int );
void x264_hpel_filter_h_neon( uint8_t *, uint8_t *, int );
#if !X264_HIGH_BIT_DEPTH
static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w ) static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w )
{ {
if( w->i_scale == 1<<w->i_denom ) if( w->i_scale == 1<<w->i_denom )
...@@ -85,14 +98,6 @@ static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w ) ...@@ -85,14 +98,6 @@ static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w )
w->weightfn = x264_mc_wtab_neon; w->weightfn = x264_mc_wtab_neon;
} }
void x264_mc_copy_w4_neon( uint8_t *, int, uint8_t *, int, int );
void x264_mc_copy_w8_neon( uint8_t *, int, uint8_t *, int, int );
void x264_mc_copy_w16_neon( uint8_t *, int, uint8_t *, int, int );
void x264_mc_copy_w16_aligned_neon( uint8_t *, int, uint8_t *, int, int );
void x264_mc_chroma_neon( uint8_t *, int, uint8_t *, int, int, int, int, int );
void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int, int, int);
static void (* const x264_pixel_avg_wtab_neon[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) = static void (* const x264_pixel_avg_wtab_neon[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) =
{ {
NULL, NULL,
...@@ -174,10 +179,6 @@ static uint8_t *get_ref_neon( uint8_t *dst, int *i_dst_stride, ...@@ -174,10 +179,6 @@ static uint8_t *get_ref_neon( uint8_t *dst, int *i_dst_stride,
} }
} }
void x264_hpel_filter_v_neon( uint8_t *, uint8_t *, int16_t *, int, int );
void x264_hpel_filter_c_neon( uint8_t *, int16_t *, int );
void x264_hpel_filter_h_neon( uint8_t *, uint8_t *, int );
static void hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, static void hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
int stride, int width, int height, int16_t *buf ) int stride, int width, int height, int16_t *buf )
{ {
...@@ -198,18 +199,22 @@ static void hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8 ...@@ -198,18 +199,22 @@ static void hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8
src += stride; src += stride;
} }
} }
#endif // !X264_HIGH_BIT_DEPTH
void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf ) void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf )
{ {
if( !(cpu&X264_CPU_ARMV6) ) if( !(cpu&X264_CPU_ARMV6) )
return; return;
#if !X264_HIGH_BIT_DEPTH
pf->prefetch_fenc = x264_prefetch_fenc_arm; pf->prefetch_fenc = x264_prefetch_fenc_arm;
pf->prefetch_ref = x264_prefetch_ref_arm; pf->prefetch_ref = x264_prefetch_ref_arm;
#endif // !X264_HIGH_BIT_DEPTH
if( !(cpu&X264_CPU_NEON) ) if( !(cpu&X264_CPU_NEON) )
return; return;
#if !X264_HIGH_BIT_DEPTH
pf->copy_16x16_unaligned = x264_mc_copy_w16_neon; pf->copy_16x16_unaligned = x264_mc_copy_w16_neon;
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_neon; pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_neon;
pf->copy[PIXEL_8x8] = x264_mc_copy_w8_neon; pf->copy[PIXEL_8x8] = x264_mc_copy_w8_neon;
...@@ -229,15 +234,16 @@ void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf ) ...@@ -229,15 +234,16 @@ void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf )
pf->offsetsub = x264_mc_offsetsub_wtab_neon; pf->offsetsub = x264_mc_offsetsub_wtab_neon;
pf->weight_cache = x264_weight_cache_neon; pf->weight_cache = x264_weight_cache_neon;
// Apple's gcc stupidly cannot align stack variables, and ALIGNED_ARRAY can't work on structs
#ifndef SYS_MACOSX
pf->memcpy_aligned = x264_memcpy_aligned_neon;
#endif
pf->memzero_aligned = x264_memzero_aligned_neon;
pf->mc_chroma = x264_mc_chroma_neon; pf->mc_chroma = x264_mc_chroma_neon;
pf->mc_luma = mc_luma_neon; pf->mc_luma = mc_luma_neon;
pf->get_ref = get_ref_neon; pf->get_ref = get_ref_neon;
pf->hpel_filter = hpel_filter_neon; pf->hpel_filter = hpel_filter_neon;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_neon; pf->frame_init_lowres_core = x264_frame_init_lowres_core_neon;
#endif // !X264_HIGH_BIT_DEPTH
// Apple's gcc stupidly cannot align stack variables, and ALIGNED_ARRAY can't work on structs
#ifndef SYS_MACOSX
pf->memcpy_aligned = x264_memcpy_aligned_neon;
#endif
pf->memzero_aligned = x264_memzero_aligned_neon;
} }
...@@ -51,6 +51,7 @@ void x264_predict_4x4_init_arm( int cpu, x264_predict_t pf[12] ) ...@@ -51,6 +51,7 @@ void x264_predict_4x4_init_arm( int cpu, x264_predict_t pf[12] )
if (!(cpu&X264_CPU_ARMV6)) if (!(cpu&X264_CPU_ARMV6))
return; return;
#if !X264_HIGH_BIT_DEPTH
pf[I_PRED_4x4_H] = x264_predict_4x4_h_armv6; pf[I_PRED_4x4_H] = x264_predict_4x4_h_armv6;
pf[I_PRED_4x4_DC] = x264_predict_4x4_dc_armv6; pf[I_PRED_4x4_DC] = x264_predict_4x4_dc_armv6;
pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_armv6; pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_armv6;
...@@ -59,6 +60,7 @@ void x264_predict_4x4_init_arm( int cpu, x264_predict_t pf[12] ) ...@@ -59,6 +60,7 @@ void x264_predict_4x4_init_arm( int cpu, x264_predict_t pf[12] )
return; return;
pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_neon; pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_neon;
#endif // !X264_HIGH_BIT_DEPTH
} }
void x264_predict_8x8c_init_arm( int cpu, x264_predict_t pf[7] ) void x264_predict_8x8c_init_arm( int cpu, x264_predict_t pf[7] )
...@@ -66,12 +68,14 @@ void x264_predict_8x8c_init_arm( int cpu, x264_predict_t pf[7] ) ...@@ -66,12 +68,14 @@ void x264_predict_8x8c_init_arm( int cpu, x264_predict_t pf[7] )
if (!(cpu&X264_CPU_NEON)) if (!(cpu&X264_CPU_NEON))
return; return;
#if !X264_HIGH_BIT_DEPTH
pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_neon; pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_neon;
pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x8c_dc_top_neon; pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x8c_dc_top_neon;
pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left_neon; pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left_neon;
pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_neon; pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_neon;
pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_neon; pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_neon;
pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_neon; pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_neon;
#endif // !X264_HIGH_BIT_DEPTH
} }
void x264_predict_8x8_init_arm( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter ) void x264_predict_8x8_init_arm( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter )
...@@ -79,8 +83,10 @@ void x264_predict_8x8_init_arm( int cpu, x264_predict8x8_t pf[12], x264_predict_ ...@@ -79,8 +83,10 @@ void x264_predict_8x8_init_arm( int cpu, x264_predict8x8_t pf[12], x264_predict_
if (!(cpu&X264_CPU_NEON)) if (!(cpu&X264_CPU_NEON))
return; return;
#if !X264_HIGH_BIT_DEPTH
pf[I_PRED_8x8_DC] = x264_predict_8x8_dc_neon; pf[I_PRED_8x8_DC] = x264_predict_8x8_dc_neon;
pf[I_PRED_8x8_H] = x264_predict_8x8_h_neon; pf[I_PRED_8x8_H] = x264_predict_8x8_h_neon;
#endif // !X264_HIGH_BIT_DEPTH
} }
void x264_predict_16x16_init_arm( int cpu, x264_predict_t pf[7] ) void x264_predict_16x16_init_arm( int cpu, x264_predict_t pf[7] )
...@@ -88,10 +94,12 @@ void x264_predict_16x16_init_arm( int cpu, x264_predict_t pf[7] ) ...@@ -88,10 +94,12 @@ void x264_predict_16x16_init_arm( int cpu, x264_predict_t pf[7] )
if (!(cpu&X264_CPU_NEON)) if (!(cpu&X264_CPU_NEON))
return; return;
#if !X264_HIGH_BIT_DEPTH
pf[I_PRED_16x16_DC ] = x264_predict_16x16_dc_neon; pf[I_PRED_16x16_DC ] = x264_predict_16x16_dc_neon;
pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_neon; pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_neon;
pf[I_PRED_16x16_DC_LEFT]= x264_predict_16x16_dc_left_neon; pf[I_PRED_16x16_DC_LEFT]= x264_predict_16x16_dc_left_neon;
pf[I_PRED_16x16_H ] = x264_predict_16x16_h_neon; pf[I_PRED_16x16_H ] = x264_predict_16x16_h_neon;
pf[I_PRED_16x16_V ] = x264_predict_16x16_v_neon; pf[I_PRED_16x16_V ] = x264_predict_16x16_v_neon;
pf[I_PRED_16x16_P ] = x264_predict_16x16_p_neon; pf[I_PRED_16x16_P ] = x264_predict_16x16_p_neon;
#endif // !X264_HIGH_BIT_DEPTH
} }
...@@ -53,7 +53,7 @@ typedef struct bs_s ...@@ -53,7 +53,7 @@ typedef struct bs_s
typedef struct typedef struct
{ {
int last; int last;
int16_t level[16]; dctcoef level[16];
uint8_t run[16]; uint8_t run[16];
} x264_run_level_t; } x264_run_level_t;
......
...@@ -91,10 +91,10 @@ void x264_param_default( x264_param_t *param ) ...@@ -91,10 +91,10 @@ void x264_param_default( x264_param_t *param )
param->rc.i_vbv_max_bitrate = 0; param->rc.i_vbv_max_bitrate = 0;
param->rc.i_vbv_buffer_size = 0; param->rc.i_vbv_buffer_size = 0;
param->rc.f_vbv_buffer_init = 0.9; param->rc.f_vbv_buffer_init = 0.9;
param->rc.i_qp_constant = 23; param->rc.i_qp_constant = 23 + QP_BD_OFFSET;
param->rc.f_rf_constant = 23; param->rc.f_rf_constant = 23 + QP_BD_OFFSET;
param->rc.i_qp_min = 10; param->rc.i_qp_min = 10;
param->rc.i_qp_max = 51; param->rc.i_qp_max = QP_MAX;
param->rc.i_qp_step = 4; param->rc.i_qp_step = 4;
param->rc.f_ip_factor = 1.4; param->rc.f_ip_factor = 1.4;
param->rc.f_pb_factor = 1.3; param->rc.f_pb_factor = 1.3;
...@@ -418,6 +418,15 @@ int x264_param_apply_profile( x264_param_t *param, const char *profile ) ...@@ -418,6 +418,15 @@ int x264_param_apply_profile( x264_param_t *param, const char *profile )
if( !profile ) if( !profile )
return 0; return 0;
#if BIT_DEPTH > 8
if( !strcasecmp( profile, "baseline" ) || !strcasecmp( profile, "main" ) ||
!strcasecmp( profile, "high" ) )
{
x264_log( NULL, X264_LOG_ERROR, "%s profile doesn't support a bit depth of %d.\n", profile, BIT_DEPTH );
return -1;
}
#endif
if( !strcasecmp( profile, "baseline" ) ) if( !strcasecmp( profile, "baseline" ) )
{ {
param->analyse.b_transform_8x8 = 0; param->analyse.b_transform_8x8 = 0;
...@@ -441,7 +450,7 @@ int x264_param_apply_profile( x264_param_t *param, const char *profile ) ...@@ -441,7 +450,7 @@ int x264_param_apply_profile( x264_param_t *param, const char *profile )
param->analyse.b_transform_8x8 = 0; param->analyse.b_transform_8x8 = 0;
param->i_cqm_preset = X264_CQM_FLAT; param->i_cqm_preset = X264_CQM_FLAT;
} }
else if( !strcasecmp( profile, "high" ) ) else if( !strcasecmp( profile, "high" ) || !strcasecmp( profile, "high10" ) )
{ {
/* Default */ /* Default */
} }
......
...@@ -52,10 +52,15 @@ do {\ ...@@ -52,10 +52,15 @@ do {\
#define X264_BFRAME_MAX 16 #define X264_BFRAME_MAX 16
#define X264_THREAD_MAX 128 #define X264_THREAD_MAX 128
#define X264_PCM_COST (386*8) #define X264_PCM_COST (384*BIT_DEPTH+16)
#define X264_LOOKAHEAD_MAX 250 #define X264_LOOKAHEAD_MAX 250
#define QP_BD_OFFSET (6*(BIT_DEPTH-8))
#define QP_MAX (51+QP_BD_OFFSET)
#define QP_MAX_MAX (51+2*6)
#define LAMBDA_MAX (91 << (BIT_DEPTH-8))
#define PIXEL_MAX ((1 << BIT_DEPTH)-1)
// arbitrary, but low because SATD scores are 1/4 normal // arbitrary, but low because SATD scores are 1/4 normal
#define X264_LOOKAHEAD_QP 12 #define X264_LOOKAHEAD_QP (12+QP_BD_OFFSET)
// number of pixels (per thread) in progress at any given time. // number of pixels (per thread) in progress at any given time.
// 16 for the macroblock in progress + 3 for deblocking + 3 for motion compensation filter + 2 for extra safety // 16 for the macroblock in progress + 3 for deblocking + 3 for motion compensation filter + 2 for extra safety
...@@ -101,17 +106,23 @@ typedef union { x264_uint128_t i; uint64_t a[2]; uint32_t b[4]; uint16_t c[8]; u ...@@ -101,17 +106,23 @@ typedef union { x264_uint128_t i; uint64_t a[2]; uint32_t b[4]; uint16_t c[8]; u
#define CP64(dst,src) M64(dst) = M64(src) #define CP64(dst,src) M64(dst) = M64(src)
#define CP128(dst,src) M128(dst) = M128(src) #define CP128(dst,src) M128(dst) = M128(src)
typedef uint8_t pixel; #if X264_HIGH_BIT_DEPTH
typedef uint32_t pixel4; typedef uint16_t pixel;
typedef int16_t dctcoef; typedef uint64_t pixel4;
typedef int32_t dctcoef;
#define PIXEL_SPLAT_X4(x) ((x)*0x01010101U) # define PIXEL_SPLAT_X4(x) ((x)*0x0001000100010001ULL)
#define MPIXEL_X4(src) M32(src) # define MPIXEL_X4(src) M64(src)
#define CPPIXEL_X4(dst,src) CP32(dst,src) #else
#define CPPIXEL_X8(dst,src) CP64(dst,src) typedef uint8_t pixel;
#define MDCT_X2(dct) M32(dct) typedef uint32_t pixel4;
#define CPDCT_X2(dst,src) CP32(dst,src) typedef int16_t dctcoef;
#define CPDCT_X4(dst,src) CP64(dst,src)
# define PIXEL_SPLAT_X4(x) ((x)*0x01010101U)
# define MPIXEL_X4(src) M32(src)
#endif
#define CPPIXEL_X4(dst,src) MPIXEL_X4(dst) = MPIXEL_X4(src)
#define X264_SCAN8_SIZE (6*8) #define X264_SCAN8_SIZE (6*8)
#define X264_SCAN8_LUMA_SIZE (5*8) #define X264_SCAN8_LUMA_SIZE (5*8)
...@@ -189,7 +200,7 @@ void x264_init_vlc_tables(); ...@@ -189,7 +200,7 @@ void x264_init_vlc_tables();
static ALWAYS_INLINE pixel x264_clip_pixel( int x ) static ALWAYS_INLINE pixel x264_clip_pixel( int x )
{ {
return x&(~255) ? (-x)>>31 : x; return ( (x & ~PIXEL_MAX) ? (-x)>>31 & PIXEL_MAX : x );
} }
static ALWAYS_INLINE int x264_clip3( int v, int i_min, int i_max ) static ALWAYS_INLINE int x264_clip3( int v, int i_min, int i_max )
...@@ -449,8 +460,8 @@ struct x264_t ...@@ -449,8 +460,8 @@ struct x264_t
/* mv/ref cost arrays. Indexed by lambda instead of /* mv/ref cost arrays. Indexed by lambda instead of
* qp because, due to rounding, some quantizers share * qp because, due to rounding, some quantizers share
* lambdas. This saves memory. */ * lambdas. This saves memory. */
uint16_t *cost_mv[92]; uint16_t *cost_mv[LAMBDA_MAX+1];
uint16_t *cost_mv_fpel[92][4]; uint16_t *cost_mv_fpel[LAMBDA_MAX+1][4];
const uint8_t *chroma_qp_table; /* includes both the nonlinear luma->chroma mapping and chroma_qp_offset */ const uint8_t *chroma_qp_table; /* includes both the nonlinear luma->chroma mapping and chroma_qp_offset */
......
...@@ -418,6 +418,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf ) ...@@ -418,6 +418,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->dct4x4dc = dct4x4dc; dctf->dct4x4dc = dct4x4dc;
dctf->idct4x4dc = idct4x4dc; dctf->idct4x4dc = idct4x4dc;
#if !X264_HIGH_BIT_DEPTH
#if HAVE_MMX #if HAVE_MMX
if( cpu&X264_CPU_MMX ) if( cpu&X264_CPU_MMX )
{ {
...@@ -515,6 +516,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf ) ...@@ -515,6 +516,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->add16x16_idct8= x264_add16x16_idct8_neon; dctf->add16x16_idct8= x264_add16x16_idct8_neon;
} }
#endif #endif
#endif // !X264_HIGH_BIT_DEPTH
} }
void x264_dct_init_weights( void ) void x264_dct_init_weights( void )
...@@ -599,11 +601,9 @@ static void zigzag_scan_4x4_frame( dctcoef level[16], dctcoef dct[16] ) ...@@ -599,11 +601,9 @@ static void zigzag_scan_4x4_frame( dctcoef level[16], dctcoef dct[16] )
static void zigzag_scan_4x4_field( dctcoef level[16], dctcoef dct[16] ) static void zigzag_scan_4x4_field( dctcoef level[16], dctcoef dct[16] )
{ {
CPDCT_X2( level, dct ); memcpy( level, dct, 2 * sizeof(dctcoef) );
ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1) ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1)
CPDCT_X2( level+6, dct+6 ); memcpy( level+6, dct+6, 10 * sizeof(dctcoef) );
CPDCT_X4( level+8, dct+8 );
CPDCT_X4( level+12, dct+12 );
} }
#undef ZIG #undef ZIG
...@@ -618,6 +618,7 @@ static void zigzag_scan_4x4_field( dctcoef level[16], dctcoef dct[16] ) ...@@ -618,6 +618,7 @@ static void zigzag_scan_4x4_field( dctcoef level[16], dctcoef dct[16] )
CPPIXEL_X4( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\ CPPIXEL_X4( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
CPPIXEL_X4( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\ CPPIXEL_X4( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
CPPIXEL_X4( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE ); CPPIXEL_X4( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );
#define CPPIXEL_X8(dst,src) ( CPPIXEL_X4(dst,src), CPPIXEL_X4(dst+4,src+4) )
#define COPY8x8\ #define COPY8x8\
CPPIXEL_X8( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\ CPPIXEL_X8( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
CPPIXEL_X8( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\ CPPIXEL_X8( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
...@@ -709,6 +710,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced ) ...@@ -709,6 +710,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
pf->sub_8x8 = zigzag_sub_8x8_field; pf->sub_8x8 = zigzag_sub_8x8_field;
pf->sub_4x4 = zigzag_sub_4x4_field; pf->sub_4x4 = zigzag_sub_4x4_field;
pf->sub_4x4ac = zigzag_sub_4x4ac_field; pf->sub_4x4ac = zigzag_sub_4x4ac_field;
#if !X264_HIGH_BIT_DEPTH
#if HAVE_MMX #if HAVE_MMX
if( cpu&X264_CPU_MMXEXT ) if( cpu&X264_CPU_MMXEXT )
{ {
...@@ -726,6 +728,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced ) ...@@ -726,6 +728,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
if( cpu&X264_CPU_ALTIVEC ) if( cpu&X264_CPU_ALTIVEC )
pf->scan_4x4 = x264_zigzag_scan_4x4_field_altivec; pf->scan_4x4 = x264_zigzag_scan_4x4_field_altivec;
#endif #endif
#endif // !X264_HIGH_BIT_DEPTH
} }
else else
{ {
...@@ -734,6 +737,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced ) ...@@ -734,6 +737,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
pf->sub_8x8 = zigzag_sub_8x8_frame; pf->sub_8x8 = zigzag_sub_8x8_frame;
pf->sub_4x4 = zigzag_sub_4x4_frame; pf->sub_4x4 = zigzag_sub_4x4_frame;
pf->sub_4x4ac = zigzag_sub_4x4ac_frame; pf->sub_4x4ac = zigzag_sub_4x4ac_frame;
#if !X264_HIGH_BIT_DEPTH
#if HAVE_MMX #if HAVE_MMX
if( cpu&X264_CPU_MMX ) if( cpu&X264_CPU_MMX )
pf->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx; pf->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx;
...@@ -759,13 +763,16 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced ) ...@@ -759,13 +763,16 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
if( cpu&X264_CPU_NEON ) if( cpu&X264_CPU_NEON )
pf->scan_4x4 = x264_zigzag_scan_4x4_frame_neon; pf->scan_4x4 = x264_zigzag_scan_4x4_frame_neon;
#endif #endif
#endif // !X264_HIGH_BIT_DEPTH
} }
pf->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc; pf->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc;
#if !X264_HIGH_BIT_DEPTH
#if HAVE_MMX #if HAVE_MMX
if( cpu&X264_CPU_MMX ) if( cpu&X264_CPU_MMX )
pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx; pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
if( cpu&X264_CPU_SHUFFLE_IS_FAST ) if( cpu&X264_CPU_SHUFFLE_IS_FAST )
pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2; pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
#endif #endif
#endif // !X264_HIGH_BIT_DEPTH
} }
...@@ -25,8 +25,9 @@ ...@@ -25,8 +25,9 @@
#include "common.h" #include "common.h"
/* Deblocking filter */ /* Deblocking filter */
static const uint8_t i_alpha_table[52+12*2] = static const uint8_t i_alpha_table[52+12*3] =
{ {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 4, 4, 5, 6, 0, 0, 0, 0, 0, 0, 4, 4, 5, 6,
...@@ -36,8 +37,9 @@ static const uint8_t i_alpha_table[52+12*2] = ...@@ -36,8 +37,9 @@ static const uint8_t i_alpha_table[52+12*2] =
255,255, 255,255,
255,255,255,255,255,255,255,255,255,255,255,255, 255,255,255,255,255,255,255,255,255,255,255,255,
}; };
static const uint8_t i_beta_table[52+12*2] = static const uint8_t i_beta_table[52+12*3] =
{ {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 2, 2, 2, 3, 0, 0, 0, 0, 0, 0, 2, 2, 2, 3,
...@@ -47,12 +49,14 @@ static const uint8_t i_beta_table[52+12*2] = ...@@ -47,12 +49,14 @@ static const uint8_t i_beta_table[52+12*2] =
18, 18, 18, 18,
18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
}; };
static const int8_t i_tc0_table[52+12*2][4] = static const int8_t i_tc0_table[52+12*3][4] =
{ {
{-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
{-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
{-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
{-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
{-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
{-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
{-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 1 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 1 },
{-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 1, 1 }, {-1, 0, 1, 1 }, {-1, 1, 1, 1 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 1, 1 }, {-1, 0, 1, 1 }, {-1, 1, 1, 1 },
{-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 },
...@@ -63,9 +67,9 @@ static const int8_t i_tc0_table[52+12*2][4] = ...@@ -63,9 +67,9 @@ static const int8_t i_tc0_table[52+12*2][4] =
{-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
{-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
}; };
#define alpha_table(x) i_alpha_table[(x)+12] #define alpha_table(x) i_alpha_table[(x)+24]
#define beta_table(x) i_beta_table[(x)+12] #define beta_table(x) i_beta_table[(x)+24]
#define tc0_table(x) i_tc0_table[(x)+12] #define tc0_table(x) i_tc0_table[(x)+24]
/* From ffmpeg */ /* From ffmpeg */
static inline void deblock_luma_c( pixel *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 ) static inline void deblock_luma_c( pixel *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
...@@ -265,18 +269,19 @@ static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264 ...@@ -265,18 +269,19 @@ static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264
static inline void deblock_edge( x264_t *h, pixel *pix1, pixel *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_inter_t pf_inter ) static inline void deblock_edge( x264_t *h, pixel *pix1, pixel *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_inter_t pf_inter )
{ {
int index_a = i_qp + h->sh.i_alpha_c0_offset; int index_a = i_qp-QP_BD_OFFSET + h->sh.i_alpha_c0_offset;
int alpha = alpha_table(index_a); int index_b = i_qp-QP_BD_OFFSET + h->sh.i_beta_offset;
int beta = beta_table(i_qp + h->sh.i_beta_offset); int alpha = alpha_table(index_a) << (BIT_DEPTH-8);
int beta = beta_table(index_b) << (BIT_DEPTH-8);
int8_t tc[4]; int8_t tc[4];
if( !M32(bS) || !alpha || !beta ) if( !M32(bS) || !alpha || !beta )
return; return;
tc[0] = tc0_table(index_a)[bS[0]] + b_chroma; tc[0] = (tc0_table(index_a)[bS[0]] << (BIT_DEPTH-8)) + b_chroma;
tc[1] = tc0_table(index_a)[bS[1]] + b_chroma; tc[1] = (tc0_table(index_a)[bS[1]] << (BIT_DEPTH-8)) + b_chroma;
tc[2] = tc0_table(index_a)[bS[2]] + b_chroma; tc[2] = (tc0_table(index_a)[bS[2]] << (BIT_DEPTH-8)) + b_chroma;
tc[3] = tc0_table(index_a)[bS[3]] + b_chroma; tc[3] = (tc0_table(index_a)[bS[3]] << (BIT_DEPTH-8)) + b_chroma;
pf_inter( pix1, i_stride, alpha, beta, tc ); pf_inter( pix1, i_stride, alpha, beta, tc );
if( b_chroma ) if( b_chroma )
...@@ -285,8 +290,10 @@ static inline void deblock_edge( x264_t *h, pixel *pix1, pixel *pix2, int i_stri ...@@ -285,8 +290,10 @@ static inline void deblock_edge( x264_t *h, pixel *pix1, pixel *pix2, int i_stri
static inline void deblock_edge_intra( x264_t *h, pixel *pix1, pixel *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_intra_t pf_intra ) static inline void deblock_edge_intra( x264_t *h, pixel *pix1, pixel *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_intra_t pf_intra )
{ {
int alpha = alpha_table(i_qp + h->sh.i_alpha_c0_offset); int index_a = i_qp-QP_BD_OFFSET + h->sh.i_alpha_c0_offset;
int beta = beta_table(i_qp + h->sh.i_beta_offset); int index_b = i_qp-QP_BD_OFFSET + h->sh.i_beta_offset;
int alpha = alpha_table(index_a) << (BIT_DEPTH-8);
int beta = beta_table(index_b) << (BIT_DEPTH-8);
if( !alpha || !beta ) if( !alpha || !beta )
return; return;
...@@ -450,6 +457,7 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf ) ...@@ -450,6 +457,7 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf )
#if HAVE_MMX #if HAVE_MMX
if( cpu&X264_CPU_MMXEXT ) if( cpu&X264_CPU_MMXEXT )
{ {
#if !X264_HIGH_BIT_DEPTH
pf->deblock_chroma[1] = x264_deblock_v_chroma_mmxext; pf->deblock_chroma[1] = x264_deblock_v_chroma_mmxext;
pf->deblock_chroma[0] = x264_deblock_h_chroma_mmxext; pf->deblock_chroma[0] = x264_deblock_h_chroma_mmxext;
pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_mmxext; pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_mmxext;
...@@ -460,10 +468,12 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf ) ...@@ -460,10 +468,12 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf )
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_mmxext; pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_mmxext;
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_mmxext; pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_mmxext;
#endif #endif
#endif // !X264_HIGH_BIT_DEPTH
pf->deblock_strength = x264_deblock_strength_mmxext; pf->deblock_strength = x264_deblock_strength_mmxext;
if( cpu&X264_CPU_SSE2 ) if( cpu&X264_CPU_SSE2 )
{ {
pf->deblock_strength = x264_deblock_strength_sse2; pf->deblock_strength = x264_deblock_strength_sse2;
#if !X264_HIGH_BIT_DEPTH
if( !(cpu&X264_CPU_STACK_MOD4) ) if( !(cpu&X264_CPU_STACK_MOD4) )
{ {
pf->deblock_luma[1] = x264_deblock_v_luma_sse2; pf->deblock_luma[1] = x264_deblock_v_luma_sse2;
...@@ -471,12 +481,14 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf ) ...@@ -471,12 +481,14 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf )
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_sse2; pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_sse2;
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_sse2; pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_sse2;
} }
#endif // !X264_HIGH_BIT_DEPTH
} }
if( cpu&X264_CPU_SSSE3 ) if( cpu&X264_CPU_SSSE3 )
pf->deblock_strength = x264_deblock_strength_ssse3; pf->deblock_strength = x264_deblock_strength_ssse3;
} }
#endif #endif
#if !X264_HIGH_BIT_DEPTH
#if HAVE_ALTIVEC #if HAVE_ALTIVEC
if( cpu&X264_CPU_ALTIVEC ) if( cpu&X264_CPU_ALTIVEC )
{ {
...@@ -494,4 +506,5 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf ) ...@@ -494,4 +506,5 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf )
pf->deblock_chroma[0] = x264_deblock_h_chroma_neon; pf->deblock_chroma[0] = x264_deblock_h_chroma_neon;
} }
#endif #endif
#endif // !X264_HIGH_BIT_DEPTH
} }
...@@ -337,7 +337,7 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead ) ...@@ -337,7 +337,7 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
int scratch_size = 0; int scratch_size = 0;
if( !b_lookahead ) if( !b_lookahead )
{ {
int buf_hpel = (h->thread[0]->fdec->i_width[0]+48) * sizeof(int16_t); int buf_hpel = (h->thread[0]->fdec->i_width[0]+48) * sizeof(dctcoef);
int buf_ssim = h->param.analyse.b_ssim * 8 * (h->param.i_width/4+3) * sizeof(int); int buf_ssim = h->param.analyse.b_ssim * 8 * (h->param.i_width/4+3) * sizeof(int);
int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range); int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range);
int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) * int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
......
...@@ -238,17 +238,30 @@ static const uint16_t block_idx_xy_fdec[16] = ...@@ -238,17 +238,30 @@ static const uint16_t block_idx_xy_fdec[16] =
2*4 + 3*4*FDEC_STRIDE, 3*4 + 3*4*FDEC_STRIDE 2*4 + 3*4*FDEC_STRIDE, 3*4 + 3*4*FDEC_STRIDE