Commit 41227fa2 authored by James Weaver's avatar James Weaver Committed by Fiona Glaser
Browse files

v210 input support

Assembly based on code by Henrik Gramner and Loren Merritt.
parent e2a96627
......@@ -1144,7 +1144,7 @@ int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_heigh
};
int csp = i_csp & X264_CSP_MASK;
if( csp <= X264_CSP_NONE || csp >= X264_CSP_MAX )
if( csp <= X264_CSP_NONE || csp >= X264_CSP_MAX || csp == X264_CSP_V210 )
return -1;
x264_picture_init( pic );
pic->img.i_csp = i_csp;
......
......@@ -53,6 +53,7 @@ static int x264_frame_internal_csp( int external_csp )
case X264_CSP_NV16:
case X264_CSP_I422:
case X264_CSP_YV16:
case X264_CSP_V210:
return X264_CSP_NV16;
case X264_CSP_I444:
case X264_CSP_YV24:
......@@ -380,6 +381,12 @@ int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
}
#endif
if( BIT_DEPTH != 10 && i_csp == X264_CSP_V210 )
{
x264_log( h, X264_LOG_ERROR, "v210 input is only compatible with bit-depth of 10 bits\n" );
return -1;
}
dst->i_type = src->i_type;
dst->i_qpplus1 = src->i_qpplus1;
dst->i_pts = dst->i_reordered_pts = src->i_pts;
......@@ -392,7 +399,16 @@ int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
uint8_t *pix[3];
int stride[3];
if ( i_csp >= X264_CSP_BGR )
if( i_csp == X264_CSP_V210 )
{
stride[0] = src->img.i_stride[0];
pix[0] = src->img.plane[0];
h->mc.plane_copy_deinterleave_v210( dst->plane[0], dst->i_stride[0],
dst->plane[1], dst->i_stride[1],
(uint32_t *)pix[0], stride[0]/sizeof(uint32_t), h->param.i_width, h->param.i_height );
}
else if( i_csp >= X264_CSP_BGR )
{
stride[0] = src->img.i_stride[0];
pix[0] = src->img.plane[0];
......
......@@ -336,6 +336,34 @@ static void x264_plane_copy_deinterleave_rgb_c( pixel *dsta, intptr_t i_dsta,
}
}
void x264_plane_copy_deinterleave_v210_c( pixel *dsty, intptr_t i_dsty,
pixel *dstc, intptr_t i_dstc,
uint32_t *src, intptr_t i_src, int w, int h )
{
for( int l = 0; l < h; l++ )
{
pixel *dsty0 = dsty;
pixel *dstc0 = dstc;
uint32_t *src0 = src;
for( int n = 0; n < w; n += 3 )
{
*(dstc0++) = *src0 & 0x03FF;
*(dsty0++) = ( *src0 >> 10 ) & 0x03FF;
*(dstc0++) = ( *src0 >> 20 ) & 0x03FF;
src0++;
*(dsty0++) = *src0 & 0x03FF;
*(dstc0++) = ( *src0 >> 10 ) & 0x03FF;
*(dsty0++) = ( *src0 >> 20 ) & 0x03FF;
src0++;
}
dsty += i_dsty;
dstc += i_dstc;
src += i_src;
}
}
static void store_interleave_chroma( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height )
{
for( int y=0; y<height; y++, dst+=i_dst, srcu+=FDEC_STRIDE, srcv+=FDEC_STRIDE )
......@@ -507,6 +535,7 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf, int cpu_independent )
pf->plane_copy_interleave = x264_plane_copy_interleave_c;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_c;
pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_c;
pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_c;
pf->hpel_filter = hpel_filter;
......
......@@ -93,6 +93,9 @@ typedef struct
pixel *src, intptr_t i_src, int w, int h );
void (*plane_copy_deinterleave_rgb)( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb,
pixel *dstc, intptr_t i_dstc, pixel *src, intptr_t i_src, int pw, int w, int h );
void (*plane_copy_deinterleave_v210)( pixel *dsty, intptr_t i_dsty,
pixel *dstc, intptr_t i_dstc,
uint32_t *src, intptr_t i_src, int w, int h );
void (*hpel_filter)( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,
intptr_t i_stride, int i_width, int i_height, int16_t *buf );
......
......@@ -38,6 +38,13 @@ filt_mul51: times 16 db -5, 1
hpel_shuf: times 2 db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
deinterleave_shuf: times 2 db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15
v210_mask: times 4 dq 0xc00ffc003ff003ff
v210_luma_shuf: times 2 db 1,2,4,5,6,7,9,10,12,13,14,15,12,13,14,15
v210_chroma_shuf: times 2 db 0,1,2,3,5,6,8,9,10,11,13,14,10,11,13,14
; vpermd indices {0,1,2,4,5,7,_,_} merged in the 3 lsb of each dword to save a register
v210_mult: dw 0x2000,0x7fff,0x0801,0x2000,0x7ffa,0x0800,0x7ffc,0x0800
dw 0x1ffd,0x7fff,0x07ff,0x2000,0x7fff,0x0800,0x7fff,0x0800
%if HIGH_BIT_DEPTH
deinterleave_shuf32a: SHUFFLE_MASK_W 0,2,4,6,8,10,12,14
deinterleave_shuf32b: SHUFFLE_MASK_W 1,3,5,7,9,11,13,15
......@@ -1195,6 +1202,64 @@ cglobal load_deinterleave_chroma_fdec, 4,4
RET
%endmacro ; PLANE_DEINTERLEAVE
%macro PLANE_DEINTERLEAVE_V210 0
;-----------------------------------------------------------------------------
; void x264_plane_copy_deinterleave_v210( uint16_t *dsty, intptr_t i_dsty,
; uint16_t *dstc, intptr_t i_dstc,
; uint32_t *src, intptr_t i_src, int w, int h )
;-----------------------------------------------------------------------------
%if ARCH_X86_64
cglobal plane_copy_deinterleave_v210, 8,10,7
%define src r8
%define org_w r9
%define h r7d
%else
cglobal plane_copy_deinterleave_v210, 7,7,7
%define src r4m
%define org_w r6m
%define h dword r7m
%endif
FIX_STRIDES r1, r3, r6d
shl r5, 2
add r0, r6
add r2, r6
neg r6
mov src, r4
mov org_w, r6
mova m2, [v210_mask]
mova m3, [v210_luma_shuf]
mova m4, [v210_chroma_shuf]
mova m5, [v210_mult] ; also functions as vpermd index for avx2
pshufd m6, m5, q1102
ALIGN 16
.loop:
movu m1, [r4]
pandn m0, m2, m1
pand m1, m2
pshufb m0, m3
pshufb m1, m4
pmulhrsw m0, m5 ; y0 y1 y2 y3 y4 y5 __ __
pmulhrsw m1, m6 ; u0 v0 u1 v1 u2 v2 __ __
%if mmsize == 32
vpermd m0, m5, m0
vpermd m1, m5, m1
%endif
movu [r0+r6], m0
movu [r2+r6], m1
add r4, mmsize
add r6, 3*mmsize/4
jl .loop
add r0, r1
add r2, r3
add src, r5
mov r4, src
mov r6, org_w
dec h
jg .loop
RET
%endmacro ; PLANE_DEINTERLEAVE_V210
%if HIGH_BIT_DEPTH
INIT_MMX mmx2
PLANE_INTERLEAVE
......@@ -1203,9 +1268,14 @@ PLANE_DEINTERLEAVE
INIT_XMM sse2
PLANE_INTERLEAVE
PLANE_DEINTERLEAVE
INIT_XMM ssse3
PLANE_DEINTERLEAVE_V210
INIT_XMM avx
PLANE_INTERLEAVE
PLANE_DEINTERLEAVE
PLANE_DEINTERLEAVE_V210
INIT_YMM avx2
PLANE_DEINTERLEAVE_V210
%else
INIT_MMX mmx2
PLANE_INTERLEAVE
......
......@@ -116,6 +116,15 @@ void x264_plane_copy_deinterleave_ssse3( uint8_t *dstu, intptr_t i_dstu,
void x264_plane_copy_deinterleave_avx( uint16_t *dstu, intptr_t i_dstu,
uint16_t *dstv, intptr_t i_dstv,
uint16_t *src, intptr_t i_src, int w, int h );
void x264_plane_copy_deinterleave_v210_ssse3( uint16_t *dstu, intptr_t i_dstu,
uint16_t *dstv, intptr_t i_dstv,
uint32_t *src, intptr_t i_src, int w, int h );
void x264_plane_copy_deinterleave_v210_avx ( uint16_t *dstu, intptr_t i_dstu,
uint16_t *dstv, intptr_t i_dstv,
uint32_t *src, intptr_t i_src, int w, int h );
void x264_plane_copy_deinterleave_v210_avx2 ( uint16_t *dstu, intptr_t i_dstu,
uint16_t *dstv, intptr_t i_dstv,
uint32_t *src, intptr_t i_src, int w, int h );
void x264_store_interleave_chroma_mmx2( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
void x264_store_interleave_chroma_sse2( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
void x264_store_interleave_chroma_avx ( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
......@@ -627,6 +636,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
return;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_ssse3;
if( !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SLOW_ATOM|X264_CPU_SLOW_PALIGNR)) )
pf->integral_init4v = x264_integral_init4v_ssse3;
......@@ -639,6 +649,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_avx;
pf->plane_copy_interleave = x264_plane_copy_interleave_avx;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_avx;
pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx;
pf->store_interleave_chroma = x264_store_interleave_chroma_avx;
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_avx;
......@@ -649,7 +660,10 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->frame_init_lowres_core = x264_frame_init_lowres_core_xop;
if( cpu&X264_CPU_AVX2 )
{
pf->mc_luma = mc_luma_avx2;
pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx2;
}
#else // !HIGH_BIT_DEPTH
#if ARCH_X86 // all x86_64 cpus with cacheline split issues use sse2 instead
......
......@@ -467,7 +467,7 @@ static int x264_validate_parameters( x264_t *h, int b_open )
x264_log( h, X264_LOG_ERROR, "not compiled with 4:2:0 support\n" );
return -1;
}
else if( CHROMA_FORMAT != CHROMA_422 && i_csp >= X264_CSP_I422 && i_csp <= X264_CSP_NV16 )
else if( CHROMA_FORMAT != CHROMA_422 && i_csp >= X264_CSP_I422 && i_csp <= X264_CSP_V210 )
{
x264_log( h, X264_LOG_ERROR, "not compiled with 4:2:2 support\n" );
return -1;
......
......@@ -94,9 +94,12 @@ static void help( int longhelp )
for( int i = X264_CSP_NONE+1; i < X264_CSP_CLI_MAX; i++ )
{
printf( "%s", x264_cli_csps[i].name );
if( i+1 < X264_CSP_CLI_MAX )
printf( ", " );
if( x264_cli_csps[i].name )
{
printf( "%s", x264_cli_csps[i].name );
if( i+1 < X264_CSP_CLI_MAX )
printf( ", " );
}
}
printf( "\n"
" - depth: 8 or 16 bits per pixel [keep current]\n"
......@@ -243,8 +246,11 @@ static int handle_opts( const char **optlist, char **opts, video_info_t *info, r
if( strlen( str_csp ) == 0 )
csp = info->csp & X264_CSP_MASK;
else
for( csp = X264_CSP_CLI_MAX-1; x264_cli_csps[csp].name && strcasecmp( x264_cli_csps[csp].name, str_csp ); )
csp--;
for( csp = X264_CSP_CLI_MAX-1; csp > X264_CSP_NONE; csp-- )
{
if( x264_cli_csps[csp].name && !strcasecmp( x264_cli_csps[csp].name, str_csp ) )
break;
}
FAIL_IF_ERROR( csp == X264_CSP_NONE, "unsupported colorspace `%s'\n", str_csp );
h->dst_csp = csp;
if( depth == 16 )
......
......@@ -42,7 +42,8 @@ const x264_cli_csp_t x264_cli_csps[] = {
int x264_cli_csp_is_invalid( int csp )
{
int csp_mask = csp & X264_CSP_MASK;
return csp_mask <= X264_CSP_NONE || csp_mask >= X264_CSP_CLI_MAX || csp & X264_CSP_OTHER;
return csp_mask <= X264_CSP_NONE || csp_mask >= X264_CSP_CLI_MAX ||
csp_mask == X264_CSP_V210 || csp & X264_CSP_OTHER;
}
int x264_cli_csp_depth_factor( int csp )
......
......@@ -55,8 +55,11 @@ static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, c
FAIL_IF_ERROR( !info->width || !info->height, "raw input requires a resolution.\n" )
if( opt->colorspace )
{
for( info->csp = X264_CSP_CLI_MAX-1; x264_cli_csps[info->csp].name && strcasecmp( x264_cli_csps[info->csp].name, opt->colorspace ); )
info->csp--;
for( info->csp = X264_CSP_CLI_MAX-1; info->csp > X264_CSP_NONE; info->csp-- )
{
if( x264_cli_csps[info->csp].name && !strcasecmp( x264_cli_csps[info->csp].name, opt->colorspace ) )
break;
}
FAIL_IF_ERROR( info->csp == X264_CSP_NONE, "unsupported colorspace `%s'\n", opt->colorspace );
}
else /* default */
......
......@@ -1453,6 +1453,33 @@ static int check_mc( int cpu_ref, int cpu_new )
}
report( "plane_copy :" );
if( mc_a.plane_copy_deinterleave_v210 != mc_ref.plane_copy_deinterleave_v210 )
{
set_func_name( "plane_copy_deinterleave_v210" );
used_asm = 1;
for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ )
{
int w = (plane_specs[i].w + 1) >> 1;
int h = plane_specs[i].h;
intptr_t dst_stride = ALIGN( w, 16 );
intptr_t src_stride = (w + 47) / 48 * 128 / sizeof(uint32_t);
intptr_t offv = dst_stride*h + 32;
memset( pbuf3, 0, 0x1000 );
memset( pbuf4, 0, 0x1000 );
call_c( mc_c.plane_copy_deinterleave_v210, pbuf3, dst_stride, pbuf3+offv, dst_stride, (uint32_t *)buf1, src_stride, w, h );
call_a( mc_a.plane_copy_deinterleave_v210, pbuf4, dst_stride, pbuf4+offv, dst_stride, (uint32_t *)buf1, src_stride, w, h );
for( int y = 0; y < h; y++ )
if( memcmp( pbuf3+y*dst_stride, pbuf4+y*dst_stride, w*sizeof(uint16_t) ) ||
memcmp( pbuf3+y*dst_stride+offv, pbuf4+y*dst_stride+offv, w*sizeof(uint16_t) ) )
{
ok = 0;
fprintf( stderr, "plane_copy_deinterleave_v210 FAILED: w=%d h=%d stride=%d\n", w, h, (int)src_stride );
break;
}
}
}
report( "v210 :" );
if( mc_a.hpel_filter != mc_ref.hpel_filter )
{
pixel *srchpel = pbuf1+8+2*64;
......
......@@ -420,9 +420,12 @@ static void print_csp_names( int longhelp )
printf( INDENT );
for( int i = X264_CSP_NONE+1; i < X264_CSP_CLI_MAX; i++ )
{
printf( "%s", x264_cli_csps[i].name );
if( i+1 < X264_CSP_CLI_MAX )
printf( ", " );
if( x264_cli_csps[i].name )
{
printf( "%s", x264_cli_csps[i].name );
if( i+1 < X264_CSP_CLI_MAX )
printf( ", " );
}
}
#if HAVE_LAVF
printf( "\n" );
......@@ -1282,7 +1285,7 @@ static int init_vid_filters( char *sequence, hnd_t *handle, video_info_t *info,
int csp = info->csp & X264_CSP_MASK;
if( output_csp == X264_CSP_I420 && (csp < X264_CSP_I420 || csp > X264_CSP_NV12) )
param->i_csp = X264_CSP_I420;
else if( output_csp == X264_CSP_I422 && (csp < X264_CSP_I422 || csp > X264_CSP_NV16) )
else if( output_csp == X264_CSP_I422 && (csp < X264_CSP_I422 || csp > X264_CSP_V210) )
param->i_csp = X264_CSP_I422;
else if( output_csp == X264_CSP_I444 && (csp < X264_CSP_I444 || csp > X264_CSP_YV24) )
param->i_csp = X264_CSP_I444;
......
......@@ -41,7 +41,7 @@
#include "x264_config.h"
#define X264_BUILD 140
#define X264_BUILD 141
/* Application developers planning to link against a shared library version of
* libx264 from a Microsoft Visual Studio or similar development environment
......@@ -215,12 +215,13 @@ static const char * const x264_nal_hrd_names[] = { "none", "vbr", "cbr", 0 };
#define X264_CSP_I422 0x0004 /* yuv 4:2:2 planar */
#define X264_CSP_YV16 0x0005 /* yvu 4:2:2 planar */
#define X264_CSP_NV16 0x0006 /* yuv 4:2:2, with one y plane and one packed u+v */
#define X264_CSP_I444 0x0007 /* yuv 4:4:4 planar */
#define X264_CSP_YV24 0x0008 /* yvu 4:4:4 planar */
#define X264_CSP_BGR 0x0009 /* packed bgr 24bits */
#define X264_CSP_BGRA 0x000a /* packed bgr 32bits */
#define X264_CSP_RGB 0x000b /* packed rgb 24bits */
#define X264_CSP_MAX 0x000c /* end of list */
#define X264_CSP_V210 0x0007 /* 10-bit yuv 4:2:2 packed in 32 */
#define X264_CSP_I444 0x0008 /* yuv 4:4:4 planar */
#define X264_CSP_YV24 0x0009 /* yvu 4:4:4 planar */
#define X264_CSP_BGR 0x000a /* packed bgr 24bits */
#define X264_CSP_BGRA 0x000b /* packed bgr 32bits */
#define X264_CSP_RGB 0x000c /* packed rgb 24bits */
#define X264_CSP_MAX 0x000d /* end of list */
#define X264_CSP_VFLIP 0x1000 /* the csp is vertically flipped */
#define X264_CSP_HIGH_DEPTH 0x2000 /* the csp has a depth of 16 bits per pixel component */
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment