Commit 41227fa2 authored by James Weaver's avatar James Weaver Committed by Fiona Glaser

v210 input support

Assembly based on code by Henrik Gramner and Loren Merritt.
parent e2a96627
......@@ -1144,7 +1144,7 @@ int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_heigh
};
int csp = i_csp & X264_CSP_MASK;
if( csp <= X264_CSP_NONE || csp >= X264_CSP_MAX )
if( csp <= X264_CSP_NONE || csp >= X264_CSP_MAX || csp == X264_CSP_V210 )
return -1;
x264_picture_init( pic );
pic->img.i_csp = i_csp;
......
......@@ -53,6 +53,7 @@ static int x264_frame_internal_csp( int external_csp )
case X264_CSP_NV16:
case X264_CSP_I422:
case X264_CSP_YV16:
case X264_CSP_V210:
return X264_CSP_NV16;
case X264_CSP_I444:
case X264_CSP_YV24:
......@@ -380,6 +381,12 @@ int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
}
#endif
if( BIT_DEPTH != 10 && i_csp == X264_CSP_V210 )
{
x264_log( h, X264_LOG_ERROR, "v210 input is only compatible with bit-depth of 10 bits\n" );
return -1;
}
dst->i_type = src->i_type;
dst->i_qpplus1 = src->i_qpplus1;
dst->i_pts = dst->i_reordered_pts = src->i_pts;
......@@ -392,7 +399,16 @@ int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
uint8_t *pix[3];
int stride[3];
if ( i_csp >= X264_CSP_BGR )
if( i_csp == X264_CSP_V210 )
{
stride[0] = src->img.i_stride[0];
pix[0] = src->img.plane[0];
h->mc.plane_copy_deinterleave_v210( dst->plane[0], dst->i_stride[0],
dst->plane[1], dst->i_stride[1],
(uint32_t *)pix[0], stride[0]/sizeof(uint32_t), h->param.i_width, h->param.i_height );
}
else if( i_csp >= X264_CSP_BGR )
{
stride[0] = src->img.i_stride[0];
pix[0] = src->img.plane[0];
......
......@@ -336,6 +336,34 @@ static void x264_plane_copy_deinterleave_rgb_c( pixel *dsta, intptr_t i_dsta,
}
}
void x264_plane_copy_deinterleave_v210_c( pixel *dsty, intptr_t i_dsty,
pixel *dstc, intptr_t i_dstc,
uint32_t *src, intptr_t i_src, int w, int h )
{
for( int l = 0; l < h; l++ )
{
pixel *dsty0 = dsty;
pixel *dstc0 = dstc;
uint32_t *src0 = src;
for( int n = 0; n < w; n += 3 )
{
*(dstc0++) = *src0 & 0x03FF;
*(dsty0++) = ( *src0 >> 10 ) & 0x03FF;
*(dstc0++) = ( *src0 >> 20 ) & 0x03FF;
src0++;
*(dsty0++) = *src0 & 0x03FF;
*(dstc0++) = ( *src0 >> 10 ) & 0x03FF;
*(dsty0++) = ( *src0 >> 20 ) & 0x03FF;
src0++;
}
dsty += i_dsty;
dstc += i_dstc;
src += i_src;
}
}
static void store_interleave_chroma( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height )
{
for( int y=0; y<height; y++, dst+=i_dst, srcu+=FDEC_STRIDE, srcv+=FDEC_STRIDE )
......@@ -507,6 +535,7 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf, int cpu_independent )
pf->plane_copy_interleave = x264_plane_copy_interleave_c;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_c;
pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_c;
pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_c;
pf->hpel_filter = hpel_filter;
......
......@@ -93,6 +93,9 @@ typedef struct
pixel *src, intptr_t i_src, int w, int h );
void (*plane_copy_deinterleave_rgb)( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb,
pixel *dstc, intptr_t i_dstc, pixel *src, intptr_t i_src, int pw, int w, int h );
void (*plane_copy_deinterleave_v210)( pixel *dsty, intptr_t i_dsty,
pixel *dstc, intptr_t i_dstc,
uint32_t *src, intptr_t i_src, int w, int h );
void (*hpel_filter)( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,
intptr_t i_stride, int i_width, int i_height, int16_t *buf );
......
......@@ -38,6 +38,13 @@ filt_mul51: times 16 db -5, 1
hpel_shuf: times 2 db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
deinterleave_shuf: times 2 db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15
v210_mask: times 4 dq 0xc00ffc003ff003ff
v210_luma_shuf: times 2 db 1,2,4,5,6,7,9,10,12,13,14,15,12,13,14,15
v210_chroma_shuf: times 2 db 0,1,2,3,5,6,8,9,10,11,13,14,10,11,13,14
; vpermd indices {0,1,2,4,5,7,_,_} merged in the 3 lsb of each dword to save a register
v210_mult: dw 0x2000,0x7fff,0x0801,0x2000,0x7ffa,0x0800,0x7ffc,0x0800
dw 0x1ffd,0x7fff,0x07ff,0x2000,0x7fff,0x0800,0x7fff,0x0800
%if HIGH_BIT_DEPTH
deinterleave_shuf32a: SHUFFLE_MASK_W 0,2,4,6,8,10,12,14
deinterleave_shuf32b: SHUFFLE_MASK_W 1,3,5,7,9,11,13,15
......@@ -1195,6 +1202,64 @@ cglobal load_deinterleave_chroma_fdec, 4,4
RET
%endmacro ; PLANE_DEINTERLEAVE
%macro PLANE_DEINTERLEAVE_V210 0
;-----------------------------------------------------------------------------
; void x264_plane_copy_deinterleave_v210( uint16_t *dsty, intptr_t i_dsty,
; uint16_t *dstc, intptr_t i_dstc,
; uint32_t *src, intptr_t i_src, int w, int h )
;-----------------------------------------------------------------------------
%if ARCH_X86_64
cglobal plane_copy_deinterleave_v210, 8,10,7
%define src r8
%define org_w r9
%define h r7d
%else
cglobal plane_copy_deinterleave_v210, 7,7,7
%define src r4m
%define org_w r6m
%define h dword r7m
%endif
FIX_STRIDES r1, r3, r6d
shl r5, 2
add r0, r6
add r2, r6
neg r6
mov src, r4
mov org_w, r6
mova m2, [v210_mask]
mova m3, [v210_luma_shuf]
mova m4, [v210_chroma_shuf]
mova m5, [v210_mult] ; also functions as vpermd index for avx2
pshufd m6, m5, q1102
ALIGN 16
.loop:
movu m1, [r4]
pandn m0, m2, m1
pand m1, m2
pshufb m0, m3
pshufb m1, m4
pmulhrsw m0, m5 ; y0 y1 y2 y3 y4 y5 __ __
pmulhrsw m1, m6 ; u0 v0 u1 v1 u2 v2 __ __
%if mmsize == 32
vpermd m0, m5, m0
vpermd m1, m5, m1
%endif
movu [r0+r6], m0
movu [r2+r6], m1
add r4, mmsize
add r6, 3*mmsize/4
jl .loop
add r0, r1
add r2, r3
add src, r5
mov r4, src
mov r6, org_w
dec h
jg .loop
RET
%endmacro ; PLANE_DEINTERLEAVE_V210
%if HIGH_BIT_DEPTH
INIT_MMX mmx2
PLANE_INTERLEAVE
......@@ -1203,9 +1268,14 @@ PLANE_DEINTERLEAVE
INIT_XMM sse2
PLANE_INTERLEAVE
PLANE_DEINTERLEAVE
INIT_XMM ssse3
PLANE_DEINTERLEAVE_V210
INIT_XMM avx
PLANE_INTERLEAVE
PLANE_DEINTERLEAVE
PLANE_DEINTERLEAVE_V210
INIT_YMM avx2
PLANE_DEINTERLEAVE_V210
%else
INIT_MMX mmx2
PLANE_INTERLEAVE
......
......@@ -116,6 +116,15 @@ void x264_plane_copy_deinterleave_ssse3( uint8_t *dstu, intptr_t i_dstu,
void x264_plane_copy_deinterleave_avx( uint16_t *dstu, intptr_t i_dstu,
uint16_t *dstv, intptr_t i_dstv,
uint16_t *src, intptr_t i_src, int w, int h );
void x264_plane_copy_deinterleave_v210_ssse3( uint16_t *dstu, intptr_t i_dstu,
uint16_t *dstv, intptr_t i_dstv,
uint32_t *src, intptr_t i_src, int w, int h );
void x264_plane_copy_deinterleave_v210_avx ( uint16_t *dstu, intptr_t i_dstu,
uint16_t *dstv, intptr_t i_dstv,
uint32_t *src, intptr_t i_src, int w, int h );
void x264_plane_copy_deinterleave_v210_avx2 ( uint16_t *dstu, intptr_t i_dstu,
uint16_t *dstv, intptr_t i_dstv,
uint32_t *src, intptr_t i_src, int w, int h );
void x264_store_interleave_chroma_mmx2( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
void x264_store_interleave_chroma_sse2( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
void x264_store_interleave_chroma_avx ( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
......@@ -627,6 +636,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
return;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_ssse3;
if( !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SLOW_ATOM|X264_CPU_SLOW_PALIGNR)) )
pf->integral_init4v = x264_integral_init4v_ssse3;
......@@ -639,6 +649,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_avx;
pf->plane_copy_interleave = x264_plane_copy_interleave_avx;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_avx;
pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx;
pf->store_interleave_chroma = x264_store_interleave_chroma_avx;
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_avx;
......@@ -649,7 +660,10 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->frame_init_lowres_core = x264_frame_init_lowres_core_xop;
if( cpu&X264_CPU_AVX2 )
{
pf->mc_luma = mc_luma_avx2;
pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx2;
}
#else // !HIGH_BIT_DEPTH
#if ARCH_X86 // all x86_64 cpus with cacheline split issues use sse2 instead
......
......@@ -467,7 +467,7 @@ static int x264_validate_parameters( x264_t *h, int b_open )
x264_log( h, X264_LOG_ERROR, "not compiled with 4:2:0 support\n" );
return -1;
}
else if( CHROMA_FORMAT != CHROMA_422 && i_csp >= X264_CSP_I422 && i_csp <= X264_CSP_NV16 )
else if( CHROMA_FORMAT != CHROMA_422 && i_csp >= X264_CSP_I422 && i_csp <= X264_CSP_V210 )
{
x264_log( h, X264_LOG_ERROR, "not compiled with 4:2:2 support\n" );
return -1;
......
......@@ -94,9 +94,12 @@ static void help( int longhelp )
for( int i = X264_CSP_NONE+1; i < X264_CSP_CLI_MAX; i++ )
{
printf( "%s", x264_cli_csps[i].name );
if( i+1 < X264_CSP_CLI_MAX )
printf( ", " );
if( x264_cli_csps[i].name )
{
printf( "%s", x264_cli_csps[i].name );
if( i+1 < X264_CSP_CLI_MAX )
printf( ", " );
}
}
printf( "\n"
" - depth: 8 or 16 bits per pixel [keep current]\n"
......@@ -243,8 +246,11 @@ static int handle_opts( const char **optlist, char **opts, video_info_t *info, r
if( strlen( str_csp ) == 0 )
csp = info->csp & X264_CSP_MASK;
else
for( csp = X264_CSP_CLI_MAX-1; x264_cli_csps[csp].name && strcasecmp( x264_cli_csps[csp].name, str_csp ); )
csp--;
for( csp = X264_CSP_CLI_MAX-1; csp > X264_CSP_NONE; csp-- )
{
if( x264_cli_csps[csp].name && !strcasecmp( x264_cli_csps[csp].name, str_csp ) )
break;
}
FAIL_IF_ERROR( csp == X264_CSP_NONE, "unsupported colorspace `%s'\n", str_csp );
h->dst_csp = csp;
if( depth == 16 )
......
......@@ -42,7 +42,8 @@ const x264_cli_csp_t x264_cli_csps[] = {
int x264_cli_csp_is_invalid( int csp )
{
int csp_mask = csp & X264_CSP_MASK;
return csp_mask <= X264_CSP_NONE || csp_mask >= X264_CSP_CLI_MAX || csp & X264_CSP_OTHER;
return csp_mask <= X264_CSP_NONE || csp_mask >= X264_CSP_CLI_MAX ||
csp_mask == X264_CSP_V210 || csp & X264_CSP_OTHER;
}
int x264_cli_csp_depth_factor( int csp )
......
......@@ -55,8 +55,11 @@ static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, c
FAIL_IF_ERROR( !info->width || !info->height, "raw input requires a resolution.\n" )
if( opt->colorspace )
{
for( info->csp = X264_CSP_CLI_MAX-1; x264_cli_csps[info->csp].name && strcasecmp( x264_cli_csps[info->csp].name, opt->colorspace ); )
info->csp--;
for( info->csp = X264_CSP_CLI_MAX-1; info->csp > X264_CSP_NONE; info->csp-- )
{
if( x264_cli_csps[info->csp].name && !strcasecmp( x264_cli_csps[info->csp].name, opt->colorspace ) )
break;
}
FAIL_IF_ERROR( info->csp == X264_CSP_NONE, "unsupported colorspace `%s'\n", opt->colorspace );
}
else /* default */
......
......@@ -1453,6 +1453,33 @@ static int check_mc( int cpu_ref, int cpu_new )
}
report( "plane_copy :" );
if( mc_a.plane_copy_deinterleave_v210 != mc_ref.plane_copy_deinterleave_v210 )
{
set_func_name( "plane_copy_deinterleave_v210" );
used_asm = 1;
for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ )
{
int w = (plane_specs[i].w + 1) >> 1;
int h = plane_specs[i].h;
intptr_t dst_stride = ALIGN( w, 16 );
intptr_t src_stride = (w + 47) / 48 * 128 / sizeof(uint32_t);
intptr_t offv = dst_stride*h + 32;
memset( pbuf3, 0, 0x1000 );
memset( pbuf4, 0, 0x1000 );
call_c( mc_c.plane_copy_deinterleave_v210, pbuf3, dst_stride, pbuf3+offv, dst_stride, (uint32_t *)buf1, src_stride, w, h );
call_a( mc_a.plane_copy_deinterleave_v210, pbuf4, dst_stride, pbuf4+offv, dst_stride, (uint32_t *)buf1, src_stride, w, h );
for( int y = 0; y < h; y++ )
if( memcmp( pbuf3+y*dst_stride, pbuf4+y*dst_stride, w*sizeof(uint16_t) ) ||
memcmp( pbuf3+y*dst_stride+offv, pbuf4+y*dst_stride+offv, w*sizeof(uint16_t) ) )
{
ok = 0;
fprintf( stderr, "plane_copy_deinterleave_v210 FAILED: w=%d h=%d stride=%d\n", w, h, (int)src_stride );
break;
}
}
}
report( "v210 :" );
if( mc_a.hpel_filter != mc_ref.hpel_filter )
{
pixel *srchpel = pbuf1+8+2*64;
......
......@@ -420,9 +420,12 @@ static void print_csp_names( int longhelp )
printf( INDENT );
for( int i = X264_CSP_NONE+1; i < X264_CSP_CLI_MAX; i++ )
{
printf( "%s", x264_cli_csps[i].name );
if( i+1 < X264_CSP_CLI_MAX )
printf( ", " );
if( x264_cli_csps[i].name )
{
printf( "%s", x264_cli_csps[i].name );
if( i+1 < X264_CSP_CLI_MAX )
printf( ", " );
}
}
#if HAVE_LAVF
printf( "\n" );
......@@ -1282,7 +1285,7 @@ static int init_vid_filters( char *sequence, hnd_t *handle, video_info_t *info,
int csp = info->csp & X264_CSP_MASK;
if( output_csp == X264_CSP_I420 && (csp < X264_CSP_I420 || csp > X264_CSP_NV12) )
param->i_csp = X264_CSP_I420;
else if( output_csp == X264_CSP_I422 && (csp < X264_CSP_I422 || csp > X264_CSP_NV16) )
else if( output_csp == X264_CSP_I422 && (csp < X264_CSP_I422 || csp > X264_CSP_V210) )
param->i_csp = X264_CSP_I422;
else if( output_csp == X264_CSP_I444 && (csp < X264_CSP_I444 || csp > X264_CSP_YV24) )
param->i_csp = X264_CSP_I444;
......
......@@ -41,7 +41,7 @@
#include "x264_config.h"
#define X264_BUILD 140
#define X264_BUILD 141
/* Application developers planning to link against a shared library version of
* libx264 from a Microsoft Visual Studio or similar development environment
......@@ -215,12 +215,13 @@ static const char * const x264_nal_hrd_names[] = { "none", "vbr", "cbr", 0 };
#define X264_CSP_I422 0x0004 /* yuv 4:2:2 planar */
#define X264_CSP_YV16 0x0005 /* yvu 4:2:2 planar */
#define X264_CSP_NV16 0x0006 /* yuv 4:2:2, with one y plane and one packed u+v */
#define X264_CSP_I444 0x0007 /* yuv 4:4:4 planar */
#define X264_CSP_YV24 0x0008 /* yvu 4:4:4 planar */
#define X264_CSP_BGR 0x0009 /* packed bgr 24bits */
#define X264_CSP_BGRA 0x000a /* packed bgr 32bits */
#define X264_CSP_RGB 0x000b /* packed rgb 24bits */
#define X264_CSP_MAX 0x000c /* end of list */
#define X264_CSP_V210 0x0007 /* 10-bit yuv 4:2:2 packed in 32 */
#define X264_CSP_I444 0x0008 /* yuv 4:4:4 planar */
#define X264_CSP_YV24 0x0009 /* yvu 4:4:4 planar */
#define X264_CSP_BGR 0x000a /* packed bgr 24bits */
#define X264_CSP_BGRA 0x000b /* packed bgr 32bits */
#define X264_CSP_RGB 0x000c /* packed rgb 24bits */
#define X264_CSP_MAX 0x000d /* end of list */
#define X264_CSP_VFLIP 0x1000 /* the csp is vertically flipped */
#define X264_CSP_HIGH_DEPTH 0x2000 /* the csp has a depth of 16 bits per pixel component */
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment