Commit 6589ad6d authored by Fiona Glaser's avatar Fiona Glaser

x86 assembly code for NAL escaping

Up to ~10x faster than C depending on CPU.
Helps the most at very high bitrates (e.g. lossless).
Also make the C code faster and simpler.
parent 9056470d
......@@ -8,7 +8,7 @@ SRCS = common/mc.c common/predict.c common/pixel.c common/macroblock.c \
common/frame.c common/dct.c common/cpu.c common/cabac.c \
common/common.c common/mdate.c common/rectangle.c \
common/set.c common/quant.c common/deblock.c common/vlc.c \
common/mvpred.c \
common/mvpred.c common/bitstream.c \
encoder/analyse.c encoder/me.c encoder/ratecontrol.c \
encoder/set.c encoder/macroblock.c encoder/cabac.c \
encoder/cavlc.c encoder/encoder.c encoder/lookahead.c
......@@ -52,7 +52,7 @@ endif
ifneq ($(AS),)
X86SRC0 = const-a.asm cabac-a.asm dct-a.asm deblock-a.asm mc-a.asm \
mc-a2.asm pixel-a.asm predict-a.asm quant-a.asm sad-a.asm \
cpu-a.asm dct-32.asm
cpu-a.asm dct-32.asm bitstream-a.asm
X86SRC = $(X86SRC0:%=common/x86/%)
ifeq ($(ARCH),X86)
......
/*****************************************************************************
* bitstream.c: h264 encoder library
*****************************************************************************
* Copyright (C) 2010 x264 project
*
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
* Fiona Glaser <fiona@x264.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*****************************************************************************/
#include "common.h"
static uint8_t *x264_nal_escape_c( uint8_t *dst, uint8_t *src, uint8_t *end )
{
if( src < end ) *dst++ = *src++;
if( src < end ) *dst++ = *src++;
while( src < end )
{
if( src[0] <= 0x03 && !dst[-2] && !dst[-1] )
*dst++ = 0x03;
*dst++ = *src++;
}
return dst;
}
#ifdef HAVE_MMX
uint8_t *x264_nal_escape_mmxext( uint8_t *dst, uint8_t *src, uint8_t *end );
uint8_t *x264_nal_escape_sse2( uint8_t *dst, uint8_t *src, uint8_t *end );
#endif
/****************************************************************************
* x264_nal_encode:
****************************************************************************/
int x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal, int b_long_startcode )
{
uint8_t *src = nal->p_payload;
uint8_t *end = nal->p_payload + nal->i_payload;
uint8_t *orig_dst = dst;
if( h->param.b_annexb )
{
if( b_long_startcode )
*dst++ = 0x00;
*dst++ = 0x00;
*dst++ = 0x00;
*dst++ = 0x01;
}
else /* save room for size later */
dst += 4;
/* nal header */
*dst++ = ( 0x00 << 7 ) | ( nal->i_ref_idc << 5 ) | nal->i_type;
dst = h->bsf.nal_escape( dst, src, end );
int size = (dst - orig_dst) - 4;
/* Write the size header for mp4/etc */
if( !h->param.b_annexb )
{
/* Size doesn't include the size of the header we're writing now. */
orig_dst[0] = size>>24;
orig_dst[1] = size>>16;
orig_dst[2] = size>> 8;
orig_dst[3] = size>> 0;
}
return size+4;
}
void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf )
{
pf->nal_escape = x264_nal_escape_c;
#ifdef HAVE_MMX
if( cpu&X264_CPU_MMXEXT )
pf->nal_escape = x264_nal_escape_mmxext;
if( (cpu&X264_CPU_SSE2) && (cpu&X264_CPU_SSE2_IS_FAST) )
pf->nal_escape = x264_nal_escape_sse2;
#endif
}
/*****************************************************************************
* bs.h :
* bitstream.h: h264 encoder library
*****************************************************************************
* Copyright (C) 2003-2008 x264 project
*
......@@ -63,6 +63,14 @@ extern const vlc_t x264_total_zeros[15][16];
extern const vlc_t x264_total_zeros_dc[3][4];
extern const vlc_t x264_run_before[7][16];
typedef struct
{
uint8_t *(*nal_escape) ( uint8_t *dst, uint8_t *src, uint8_t *end );
} x264_bitstream_function_t;
int x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal, int b_long_startcode );
void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf );
/* A larger level table size theoretically could help a bit at extremely
* high bitrates, but the cost in cache is usually too high for it to be
* useful.
......
......@@ -1026,60 +1026,6 @@ void x264_picture_clean( x264_picture_t *pic )
memset( pic, 0, sizeof( x264_picture_t ) );
}
/****************************************************************************
* x264_nal_encode:
****************************************************************************/
int x264_nal_encode( uint8_t *dst, x264_nal_t *nal, int b_annexb, int b_long_startcode )
{
uint8_t *src = nal->p_payload;
uint8_t *end = nal->p_payload + nal->i_payload;
uint8_t *orig_dst = dst;
int i_count = 0, size;
if( b_annexb )
{
if( b_long_startcode )
*dst++ = 0x00;
*dst++ = 0x00;
*dst++ = 0x00;
*dst++ = 0x01;
}
else /* save room for size later */
dst += 4;
/* nal header */
*dst++ = ( 0x00 << 7 ) | ( nal->i_ref_idc << 5 ) | nal->i_type;
while( src < end )
{
if( i_count == 2 && *src <= 0x03 )
{
*dst++ = 0x03;
i_count = 0;
}
if( *src == 0 )
i_count++;
else
i_count = 0;
*dst++ = *src++;
}
size = (dst - orig_dst) - 4;
/* Write the size header for mp4/etc */
if( !b_annexb )
{
/* Size doesn't include the size of the header we're writing now. */
orig_dst[0] = size>>24;
orig_dst[1] = size>>16;
orig_dst[2] = size>> 8;
orig_dst[3] = size>> 0;
}
return size+4;
}
/****************************************************************************
* x264_malloc:
****************************************************************************/
......
......@@ -137,7 +137,7 @@ static const int x264_scan8[16+2*4+3] =
*/
#include "x264.h"
#include "bs.h"
#include "bitstream.h"
#include "set.h"
#include "predict.h"
#include "pixel.h"
......@@ -166,8 +166,6 @@ int64_t x264_mdate( void );
* the encoding options */
char *x264_param2string( x264_param_t *p, int b_res );
int x264_nal_encode( uint8_t *dst, x264_nal_t *nal, int b_annexb, int b_long_startcode );
/* log */
void x264_log( x264_t *h, int i_level, const char *psz_fmt, ... );
......@@ -796,6 +794,7 @@ struct x264_t
x264_zigzag_function_t zigzagf;
x264_quant_function_t quantf;
x264_deblock_function_t loopf;
x264_bitstream_function_t bsf;
#ifdef HAVE_VISUALIZE
struct visualize_t *visualize;
......
;*****************************************************************************
;* bitstream-a.asm: h264 encoder library
;*****************************************************************************
;* Copyright (C) 2010 x264 project
;*
;* Authors: Fiona Glaser <fiona@x264.com>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
;*****************************************************************************
%include "x86inc.asm"
%include "x86util.asm"
SECTION .text
;-----------------------------------------------------------------------------
; uint8_t *x264_nal_escape( uint8_t *dst, uint8_t *src, uint8_t *end )
;-----------------------------------------------------------------------------
%macro NAL_LOOP 2
ALIGN 16
%1:
mova m0, [r1+r2]
mova m1, m0
%if mmsize == 8
psllq m0, 8
%else
pslldq m0, 1
%endif
%2 [r0+r1], m1
por m1, m0
pcmpeqb m1, m2
pmovmskb r3d, m1
test r3d, r3d
jnz .escape
add r1, mmsize
jl %1
%endmacro
%macro NAL_ESCAPE 1
cglobal nal_escape_%1, 3,5
pxor m2, m2
sub r1, r2 ; r1 = offset of current src pointer from end of src
sub r0, r1 ; r0 = projected end of dst, assuming no more escapes
mov r3b, [r1+r2]
mov [r0+r1], r3b
inc r1
jge .ret
; Start off by jumping into the escape loop in
; case there's an escape at the start.
; And do a few more in scalar until src is aligned again.
lea r4d, [r1+r2]
or r4d, -mmsize
neg r4d
jmp .first_escape
NAL_LOOP .loop_aligned, mova
%if mmsize==16
NAL_LOOP .loop_unaligned, movu
%endif
.ret:
movifnidn rax, r0
RET
ALIGN 16
.escape:
mov r4d, mmsize
.first_escape:
mov r3b, [r1+r2]
.escape_loop:
mov [r0+r1], r3b
inc r1
jge .ret
mov r3b, [r1+r2]
cmp r3b, 3
jna .escape_check
.no_escape:
dec r4d
jg .escape_loop
%if mmsize==16
lea r4d, [r0+r1]
test r4d, mmsize-1
jnz .loop_unaligned
%endif
jmp .loop_aligned
.escape_check:
cmp word [r0+r1-2], 0
jnz .no_escape
mov byte [r0+r1], 3
inc r0
jmp .no_escape
%endmacro
INIT_MMX
NAL_ESCAPE mmxext
INIT_XMM
NAL_ESCAPE sse2
......@@ -4,6 +4,7 @@
;* Copyright (C) 2005-2008 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Fiona Glaser <fiona@x264.com>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
......
......@@ -987,6 +987,7 @@ x264_t *x264_encoder_open( x264_param_t *param )
x264_mc_init( h->param.cpu, &h->mc );
x264_quant_init( h, h->param.cpu, &h->quantf );
x264_deblock_init( h->param.cpu, &h->loopf );
x264_bitstream_init( h->param.cpu, &h->bsf );
x264_dct_init_weights();
mbcmp_init( h );
......@@ -1273,7 +1274,7 @@ static int x264_encoder_encapsulate_nals( x264_t *h, int start )
for( int i = start; i < h->out.i_nal; i++ )
{
int long_startcode = !i || h->out.nal[i].i_type == NAL_SPS || h->out.nal[i].i_type == NAL_PPS;
int size = x264_nal_encode( nal_buffer, &h->out.nal[i], h->param.b_annexb, long_startcode );
int size = x264_nal_encode( h, nal_buffer, &h->out.nal[i], long_startcode );
h->out.nal[i].i_payload = size;
h->out.nal[i].p_payload = nal_buffer;
nal_buffer += size;
......
......@@ -1661,6 +1661,56 @@ static int check_cabac( int cpu_ref, int cpu_new )
return ret;
}
static int check_bitstream( int cpu_ref, int cpu_new )
{
x264_bitstream_function_t bs_c;
x264_bitstream_function_t bs_ref;
x264_bitstream_function_t bs_a;
int ret = 0, ok = 1, used_asm = 0;
x264_bitstream_init( 0, &bs_c );
x264_bitstream_init( cpu_ref, &bs_ref );
x264_bitstream_init( cpu_new, &bs_a );
if( bs_a.nal_escape != bs_ref.nal_escape )
{
int size = 0x4000;
uint8_t *input = malloc(size+100);
uint8_t *output1 = malloc(size*2);
uint8_t *output2 = malloc(size*2);
used_asm = 1;
set_func_name( "nal_escape" );
for( int i = 0; i < 100; i++ )
{
/* Test corner-case sizes */
int test_size = i < 10 ? i+1 : rand() & 0x3fff;
/* Test 8 different probability distributions of zeros */
for( int j = 0; j < test_size; j++ )
input[j] = (rand()&((1 << ((i&7)+1)) - 1)) * rand();
uint8_t *end_c = (uint8_t*)call_c1( bs_c.nal_escape, output1, input, input+test_size );
uint8_t *end_a = (uint8_t*)call_a1( bs_a.nal_escape, output2, input, input+test_size );
int size_c = end_c-output1;
int size_a = end_a-output2;
if( size_c != size_a || memcmp( output1, output2, size_c ) )
{
fprintf( stderr, "nal_escape : [FAILED] %d %d\n", size_c, size_a );
ok = 0;
break;
}
}
for( int j = 0; j < size; j++ )
input[j] = rand();
call_c2( bs_c.nal_escape, output1, input, input+size );
call_a2( bs_a.nal_escape, output2, input, input+size );
free(input);
free(output1);
free(output2);
}
report( "nal escape:" );
return ret;
}
static int check_all_funcs( int cpu_ref, int cpu_new )
{
return check_pixel( cpu_ref, cpu_new )
......@@ -1669,7 +1719,8 @@ static int check_all_funcs( int cpu_ref, int cpu_new )
+ check_intra( cpu_ref, cpu_new )
+ check_deblock( cpu_ref, cpu_new )
+ check_quant( cpu_ref, cpu_new )
+ check_cabac( cpu_ref, cpu_new );
+ check_cabac( cpu_ref, cpu_new )
+ check_bitstream( cpu_ref, cpu_new );
}
static int add_flags( int *cpu_ref, int *cpu_new, int flags, const char *name )
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment