Commit a3f5c732 authored by Fiona Glaser's avatar Fiona Glaser
Browse files

x86-64: cabac_block_residual assembly

RDO: ~20% faster than C
Bitstream: ~50% faster than C
1-2% faster overall, highest on preset superfast/fast/medium.
parent f49a1b2e
......@@ -39,11 +39,19 @@ static uint8_t *x264_nal_escape_c( uint8_t *dst, uint8_t *src, uint8_t *end )
return dst;
}
#if HAVE_MMX
uint8_t *x264_nal_escape_mmx2( uint8_t *dst, uint8_t *src, uint8_t *end );
uint8_t *x264_nal_escape_sse2( uint8_t *dst, uint8_t *src, uint8_t *end );
uint8_t *x264_nal_escape_avx( uint8_t *dst, uint8_t *src, uint8_t *end );
#endif
void x264_cabac_block_residual_rd_internal_sse2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_rd_internal_sse2_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_rd_internal_ssse3 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_rd_internal_ssse3_lzcnt( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_8x8_rd_internal_sse2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_8x8_rd_internal_sse2_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_8x8_rd_internal_ssse3 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_8x8_rd_internal_ssse3_lzcnt( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_internal_sse2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_internal_sse2_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
/****************************************************************************
* x264_nal_encode:
......@@ -88,12 +96,43 @@ void x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal )
void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf )
{
memset( pf, 0, sizeof(*pf) );
pf->nal_escape = x264_nal_escape_c;
#if HAVE_MMX
#if ARCH_X86_64
pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_sse2;
pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_sse2;
pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_sse2;
#endif
if( cpu&X264_CPU_MMX2 )
pf->nal_escape = x264_nal_escape_mmx2;
if( (cpu&X264_CPU_SSE2) && (cpu&X264_CPU_SSE2_IS_FAST) )
pf->nal_escape = x264_nal_escape_sse2;
if( cpu&X264_CPU_SSE2 )
{
#if ARCH_X86_64
if( cpu&X264_CPU_LZCNT )
{
pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_sse2_lzcnt;
pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_sse2_lzcnt;
pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_sse2_lzcnt;
}
#endif
if( cpu&X264_CPU_SSE2_IS_FAST )
pf->nal_escape = x264_nal_escape_sse2;
}
#if ARCH_X86_64
if( cpu&X264_CPU_SSSE3 )
{
pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_ssse3;
pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_ssse3;
if( cpu&X264_CPU_LZCNT )
{
pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_ssse3_lzcnt;
pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_ssse3_lzcnt;
}
}
#endif
if( cpu&X264_CPU_AVX )
pf->nal_escape = x264_nal_escape_avx;
#endif
......
......@@ -69,6 +69,12 @@ extern const vlc_t x264_total_zeros_2x4_dc[7][8];
typedef struct
{
uint8_t *(*nal_escape) ( uint8_t *dst, uint8_t *src, uint8_t *end );
void (*cabac_block_residual_internal)( dctcoef *l, int b_interlaced,
intptr_t ctx_block_cat, x264_cabac_t *cb );
void (*cabac_block_residual_rd_internal)( dctcoef *l, int b_interlaced,
intptr_t ctx_block_cat, x264_cabac_t *cb );
void (*cabac_block_residual_8x8_rd_internal)( dctcoef *l, int b_interlaced,
intptr_t ctx_block_cat, x264_cabac_t *cb );
} x264_bitstream_function_t;
void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf );
......
......@@ -209,6 +209,7 @@ static const uint8_t x264_scan8[16*3 + 3] =
};
#include "x264.h"
#include "cabac.h"
#include "bitstream.h"
#include "set.h"
#include "predict.h"
......@@ -216,7 +217,6 @@ static const uint8_t x264_scan8[16*3 + 3] =
#include "mc.h"
#include "frame.h"
#include "dct.h"
#include "cabac.h"
#include "quant.h"
#include "cpu.h"
#include "threadpool.h"
......
......@@ -26,12 +26,57 @@
;*****************************************************************************
%include "x86inc.asm"
%include "x86util.asm"
SECTION_RODATA
coeff_abs_level1_ctx: db 1, 2, 3, 4, 0, 0, 0, 0
coeff_abs_levelgt1_ctx: db 5, 5, 5, 5, 6, 7, 8, 9
coeff_abs_level_transition: db 1, 2, 3, 3, 4, 5, 6, 7
db 4, 4, 4, 4, 5, 6, 7, 7
%if ARCH_X86_64
%macro COEFF_LAST_TABLE 16
%define funccpu1 %1
%define funccpu2 %2
%rep 14
%ifidn %3, 4
dq mangle(x264_coeff_last%3_ %+ funccpu1)
%else
dq mangle(x264_coeff_last%3_ %+ funccpu2)
%endif
%rotate 1
%endrep
%endmacro
cextern coeff_last4_mmx2
cextern coeff_last4_mmx2_lzcnt
cextern coeff_last15_sse2
cextern coeff_last15_sse2_lzcnt
cextern coeff_last16_sse2
cextern coeff_last16_sse2_lzcnt
cextern coeff_last64_sse2
cextern coeff_last64_sse2_lzcnt
coeff_last_sse2: COEFF_LAST_TABLE mmx2, sse2, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
coeff_last_sse2_lzcnt: COEFF_LAST_TABLE mmx2_lzcnt, sse2_lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
%endif
SECTION .text
cextern cabac_range_lps
cextern cabac_transition
cextern cabac_renorm_shift
cextern cabac_entropy
cextern cabac_size_unary
cextern cabac_transition_unary
cextern significant_coeff_flag_offset
cextern significant_coeff_flag_offset_8x8
cextern last_coeff_flag_offset
cextern last_coeff_flag_offset_8x8
cextern coeff_abs_level_m1_offset
cextern count_cat_m1
cextern cabac_encode_ue_bypass
; t3 must be ecx, since it's used for shift.
%if WIN64
......@@ -185,3 +230,485 @@ cabac_putbyte:
inc t5d
mov [t0+cb.bytes_outstanding], t5d
jmp mangle(x264_cabac_encode_decision_asm.update_queue_low)
; %1 = label name
; %2 = node_ctx init?
%macro COEFF_ABS_LEVEL_GT1 2
%if %2
%define ctx 1
%else
movzx r11d, byte [coeff_abs_level1_ctx+r2 GLOBAL]
%define ctx r11
%endif
movzx r9d, byte [r8+ctx]
; if( coeff_abs > 1 )
cmp r1d, 1
jg .%1_gt1
; x264_cabac_encode_decision( cb, ctx_level+ctx, 0 )
movzx r10d, byte [cabac_transition+r9*2 GLOBAL]
movzx r9d, word [cabac_entropy+r9*2 GLOBAL]
lea r0d, [r0+r9+256]
mov [r8+ctx], r10b
%if %2
mov r2d, 1
%else
movzx r2d, byte [coeff_abs_level_transition+r2 GLOBAL]
%endif
jmp .%1_end
.%1_gt1:
; x264_cabac_encode_decision( cb, ctx_level+ctx, 1 )
movzx r10d, byte [cabac_transition+r9*2+1 GLOBAL]
xor r9d, 1
movzx r9d, word [cabac_entropy+r9*2 GLOBAL]
mov [r8+ctx], r10b
add r0d, r9d
%if %2
%define ctx 5
%else
movzx r11d, byte [coeff_abs_levelgt1_ctx+r2 GLOBAL]
%define ctx r11
%endif
; if( coeff_abs < 15 )
cmp r1d, 15
jge .%1_escape
shl r1d, 7
; x264_cabac_transition_unary[coeff_abs-1][cb->state[ctx_level+ctx]]
movzx r9d, byte [r8+ctx]
add r9d, r1d
movzx r10d, byte [cabac_transition_unary-128+r9 GLOBAL]
; x264_cabac_size_unary[coeff_abs-1][cb->state[ctx_level+ctx]]
movzx r9d, word [cabac_size_unary-256+r9*2 GLOBAL]
mov [r8+ctx], r10b
add r0d, r9d
jmp .%1_gt1_end
.%1_escape:
; x264_cabac_transition_unary[14][cb->state[ctx_level+ctx]]
movzx r9d, byte [r8+ctx]
movzx r10d, byte [cabac_transition_unary+128*14+r9 GLOBAL]
; x264_cabac_size_unary[14][cb->state[ctx_level+ctx]]
movzx r9d, word [cabac_size_unary+256*14+r9*2 GLOBAL]
add r0d, r9d
mov [r8+ctx], r10b
sub r1d, 14
%if cpuflag(lzcnt)
lzcnt r9d, r1d
xor r9d, 0x1f
%else
bsr r9d, r1d
%endif
; bs_size_ue_big(coeff_abs-15)<<8
shl r9d, 9
; (ilog2(coeff_abs-14)+1) << 8
lea r0d, [r0+r9+256]
.%1_gt1_end:
%if %2
mov r2d, 4
%else
movzx r2d, byte [coeff_abs_level_transition+8+r2 GLOBAL]
%endif
.%1_end:
%endmacro
%macro LOAD_DCTCOEF 1
%if HIGH_BIT_DEPTH
mov %1, [dct+r6*4]
%else
movzx %1, word [dct+r6*2]
%endif
%endmacro
%macro ABS_DCTCOEFS 2
%assign i 0
%rep %2/16
%if HIGH_BIT_DEPTH
ABSD m0, [%1+ 0+i*64], m4
ABSD m1, [%1+16+i*64], m5
ABSD m2, [%1+32+i*64], m4
ABSD m3, [%1+48+i*64], m5
mova [rsp+ 0+i*64], m0
mova [rsp+16+i*64], m1
mova [rsp+32+i*64], m2
mova [rsp+48+i*64], m3
%else
ABSW m0, [%1+ 0+i*32], m2
ABSW m1, [%1+16+i*32], m3
mova [rsp+ 0+i*32], m0
mova [rsp+16+i*32], m1
%endif
%assign i i+1
%endrep
%endmacro
%macro SIG_OFFSET 1
%if %1
movzx r11d, byte [r4+r6]
%endif
%endmacro
%macro LAST_OFFSET 1
%if %1
movzx r11d, byte [last_coeff_flag_offset_8x8+r6 GLOBAL]
%endif
%endmacro
;-----------------------------------------------------------------------------
; void x264_cabac_block_residual_rd_internal_sse2 ( dctcoef *l, int b_interlaced,
; int ctx_block_cat, x264_cabac_t *cb );
;-----------------------------------------------------------------------------
;%1 = 8x8 mode
%macro CABAC_RESIDUAL_RD 2
%if %1
%define func cabac_block_residual_8x8_rd_internal
%define maxcoeffs 64
%define dct rsp
%else
%define func cabac_block_residual_rd_internal
%define maxcoeffs 16
%define dct r4
%endif
%ifdef PIC
cglobal func, 4,13
lea r12, [$$]
%define GLOBAL +r12-$$
%else
cglobal func, 4,12
%define GLOBAL
%endif
%assign pad gprsize+SIZEOF_DCTCOEF*maxcoeffs-(stack_offset&15)
SUB rsp, pad
shl r1d, 4 ; MB_INTERLACED*16
%if %1
lea r4, [significant_coeff_flag_offset_8x8+r1*4 GLOBAL] ; r12 = sig offset 8x8
%endif
add r1d, r2d
movzx r5d, word [significant_coeff_flag_offset+r1*2 GLOBAL] ; r5 = ctx_sig
movzx r7d, word [last_coeff_flag_offset+r1*2 GLOBAL] ; r7 = ctx_last
movzx r8d, word [coeff_abs_level_m1_offset+r2*2 GLOBAL] ; r8 = ctx_level
; abs() all the coefficients; copy them to the stack to avoid
; changing the originals.
; overreading is okay; it's all valid aligned data anyways.
%if %1
ABS_DCTCOEFS r0, 64
%else
mov r4, r0 ; r4 = dct
mov r6, ~SIZEOF_DCTCOEF
and r6, r4 ; handle AC coefficient case
ABS_DCTCOEFS r6, 16
sub r4, r6 ; calculate our new dct pointer
add r4, rsp ; restore AC coefficient offset
%endif
mov r1, [%2+gprsize*r2 GLOBAL]
; for improved OOE performance, run coeff_last on the original coefficients.
call r1 ; coeff_last[ctx_block_cat]( dct )
; we know on 64-bit that the SSE2 versions of this function only
; overwrite r0, r1, and rax (r6). last64 overwrites r2 too, but we
; don't need r2 in 8x8 mode.
mov r0d, [r3+cb.bits_encoded] ; r0 = cabac.f8_bits_encoded
; pre-add some values to simplify addressing
add r3, cb.state
add r5, r3
add r7, r3
add r8, r3 ; precalculate cabac state pointers
; if( last != count_cat_m1[ctx_block_cat] )
%if %1
cmp r6b, 63
%else
cmp r6b, [count_cat_m1+r2 GLOBAL]
%endif
je .skip_last_sigmap
; in 8x8 mode we have to do a bit of extra calculation for ctx_sig/last,
; so we'll use r11 for this.
%if %1
%define siglast_ctx r11
%else
%define siglast_ctx r6
%endif
; x264_cabac_encode_decision( cb, ctx_sig + last, 1 )
; x264_cabac_encode_decision( cb, ctx_last + last, 1 )
SIG_OFFSET %1
movzx r1d, byte [r5+siglast_ctx]
movzx r9d, byte [cabac_transition+1+r1*2 GLOBAL]
xor r1d, 1
movzx r1d, word [cabac_entropy+r1*2 GLOBAL]
mov [r5+siglast_ctx], r9b
add r0d, r1d
LAST_OFFSET %1
movzx r1d, byte [r7+siglast_ctx]
movzx r9d, byte [cabac_transition+1+r1*2 GLOBAL]
xor r1d, 1
movzx r1d, word [cabac_entropy+r1*2 GLOBAL]
mov [r7+siglast_ctx], r9b
add r0d, r1d
.skip_last_sigmap:
LOAD_DCTCOEF r1d
COEFF_ABS_LEVEL_GT1 last, 1
; for( int i = last-1 ; i >= 0; i-- )
dec r6d
jl .end
.coeff_loop:
LOAD_DCTCOEF r1d
; if( l[i] )
SIG_OFFSET %1
movzx r9d, byte [r5+siglast_ctx]
test r1d, r1d
jnz .coeff_nonzero
; x264_cabac_encode_decision( cb, ctx_sig + i, 0 )
movzx r10d, byte [cabac_transition+r9*2 GLOBAL]
movzx r9d, word [cabac_entropy+r9*2 GLOBAL]
mov [r5+siglast_ctx], r10b
add r0d, r9d
dec r6d
jge .coeff_loop
jmp .end
.coeff_nonzero:
; x264_cabac_encode_decision( cb, ctx_sig + i, 1 )
movzx r10d, byte [cabac_transition+r9*2+1 GLOBAL]
xor r9d, 1
movzx r9d, word [cabac_entropy+r9*2 GLOBAL]
mov [r5+siglast_ctx], r10b
add r0d, r9d
; x264_cabac_encode_decision( cb, ctx_last + i, 0 );
LAST_OFFSET %1
movzx r9d, byte [r7+siglast_ctx]
movzx r10d, byte [cabac_transition+r9*2 GLOBAL]
movzx r9d, word [cabac_entropy+r9*2 GLOBAL]
mov [r7+siglast_ctx], r10b
add r0d, r9d
COEFF_ABS_LEVEL_GT1 coeff, 0
dec r6d
jge .coeff_loop
.end:
mov [r3+cb.bits_encoded-cb.state], r0d
ADD rsp, pad
RET
%endmacro
%if ARCH_X86_64
INIT_XMM sse2
CABAC_RESIDUAL_RD 0, coeff_last_sse2
CABAC_RESIDUAL_RD 1, coeff_last_sse2
INIT_XMM sse2,lzcnt
CABAC_RESIDUAL_RD 0, coeff_last_sse2_lzcnt
CABAC_RESIDUAL_RD 1, coeff_last_sse2_lzcnt
INIT_XMM ssse3
CABAC_RESIDUAL_RD 0, coeff_last_sse2
CABAC_RESIDUAL_RD 1, coeff_last_sse2
INIT_XMM ssse3,lzcnt
CABAC_RESIDUAL_RD 0, coeff_last_sse2_lzcnt
CABAC_RESIDUAL_RD 1, coeff_last_sse2_lzcnt
%endif
;-----------------------------------------------------------------------------
; void x264_cabac_block_residual_internal_sse2 ( dctcoef *l, int b_interlaced,
; int ctx_block_cat, x264_cabac_t *cb );
;-----------------------------------------------------------------------------
%macro CALL_CABAC 0
call cabac_encode_decision_asm
%if WIN64 ; move cabac back
mov r0, r3
%endif
%endmacro
; %1 = 8x8 mode
; %2 = dct register
; %3 = countcat
; %4 = name
%macro SIGMAP_LOOP 3-4
.sigmap_%4loop:
%if HIGH_BIT_DEPTH
mov %2, [dct+r10*4]
%else
movsx %2, word [dct+r10*2]
%endif
%if %1
movzx r1d, byte [sigoff_8x8 + r10]
add r1d, sigoffd
%else
lea r1d, [sigoffd + r10d]
%endif
test %2, %2
jz .sigmap_%4zero ; if( l[i] )
inc coeffidxd
mov [coeffs+coeffidxq*4], %2 ; coeffs[++coeff_idx] = l[i];
mov r2d, 1
CALL_CABAC ; x264_cabac_encode_decision( cb, ctx_sig + sig_off, 1 );
%if %1
movzx r1d, byte [last_coeff_flag_offset_8x8 + r10 GLOBAL]
add r1d, lastoffd
%else
lea r1d, [lastoffd + r10d]
%endif
cmp r10d, lastm ; if( i == last )
je .sigmap_%4last
xor r2d, r2d
CALL_CABAC ; x264_cabac_encode_decision( cb, ctx_last + last_off, 0 );
jmp .sigmap_%4loop_endcheck
.sigmap_%4zero:
xor r2d, r2d
CALL_CABAC ; x264_cabac_encode_decision( cb, ctx_sig + sig_off, 0 );
.sigmap_%4loop_endcheck:
inc r10d
cmp r10d, %3
jne .sigmap_%4loop ; if( ++i == count_m1 )
%if HIGH_BIT_DEPTH
mov %2, [dct+r10*4]
%else
movsx %2, word [dct+r10*2]
%endif
inc coeffidxd
mov [coeffs+coeffidxq*4], %2 ; coeffs[++coeff_idx] = l[i]
jmp .sigmap_%4end
.sigmap_%4last: ; x264_cabac_encode_decision( cb, ctx_last + last_off, 1 );
mov r2d, 1
CALL_CABAC
.sigmap_%4end:
%if %1==0
jmp .level_loop_start
%endif
%endmacro
%macro CABAC_RESIDUAL 1
cglobal cabac_block_residual_internal, 4,15
%ifdef PIC
; if we use the same r7 as in cabac_encode_decision, we can cheat and save a register.
lea r7, [$$]
%define lastm [rsp+4*1]
%define GLOBAL +r7-$$
%else
%define lastm r7d
%define GLOBAL
%endif
%assign pad gprsize+4*2+4*64-(stack_offset&15)
SUB rsp, pad
shl r1d, 4
%define sigoffq r8
%define sigoffd r8d
%define lastoffq r9
%define lastoffd r9d
%define leveloffq r10
%define leveloffd r10d
%define leveloffm [rsp+4*0]
%define countcatd r11d
%define sigoff_8x8 r12
%define coeffidxq r13
%define coeffidxd r13d
%define dct r14
%define coeffs rsp+4*2
lea sigoff_8x8, [significant_coeff_flag_offset_8x8+r1*4 GLOBAL]
add r1d, r2d
movzx sigoffd, word [significant_coeff_flag_offset+r1*2 GLOBAL]
movzx lastoffd, word [last_coeff_flag_offset+r1*2 GLOBAL]
movzx leveloffd, word [coeff_abs_level_m1_offset+r2*2 GLOBAL]
movzx countcatd, byte [count_cat_m1+r2 GLOBAL]
mov coeffidxd, -1
mov dct, r0
mov leveloffm, leveloffd
mov r1, [%1+gprsize*r2 GLOBAL]
call r1
mov lastm, eax
; put cabac in r0; needed for cabac_encode_decision
mov r0, r3
xor r10d, r10d
cmp countcatd, 63
je .sigmap_8x8
SIGMAP_LOOP 0, r12d, countcatd,
.sigmap_8x8:
SIGMAP_LOOP 1, r11d, 63, _8x8
.level_loop_start:
; we now have r8, r9, r11, r12, and r7/r14(dct) free for the main loop.
%define nodectxq r8
%define nodectxd r8d
mov leveloffd, leveloffm
xor nodectxd, nodectxd
.level_loop:
mov r9d, [coeffs+coeffidxq*4]
mov r11d, r9d
sar r11d, 31
add r9d, r11d
movzx r1d, byte [coeff_abs_level1_ctx+nodectxq GLOBAL]
xor r9d, r11d
add r1d, leveloffd
cmp r9d, 1
jg .level_gt1
xor r2d, r2d
CALL_CABAC
movzx nodectxd, byte [coeff_abs_level_transition+nodectxq GLOBAL]
jmp .level_sign
.level_gt1:
mov r2d, 1
CALL_CABAC
movzx r14d, byte [coeff_abs_levelgt1_ctx+nodectxq GLOBAL]
add r14d, leveloffd
cmp r9d, 15
mov r12d, 15
cmovl r12d, r9d
sub r12d, 2
jz .level_eq2
.level_gt1_loop:
mov r1d, r14d
mov r2d, 1
CALL_CABAC
dec r12d
jg .level_gt1_loop
cmp r9d, 15
jge .level_bypass
.level_eq2:
mov r1d, r14d
xor r2d, r2d
CALL_CABAC
jmp .level_gt1_end
.level_bypass:
lea r2d, [r9d-15]
xor r1d, r1d
push r0
; we could avoid this if we implemented it in asm, but I don't feel like that
; right now.
%if UNIX64
push r7
push r8
%else
sub rsp, 32 ; shadow space
%endif
call cabac_encode_ue_bypass
%if UNIX64
pop r8
pop r7