Commit a3e11cbf authored by Fiona Glaser's avatar Fiona Glaser Committed by Loren Merritt

rearrange cabac struct to reduce code size

parent 9289e806
......@@ -26,11 +26,6 @@
typedef struct
{
/* context */
DECLARE_ALIGNED_16( uint8_t state[460] );
int f8_bits_encoded; // only if using x264_cabac_size_decision()
/* state */
int i_low;
int i_range;
......@@ -43,6 +38,11 @@ typedef struct
uint8_t *p;
uint8_t *p_end;
/* aligned for aligned_memcpy starting here */
DECLARE_ALIGNED_16( int f8_bits_encoded ); // only if using x264_cabac_size_decision()
/* context */
uint8_t state[460];
} x264_cabac_t;
extern const uint8_t x264_cabac_transition[128][2];
......
......@@ -40,20 +40,25 @@ cextern x264_cabac_renorm_shift
; t3 must be ecx, since it's used for shift.
%ifdef ARCH_X86_64
DEF_TMP 0,1,2,3,4,5,6,7, 0,1,2,3,4,5,6,10
%define pointer 8
%define pointer resq
%else
DEF_TMP 0,1,2,3,4,5,6,7, 0,3,2,1,4,5,6,3
%define pointer 4
%define pointer resd
%endif
%define cb.state r0+0
%define cb.low r0+464
%define cb.range r0+468
%define cb.queue r0+472
%define cb.bytes_outstanding r0+476
%define cb.p r0+480+pointer
%define cb.end r0+480+pointer*2
struc cb
.low: resd 1
.range: resd 1
.queue: resd 1
.bytes_outstanding: resd 1
.start: pointer 1
.p: pointer 1
.end: pointer 1
align 16
.bits_encoded: resd 1
.state: resb 460
endstruc
%macro LOAD_GLOBAL 4
%ifdef PIC64
; this would be faster if the arrays were declared in asm, so that I didn't have to duplicate the lea
......@@ -78,8 +83,8 @@ cglobal x264_cabac_encode_decision, 0,7
movifnidn t0d, r0m
movifnidn t1d, r1m
picgetgot t2
mov t5d, [cb.range]
movzx t3d, byte [cb.state+t1]
mov t5d, [r0+cb.range]
movzx t3d, byte [r0+cb.state+t1]
mov t4d, t5d
shr t5d, 6
and t5d, 3
......@@ -93,7 +98,7 @@ cglobal x264_cabac_encode_decision, 0,7
movifnidn t2d, r2m
cmp t6d, t2d
%endif
mov t6d, [cb.low]
mov t6d, [r0+cb.low]
lea t7, [t6+t4]
cmovne t4d, t5d
cmovne t6d, t7d
......@@ -103,18 +108,18 @@ cglobal x264_cabac_encode_decision, 0,7
%else
LOAD_GLOBAL t3d, x264_cabac_transition, t2, t3*2
%endif
if32 mov t1d, r1m
mov [cb.state+t1], t3b
movifnidn t1d, r1m
mov [r0+cb.state+t1], t3b
.renorm:
mov t3d, t4d
shr t3d, 3
LOAD_GLOBAL t3d, x264_cabac_renorm_shift, 0, t3
shl t4d, t3b
shl t6d, t3b
add t3d, [cb.queue]
mov [cb.range], t4d
mov [cb.low], t6d
mov [cb.queue], t3d
add t3d, [r0+cb.queue]
mov [r0+cb.range], t4d
mov [r0+cb.low], t6d
mov [r0+cb.queue], t3d
cmp t3d, 8
jge .putbyte
.ret:
......@@ -130,15 +135,15 @@ cglobal x264_cabac_encode_decision, 0,7
sub t3d, 10
and t6d, t1d
cmp t2b, 0xff ; FIXME is a 32bit op faster?
mov [cb.queue], t3d
mov [cb.low], t6d
mov [r0+cb.queue], t3d
mov [r0+cb.low], t6d
mov t1d, t2d
mov t4, [cb.p]
mov t4, [r0+cb.p]
je .postpone
mov t5d, [cb.bytes_outstanding]
mov t5d, [r0+cb.bytes_outstanding]
shr t1d, 8 ; carry
lea t6, [t4+t5+1]
cmp t6, [cb.end]
cmp t6, [r0+cb.end]
jge .ret
add [t4-1], t1b
test t5d, t5d
......@@ -152,10 +157,10 @@ cglobal x264_cabac_encode_decision, 0,7
.no_outstanding:
mov [t4], t2b
inc t4
mov [cb.bytes_outstanding], t5d ; is zero, but a reg has smaller opcode than an immediate
mov [cb.p], t4
mov [r0+cb.bytes_outstanding], t5d ; is zero, but a reg has smaller opcode than an immediate
mov [r0+cb.p], t4
RET
.postpone:
inc dword [cb.bytes_outstanding]
inc dword [r0+cb.bytes_outstanding]
RET
......@@ -49,7 +49,9 @@ static int cabac_prefix_size[15][128];
#define x264_macroblock_write_cabac x264_macroblock_size_cabac
#include "cabac.c"
#define COPY_CABAC h->mc.memcpy_aligned( &cabac_tmp.f8_bits_encoded, &h->cabac.f8_bits_encoded, \
sizeof(x264_cabac_t) - offsetof(x264_cabac_t,f8_bits_encoded) )
static int ssd_mb( x264_t *h )
{
return h->pixf.ssd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE,
......@@ -83,7 +85,7 @@ static int x264_rd_cost_mb( x264_t *h, int i_lambda2 )
else if( h->param.b_cabac )
{
x264_cabac_t cabac_tmp;
h->mc.memcpy_aligned( &cabac_tmp, &h->cabac, offsetof(x264_cabac_t,i_low) );
COPY_CABAC;
x264_macroblock_size_cabac( h, &cabac_tmp );
i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 32768 ) >> 16;
}
......@@ -125,7 +127,7 @@ int x264_rd_cost_part( x264_t *h, int i_lambda2, int i8, int i_pixel )
if( h->param.b_cabac )
{
x264_cabac_t cabac_tmp;
h->mc.memcpy_aligned( &cabac_tmp, &h->cabac, offsetof(x264_cabac_t,i_low) );
COPY_CABAC;
x264_partition_size_cabac( h, &cabac_tmp, i8, i_pixel );
i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 32768 ) >> 16;
}
......@@ -147,7 +149,7 @@ int x264_rd_cost_i8x8( x264_t *h, int i_lambda2, int i8, int i_mode )
if( h->param.b_cabac )
{
x264_cabac_t cabac_tmp;
h->mc.memcpy_aligned( &cabac_tmp, &h->cabac, offsetof(x264_cabac_t,i_low) );
COPY_CABAC;
x264_partition_i8x8_size_cabac( h, &cabac_tmp, i8, i_mode );
i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 32768 ) >> 16;
}
......@@ -169,7 +171,7 @@ int x264_rd_cost_i4x4( x264_t *h, int i_lambda2, int i4, int i_mode )
if( h->param.b_cabac )
{
x264_cabac_t cabac_tmp;
h->mc.memcpy_aligned( &cabac_tmp, &h->cabac, offsetof(x264_cabac_t,i_low) );
COPY_CABAC;
x264_partition_i4x4_size_cabac( h, &cabac_tmp, i4, i_mode );
i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 32768 ) >> 16;
}
......@@ -195,7 +197,7 @@ int x264_rd_cost_i8x8_chroma( x264_t *h, int i_lambda2, int i_mode, int b_dct )
if( h->param.b_cabac )
{
x264_cabac_t cabac_tmp;
h->mc.memcpy_aligned( &cabac_tmp, &h->cabac, offsetof(x264_cabac_t,i_low) );
COPY_CABAC;
x264_i8x8_chroma_size_cabac( h, &cabac_tmp );
i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 32768 ) >> 16;
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment