Commit e394bd60 authored by Fiona Glaser's avatar Fiona Glaser

Massive overhaul of nnz/cbp calculation

Modify quantization to also calculate array_non_zero.
PPC assembly changes by gpoirior.
New quant asm includes some small tweaks to quant and SSE4 versions using ptest for the array_non_zero.
Use this new feature of quant to merge nnz/cbp calculation directly with encoding and avoid many unnecessary calls to dequant/zigzag/decimate/etc.
Also add new i16x16 DC-only iDCT with asm.
Since intra encoding now directly calculates nnz, skip_intra now backs up nnz/cbp as well.
Output should be equivalent except when using p4x4+RDO because of a subtlety involving old nnz values lying around.
Performance increase in macroblock_encode: ~18% with dct-decimate, 30% without at CRF 25.
Overall performance increase 0-6% depending on encoding settings.
parent 9c555215
......@@ -471,6 +471,10 @@ struct x264_t
DECLARE_ALIGNED_16( uint8_t i8x8_fdec_buf[16*16] );
DECLARE_ALIGNED_16( int16_t i8x8_dct_buf[3][64] );
DECLARE_ALIGNED_16( int16_t i4x4_dct_buf[15][16] );
uint32_t i4x4_nnz_buf[4];
uint32_t i8x8_nnz_buf[4];
int i4x4_cbp;
int i8x8_cbp;
/* Psy trellis DCT data */
DECLARE_ALIGNED_16( int16_t fenc_dct8[4][64] );
......
......@@ -369,6 +369,18 @@ static void add8x8_idct_dc( uint8_t *p_dst, int16_t dct[2][2] )
add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[1][1] );
}
static void add16x16_idct_dc( uint8_t *p_dst, int16_t dct[4][4] )
{
int i;
for( i = 0; i < 4; i++, p_dst += 4*FDEC_STRIDE )
{
add4x4_idct_dc( &p_dst[ 0], dct[i][0] );
add4x4_idct_dc( &p_dst[ 4], dct[i][1] );
add4x4_idct_dc( &p_dst[ 8], dct[i][2] );
add4x4_idct_dc( &p_dst[12], dct[i][3] );
}
}
/****************************************************************************
* x264_dct_init:
......@@ -384,6 +396,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->sub16x16_dct = sub16x16_dct;
dctf->add16x16_idct = add16x16_idct;
dctf->add16x16_idct_dc = add16x16_idct_dc;
dctf->sub8x8_dct8 = sub8x8_dct8;
dctf->add8x8_idct8 = add8x8_idct8;
......@@ -400,6 +413,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->sub4x4_dct = x264_sub4x4_dct_mmx;
dctf->add4x4_idct = x264_add4x4_idct_mmx;
dctf->add8x8_idct_dc = x264_add8x8_idct_dc_mmx;
dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx;
dctf->dct4x4dc = x264_dct4x4dc_mmx;
dctf->idct4x4dc = x264_idct4x4dc_mmx;
......@@ -427,10 +441,14 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->sub16x16_dct = x264_sub16x16_dct_sse2;
dctf->add8x8_idct = x264_add8x8_idct_sse2;
dctf->add16x16_idct = x264_add16x16_idct_sse2;
dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
}
if( cpu&X264_CPU_SSSE3 )
{
dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
}
#endif //HAVE_MMX
#ifdef ARCH_PPC
......
......@@ -100,6 +100,7 @@ typedef struct
void (*sub16x16_dct) ( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 );
void (*add16x16_idct)( uint8_t *p_dst, int16_t dct[16][4][4] );
void (*add16x16_idct_dc) ( uint8_t *p_dst, int16_t dct[4][4] );
void (*sub8x8_dct8) ( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 );
void (*add8x8_idct8) ( uint8_t *p_dst, int16_t dct[8][8] );
......
......@@ -30,10 +30,10 @@ mfvA = vec_ld((idx0), mf); \
mfvB = vec_ld((idx1), mf); \
biasvA = vec_ld((idx0), bias); \
biasvB = vec_ld((idx1), bias); \
mskA = vec_cmplt(temp1v, zerov); \
mskB = vec_cmplt(temp2v, zerov); \
coefvA = (vec_u16_t)vec_max(vec_sub(zerov, temp1v), temp1v); \
coefvB = (vec_u16_t)vec_max(vec_sub(zerov, temp2v), temp2v); \
mskA = vec_cmplt(temp1v, zero_s16v); \
mskB = vec_cmplt(temp2v, zero_s16v); \
coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v); \
coefvB = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp2v), temp2v); \
coefvA = vec_adds(coefvA, biasvA); \
coefvB = vec_adds(coefvB, biasvB); \
multEvenvA = vec_mule(coefvA, mfvA); \
......@@ -51,17 +51,20 @@ temp2v = vec_xor(temp2v, mskB); \
temp1v = vec_adds(temp1v, vec_and(mskA, one)); \
vec_st(temp1v, (idx0), (int16_t*)dct); \
temp2v = vec_adds(temp2v, vec_and(mskB, one)); \
nz = vec_or(nz, vec_or(temp1v, temp2v)); \
vec_st(temp2v, (idx1), (int16_t*)dct);
void x264_quant_4x4_altivec( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] )
int x264_quant_4x4_altivec( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] )
{
LOAD_ZERO;
vector bool short mskA;
vec_u32_t i_qbitsv;
vec_u16_t coefvA;
vec_u32_t multEvenvA, multOddvA;
vec_u16_t mfvA;
vec_u16_t biasvA;
vec_s16_t zerov, one;
vec_s16_t one = vec_splat_s16(1);;
vec_s16_t nz = zero_s16v;
vector bool short mskB;
vec_u16_t coefvB;
......@@ -75,20 +78,18 @@ void x264_quant_4x4_altivec( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[1
qbits_u.s[0]=16;
i_qbitsv = vec_splat(qbits_u.v, 0);
zerov = vec_splat_s16(0);
one = vec_splat_s16(1);
QUANT_16_U( 0, 16 );
return vec_any_ne(nz, zero_s16v);
}
// DC quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
#define QUANT_16_U_DC( idx0, idx1 ) \
temp1v = vec_ld((idx0), *dct); \
temp2v = vec_ld((idx1), *dct); \
mskA = vec_cmplt(temp1v, zerov); \
mskB = vec_cmplt(temp2v, zerov); \
coefvA = (vec_u16_t) vec_max(vec_sub(zerov, temp1v), temp1v); \
coefvB = (vec_u16_t) vec_max(vec_sub(zerov, temp2v), temp2v); \
mskA = vec_cmplt(temp1v, zero_s16v); \
mskB = vec_cmplt(temp2v, zero_s16v); \
coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v);\
coefvB = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp2v), temp2v);\
coefvA = vec_add(coefvA, biasv); \
coefvB = vec_add(coefvB, biasv); \
multEvenvA = vec_mule(coefvA, mfv); \
......@@ -106,15 +107,18 @@ temp2v = vec_xor(temp2v, mskB); \
temp1v = vec_add(temp1v, vec_and(mskA, one)); \
vec_st(temp1v, (idx0), (int16_t*)dct); \
temp2v = vec_add(temp2v, vec_and(mskB, one)); \
nz = vec_or(nz, vec_or(temp1v, temp2v)); \
vec_st(temp2v, (idx1), (int16_t*)dct);
void x264_quant_4x4_dc_altivec( int16_t dct[4][4], int mf, int bias )
int x264_quant_4x4_dc_altivec( int16_t dct[4][4], int mf, int bias )
{
LOAD_ZERO;
vector bool short mskA;
vec_u32_t i_qbitsv;
vec_u16_t coefvA;
vec_u32_t multEvenvA, multOddvA;
vec_s16_t zerov, one;
vec_s16_t one = vec_splat_s16(1);
vec_s16_t nz = zero_s16v;
vector bool short mskB;
vec_u16_t coefvB;
......@@ -137,18 +141,16 @@ void x264_quant_4x4_dc_altivec( int16_t dct[4][4], int mf, int bias )
bias_u.s[0]=bias;
biasv = vec_splat(bias_u.v, 0);
zerov = vec_splat_s16(0);
one = vec_splat_s16(1);
QUANT_16_U_DC( 0, 16 );
return vec_any_ne(nz, zero_s16v);
}
// DC quant of a whole 2x2 block
#define QUANT_4_U_DC( idx0 ) \
const vec_u16_t sel = (vec_u16_t) CV(-1,-1,-1,-1,0,0,0,0); \
temp1v = vec_ld((idx0), *dct); \
mskA = vec_cmplt(temp1v, zerov); \
coefvA = (vec_u16_t) vec_max(vec_sub(zerov, temp1v), temp1v); \
mskA = vec_cmplt(temp1v, zero_s16v); \
coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v);\
coefvA = vec_add(coefvA, biasv); \
multEvenvA = vec_mule(coefvA, mfv); \
multOddvA = vec_mulo(coefvA, mfv); \
......@@ -158,15 +160,18 @@ temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(mul
temp2v = vec_xor(temp2v, mskA); \
temp2v = vec_add(temp2v, vec_and(mskA, one)); \
temp1v = vec_sel(temp1v, temp2v, sel); \
nz = vec_or(nz, temp1v); \
vec_st(temp1v, (idx0), (int16_t*)dct);
void x264_quant_2x2_dc_altivec( int16_t dct[2][2], int mf, int bias )
int x264_quant_2x2_dc_altivec( int16_t dct[2][2], int mf, int bias )
{
LOAD_ZERO;
vector bool short mskA;
vec_u32_t i_qbitsv;
vec_u16_t coefvA;
vec_u32_t multEvenvA, multOddvA;
vec_s16_t zerov, one;
vec_s16_t one = vec_splat_s16(1);
vec_s16_t nz = zero_s16v;
vec_s16_t temp1v, temp2v;
......@@ -185,42 +190,41 @@ void x264_quant_2x2_dc_altivec( int16_t dct[2][2], int mf, int bias )
bias_u.s[0]=bias;
biasv = vec_splat(bias_u.v, 0);
zerov = vec_splat_s16(0);
one = vec_splat_s16(1);
static const vec_s16_t mask2 = CV(-1, -1, -1, -1, 0, 0, 0, 0);
QUANT_4_U_DC(0);
return vec_any_ne(vec_and(nz, mask2), zero_s16v);
}
void x264_quant_8x8_altivec( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] )
int x264_quant_8x8_altivec( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] )
{
LOAD_ZERO;
vector bool short mskA;
vec_u32_t i_qbitsv;
vec_u16_t coefvA;
vec_u32_t multEvenvA, multOddvA;
vec_u16_t mfvA;
vec_u16_t biasvA;
vec_s16_t zerov, one;
vec_s16_t one = vec_splat_s16(1);;
vec_s16_t nz = zero_s16v;
vector bool short mskB;
vec_u16_t coefvB;
vec_u32_t multEvenvB, multOddvB;
vec_u16_t mfvB;
vec_u16_t biasvB;
vec_s16_t temp1v, temp2v;
vec_u32_u qbits_u;
qbits_u.s[0]=16;
i_qbitsv = vec_splat(qbits_u.v, 0);
zerov = vec_splat_s16(0);
one = vec_splat_s16(1);
int i;
for ( i=0; i<4; i++ ) {
QUANT_16_U( i*2*16, i*2*16+16 );
}
return vec_any_ne(nz, zero_s16v);
}
#define DEQUANT_SHL() \
......
......@@ -21,11 +21,11 @@
#ifndef X264_PPC_QUANT_H
#define X264_PPC_QUANT_H
void x264_quant_4x4_altivec( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
void x264_quant_8x8_altivec( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
int x264_quant_4x4_altivec( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
int x264_quant_8x8_altivec( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
void x264_quant_4x4_dc_altivec( int16_t dct[4][4], int mf, int bias );
void x264_quant_2x2_dc_altivec( int16_t dct[2][2], int mf, int bias );
int x264_quant_4x4_dc_altivec( int16_t dct[4][4], int mf, int bias );
int x264_quant_2x2_dc_altivec( int16_t dct[2][2], int mf, int bias );
void x264_dequant_4x4_altivec( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
void x264_dequant_8x8_altivec( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
......
......@@ -36,35 +36,41 @@
(coef) = (f + (coef)) * (mf) >> 16; \
else \
(coef) = - ((f - (coef)) * (mf) >> 16); \
nz |= (coef); \
}
static void quant_8x8( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] )
static int quant_8x8( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] )
{
int i;
int i, nz = 0;
for( i = 0; i < 64; i++ )
QUANT_ONE( dct[0][i], mf[i], bias[i] );
return !!nz;
}
static void quant_4x4( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] )
static int quant_4x4( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] )
{
int i;
int i, nz = 0;
for( i = 0; i < 16; i++ )
QUANT_ONE( dct[0][i], mf[i], bias[i] );
return !!nz;
}
static void quant_4x4_dc( int16_t dct[4][4], int mf, int bias )
static int quant_4x4_dc( int16_t dct[4][4], int mf, int bias )
{
int i;
int i, nz = 0;
for( i = 0; i < 16; i++ )
QUANT_ONE( dct[0][i], mf, bias );
return !!nz;
}
static void quant_2x2_dc( int16_t dct[2][2], int mf, int bias )
static int quant_2x2_dc( int16_t dct[2][2], int mf, int bias )
{
int nz = 0;
QUANT_ONE( dct[0][0], mf, bias );
QUANT_ONE( dct[0][1], mf, bias );
QUANT_ONE( dct[0][2], mf, bias );
QUANT_ONE( dct[0][3], mf, bias );
return !!nz;
}
#define DEQUANT_SHL( x ) \
......@@ -402,6 +408,13 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->decimate_score16 = x264_decimate_score16_ssse3;
pf->decimate_score64 = x264_decimate_score64_ssse3;
}
if( cpu&X264_CPU_SSE4 )
{
pf->quant_4x4_dc = x264_quant_4x4_dc_sse4;
pf->quant_4x4 = x264_quant_4x4_sse4;
pf->quant_8x8 = x264_quant_8x8_sse4;
}
#endif // HAVE_MMX
#ifdef ARCH_PPC
......
......@@ -25,10 +25,10 @@
typedef struct
{
void (*quant_8x8)( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
void (*quant_4x4)( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
void (*quant_4x4_dc)( int16_t dct[4][4], int mf, int bias );
void (*quant_2x2_dc)( int16_t dct[2][2], int mf, int bias );
int (*quant_8x8)( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] );
int (*quant_4x4)( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] );
int (*quant_4x4_dc)( int16_t dct[4][4], int mf, int bias );
int (*quant_2x2_dc)( int16_t dct[2][2], int mf, int bias );
void (*dequant_8x8)( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
void (*dequant_4x4)( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
......
......@@ -33,6 +33,7 @@ pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
pb_scan4framea: db 12,13,6,7,14,15,0,1,8,9,2,3,4,5,10,11
pb_scan4frameb: db 0,1,8,9,2,3,4,5,10,11,12,13,6,7,14,15
pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
pb_idctdc_unpack2: db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
SECTION .text
......@@ -324,6 +325,104 @@ cglobal x264_add8x8_idct_dc_ssse3, 2,2
movhps [r0+FDEC_STRIDE* 3], xmm5
ret
cglobal x264_add16x16_idct_dc_mmx, 2,3
mov r2, 4
.loop:
movq mm0, [r1]
pxor mm1, mm1
paddw mm0, [pw_32 GLOBAL]
psraw mm0, 6
psubw mm1, mm0
packuswb mm0, mm0
packuswb mm1, mm1
punpcklbw mm0, mm0
punpcklbw mm1, mm1
pshufw mm2, mm0, 0xFA
pshufw mm3, mm1, 0xFA
punpcklbw mm0, mm0
punpcklbw mm1, mm1
ADD_DC mm0, mm1, r0
ADD_DC mm2, mm3, r0+8
add r1, 8
add r0, FDEC_STRIDE*4
dec r2
jg .loop
ret
%macro IDCT_DC_STORE 3
movdqa xmm4, [r0+%1+FDEC_STRIDE*0]
movdqa xmm5, [r0+%1+FDEC_STRIDE*1]
movdqa xmm6, [r0+%1+FDEC_STRIDE*2]
movdqa xmm7, [r0+%1+FDEC_STRIDE*3]
paddusb xmm4, %2
paddusb xmm5, %2
paddusb xmm6, %2
paddusb xmm7, %2
psubusb xmm4, %3
psubusb xmm5, %3
psubusb xmm6, %3
psubusb xmm7, %3
movdqa [r0+%1+FDEC_STRIDE*0], xmm4
movdqa [r0+%1+FDEC_STRIDE*1], xmm5
movdqa [r0+%1+FDEC_STRIDE*2], xmm6
movdqa [r0+%1+FDEC_STRIDE*3], xmm7
%endmacro
cglobal x264_add16x16_idct_dc_sse2, 2,2
call .loop
add r0, FDEC_STRIDE*4
.loop:
add r0, FDEC_STRIDE*4
movq xmm0, [r1+0]
movq xmm2, [r1+8]
add r1, 16
punpcklwd xmm0, xmm0
punpcklwd xmm2, xmm2
pxor xmm1, xmm1
pxor xmm3, xmm3
paddw xmm0, [pw_32 GLOBAL]
paddw xmm2, [pw_32 GLOBAL]
psraw xmm0, 6
psraw xmm2, 6
psubw xmm1, xmm0
psubw xmm3, xmm2
packuswb xmm0, xmm1
packuswb xmm2, xmm3
movdqa xmm1, xmm0
movdqa xmm3, xmm2
punpcklbw xmm0, xmm0
punpcklbw xmm2, xmm2
punpckhbw xmm1, xmm1
punpckhbw xmm3, xmm3
IDCT_DC_STORE FDEC_STRIDE*-4, xmm0, xmm1
IDCT_DC_STORE 0, xmm2, xmm3
ret
cglobal x264_add16x16_idct_dc_ssse3, 2,2
call .loop
add r0, FDEC_STRIDE*4
.loop:
add r0, FDEC_STRIDE*4
movdqa xmm0, [r1]
add r1, 16
pxor xmm1, xmm1
paddw xmm0, [pw_32 GLOBAL]
psraw xmm0, 6
psubw xmm1, xmm0
movdqa xmm5, [ pb_idctdc_unpack GLOBAL]
movdqa xmm6, [pb_idctdc_unpack2 GLOBAL]
packuswb xmm0, xmm0
packuswb xmm1, xmm1
movdqa xmm2, xmm0
movdqa xmm3, xmm1
pshufb xmm0, xmm5
pshufb xmm2, xmm6
pshufb xmm1, xmm5
pshufb xmm3, xmm6
IDCT_DC_STORE FDEC_STRIDE*-4, xmm0, xmm1
IDCT_DC_STORE 0, xmm2, xmm3
ret
;-----------------------------------------------------------------------------
; void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[8][8] )
;-----------------------------------------------------------------------------
......
......@@ -34,9 +34,12 @@ void x264_add4x4_idct_mmx ( uint8_t *p_dst, int16_t dct[ 4][4] );
void x264_add8x8_idct_mmx ( uint8_t *p_dst, int16_t dct[ 4][4][4] );
void x264_add8x8_idct_dc_mmx ( uint8_t *p_dst, int16_t dct[2][2] );
void x264_add16x16_idct_mmx ( uint8_t *p_dst, int16_t dct[16][4][4] );
void x264_add16x16_idct_dc_mmx ( uint8_t *p_dst, int16_t dct[4][4] );
void x264_add8x8_idct_sse2 ( uint8_t *p_dst, int16_t dct[ 4][4][4] );
void x264_add16x16_idct_sse2 ( uint8_t *p_dst, int16_t dct[16][4][4] );
void x264_add16x16_idct_dc_sse2( uint8_t *p_dst, int16_t dct[4][4] );
void x264_add8x8_idct_dc_ssse3( uint8_t *p_dst, int16_t dct[2][2] );
void x264_add16x16_idct_dc_ssse3( uint8_t *p_dst, int16_t dct[4][4] );
void x264_dct4x4dc_mmx ( int16_t d[4][4] );
void x264_idct4x4dc_mmx ( int16_t d[4][4] );
......
......@@ -29,6 +29,7 @@ SECTION_RODATA
pb_1: times 16 db 1
pw_1: times 8 dw 1
pd_1: times 4 dd 1
pb_01: times 8 db 0, 1
%macro DQM4 3
dw %1, %2, %1, %2, %2, %3, %2, %3
......@@ -70,7 +71,7 @@ decimate_mask_table4:
SECTION .text
%macro QUANT_DC_START 0
%macro QUANT_DC_START_MMX 0
movd m6, r1m ; mf
movd m7, r2m ; bias
%ifidn m0, mm0
......@@ -84,6 +85,14 @@ SECTION .text
%endif
%endmacro
%macro QUANT_DC_START_SSSE3 0
movdqa m5, [pb_01 GLOBAL]
movd m6, r1m ; mf
movd m7, r2m ; bias
pshufb m6, m5
pshufb m7, m5
%endmacro
%macro PABSW_MMX 2
pxor %1, %1
pcmpgtw %1, %2
......@@ -105,7 +114,7 @@ SECTION .text
psignw %1, %2
%endmacro
%macro QUANT_ONE 3
%macro QUANT_ONE 4
;;; %1 (m64) dct[y][x]
;;; %2 (m64/mmx) mf[y][x] or mf[0][0] (as uint16_t)
;;; %3 (m64/mmx) bias[y][x] or bias[0][0] (as uint16_t)
......@@ -115,6 +124,62 @@ SECTION .text
pmulhuw m0, %2 ; divide
PSIGNW m0, m1 ; restore sign
mova %1, m0 ; store
%if %4
por m5, m0
%else
SWAP m5, m0
%endif
%endmacro
%macro QUANT_TWO 7
mova m1, %1
mova m3, %2
PABSW m0, m1
PABSW m2, m3
paddusw m0, %5
paddusw m2, %6
pmulhuw m0, %3
pmulhuw m2, %4
PSIGNW m0, m1
PSIGNW m2, m3
mova %1, m0
mova %2, m2
%if %7
por m5, m0
por m5, m2
%else
SWAP m5, m0
por m5, m2
%endif
%endmacro
%macro QUANT_END_MMX 0
xor eax, eax
%ifndef ARCH_X86_64
%if mmsize==8
packsswb m5, m5
movd ecx, m5
test ecx, ecx
%else
pxor m4, m4
pcmpeqb m5, m4
pmovmskb ecx, m5
cmp ecx, (1<<mmsize)-1
%endif
%else
%if mmsize==16
packsswb m5, m5
%endif
movq rcx, m5
test rcx, rcx
%endif
setne al
%endmacro
%macro QUANT_END_SSE4 0
xor eax, eax
ptest m5, m5
setne al
%endmacro
;-----------------------------------------------------------------------------
......@@ -123,30 +188,38 @@ SECTION .text
%macro QUANT_DC 2
cglobal %1, 1,1
QUANT_DC_START
%if %2==1
QUANT_ONE [r0], m6, m7, 0
%else
%assign x 0
%rep %2
QUANT_ONE [r0+x], m6, m7
%assign x x+mmsize
%rep %2/2
QUANT_TWO [r0+x], [r0+x+mmsize], m6, m6, m7, m7, x
%assign x x+mmsize*2
%endrep
%endif
QUANT_END
RET
%endmacro
;-----------------------------------------------------------------------------
; void x264_quant_4x4_mmx( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
; int x264_quant_4x4_mmx( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
;-----------------------------------------------------------------------------
%macro QUANT_AC 2
cglobal %1, 3,3
%assign x 0
%rep %2
QUANT_ONE [r0+x], [r1+x], [r2+x]
%assign x x+mmsize
%rep %2/2
QUANT_TWO [r0+x], [r0+x+mmsize], [r1+x], [r1+x+mmsize], [r2+x], [r2+x+mmsize], x
%assign x x+mmsize*2
%endrep
QUANT_END
RET
%endmacro
INIT_MMX
%define QUANT_END QUANT_END_MMX
%define PABSW PABSW_MMX
%define PSIGNW PSIGNW_MMX