Commit 7946d913 authored by Oskar Arvidsson's avatar Oskar Arvidsson Committed by Fiona Glaser

x86 asm for high-bit-depth quant

~3.1-4.2x faster than C.
parent 03c61538
......@@ -114,6 +114,7 @@ typedef union { x264_uint128_t i; uint64_t a[2]; uint32_t b[4]; uint16_t c[8]; u
typedef uint16_t pixel;
typedef uint64_t pixel4;
typedef int32_t dctcoef;
typedef uint32_t udctcoef;
# define PIXEL_SPLAT_X4(x) ((x)*0x0001000100010001ULL)
# define MPIXEL_X4(src) M64(src)
......@@ -121,6 +122,7 @@ typedef union { x264_uint128_t i; uint64_t a[2]; uint32_t b[4]; uint16_t c[8]; u
typedef uint8_t pixel;
typedef uint32_t pixel4;
typedef int16_t dctcoef;
typedef uint16_t udctcoef;
# define PIXEL_SPLAT_X4(x) ((x)*0x01010101U)
# define MPIXEL_X4(src) M32(src)
......@@ -452,10 +454,10 @@ struct x264_t
int (*unquant4_mf[4])[16]; /* [4][52][16] */
int (*unquant8_mf[2])[64]; /* [2][52][64] */
/* quantization matrix for deadzone */
uint16_t (*quant4_mf[4])[16]; /* [4][52][16] */
uint16_t (*quant8_mf[2])[64]; /* [2][52][64] */
uint16_t (*quant4_bias[4])[16]; /* [4][52][16] */
uint16_t (*quant8_bias[2])[64]; /* [2][52][64] */
udctcoef (*quant4_mf[4])[16]; /* [4][52][16] */
udctcoef (*quant8_mf[2])[64]; /* [2][52][64] */
udctcoef (*quant4_bias[4])[16]; /* [4][52][16] */
udctcoef (*quant8_bias[2])[64]; /* [2][52][64] */
/* mv/ref cost arrays. Indexed by lambda instead of
* qp because, due to rounding, some quantizers share
......
......@@ -46,7 +46,7 @@
nz |= (coef); \
}
static int quant_8x8( dctcoef dct[64], uint16_t mf[64], uint16_t bias[64] )
static int quant_8x8( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] )
{
int nz = 0;
for( int i = 0; i < 64; i++ )
......@@ -54,7 +54,7 @@ static int quant_8x8( dctcoef dct[64], uint16_t mf[64], uint16_t bias[64] )
return !!nz;
}
static int quant_4x4( dctcoef dct[16], uint16_t mf[16], uint16_t bias[16] )
static int quant_4x4( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] )
{
int nz = 0;
for( int i = 0; i < 16; i++ )
......@@ -285,7 +285,41 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15;
pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16;
#if !X264_HIGH_BIT_DEPTH
#if X264_HIGH_BIT_DEPTH
#if HAVE_MMX
if( cpu&X264_CPU_MMX )
{
pf->quant_4x4 = x264_quant_4x4_mmx;
pf->quant_8x8 = x264_quant_8x8_mmx;
}
if( cpu&X264_CPU_MMXEXT )
{
pf->quant_2x2_dc = x264_quant_2x2_dc_mmxext;
pf->quant_4x4_dc = x264_quant_4x4_dc_mmxext;
}
if( cpu&X264_CPU_SSE2 )
{
pf->quant_4x4 = x264_quant_4x4_sse2;
pf->quant_8x8 = x264_quant_8x8_sse2;
pf->quant_2x2_dc = x264_quant_2x2_dc_sse2;
pf->quant_4x4_dc = x264_quant_4x4_dc_sse2;
}
if( cpu&X264_CPU_SSSE3 )
{
pf->quant_4x4 = x264_quant_4x4_ssse3;
pf->quant_8x8 = x264_quant_8x8_ssse3;
pf->quant_2x2_dc = x264_quant_2x2_dc_ssse3;
pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3;
}
if( cpu&X264_CPU_SSE4 )
{
pf->quant_2x2_dc = x264_quant_2x2_dc_sse4;
pf->quant_4x4_dc = x264_quant_4x4_dc_sse4;
pf->quant_4x4 = x264_quant_4x4_sse4;
pf->quant_8x8 = x264_quant_8x8_sse4;
}
#endif // HAVE_MMX
#else // !X264_HIGH_BIT_DEPTH
#if HAVE_MMX
if( cpu&X264_CPU_MMX )
{
......@@ -424,7 +458,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_neon;
}
#endif
#endif // !X264_HIGH_BIT_DEPTH
#endif // X264_HIGH_BIT_DEPTH
pf->coeff_last[ DCT_LUMA_DC] = pf->coeff_last[DCT_LUMA_4x4];
pf->coeff_last[DCT_CHROMA_AC] = pf->coeff_last[ DCT_LUMA_AC];
pf->coeff_level_run[ DCT_LUMA_DC] = pf->coeff_level_run[DCT_LUMA_4x4];
......
......@@ -29,8 +29,8 @@
typedef struct
{
int (*quant_8x8)( dctcoef dct[64], uint16_t mf[64], uint16_t bias[64] );
int (*quant_4x4)( dctcoef dct[16], uint16_t mf[16], uint16_t bias[16] );
int (*quant_8x8)( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
int (*quant_4x4)( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
int (*quant_4x4_dc)( dctcoef dct[16], int mf, int bias );
int (*quant_2x2_dc)( dctcoef dct[4], int mf, int bias );
......
......@@ -100,7 +100,7 @@ int x264_cqm_init( x264_t *h )
}
else
{
CHECKED_MALLOC( h-> quant4_mf[i], (QP_MAX+1)*size*sizeof(uint16_t) );
CHECKED_MALLOC( h-> quant4_mf[i], (QP_MAX+1)*size*sizeof(udctcoef) );
CHECKED_MALLOC( h->dequant4_mf[i], 6*size*sizeof(int) );
CHECKED_MALLOC( h->unquant4_mf[i], (QP_MAX+1)*size*sizeof(int) );
}
......@@ -112,7 +112,7 @@ int x264_cqm_init( x264_t *h )
if( j < i )
h->quant4_bias[i] = h->quant4_bias[j];
else
CHECKED_MALLOC( h->quant4_bias[i], (QP_MAX+1)*size*sizeof(uint16_t) );
CHECKED_MALLOC( h->quant4_bias[i], (QP_MAX+1)*size*sizeof(udctcoef) );
}
for( int q = 0; q < 6; q++ )
......@@ -171,7 +171,9 @@ int x264_cqm_init( x264_t *h )
for( int i = 0; i < 64; i++ )
{
h->unquant8_mf[i_list][q][i] = (1ULL << (q/6 + 16 + 8)) / quant8_mf[i_list][q%6][i];
h->quant8_mf[i_list][q][i] = j = SHIFT(quant8_mf[i_list][q%6][i], q/6);
j = SHIFT(quant8_mf[i_list][q%6][i], q/6);
h->quant8_mf[i_list][q][i] = (uint16_t)j;
if( !j )
{
min_qp_err = X264_MIN( min_qp_err, q );
......
......@@ -6,6 +6,7 @@
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Fiona Glaser <fiona@x264.com>
;* Christian Heine <sennindemokrit@gmx.net>
;* Oskar Arvidsson <oskar@irock.se>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
......@@ -78,15 +79,13 @@ cextern pb_01
%macro QUANT_DC_START_MMX 0
movd m6, r1m ; mf
movd m7, r2m ; bias
%ifidn m0, mm0
pshufw m6, m6, 0
pshufw m7, m7, 0
%ifdef X264_HIGH_BIT_DEPTH
SPLATD m6, m6
SPLATD m7, m7
%else
pshuflw m6, m6, 0
pshuflw m7, m7, 0
punpcklqdq m6, m6
punpcklqdq m7, m7
%endif
SPLATW m6, m6
SPLATW m7, m7
%endif ; X264_HIGH_BIT_DEPTH
%endmacro
%macro QUANT_DC_START_SSSE3 0
......@@ -118,6 +117,246 @@ cextern pb_01
psignw %1, %2
%endmacro
%macro PSIGND_MMX 2
pxor %1, %2
psubd %1, %2
%endmacro
%macro PSIGND_SSSE3 2
psignd %1, %2
%endmacro
%macro PABSD_MMX 2
pxor %1, %1
pcmpgtd %1, %2
pxor %2, %1
psubd %2, %1
SWAP %1, %2
%endmacro
%macro PABSD_SSSE3 2
pabsd %1, %2
%endmacro
%macro QUANT_END_MMX 0
xor eax, eax
%ifdef ARCH_X86_64
%if mmsize == 16
packsswb m5, m5
%endif
movq rcx, m5
test rcx, rcx
%else
%if mmsize == 16
pxor m4, m4
pcmpeqb m5, m4
pmovmskb ecx, m5
cmp ecx, (1<<mmsize)-1
%else
packsswb m5, m5
movd ecx, m5
test ecx, ecx
%endif
%endif
setne al
%endmacro
%macro QUANT_END_SSE4 0
xor eax, eax
ptest m5, m5
setne al
%endmacro
%ifdef X264_HIGH_BIT_DEPTH
%macro QUANT_ONE_DC_MMX 4
mova m0, [%1]
PABSD m1, m0
paddd m1, %3
mova m2, m1
psrlq m2, 32
pmuludq m1, %2
pmuludq m2, %2
psllq m2, 32
paddd m1, m2
psrld m1, 16
PSIGND m1, m0
mova [%1], m1
%if %4
por m5, m1
%else
SWAP m5, m1
%endif
%endmacro
%macro QUANT_TWO_DC_MMX 4
QUANT_ONE_DC_MMX %1, %2, %3, %4
QUANT_ONE_DC_MMX %1+mmsize, %2, %3, %4+mmsize
%endmacro
%macro QUANT_ONE_DC_SSE4 4
mova m0, [%1]
PABSD m1, m0
paddd m1, %3
pmulld m1, %2
psrad m1, 16
PSIGND m1, m0
mova [%1], m1
%if %4
por m5, m1
%else
SWAP m5, m1
%endif
%endmacro
%macro QUANT_TWO_DC_SSE4 4
mova m0, [%1]
mova m1, [%1+mmsize]
PABSD m2, m0
PABSD m3, m1
paddd m2, %3
paddd m3, %3
pmulld m2, %2
pmulld m3, %2
psrad m2, 16
psrad m3, 16
PSIGND m2, m0
PSIGND m3, m1
mova [%1], m2
mova [%1+mmsize], m3
%if %4
por m5, m2
%else
SWAP m5, m2
%endif
por m5, m3
%endmacro
%macro QUANT_ONE_AC_MMX 4
mova m0, [%1]
mova m2, [%2]
PABSD m1, m0
mova m4, m2
paddd m1, [%3]
mova m3, m1
psrlq m4, 32
psrlq m3, 32
pmuludq m1, m2
pmuludq m3, m4
psllq m3, 32
paddd m1, m3
psrad m1, 16
PSIGND m1, m0
mova [%1], m1
%if %4
por m5, m1
%else
SWAP m5, m1
%endif
%endmacro
%macro QUANT_TWO_AC_MMX 4
QUANT_ONE_AC_MMX %1, %2, %3, %4
QUANT_ONE_AC_MMX %1+mmsize, %2+mmsize, %3+mmsize, %4+mmsize
%endmacro
%macro QUANT_TWO_AC_SSE4 4
mova m0, [%1]
mova m1, [%1+mmsize]
PABSD m2, m0
PABSD m3, m1
paddd m2, [%3]
paddd m3, [%3+mmsize]
pmulld m2, [%2]
pmulld m3, [%2+mmsize]
psrad m2, 16
psrad m3, 16
PSIGND m2, m0
PSIGND m3, m1
mova [%1], m2
mova [%1+mmsize], m3
%if %4
por m5, m2
%else
SWAP m5, m2
%endif
por m5, m3
%endmacro
;-----------------------------------------------------------------------------
; int quant_2x2( int32_t dct[M*N], int mf, int bias )
;-----------------------------------------------------------------------------
%macro QUANT_DC 3
cglobal quant_%1x%2_dc_%3, 3,3,8*(mmsize/16)
QUANT_DC_START_MMX
%if %1*%2 <= mmsize/4
QUANT_ONE_DC r0, m6, m7, 0
%else
%assign x 0
%rep %1*%2/(mmsize/2)
QUANT_TWO_DC r0+x, m6, m7, x
%assign x x+mmsize*2
%endrep
%endif
QUANT_END
RET
%endmacro
;-----------------------------------------------------------------------------
; int quant_MxN( int32_t dct[M*N], uint32_t mf[M*N], uint32_t bias[M*N] )
;-----------------------------------------------------------------------------
%macro QUANT_AC 3
cglobal quant_%1x%2_%3, 3,3,8*(mmsize/16)
%assign x 0
%rep %1*%2/(mmsize/2)
QUANT_TWO_AC r0+x, r1+x, r2+x, x
%assign x x+mmsize*2
%endrep
QUANT_END
RET
%endmacro
%define QUANT_TWO_AC QUANT_TWO_AC_MMX
%define QUANT_ONE_DC QUANT_ONE_DC_MMX
%define QUANT_TWO_DC QUANT_TWO_DC_MMX
%define QUANT_END QUANT_END_MMX
%define PABSD PABSD_MMX
%define PSIGND PSIGND_MMX
INIT_MMX
QUANT_DC 2, 2, mmxext
QUANT_DC 4, 4, mmxext
QUANT_AC 4, 4, mmx
QUANT_AC 8, 8, mmx
INIT_XMM
QUANT_DC 2, 2, sse2
QUANT_DC 4, 4, sse2
QUANT_AC 4, 4, sse2
QUANT_AC 8, 8, sse2
%define PABSD PABSD_SSSE3
%define PSIGND PSIGND_SSSE3
QUANT_DC 2, 2, ssse3
QUANT_DC 4, 4, ssse3
QUANT_AC 4, 4, ssse3
QUANT_AC 8, 8, ssse3
%define QUANT_TWO_AC QUANT_TWO_AC_SSE4
%define QUANT_ONE_DC QUANT_ONE_DC_SSE4
%define QUANT_TWO_DC QUANT_TWO_DC_SSE4
%define QUANT_END QUANT_END_SSE4
QUANT_DC 2, 2, sse4
QUANT_DC 4, 4, sse4
QUANT_AC 4, 4, sse4
QUANT_AC 8, 8, sse4
%undef SIGND
%undef PABSD
%undef QUANT_END
%undef QUANT_TWO_AC
%undef QUANT_ONE_DC
%undef QUANT_TWO_DC
%endif ; X264_HIGH_BIT_DEPTH
%ifndef X264_HIGH_BIT_DEPTH
%macro QUANT_ONE 4
;;; %1 (m64) dct[y][x]
;;; %2 (m64/mmx) mf[y][x] or mf[0][0] (as uint16_t)
......@@ -157,35 +396,6 @@ cextern pb_01
%endif
%endmacro
%macro QUANT_END_MMX 0
xor eax, eax
%ifndef ARCH_X86_64
%if mmsize==8
packsswb m5, m5
movd ecx, m5
test ecx, ecx
%else
pxor m4, m4
pcmpeqb m5, m4
pmovmskb ecx, m5
cmp ecx, (1<<mmsize)-1
%endif
%else
%if mmsize==16
packsswb m5, m5
%endif
movq rcx, m5
test rcx, rcx
%endif
setne al
%endmacro
%macro QUANT_END_SSE4 0
xor eax, eax
ptest m5, m5
setne al
%endmacro
;-----------------------------------------------------------------------------
; void quant_4x4_dc( int16_t dct[16], int mf, int bias )
;-----------------------------------------------------------------------------
......@@ -251,6 +461,7 @@ INIT_XMM
QUANT_DC quant_4x4_dc_sse4, 2, 8
QUANT_AC quant_4x4_sse4, 2
QUANT_AC quant_8x8_sse4, 8
%endif ; !X264_HIGH_BIT_DEPTH
......
......@@ -28,20 +28,22 @@
#ifndef X264_I386_QUANT_H
#define X264_I386_QUANT_H
int x264_quant_2x2_dc_mmxext( int16_t dct[4], int mf, int bias );
int x264_quant_4x4_dc_mmxext( int16_t dct[16], int mf, int bias );
int x264_quant_4x4_mmx( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] );
int x264_quant_8x8_mmx( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] );
int x264_quant_4x4_dc_sse2( int16_t dct[16], int mf, int bias );
int x264_quant_4x4_sse2( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] );
int x264_quant_8x8_sse2( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] );
int x264_quant_2x2_dc_ssse3( int16_t dct[4], int mf, int bias );
int x264_quant_4x4_dc_ssse3( int16_t dct[16], int mf, int bias );
int x264_quant_4x4_ssse3( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] );
int x264_quant_8x8_ssse3( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] );
int x264_quant_4x4_dc_sse4( int16_t dct[16], int mf, int bias );
int x264_quant_4x4_sse4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] );
int x264_quant_8x8_sse4( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] );
int x264_quant_2x2_dc_mmxext( dctcoef dct[4], int mf, int bias );
int x264_quant_4x4_dc_mmxext( dctcoef dct[16], int mf, int bias );
int x264_quant_4x4_mmx( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
int x264_quant_8x8_mmx( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
int x264_quant_2x2_dc_sse2( dctcoef dct[16], int mf, int bias );
int x264_quant_4x4_dc_sse2( dctcoef dct[16], int mf, int bias );
int x264_quant_4x4_sse2( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
int x264_quant_8x8_sse2( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
int x264_quant_2x2_dc_ssse3( dctcoef dct[4], int mf, int bias );
int x264_quant_4x4_dc_ssse3( dctcoef dct[16], int mf, int bias );
int x264_quant_4x4_ssse3( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
int x264_quant_8x8_ssse3( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
int x264_quant_2x2_dc_sse4( dctcoef dct[16], int mf, int bias );
int x264_quant_4x4_dc_sse4( dctcoef dct[16], int mf, int bias );
int x264_quant_4x4_sse4( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
int x264_quant_8x8_sse4( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
void x264_dequant_4x4_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_4x4dc_mmxext( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
......
......@@ -602,3 +602,11 @@
pshufw %1, %2, %3*0x55
%endif
%endmacro
%macro SPLATD 2-3 0
%if mmsize == 16
pshufd %1, %2, %3*0x55
%else
pshufw %1, %2, %3*0x11 + (%3+1)*0x44
%endif
%endmacro
......@@ -415,7 +415,7 @@ typedef struct {
static ALWAYS_INLINE
int quant_trellis_cabac( x264_t *h, dctcoef *dct,
const uint16_t *quant_mf, const int *unquant_mf,
const udctcoef *quant_mf, const int *unquant_mf,
const int *coef_weight, const uint8_t *zigzag,
int ctx_block_cat, int i_lambda2, int b_ac,
int dc, int i_coefs, int idx )
......@@ -659,7 +659,7 @@ int quant_trellis_cabac( x264_t *h, dctcoef *dct,
* such a way that trailing ones and suffix length isn't affected. */
static ALWAYS_INLINE
int quant_trellis_cavlc( x264_t *h, dctcoef *dct,
const uint16_t *quant_mf, const int *unquant_mf,
const udctcoef *quant_mf, const int *unquant_mf,
const int *coef_weight, const uint8_t *zigzag,
int ctx_block_cat, int i_lambda2, int b_ac,
int dc, int i_coefs, int idx, int b_8x8 )
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment