Commit d18bbd3b authored by Loren Merritt's avatar Loren Merritt

mmx dequant. up to 3% speedup w/ RD.



git-svn-id: svn://svn.videolan.org/x264/trunk@364 df754926-b1dd-0310-bc7b-ec298dee348c
parent d447c2d3
......@@ -35,7 +35,9 @@ BITS 64
%include "amd64inc.asm"
ALIGN 16
SECTION .rodata
pw_1: times 4 dw 1
pd_1: times 2 dd 1
SECTION .text
......@@ -54,6 +56,9 @@ cglobal x264_quant_4x4_dc_core32_mmxext
cglobal x264_quant_4x4_core32_mmxext
cglobal x264_quant_8x8_core32_mmxext
cglobal x264_dequant_4x4_mmx
cglobal x264_dequant_8x8_mmx
%macro MMX_QUANT_AC_START 0
; mov rdi, rdi ; &dct[0][0]
; mov rsi, rsi ; &quant_mf[0][0]
......@@ -374,3 +379,139 @@ x264_quant_8x8_core32_mmxext:
ret
;=============================================================================
; dequant
;=============================================================================
%macro DEQUANT16_L_1x4 3
;;; %1 dct[y][x]
;;; %2,%3 dequant_mf[i_mf][y][x]
;;; mm5 i_qbits
movq mm1, %2
movq mm2, %3
movq mm0, %1
packssdw mm1, mm2
pmullw mm0, mm1
psllw mm0, mm5
movq %1, mm0
%endmacro
%macro DEQUANT16_R_1x4 3
;;; %1 dct[y][x]
;;; %2,%3 dequant_mf[i_mf][y][x]
;;; mm5 -i_qbits
;;; mm6 f as words
movq mm1, %2
movq mm2, %3
movq mm0, %1
packssdw mm1, mm2
pmullw mm0, mm1
paddw mm0, mm6
psraw mm0, mm5
movq %1, mm0
%endmacro
%macro DEQUANT32_R_1x4 3
;;; %1 dct[y][x]
;;; %2,%3 dequant_mf[i_mf][y][x]
;;; mm5 -i_qbits
;;; mm6 f as dwords
;;; mm7 0
movq mm0, %1
movq mm1, mm0
punpcklwd mm0, mm0
punpckhwd mm1, mm1
movq mm2, mm0
movq mm3, mm1
pmulhw mm0, %2
pmulhw mm1, %3
pmullw mm2, %2
pmullw mm3, %3
pslld mm0, 16
pslld mm1, 16
paddd mm0, mm2
paddd mm1, mm3
paddd mm0, mm6
paddd mm1, mm6
psrad mm0, mm5
psrad mm1, mm5
packssdw mm0, mm1
movq %1, mm0
%endmacro
%macro DEQUANT_WxH 3
ALIGN 16
;;; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
%1:
; mov rdi, rdi ; dct
; mov rsi, rsi ; dequant_mf
; mov edx, edx ; i_qp
imul eax, edx, 0x2b
shr eax, 8 ; i_qbits = i_qp / 6
lea ecx, [eax+eax*2]
sub edx, ecx
sub edx, ecx ; i_mf = i_qp % 6
shl edx, %3+2
movsxd rdx, edx
add rsi, rdx ; dequant_mf[i_mf]
sub eax, %3
cmp eax, -2
jle .rshift32 ; dct * dequant overflows 16bit
cmp eax, -1
jle .rshift16 ; negative qbits => rightshift
.lshift:
movd mm5, eax
%rep %2
DEQUANT16_L_1x4 [rdi], [rsi], [rsi+8]
add rsi, byte 16
add rdi, byte 8
%endrep
ret
.rshift16:
neg eax
movd mm5, eax
movq mm6, [pw_1]
pxor mm7, mm7
psllw mm6, mm5
psrlw mm6, 1
%rep %2
DEQUANT16_R_1x4 [rdi], [rsi], [rsi+8]
add rsi, byte 16
add rdi, byte 8
%endrep
ret
.rshift32:
neg eax
movd mm5, eax
movq mm6, [pd_1]
pxor mm7, mm7
pslld mm6, mm5
psrld mm6, 1
%rep %2
DEQUANT32_R_1x4 [rdi], [rsi], [rsi+8]
add rsi, byte 16
add rdi, byte 8
%endrep
ret
%endmacro
DEQUANT_WxH x264_dequant_4x4_mmx, 4, 4
DEQUANT_WxH x264_dequant_8x8_mmx, 16, 6
......@@ -42,7 +42,9 @@ BITS 32
%endif
%endmacro
ALIGN 16
SECTION .rodata
pw_1: times 4 dw 1
pd_1: times 2 dd 1
SECTION .text
......@@ -61,6 +63,9 @@ cglobal x264_quant_4x4_dc_core32_mmxext
cglobal x264_quant_4x4_core32_mmxext
cglobal x264_quant_8x8_core32_mmxext
cglobal x264_dequant_4x4_mmx
cglobal x264_dequant_8x8_mmx
%macro MMX_QUANT_AC_START 0
mov eax, [esp+ 4] ; &dct[0][0]
mov ecx, [esp+ 8] ; &quant_mf[0][0]
......@@ -381,3 +386,145 @@ x264_quant_8x8_core32_mmxext:
ret
;=============================================================================
; dequant
;=============================================================================
%macro DEQUANT16_L_1x4 3
;;; %1 dct[y][x]
;;; %2,%3 dequant_mf[i_mf][y][x]
;;; mm5 i_qbits
movq mm1, %2
movq mm2, %3
movq mm0, %1
packssdw mm1, mm2
pmullw mm0, mm1
psllw mm0, mm5
movq %1, mm0
%endmacro
%macro DEQUANT16_R_1x4 3
;;; %1 dct[y][x]
;;; %2,%3 dequant_mf[i_mf][y][x]
;;; mm5 -i_qbits
;;; mm6 f as words
movq mm1, %2
movq mm2, %3
movq mm0, %1
packssdw mm1, mm2
pmullw mm0, mm1
paddw mm0, mm6
psraw mm0, mm5
movq %1, mm0
%endmacro
%macro DEQUANT32_R_1x4 3
;;; %1 dct[y][x]
;;; %2,%3 dequant_mf[i_mf][y][x]
;;; mm5 -i_qbits
;;; mm6 f as dwords
;;; mm7 0
movq mm0, %1
movq mm1, mm0
punpcklwd mm0, mm0
punpckhwd mm1, mm1
movq mm2, mm0
movq mm3, mm1
pmulhw mm0, %2
pmulhw mm1, %3
pmullw mm2, %2
pmullw mm3, %3
pslld mm0, 16
pslld mm1, 16
paddd mm0, mm2
paddd mm1, mm3
paddd mm0, mm6
paddd mm1, mm6
psrad mm0, mm5
psrad mm1, mm5
packssdw mm0, mm1
movq %1, mm0
%endmacro
%macro DEQUANT_WxH 3
ALIGN 16
;;; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
%1:
mov edx, [esp+12] ; i_qp
imul eax, edx, 0x2b
shr eax, 8 ; i_qbits = i_qp / 6
lea ecx, [eax+eax*2]
sub edx, ecx
sub edx, ecx ; i_mf = i_qp % 6
shl edx, %3+2
add edx, [esp+8] ; dequant_mf[i_mf]
mov ecx, [esp+4] ; dct
sub eax, %3
jge .lshift
cmp eax, byte -1
je .rshift16 ; negative qbits => rightshift
jmp .rshift32 ; dct * dequant overflows 16bit
.lshift:
movd mm5, eax
mov eax, 8*(%2-1)
.loopl16
%rep 2
DEQUANT16_L_1x4 [ecx+eax], [edx+eax*2], [edx+eax*2+8]
sub eax, byte 8
%endrep
jge .loopl16
nop
ret
.rshift16:
neg eax
movq mm6, [pw_1]
movd mm5, eax
pxor mm7, mm7
psllw mm6, mm5
psrlw mm6, 1
mov eax, 8*(%2-1)
.loopr16
%rep 2
DEQUANT16_R_1x4 [ecx+eax], [edx+eax*2], [edx+eax*2+8]
sub eax, byte 8
%endrep
jge .loopr16
nop
ret
.rshift32:
neg eax
movq mm6, [pd_1]
movd mm5, eax
pxor mm7, mm7
pslld mm6, mm5
psrld mm6, 1
mov eax, 8*(%2-1)
.loopr32
%rep 2
DEQUANT32_R_1x4 [ecx+eax], [edx+eax*2], [edx+eax*2+8]
sub eax, byte 8
%endrep
jge .loopr32
nop
ret
%endmacro
DEQUANT_WxH x264_dequant_4x4_mmx, 4, 4
DEQUANT_WxH x264_dequant_8x8_mmx, 16, 6
......@@ -50,4 +50,7 @@ void x264_quant_4x4_dc_core32_mmxext( int16_t dct[4][4],
void x264_quant_2x2_dc_core32_mmxext( int16_t dct[2][2],
int const i_qmf, int const i_qbits, int const f );
void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );
void x264_dequant_8x8_mmx( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp );
#endif
......@@ -84,132 +84,6 @@ int x264_mb_transform_8x8_allowed( x264_t *h )
return 1;
}
/****************************************************************************
* Scan and Quant functions
****************************************************************************/
void x264_mb_dequant_2x2_dc( int16_t dct[2][2], int dequant_mf[6][4][4], int i_qscale )
{
const int i_qbits = i_qscale/6 - 5;
if( i_qbits >= 0 )
{
const int i_dmf = dequant_mf[i_qscale%6][0][0] << i_qbits;
dct[0][0] *= i_dmf;
dct[0][1] *= i_dmf;
dct[1][0] *= i_dmf;
dct[1][1] *= i_dmf;
}
else
{
const int i_dmf = dequant_mf[i_qscale%6][0][0];
// chroma DC is truncated, not rounded
dct[0][0] = ( dct[0][0] * i_dmf ) >> (-i_qbits);
dct[0][1] = ( dct[0][1] * i_dmf ) >> (-i_qbits);
dct[1][0] = ( dct[1][0] * i_dmf ) >> (-i_qbits);
dct[1][1] = ( dct[1][1] * i_dmf ) >> (-i_qbits);
}
}
void x264_mb_dequant_4x4_dc( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qscale )
{
const int i_qbits = i_qscale/6 - 6;
int y;
if( i_qbits >= 0 )
{
const int i_dmf = dequant_mf[i_qscale%6][0][0] << i_qbits;
for( y = 0; y < 4; y++ )
{
dct[y][0] *= i_dmf;
dct[y][1] *= i_dmf;
dct[y][2] *= i_dmf;
dct[y][3] *= i_dmf;
}
}
else
{
const int i_dmf = dequant_mf[i_qscale%6][0][0];
const int f = 1 << (-i_qbits-1);
for( y = 0; y < 4; y++ )
{
dct[y][0] = ( dct[y][0] * i_dmf + f ) >> (-i_qbits);
dct[y][1] = ( dct[y][1] * i_dmf + f ) >> (-i_qbits);
dct[y][2] = ( dct[y][2] * i_dmf + f ) >> (-i_qbits);
dct[y][3] = ( dct[y][3] * i_dmf + f ) >> (-i_qbits);
}
}
}
void x264_mb_dequant_4x4( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qscale )
{
const int i_mf = i_qscale%6;
const int i_qbits = i_qscale/6 - 4;
int y;
if( i_qbits >= 0 )
{
for( y = 0; y < 4; y++ )
{
dct[y][0] = ( dct[y][0] * dequant_mf[i_mf][y][0] ) << i_qbits;
dct[y][1] = ( dct[y][1] * dequant_mf[i_mf][y][1] ) << i_qbits;
dct[y][2] = ( dct[y][2] * dequant_mf[i_mf][y][2] ) << i_qbits;
dct[y][3] = ( dct[y][3] * dequant_mf[i_mf][y][3] ) << i_qbits;
}
}
else
{
const int f = 1 << (-i_qbits-1);
for( y = 0; y < 4; y++ )
{
dct[y][0] = ( dct[y][0] * dequant_mf[i_mf][y][0] + f ) >> (-i_qbits);
dct[y][1] = ( dct[y][1] * dequant_mf[i_mf][y][1] + f ) >> (-i_qbits);
dct[y][2] = ( dct[y][2] * dequant_mf[i_mf][y][2] + f ) >> (-i_qbits);
dct[y][3] = ( dct[y][3] * dequant_mf[i_mf][y][3] + f ) >> (-i_qbits);
}
}
}
void x264_mb_dequant_8x8( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qscale )
{
const int i_mf = i_qscale%6;
const int i_qbits = i_qscale/6 - 6;
int y;
if( i_qbits >= 0 )
{
for( y = 0; y < 8; y++ )
{
dct[y][0] = ( dct[y][0] * dequant_mf[i_mf][y][0] ) << i_qbits;
dct[y][1] = ( dct[y][1] * dequant_mf[i_mf][y][1] ) << i_qbits;
dct[y][2] = ( dct[y][2] * dequant_mf[i_mf][y][2] ) << i_qbits;
dct[y][3] = ( dct[y][3] * dequant_mf[i_mf][y][3] ) << i_qbits;
dct[y][4] = ( dct[y][4] * dequant_mf[i_mf][y][4] ) << i_qbits;
dct[y][5] = ( dct[y][5] * dequant_mf[i_mf][y][5] ) << i_qbits;
dct[y][6] = ( dct[y][6] * dequant_mf[i_mf][y][6] ) << i_qbits;
dct[y][7] = ( dct[y][7] * dequant_mf[i_mf][y][7] ) << i_qbits;
}
}
else
{
const int f = 1 << (-i_qbits-1);
for( y = 0; y < 8; y++ )
{
dct[y][0] = ( dct[y][0] * dequant_mf[i_mf][y][0] + f ) >> (-i_qbits);
dct[y][1] = ( dct[y][1] * dequant_mf[i_mf][y][1] + f ) >> (-i_qbits);
dct[y][2] = ( dct[y][2] * dequant_mf[i_mf][y][2] + f ) >> (-i_qbits);
dct[y][3] = ( dct[y][3] * dequant_mf[i_mf][y][3] + f ) >> (-i_qbits);
dct[y][4] = ( dct[y][4] * dequant_mf[i_mf][y][4] + f ) >> (-i_qbits);
dct[y][5] = ( dct[y][5] * dequant_mf[i_mf][y][5] + f ) >> (-i_qbits);
dct[y][6] = ( dct[y][6] * dequant_mf[i_mf][y][6] + f ) >> (-i_qbits);
dct[y][7] = ( dct[y][7] * dequant_mf[i_mf][y][7] + f ) >> (-i_qbits);
}
}
}
void x264_mb_predict_mv( x264_t *h, int i_list, int idx, int i_width, int mvp[2] )
{
const int i8 = x264_scan8[idx];
......
......@@ -218,11 +218,6 @@ void x264_macroblock_cache_end( x264_t *h );
void x264_macroblock_bipred_init( x264_t *h );
void x264_mb_dequant_4x4_dc( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qscale );
void x264_mb_dequant_2x2_dc( int16_t dct[2][2], int dequant_mf[6][4][4], int i_qscale );
void x264_mb_dequant_4x4( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qscale );
void x264_mb_dequant_8x8( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qscale );
/* x264_mb_predict_mv_16x16:
* set mvp with predicted mv for D_16x16 block
* h->mb. need only valid values from other blocks */
......
......@@ -63,6 +63,132 @@ static void quant_2x2_dc_core( int16_t dct[2][2], int i_quant_mf, int i_qbits, i
QUANT_ONE( dct[0][3], i_quant_mf );
}
#define DEQUANT_SHL( x ) \
dct[y][x] = ( dct[y][x] * dequant_mf[i_mf][y][x] ) << i_qbits
#define DEQUANT_SHR( x ) \
dct[y][x] = ( dct[y][x] * dequant_mf[i_mf][y][x] + f ) >> (-i_qbits)
static void dequant_4x4( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
{
const int i_mf = i_qp%6;
const int i_qbits = i_qp/6 - 4;
int y;
if( i_qbits >= 0 )
{
for( y = 0; y < 4; y++ )
{
DEQUANT_SHL( 0 );
DEQUANT_SHL( 1 );
DEQUANT_SHL( 2 );
DEQUANT_SHL( 3 );
}
}
else
{
const int f = 1 << (-i_qbits-1);
for( y = 0; y < 4; y++ )
{
DEQUANT_SHR( 0 );
DEQUANT_SHR( 1 );
DEQUANT_SHR( 2 );
DEQUANT_SHR( 3 );
}
}
}
static void dequant_8x8( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp )
{
const int i_mf = i_qp%6;
const int i_qbits = i_qp/6 - 6;
int y;
if( i_qbits >= 0 )
{
for( y = 0; y < 8; y++ )
{
DEQUANT_SHL( 0 );
DEQUANT_SHL( 1 );
DEQUANT_SHL( 2 );
DEQUANT_SHL( 3 );
DEQUANT_SHL( 4 );
DEQUANT_SHL( 5 );
DEQUANT_SHL( 6 );
DEQUANT_SHL( 7 );
}
}
else
{
const int f = 1 << (-i_qbits-1);
for( y = 0; y < 8; y++ )
{
DEQUANT_SHR( 0 );
DEQUANT_SHR( 1 );
DEQUANT_SHR( 2 );
DEQUANT_SHR( 3 );
DEQUANT_SHR( 4 );
DEQUANT_SHR( 5 );
DEQUANT_SHR( 6 );
DEQUANT_SHR( 7 );
}
}
}
void x264_mb_dequant_2x2_dc( int16_t dct[2][2], int dequant_mf[6][4][4], int i_qp )
{
const int i_qbits = i_qp/6 - 5;
if( i_qbits >= 0 )
{
const int i_dmf = dequant_mf[i_qp%6][0][0] << i_qbits;
dct[0][0] *= i_dmf;
dct[0][1] *= i_dmf;
dct[1][0] *= i_dmf;
dct[1][1] *= i_dmf;
}
else
{
const int i_dmf = dequant_mf[i_qp%6][0][0];
// chroma DC is truncated, not rounded
dct[0][0] = ( dct[0][0] * i_dmf ) >> (-i_qbits);
dct[0][1] = ( dct[0][1] * i_dmf ) >> (-i_qbits);
dct[1][0] = ( dct[1][0] * i_dmf ) >> (-i_qbits);
dct[1][1] = ( dct[1][1] * i_dmf ) >> (-i_qbits);
}
}
void x264_mb_dequant_4x4_dc( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
{
const int i_qbits = i_qp/6 - 6;
int y;
if( i_qbits >= 0 )
{
const int i_dmf = dequant_mf[i_qp%6][0][0] << i_qbits;
for( y = 0; y < 4; y++ )
{
dct[y][0] *= i_dmf;
dct[y][1] *= i_dmf;
dct[y][2] *= i_dmf;
dct[y][3] *= i_dmf;
}
}
else
{
const int i_dmf = dequant_mf[i_qp%6][0][0];
const int f = 1 << (-i_qbits-1);
for( y = 0; y < 4; y++ )
{
dct[y][0] = ( dct[y][0] * i_dmf + f ) >> (-i_qbits);
dct[y][1] = ( dct[y][1] * i_dmf + f ) >> (-i_qbits);
dct[y][2] = ( dct[y][2] * i_dmf + f ) >> (-i_qbits);
dct[y][3] = ( dct[y][3] * i_dmf + f ) >> (-i_qbits);
}
}
}
void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
{
......@@ -73,6 +199,9 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->quant_4x4_dc_core = quant_4x4_dc_core;
pf->quant_2x2_dc_core = quant_2x2_dc_core;
pf->dequant_4x4 = dequant_4x4;
pf->dequant_8x8 = dequant_8x8;
#ifdef HAVE_MMXEXT
/* determine the biggest coeffient in all quant8_mf tables */
......@@ -133,5 +262,13 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->quant_2x2_dc_core = x264_quant_2x2_dc_core32_mmxext;
}
if( cpu&X264_CPU_MMXEXT )
{
/* dequant is not subject to the above CQM-dependent overflow issues,
* as long as the inputs are in the range generable by dct+quant.
* that is not guaranteed by the standard, but is true within x264 */
pf->dequant_4x4 = x264_dequant_4x4_mmx;
pf->dequant_8x8 = x264_dequant_8x8_mmx;
}
#endif /* HAVE_MMXEXT */
}
......@@ -29,8 +29,14 @@ typedef struct
void (*quant_4x4_core)( int16_t dct[4][4], int quant_mf[4][4], int i_qbits, int f );