Commit e1d852d2 authored by Loren Merritt's avatar Loren Merritt
Browse files

dequant_mmx made incorrect assumptions about extreme inputs. now uses 32bit in more cases.

patch by Christian Heine.



git-svn-id: svn://svn.videolan.org/x264/trunk@428 df754926-b1dd-0310-bc7b-ec298dee348c
parent fed2847c
......@@ -36,7 +36,6 @@ BITS 64
%include "amd64inc.asm"
SECTION .rodata
pw_1: times 4 dw 1
pd_1: times 2 dd 1
SECTION .text
......@@ -398,22 +397,6 @@ x264_quant_8x8_core32_mmxext:
movq %1, mm0
%endmacro
%macro DEQUANT16_R_1x4 3
;;; %1 dct[y][x]
;;; %2,%3 dequant_mf[i_mf][y][x]
;;; mm5 -i_qbits
;;; mm6 f as words
movq mm1, %2
movq mm2, %3
movq mm0, %1
packssdw mm1, mm2
pmullw mm0, mm1
paddw mm0, mm6
psraw mm0, mm5
movq %1, mm0
%endmacro
%macro DEQUANT32_R_1x4 3
;;; %1 dct[y][x]
;;; %2,%3 dequant_mf[i_mf][y][x]
......@@ -464,10 +447,7 @@ ALIGN 16
add rsi, rdx ; dequant_mf[i_mf]
sub eax, %3
cmp eax, -2
jle .rshift32 ; dct * dequant overflows 16bit
cmp eax, -1
jle .rshift16 ; negative qbits => rightshift
jl .rshift32 ; negative qbits => rightshift
.lshift:
movd mm5, eax
......@@ -480,22 +460,6 @@ ALIGN 16
ret
.rshift16:
neg eax
movd mm5, eax
movq mm6, [pw_1 GLOBAL]
pxor mm7, mm7
psllw mm6, mm5
psrlw mm6, 1
%rep %2
DEQUANT16_R_1x4 [rdi], [rsi], [rsi+8]
add rsi, byte 16
add rdi, byte 8
%endrep
ret
.rshift32:
neg eax
movd mm5, eax
......
......@@ -36,7 +36,6 @@ BITS 32
%include "i386inc.asm"
SECTION .rodata
pw_1: times 4 dw 1
pd_1: times 2 dd 1
SECTION .text
......@@ -461,10 +460,7 @@ ALIGN 16
mov ecx, [esp+4] ; dct
sub eax, %3
jge .lshift
cmp eax, byte -1
je .rshift16 ; negative qbits => rightshift
jmp .rshift32 ; dct * dequant overflows 16bit
jl .rshift32 ; negative qbits => rightshift
.lshift:
movd mm5, eax
......@@ -480,28 +476,6 @@ ALIGN 16
nop
ret
.rshift16:
neg eax
picpush ebx
picgetgot ebx
movq mm6, [pw_1 GLOBAL]
picpop ebx
movd mm5, eax
pxor mm7, mm7
psllw mm6, mm5
psrlw mm6, 1
mov eax, 8*(%2-1)
.loopr16
%rep 2
DEQUANT16_R_1x4 [ecx+eax], [edx+eax*2], [edx+eax*2+8]
sub eax, byte 8
%endrep
jge .loopr16
nop
ret
.rshift32:
neg eax
picpush ebx
......
......@@ -262,7 +262,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->quant_2x2_dc_core = x264_quant_2x2_dc_core32_mmxext;
}
if( cpu&X264_CPU_MMXEXT )
if( cpu&X264_CPU_MMX )
{
/* dequant is not subject to the above CQM-dependent overflow issues,
* as long as the inputs are in the range generable by dct+quant.
......
......@@ -398,6 +398,30 @@ static int check_quant( int cpu_ref, int cpu_new )
x264_quant_init( h, cpu_ref, &qf_ref );
x264_quant_init( h, cpu_new, &qf_a );
#define INIT_QUANT8() \
{ \
static const int scale1d[8] = {32,31,24,31,32,31,24,31}; \
int x, y; \
for( y = 0; y < 8; y++ ) \
for( x = 0; x < 8; x++ ) \
{ \
unsigned int scale = (255*scale1d[y]*scale1d[x])/16; \
dct1[y*8+x] = dct2[y*8+x] = (rand()%(2*scale+1))-scale; \
} \
}
#define INIT_QUANT4() \
{ \
static const int scale1d[4] = {4,6,4,6}; \
int x, y; \
for( y = 0; y < 4; y++ ) \
for( x = 0; x < 4; x++ ) \
{ \
unsigned int scale = 255*scale1d[y]*scale1d[x]; \
dct1[y*4+x] = dct2[y*4+x] = (rand()%(2*scale+1))-scale; \
} \
}
#define TEST_QUANT( name, cqm ) \
if( qf_a.name != qf_ref.name ) \
{ \
......@@ -413,37 +437,97 @@ static int check_quant( int cpu_ref, int cpu_new )
} \
}
TEST_QUANT( quant_8x8_core, *h->quant8_mf[CQM_8IY] );
TEST_QUANT( quant_8x8_core, *h->quant8_mf[CQM_8PY] );
TEST_QUANT( quant_4x4_core, *h->quant4_mf[CQM_4IY] );
TEST_QUANT( quant_4x4_core, *h->quant4_mf[CQM_4PY] );
#define TEST_QUANT8( qname, cqm, shift, divider ) \
if( qf_a.qname != qf_ref.qname ) \
{ \
int qp; \
used_asms[0] = 1; \
for( qp = 51; qp > 0; qp-- ) \
{ \
INIT_QUANT8() \
qf_c.qname( (void*)dct1, cqm[qp%6], shift+qp/6, (1<<(shift+qp/6))/divider ); \
qf_a.qname( (void*)dct2, cqm[qp%6], shift+qp/6, (1<<(shift+qp/6))/divider ); \
if( memcmp( dct1, dct2, 64*2 ) ) \
{ \
oks[0] = 0; \
fprintf( stderr, #qname "(qp=%d, cqm=%d, intra=%d): [FAILED]\n", qp, i_cqm, divider==3 ); \
break; \
} \
} \
}
#define TEST_QUANT4( qname, cqm, shift, divider ) \
if( qf_a.qname != qf_ref.qname ) \
{ \
int qp; \
used_asms[0] = 1; \
for( qp = 51; qp > 0; qp-- ) \
{ \
INIT_QUANT4() \
qf_c.qname( (void*)dct1, cqm[qp%6], shift+qp/6, (1<<(shift+qp/6))/divider ); \
qf_a.qname( (void*)dct2, cqm[qp%6], shift+qp/6, (1<<(shift+qp/6))/divider ); \
if( memcmp( dct1, dct2, 16*2 ) ) \
{ \
oks[0] = 0; \
fprintf( stderr, #qname "(qp=%d, cqm=%d, intra=%d): [FAILED]\n", qp, i_cqm, divider==3 ); \
break; \
} \
} \
}
TEST_QUANT8( quant_8x8_core, h->quant8_mf[CQM_8IY], 16, 3 );
TEST_QUANT8( quant_8x8_core, h->quant8_mf[CQM_8PY], 16, 6 );
TEST_QUANT4( quant_4x4_core, h->quant4_mf[CQM_4IY], 15, 3 );
TEST_QUANT4( quant_4x4_core, h->quant4_mf[CQM_4PY], 15, 6 );
TEST_QUANT( quant_4x4_dc_core, ***h->quant4_mf[CQM_4IY] );
TEST_QUANT( quant_2x2_dc_core, ***h->quant4_mf[CQM_4IC] );
#define TEST_DEQUANT( name, quant, dqm, cqm, shift ) \
if( qf_a.name != qf_ref.name ) \
#define TEST_DEQUANT8( qname, dqname, cqm, dqm, shift, divider ) \
if( qf_a.dqname != qf_ref.dqname ) \
{ \
int qp; \
used_asms[1] = 1; \
for( qp = 51; qp > 0; qp-- ) \
{ \
for( i = 0; i < 64; i++ ) \
dct1[i] = dct2[i] = (rand() & 0x1fff) - 0xfff; \
qf_c.quant( (void*)dct1, cqm[qp%6], shift+qp/6, 0 ); \
memcpy( dct2, dct1, sizeof(dct2) ); \
qf_c.name( (void*)dct1, dqm, qp ); \
qf_a.name( (void*)dct2, dqm, qp ); \
INIT_QUANT8() \
qf_c.qname( (void*)dct1, cqm[qp%6], shift+qp/6, (1<<(shift+qp/6))/divider ); \
memcpy( dct2, dct1, 64*2 ); \
qf_c.dqname( (void*)dct1, dqm, qp ); \
qf_a.dqname( (void*)dct2, dqm, qp ); \
if( memcmp( dct1, dct2, 64*2 ) ) \
{ \
oks[1] = 0; \
fprintf( stderr, #name "(qp=%d, cqm=%d): [FAILED]\n", qp, i_cqm ); \
fprintf( stderr, #dqname "(qp=%d, cqm=%d, intra=%d): [FAILED]\n", qp, i_cqm, divider==3 ); \
break; \
} \
} \
}
#define TEST_DEQUANT4( qname, dqname, cqm, dqm, shift, divider ) \
if( qf_a.dqname != qf_ref.dqname ) \
{ \
int qp; \
used_asms[1] = 1; \
for( qp = 51; qp > 0; qp-- ) \
{ \
INIT_QUANT4() \
qf_c.qname( (void*)dct1, cqm[qp%6], shift+qp/6, (1<<(shift+qp/6))/divider ); \
memcpy( dct2, dct1, 16*2 ); \
qf_c.dqname( (void*)dct1, dqm, qp ); \
qf_a.dqname( (void*)dct2, dqm, qp ); \
if( memcmp( dct1, dct2, 16*2 ) ) \
{ \
oks[1] = 0; \
fprintf( stderr, #dqname "(qp=%d, cqm=%d, intra=%d): [FAILED]\n", qp, i_cqm, divider==3 ); \
break; \
} \
} \
}
TEST_DEQUANT( dequant_8x8, quant_8x8_core, h->dequant8_mf[CQM_8PY], h->quant8_mf[CQM_8PY], 16 );
TEST_DEQUANT( dequant_4x4, quant_4x4_core, h->dequant4_mf[CQM_4PY], h->quant4_mf[CQM_4PY], 15 );
TEST_DEQUANT8( quant_8x8_core, dequant_8x8, h->quant8_mf[CQM_8IY], h->dequant8_mf[CQM_8IY], 16, 3 );
TEST_DEQUANT8( quant_8x8_core, dequant_8x8, h->quant8_mf[CQM_8PY], h->dequant8_mf[CQM_8PY], 16, 6 );
TEST_DEQUANT4( quant_4x4_core, dequant_4x4, h->quant4_mf[CQM_4IY], h->dequant4_mf[CQM_4IY], 15, 3 );
TEST_DEQUANT4( quant_4x4_core, dequant_4x4, h->quant4_mf[CQM_4PY], h->dequant4_mf[CQM_4PY], 15, 6 );
}
ok = oks[0]; used_asm = used_asms[0];
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment