Commit 92c074e2 authored by Henrik Gramner's avatar Henrik Gramner
Browse files

Rework pixel_var2

The functions are only ever called with pointers to fenc and fdec and the
strides are always constant so there's no point in having them as parameters.

Cover both the U and V planes in a single function call. This is more
efficient with SIMD, especially with the wider vectors provided by AVX2 and
AVX-512, even when accounting for losing the possibility of early termination.

Drop the MMX and XOP implementations, update the rest of the x86 assembly
to match the new behavior. Also enable high bit-depth in the AVX2 version.

Comment out the ARM, AARCH64, and MIPS MSA assembly for now.
parent 4c48f9e7
......@@ -201,28 +201,32 @@ PIXEL_VAR_C( x264_pixel_var_8x8, 8, 8 )
/****************************************************************************
* pixel_var2_wxh
****************************************************************************/
#define PIXEL_VAR2_C( name, w, h, shift ) \
static int name( pixel *pix1, intptr_t i_stride1, pixel *pix2, intptr_t i_stride2, int *ssd ) \
#define PIXEL_VAR2_C( name, h, shift ) \
static int name( pixel *fenc, pixel *fdec, int ssd[2] ) \
{ \
int var = 0, sum = 0, sqr = 0; \
int sum_u = 0, sum_v = 0, sqr_u = 0, sqr_v = 0; \
for( int y = 0; y < h; y++ ) \
{ \
for( int x = 0; x < w; x++ ) \
for( int x = 0; x < 8; x++ ) \
{ \
int diff = pix1[x] - pix2[x]; \
sum += diff; \
sqr += diff * diff; \
int diff_u = fenc[x] - fdec[x]; \
int diff_v = fenc[x+FENC_STRIDE/2] - fdec[x+FDEC_STRIDE/2]; \
sum_u += diff_u; \
sum_v += diff_v; \
sqr_u += diff_u * diff_u; \
sqr_v += diff_v * diff_v; \
} \
pix1 += i_stride1; \
pix2 += i_stride2; \
fenc += FENC_STRIDE; \
fdec += FDEC_STRIDE; \
} \
var = sqr - ((int64_t)sum * sum >> shift); \
*ssd = sqr; \
return var; \
ssd[0] = sqr_u; \
ssd[1] = sqr_v; \
return sqr_u - ((int64_t)sum_u * sum_u >> shift) + \
sqr_v - ((int64_t)sum_v * sum_v >> shift); \
}
PIXEL_VAR2_C( x264_pixel_var2_8x16, 8, 16, 7 )
PIXEL_VAR2_C( x264_pixel_var2_8x8, 8, 8, 6 )
PIXEL_VAR2_C( x264_pixel_var2_8x16, 16, 7 )
PIXEL_VAR2_C( x264_pixel_var2_8x8, 8, 6 )
#if BIT_DEPTH > 8
typedef uint32_t sum_t;
......@@ -884,10 +888,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT4( hadamard_ac, _mmx2 );
INIT8( ssd, _mmx2 );
INIT_ADS( _mmx2 );
#if ARCH_X86
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_mmx2;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_mmx2;
#endif
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_mmx2;
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmx2;
......@@ -1039,6 +1039,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT2( sad_x3, _avx2 );
INIT2( sad_x4, _avx2 );
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx2;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_avx2;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx2;
pixf->vsad = x264_pixel_vsad_avx2;
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx2;
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_avx2;
......@@ -1072,8 +1074,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_mmx2;
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmx2;
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_mmx2;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_mmx2;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_mmx2;
pixf->vsad = x264_pixel_vsad_mmx2;
if( cpu&X264_CPU_CACHELINE_32 )
......@@ -1318,8 +1318,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_xop;
pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_xop;
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_xop;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_xop;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_xop;
#if ARCH_X86_64
pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_xop;
#endif
......@@ -1381,8 +1379,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_neon;
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_neon;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_neon;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_neon;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_neon;
//pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_neon;
//pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_neon;
pixf->vsad = x264_pixel_vsad_neon;
pixf->asd8 = x264_pixel_asd8_neon;
......@@ -1436,8 +1434,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_neon;
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_neon;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_neon;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_neon;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_neon;
//pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_neon;
//pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_neon;
pixf->vsad = x264_pixel_vsad_neon;
pixf->asd8 = x264_pixel_asd8_neon;
......@@ -1483,8 +1481,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_msa;
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_msa;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_msa;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_msa;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_msa;
//pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_msa;
//pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_msa;
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8;
}
......
......@@ -93,8 +93,7 @@ typedef struct
uint64_t (*sa8d_satd[1])( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
uint64_t (*var[4])( pixel *pix, intptr_t stride );
int (*var2[4])( pixel *pix1, intptr_t stride1,
pixel *pix2, intptr_t stride2, int *ssd );
int (*var2[4])( pixel *fenc, pixel *fdec, int ssd[2] );
uint64_t (*hadamard_ac[4])( pixel *pix, intptr_t stride );
void (*ssd_nv12_core)( pixel *pixuv1, intptr_t stride1,
......
......@@ -981,195 +981,199 @@ cglobal pixel_var_8x16, 2,3
VAR_AVX512_CORE_8x8 1
jmp var_avx512_end
%macro VAR2_END 3
HADDW %2, xm1
movd r1d, %2
imul r1d, r1d
HADDD %3, xm1
shr r1d, %1
movd eax, %3
movd [r4], %3
sub eax, r1d ; sqr - (sum * sum >> shift)
RET
%endmacro
;-----------------------------------------------------------------------------
; int pixel_var2_8x8( pixel *, intptr_t, pixel *, intptr_t, int * )
; int pixel_var2_8x8( pixel *fenc, pixel *fdec, int ssd[2] )
;-----------------------------------------------------------------------------
%macro VAR2_8x8_MMX 2
cglobal pixel_var2_8x%1, 5,6
FIX_STRIDES r1, r3
VAR_START 0
mov r5d, %1
.loop:
%if HIGH_BIT_DEPTH
mova m0, [r0]
mova m1, [r0+mmsize]
psubw m0, [r2]
psubw m1, [r2+mmsize]
%else ; !HIGH_BIT_DEPTH
movq m0, [r0]
movq m1, m0
movq m2, [r2]
movq m3, m2
punpcklbw m0, m7
punpckhbw m1, m7
punpcklbw m2, m7
punpckhbw m3, m7
psubw m0, m2
psubw m1, m3
%endif ; HIGH_BIT_DEPTH
paddw m5, m0
paddw m5, m1
pmaddwd m0, m0
pmaddwd m1, m1
paddd m6, m0
paddd m6, m1
add r0, r1
add r2, r3
dec r5d
jg .loop
VAR2_END %2, m5, m6
%endmacro
%if ARCH_X86_64 == 0
INIT_MMX mmx2
VAR2_8x8_MMX 8, 6
VAR2_8x8_MMX 16, 7
%endif
%if ARCH_X86_64
DECLARE_REG_TMP 6
%else
DECLARE_REG_TMP 2
%endif
%macro VAR2_END 3 ; src, tmp, shift
movifnidn r2, r2mp
pshufd %2, %1, q3331
pmuludq %1, %1
movq [r2], %2 ; sqr_u sqr_v
psrld %1, %3
psubd %2, %1 ; sqr - (sum * sum >> shift)
MOVHL %1, %2
paddd %1, %2
movd eax, %1
RET
%endmacro
%macro VAR2_8x8_SSE2 2
cglobal pixel_var2_8x%1, 5,6,8
VAR_START 1
mov r5d, %1/2
%if HIGH_BIT_DEPTH
cglobal pixel_var2_8x%1, 2,3,6
pxor m4, m4
pxor m5, m5
%define %%sum2 m4
%define %%sqr2 m5
%else
cglobal pixel_var2_8x%1, 2,3,7
mova m6, [pw_00ff]
%define %%sum2 m0
%define %%sqr2 m1
%endif
pxor m0, m0 ; sum
pxor m1, m1 ; sqr
mov t0d, (%1-1)*FENC_STRIDEB
.loop:
%if HIGH_BIT_DEPTH
mova m0, [r0]
mova m1, [r0+r1*2]
mova m2, [r2]
mova m3, [r2+r3*2]
%else ; !HIGH_BIT_DEPTH
movq m1, [r0]
movhps m1, [r0+r1]
movq m3, [r2]
movhps m3, [r2+r3]
DEINTB 0, 1, 2, 3, 7
%endif ; HIGH_BIT_DEPTH
psubw m0, m2
psubw m1, m3
paddw m5, m0
paddw m5, m1
pmaddwd m0, m0
pmaddwd m1, m1
paddd m6, m0
paddd m6, m1
lea r0, [r0+r1*2*SIZEOF_PIXEL]
lea r2, [r2+r3*2*SIZEOF_PIXEL]
dec r5d
jg .loop
VAR2_END %2, m5, m6
mova m2, [r0+1*t0]
psubw m2, [r1+2*t0]
mova m3, [r0+1*t0+16]
psubw m3, [r1+2*t0+32]
%else
mova m3, [r0+1*t0]
movq m5, [r1+2*t0]
punpcklqdq m5, [r1+2*t0+16]
DEINTB 2, 3, 4, 5, 6
psubw m2, m4
psubw m3, m5
%endif
paddw m0, m2
pmaddwd m2, m2
paddw %%sum2, m3
pmaddwd m3, m3
paddd m1, m2
paddd %%sqr2, m3
sub t0d, FENC_STRIDEB
jge .loop
%if HIGH_BIT_DEPTH
SBUTTERFLY dq, 0, 4, 2
paddw m0, m4 ; sum_u sum_v
pmaddwd m0, [pw_1]
SBUTTERFLY dq, 1, 5, 2
paddd m1, m5 ; sqr_u sqr_v
SBUTTERFLY dq, 0, 1, 2
paddd m0, m1
%else
pmaddwd m0, [pw_1]
shufps m2, m0, m1, q2020
shufps m0, m1, q3131
paddd m0, m2
pshufd m0, m0, q3120 ; sum_u sqr_u sum_v sqr_v
%endif
VAR2_END m0, m1, %2
%endmacro
INIT_XMM sse2
VAR2_8x8_SSE2 8, 6
VAR2_8x8_SSE2 16, 7
%macro VAR2_CORE 3 ; src1, src2, accum
%if %3
paddw m0, %1
pmaddwd %1, %1
paddw m0, %2
pmaddwd %2, %2
paddd m1, %1
paddd m1, %2
%else
paddw m0, %1, %2
pmaddwd %1, %1
pmaddwd %2, %2
paddd m1, %1, %2
%endif
%endmacro
%if HIGH_BIT_DEPTH == 0
%macro VAR2_8x8_SSSE3 2
cglobal pixel_var2_8x%1, 5,6,8
pxor m5, m5 ; sum
pxor m6, m6 ; sum squared
mova m7, [hsub_mul]
mov r5d, %1/4
INIT_XMM ssse3
cglobal pixel_var2_internal
pxor m0, m0 ; sum
pxor m1, m1 ; sqr
.loop:
movq m0, [r0]
movq m2, [r2]
movq m1, [r0+r1]
movq m3, [r2+r3]
lea r0, [r0+r1*2]
lea r2, [r2+r3*2]
punpcklbw m0, m2
punpcklbw m1, m3
movq m2, [r0]
movq m3, [r2]
punpcklbw m2, m3
movq m3, [r0+r1]
movq m4, [r2+r3]
punpcklbw m3, m4
pmaddubsw m0, m7
pmaddubsw m1, m7
pmaddubsw m2, m7
pmaddubsw m3, m7
paddw m5, m0
paddw m5, m1
paddw m5, m2
paddw m5, m3
pmaddwd m0, m0
pmaddwd m1, m1
pmaddwd m2, m2
pmaddwd m3, m3
paddd m6, m0
paddd m6, m1
paddd m6, m2
paddd m6, m3
lea r0, [r0+r1*2]
lea r2, [r2+r3*2]
dec r5d
movq m2, [r0+1*t0]
punpcklbw m2, [r1+2*t0]
movq m3, [r0+1*t0-1*FENC_STRIDE]
punpcklbw m3, [r1+2*t0-1*FDEC_STRIDE]
movq m4, [r0+1*t0-2*FENC_STRIDE]
punpcklbw m4, [r1+2*t0-2*FDEC_STRIDE]
movq m5, [r0+1*t0-3*FENC_STRIDE]
punpcklbw m5, [r1+2*t0-3*FDEC_STRIDE]
pmaddubsw m2, m7
pmaddubsw m3, m7
pmaddubsw m4, m7
pmaddubsw m5, m7
VAR2_CORE m2, m3, 1
VAR2_CORE m4, m5, 1
sub t0d, 4*FENC_STRIDE
jg .loop
VAR2_END %2, m5, m6
pmaddwd m0, [pw_1]
ret
%macro VAR2_8x8_SSSE3 2
cglobal pixel_var2_8x%1, 2,3,8
mova m7, [hsub_mul]
mov t0d, (%1-1)*FENC_STRIDE
call pixel_var2_internal_ssse3 ; u
add r0, 8
add r1, 16
SBUTTERFLY qdq, 0, 1, 6
paddd m1, m0
mov t0d, (%1-1)*FENC_STRIDE
call pixel_var2_internal_ssse3 ; v
SBUTTERFLY qdq, 0, 6, 2
paddd m0, m6
phaddd m1, m0 ; sum_u sqr_u sum_v sqr_v
VAR2_END m1, m0, %2
%endmacro
INIT_XMM ssse3
VAR2_8x8_SSSE3 8, 6
VAR2_8x8_SSSE3 16, 7
INIT_XMM xop
VAR2_8x8_SSSE3 8, 6
VAR2_8x8_SSSE3 16, 7
%endif ; !HIGH_BIT_DEPTH
%macro VAR2_AVX2_LOAD 3 ; offset_reg, row1_offset, row2_offset
%if HIGH_BIT_DEPTH
mova xm2, [r1+2*%1+%2*FDEC_STRIDEB]
vinserti128 m2, [r1+2*%1+%2*FDEC_STRIDEB+32], 1
mova xm3, [r1+2*%1+%3*FDEC_STRIDEB]
vinserti128 m3, [r1+2*%1+%3*FDEC_STRIDEB+32], 1
psubw m2, [r0+1*%1+%2*FENC_STRIDEB]
psubw m3, [r0+1*%1+%3*FENC_STRIDEB]
%else
pmovzxbw m2, [r0+1*%1+%2*FENC_STRIDE]
mova m4, [r1+2*%1+%2*FDEC_STRIDE]
pmovzxbw m3, [r0+1*%1+%3*FENC_STRIDE]
mova m5, [r1+2*%1+%3*FDEC_STRIDE]
punpcklbw m4, m6
punpcklbw m5, m6
psubw m2, m4
psubw m3, m5
%endif
%endmacro
%macro VAR2_8x8_AVX2 2
cglobal pixel_var2_8x%1, 5,6,6
pxor m3, m3 ; sum
pxor m4, m4 ; sum squared
mova m5, [hsub_mul]
mov r5d, %1/4
%if HIGH_BIT_DEPTH
cglobal pixel_var2_8x%1, 2,3,4
%else
cglobal pixel_var2_8x%1, 2,3,7
pxor m6, m6
%endif
mov t0d, (%1-3)*FENC_STRIDEB
VAR2_AVX2_LOAD t0, 2, 1
VAR2_CORE m2, m3, 0
.loop:
movq xm0, [r0]
movq xm1, [r2]
vinserti128 m0, m0, [r0+r1], 1
vinserti128 m1, m1, [r2+r3], 1
lea r0, [r0+r1*2]
lea r2, [r2+r3*2]
punpcklbw m0, m1
movq xm1, [r0]
movq xm2, [r2]
vinserti128 m1, m1, [r0+r1], 1
vinserti128 m2, m2, [r2+r3], 1
lea r0, [r0+r1*2]
lea r2, [r2+r3*2]
punpcklbw m1, m2
pmaddubsw m0, m5
pmaddubsw m1, m5
paddw m3, m0
paddw m3, m1
pmaddwd m0, m0
pmaddwd m1, m1
paddd m4, m0
paddd m4, m1
dec r5d
VAR2_AVX2_LOAD t0, 0, -1
VAR2_CORE m2, m3, 1
sub t0d, 2*FENC_STRIDEB
jg .loop
vextracti128 xm0, m3, 1
vextracti128 xm1, m4, 1
paddw xm3, xm0
paddd xm4, xm1
VAR2_END %2, xm3, xm4
pmaddwd m0, [pw_1]
SBUTTERFLY qdq, 0, 1, 2
paddd m0, m1
vextracti128 xm1, m0, 1
phaddd xm0, xm1
VAR2_END xm0, xm1, %2
%endmacro
INIT_YMM avx2
VAR2_8x8_AVX2 8, 6
VAR2_8x8_AVX2 16, 7
%endif ; !HIGH_BIT_DEPTH
;=============================================================================
; SATD
;=============================================================================
......
......@@ -166,16 +166,12 @@ void x264_pixel_ssim_4x4x2_core_avx ( const pixel *pix1, intptr_t stride1,
const pixel *pix2, intptr_t stride2, int sums[2][4] );
float x264_pixel_ssim_end4_sse2( int sum0[5][4], int sum1[5][4], int width );
float x264_pixel_ssim_end4_avx ( int sum0[5][4], int sum1[5][4], int width );
int x264_pixel_var2_8x8_mmx2 ( pixel *, intptr_t, pixel *, intptr_t, int * );
int x264_pixel_var2_8x8_sse2 ( pixel *, intptr_t, pixel *, intptr_t, int * );
int x264_pixel_var2_8x8_ssse3 ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
int x264_pixel_var2_8x8_xop ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
int x264_pixel_var2_8x8_avx2 ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
int x264_pixel_var2_8x16_mmx2 ( pixel *, intptr_t, pixel *, intptr_t, int * );
int x264_pixel_var2_8x16_sse2 ( pixel *, intptr_t, pixel *, intptr_t, int * );
int x264_pixel_var2_8x16_ssse3( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
int x264_pixel_var2_8x16_xop ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
int x264_pixel_var2_8x16_avx2 ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
int x264_pixel_var2_8x8_sse2 ( pixel *fenc, pixel *fdec, int ssd[2] );
int x264_pixel_var2_8x8_ssse3 ( uint8_t *fenc, uint8_t *fdec, int ssd[2] );
int x264_pixel_var2_8x8_avx2 ( pixel *fenc, pixel *fdec, int ssd[2] );
int x264_pixel_var2_8x16_sse2 ( pixel *fenc, pixel *fdec, int ssd[2] );
int x264_pixel_var2_8x16_ssse3 ( uint8_t *fenc, uint8_t *fdec, int ssd[2] );
int x264_pixel_var2_8x16_avx2 ( pixel *fenc, pixel *fdec, int ssd[2] );
int x264_pixel_vsad_mmx2 ( pixel *src, intptr_t stride, int height );
int x264_pixel_vsad_sse2 ( pixel *src, intptr_t stride, int height );
int x264_pixel_vsad_ssse3( pixel *src, intptr_t stride, int height );
......
......@@ -283,13 +283,10 @@ static ALWAYS_INLINE void x264_mb_encode_chroma_internal( x264_t *h, int b_inter
if( b_decimate && i_qp >= (h->mb.b_trellis ? 12 : 18) && !h->mb.b_noise_reduction )
{
int thresh = chroma422 ? (x264_lambda2_tab[i_qp] + 16) >> 5 : (x264_lambda2_tab[i_qp] + 32) >> 6;
int ssd[2];
ALIGNED_ARRAY_8( int, ssd,[2] );
int chromapix = chroma422 ? PIXEL_8x16 : PIXEL_8x8;
int score = h->pixf.var2[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, &ssd[0] );
if( score < thresh*4 )
score += h->pixf.var2[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, &ssd[1] );
if( score < thresh*4 )
if( h->pixf.var2[chromapix]( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], ssd ) < thresh*4 )
{
h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] = 0;
h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] = 0;
......
......@@ -506,15 +506,17 @@ static int check_pixel( int cpu_ref, int cpu_new )
#define TEST_PIXEL_VAR2( i ) \
if( pixel_asm.var2[i] != pixel_ref.var2[i] ) \
{ \
int res_c, res_asm, ssd_c, ssd_asm; \
int res_c, res_asm; \
ALIGNED_ARRAY_8( int, ssd_c, [2] ); \
ALIGNED_ARRAY_8( int, ssd_asm,[2] ); \
set_func_name( "%s_%s", "var2", pixel_names[i] ); \
used_asm = 1; \
res_c = call_c( pixel_c.var2[i], pbuf1, (intptr_t)16, pbuf2, (intptr_t)16, &ssd_c ); \
res_asm = call_a( pixel_asm.var2[i], pbuf1, (intptr_t)16, pbuf2, (intptr_t)16, &ssd_asm ); \
if( res_c != res_asm || ssd_c != ssd_asm ) \
res_c = call_c( pixel_c.var2[i], pbuf1, pbuf2, ssd_c ); \
res_asm = call_a( pixel_asm.var2[i], pbuf1, pbuf2, ssd_asm ); \
if( res_c != res_asm || memcmp( ssd_c, ssd_asm, 2*sizeof(int) ) ) \
{ \
ok = 0; \
fprintf( stderr, "var2[%d]: %d != %d or %d != %d [FAILED]\n", i, res_c, res_asm, ssd_c, ssd_asm ); \
fprintf( stderr, "var2[%d]: {%d, %d, %d} != {%d, %d, %d} [FAILED]\n", i, res_c, ssd_c[0], ssd_c[1], res_asm, ssd_asm[0], ssd_asm[1] ); \
} \
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment