checkasm.c 42 KB
Newer Older
1
#include <ctype.h>
Laurent Aimar's avatar
Laurent Aimar committed
2
#include <stdlib.h>
3
#include <limits.h>
4
#include <math.h>
Laurent Aimar's avatar
Laurent Aimar committed
5

6
#include "common/common.h"
7
#include "common/cpu.h"
Laurent Aimar's avatar
Laurent Aimar committed
8

9
/* buf1, buf2: initialised to random data and shouldn't write into them */
Laurent Aimar's avatar
Laurent Aimar committed
10
11
12
13
uint8_t * buf1, * buf2;
/* buf3, buf4: used to store output */
uint8_t * buf3, * buf4;

14
15
int quiet = 0;

16
#define report( name ) { \
17
    if( used_asm && !quiet ) \
18
19
20
21
        fprintf( stderr, " - %-21s [%s]\n", name, ok ? "OK" : "FAILED" ); \
    if( !ok ) ret = -1; \
}

22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#define BENCH_RUNS 100  // tradeoff between accuracy and speed
#define BENCH_ALIGNS 16 // number of stack+heap data alignments (another accuracy vs speed tradeoff)
#define MAX_FUNCS 1000  // just has to be big enough to hold all the existing functions
#define MAX_CPUS 10     // number of different combinations of cpu flags

typedef struct {
    void *pointer; // just for detecting duplicates
    uint32_t cpu;
    uint32_t cycles;
    uint32_t den;
} bench_t;

typedef struct {
    char *name;
    bench_t vers[MAX_CPUS];
} bench_func_t;

int do_bench = 0;
Loren Merritt's avatar
Loren Merritt committed
40
41
int bench_pattern_len = 0;
const char *bench_pattern = "";
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
char func_name[100];
static bench_func_t benchs[MAX_FUNCS];

static const char *pixel_names[10] = { "16x16", "16x8", "8x16", "8x8", "8x4", "4x8", "4x4", "4x2", "2x4", "2x2" };
static const char *intra_predict_16x16_names[7] = { "v", "h", "dc", "p", "dcl", "dct", "dc8" };
static const char *intra_predict_8x8c_names[7] = { "dc", "h", "v", "p", "dcl", "dct", "dc8" };
static const char *intra_predict_4x4_names[12] = { "v", "h", "dc", "ddl", "ddr", "vr", "hd", "vl", "hu", "dcl", "dct", "dc8" };
static const char **intra_predict_8x8_names = intra_predict_4x4_names;

#define set_func_name(...) snprintf( func_name, sizeof(func_name), __VA_ARGS__ )

static inline uint32_t read_time(void)
{
#if defined(__GNUC__) && (defined(ARCH_X86) || defined(ARCH_X86_64))
    uint32_t a;
    asm volatile( "rdtsc" :"=a"(a) ::"edx" );
    return a;
#else
    return 0;
#endif
}

static bench_t* get_bench( const char *name, int cpu )
{
    int i, j;
    for( i=0; benchs[i].name && strcmp(name, benchs[i].name); i++ )
        assert( i < MAX_FUNCS );
    if( !benchs[i].name )
        benchs[i].name = strdup( name );
    if( !cpu )
        return &benchs[i].vers[0];
    for( j=1; benchs[i].vers[j].cpu && benchs[i].vers[j].cpu != cpu; j++ )
        assert( j < MAX_CPUS );
    benchs[i].vers[j].cpu = cpu;
    return &benchs[i].vers[j];
}

int cmp_nop( const void *a, const void *b )
{
    return *(uint16_t*)a - *(uint16_t*)b;
}

int cmp_bench( const void *a, const void *b )
{
    // asciibetical sort except preserving numbers
    const char *sa = ((bench_func_t*)a)->name;
    const char *sb = ((bench_func_t*)b)->name;
    for(;; sa++, sb++)
    {
        if( !*sa && !*sb ) return 0;
        if( isdigit(*sa) && isdigit(*sb) && isdigit(sa[1]) != isdigit(sb[1]) )
            return isdigit(sa[1]) - isdigit(sb[1]);
        if( *sa != *sb ) return *sa - *sb;
    }
}

static void print_bench(void)
{
    uint16_t nops[10000] = {0};
    int i, j, k, nfuncs, nop_time=0;

    for( i=0; i<10000; i++ )
    {
        int t = read_time();
        nops[i] = read_time() - t;
    }
    qsort( nops, 10000, sizeof(uint16_t), cmp_nop );
    for( i=500; i<9500; i++ )
        nop_time += nops[i];
    nop_time /= 900;
    printf( "nop: %d\n", nop_time );

    for( i=0; i<MAX_FUNCS && benchs[i].name; i++ );
    nfuncs=i;
    qsort( benchs, nfuncs, sizeof(bench_func_t), cmp_bench );
    for( i=0; i<nfuncs; i++ )
        for( j=0; j<MAX_CPUS && (!j || benchs[i].vers[j].cpu); j++ )
        {
            bench_t *b = &benchs[i].vers[j];
            if( !b->den ) continue;
            for( k=0; k<j && benchs[i].vers[k].pointer != b->pointer; k++ );
            if( k<j ) continue;
            printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,
125
                    b->cpu&X264_CPU_PHADD_IS_FAST ? "phadd" :
126
127
                    b->cpu&X264_CPU_SSSE3 ? "ssse3" :
                    b->cpu&X264_CPU_SSE3 ? "sse3" :
128
129
                    /* print sse2slow only if there's also a sse2fast version of the same func */
                    b->cpu&X264_CPU_SSE2_IS_SLOW && j<MAX_CPUS && b[1].cpu&X264_CPU_SSE2_IS_FAST && !(b[1].cpu&X264_CPU_SSE3) ? "sse2slow" :
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
                    b->cpu&X264_CPU_SSE2 ? "sse2" :
                    b->cpu&X264_CPU_MMX ? "mmx" : "c",
                    b->cpu&X264_CPU_CACHELINE_32 ? "_c32" :
                    b->cpu&X264_CPU_CACHELINE_64 ? "_c64" : "",
                    ((int64_t)10*b->cycles/b->den - nop_time)/4 );
        }
}

#if defined(ARCH_X86) || defined(ARCH_X86_64)
int x264_stack_pagealign( int (*func)(), int align );
#else
#define x264_stack_pagealign( func, align ) func()
#endif

#define call_c1(func,...) func(__VA_ARGS__)

#ifdef ARCH_X86
147
148
149
/* detect when callee-saved regs aren't saved.
 * needs an explicit asm check because it only sometimes crashes in normal use. */
long x264_checkasm_call( long (*func)(), int *ok, ... );
150
#define call_a1(func,...) x264_checkasm_call((long(*)())func, &ok, __VA_ARGS__)
151
#else
152
#define call_a1 call_c1
153
154
#endif

155
#define call_bench(func,cpu,...)\
Loren Merritt's avatar
Loren Merritt committed
156
    if( do_bench && !strncmp(func_name, bench_pattern, bench_pattern_len) )\
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
    {\
        uint32_t tsum = 0;\
        int tcount = 0;\
        int ti;\
        call_a1(func, __VA_ARGS__);\
        for( ti=0; ti<(cpu?BENCH_RUNS:BENCH_RUNS/4); ti++ )\
        {\
            uint32_t t = read_time();\
            func(__VA_ARGS__);\
            func(__VA_ARGS__);\
            func(__VA_ARGS__);\
            func(__VA_ARGS__);\
            t = read_time() - t;\
            if( t*tcount <= tsum*4 && ti > 0 )\
            {\
                tsum += t;\
                tcount++;\
            }\
        }\
        bench_t *b = get_bench( func_name, cpu );\
        b->cycles += tsum;\
        b->den += tcount;\
        b->pointer = func;\
    }

/* for most functions, run benchmark and correctness test at the same time.
 * for those that modify their inputs, run the above macros separately */
#define call_a(func,...) ({ call_a2(func,__VA_ARGS__); call_a1(func,__VA_ARGS__); })
#define call_c(func,...) ({ call_c2(func,__VA_ARGS__); call_c1(func,__VA_ARGS__); })
#define call_a2(func,...) ({ call_bench(func,cpu_new,__VA_ARGS__); })
#define call_c2(func,...) ({ call_bench(func,0,__VA_ARGS__); })


190
static int check_pixel( int cpu_ref, int cpu_new )
Laurent Aimar's avatar
Laurent Aimar committed
191
{
192
193
194
    x264_pixel_function_t pixel_c;
    x264_pixel_function_t pixel_ref;
    x264_pixel_function_t pixel_asm;
195
196
197
198
    x264_predict_t predict_16x16[4+3];
    x264_predict_t predict_8x8c[4+3];
    x264_predict_t predict_4x4[9+3];
    x264_predict8x8_t predict_8x8[9+3];
Loren Merritt's avatar
Loren Merritt committed
199
    DECLARE_ALIGNED_16( uint8_t edge[33] );
200
    uint16_t cost_mv[32];
201
    int ret = 0, ok, used_asm;
Loren Merritt's avatar
Loren Merritt committed
202
    int i, j;
Laurent Aimar's avatar
Laurent Aimar committed
203
204

    x264_pixel_init( 0, &pixel_c );
205
206
    x264_pixel_init( cpu_ref, &pixel_ref );
    x264_pixel_init( cpu_new, &pixel_asm );
207
208
209
210
    x264_predict_16x16_init( 0, predict_16x16 );
    x264_predict_8x8c_init( 0, predict_8x8c );
    x264_predict_8x8_init( 0, predict_8x8 );
    x264_predict_4x4_init( 0, predict_4x4 );
211
    x264_predict_8x8_filter( buf2+40, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
Laurent Aimar's avatar
Laurent Aimar committed
212

Loren Merritt's avatar
Loren Merritt committed
213
#define TEST_PIXEL( name, align ) \
214
215
216
217
218
    for( i = 0, ok = 1, used_asm = 0; i < 7; i++ ) \
    { \
        int res_c, res_asm; \
        if( pixel_asm.name[i] != pixel_ref.name[i] ) \
        { \
219
            set_func_name( "%s_%s", #name, pixel_names[i] ); \
220
            for( j=0; j<64; j++ ) \
221
            { \
222
                used_asm = 1; \
223
224
                res_c   = call_c( pixel_c.name[i], buf1, 16, buf2+j*!align, 64 ); \
                res_asm = call_a( pixel_asm.name[i], buf1, 16, buf2+j*!align, 64 ); \
225
226
227
228
229
230
                if( res_c != res_asm ) \
                { \
                    ok = 0; \
                    fprintf( stderr, #name "[%d]: %d != %d [FAILED]\n", i, res_c, res_asm ); \
                    break; \
                } \
231
232
233
234
            } \
        } \
    } \
    report( "pixel " #name " :" );
Laurent Aimar's avatar
Laurent Aimar committed
235

Loren Merritt's avatar
Loren Merritt committed
236
237
238
239
    TEST_PIXEL( sad, 0 );
    TEST_PIXEL( ssd, 1 );
    TEST_PIXEL( satd, 0 );
    TEST_PIXEL( sa8d, 0 );
Laurent Aimar's avatar
Laurent Aimar committed
240

241
242
243
244
245
246
#define TEST_PIXEL_X( N ) \
    for( i = 0, ok = 1, used_asm = 0; i < 7; i++ ) \
    { \
        int res_c[4]={0}, res_asm[4]={0}; \
        if( pixel_asm.sad_x##N[i] && pixel_asm.sad_x##N[i] != pixel_ref.sad_x##N[i] ) \
        { \
247
            set_func_name( "sad_x%d_%s", N, pixel_names[i] ); \
248
            for( j=0; j<64; j++) \
249
            { \
250
251
                uint8_t *pix2 = buf2+j; \
                used_asm = 1; \
252
253
254
                res_c[0] = pixel_c.sad[i]( buf1, 16, pix2, 64 ); \
                res_c[1] = pixel_c.sad[i]( buf1, 16, pix2+6, 64 ); \
                res_c[2] = pixel_c.sad[i]( buf1, 16, pix2+1, 64 ); \
255
256
                if(N==4) \
                { \
257
258
                    res_c[3] = pixel_c.sad[i]( buf1, 16, pix2+10, 64 ); \
                    call_a( pixel_asm.sad_x4[i], buf1, pix2, pix2+6, pix2+1, pix2+10, 64, res_asm ); \
259
260
                } \
                else \
261
                    call_a( pixel_asm.sad_x3[i], buf1, pix2, pix2+6, pix2+1, 64, res_asm ); \
262
263
264
265
266
267
268
                if( memcmp(res_c, res_asm, sizeof(res_c)) ) \
                { \
                    ok = 0; \
                    fprintf( stderr, "sad_x"#N"[%d]: %d,%d,%d,%d != %d,%d,%d,%d [FAILED]\n", \
                             i, res_c[0], res_c[1], res_c[2], res_c[3], \
                             res_asm[0], res_asm[1], res_asm[2], res_asm[3] ); \
                } \
269
270
271
272
                if(N==4) \
                    call_c2( pixel_c.sad_x4[i], buf1, pix2, pix2+6, pix2+1, pix2+10, 64, res_asm ); \
                else \
                    call_c2( pixel_c.sad_x3[i], buf1, pix2, pix2+6, pix2+1, 64, res_asm ); \
273
274
275
276
277
278
279
            } \
        } \
    } \
    report( "pixel sad_x"#N" :" );

    TEST_PIXEL_X(3);
    TEST_PIXEL_X(4);
280

281
#define TEST_INTRA_SATD( name, pred, satd, i8x8, ... ) \
282
283
284
    if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \
    { \
        int res_c[3], res_asm[3]; \
285
        set_func_name( #name );\
286
287
288
289
290
291
292
        used_asm = 1; \
        memcpy( buf3, buf2, 1024 ); \
        for( i=0; i<3; i++ ) \
        { \
            pred[i]( buf3+40, ##__VA_ARGS__ ); \
            res_c[i] = pixel_c.satd( buf1+40, 16, buf3+40, 32 ); \
        } \
293
        call_a( pixel_asm.name, buf1+40, i8x8 ? edge : buf3+40, res_asm ); \
294
295
296
297
298
299
300
301
302
303
        if( memcmp(res_c, res_asm, sizeof(res_c)) ) \
        { \
            ok = 0; \
            fprintf( stderr, #name": %d,%d,%d != %d,%d,%d [FAILED]\n", \
                     res_c[0], res_c[1], res_c[2], \
                     res_asm[0], res_asm[1], res_asm[2] ); \
        } \
    }

    ok = 1; used_asm = 0;
304
305
306
307
    TEST_INTRA_SATD( intra_satd_x3_16x16, predict_16x16, satd[PIXEL_16x16], 0 );
    TEST_INTRA_SATD( intra_satd_x3_8x8c, predict_8x8c, satd[PIXEL_8x8], 0 );
    TEST_INTRA_SATD( intra_satd_x3_4x4, predict_4x4, satd[PIXEL_4x4], 0 );
    TEST_INTRA_SATD( intra_sa8d_x3_8x8, predict_8x8, sa8d[PIXEL_8x8], 1, edge );
308
309
    report( "intra satd_x3 :" );

310
311
312
313
    if( pixel_asm.ssim_4x4x2_core != pixel_ref.ssim_4x4x2_core ||
        pixel_asm.ssim_end4 != pixel_ref.ssim_end4 )
    {
        float res_c, res_a;
314
315
        int sums[5][4] = {{0}};
        used_asm = ok = 1;
Loren Merritt's avatar
Loren Merritt committed
316
        x264_emms();
317
318
        res_c = x264_pixel_ssim_wxh( &pixel_c,   buf1+2, 32, buf2+2, 32, 32, 28 );
        res_a = x264_pixel_ssim_wxh( &pixel_asm, buf1+2, 32, buf2+2, 32, 32, 28 );
319
        if( fabs(res_c - res_a) > 1e-6 )
320
321
322
323
        {
            ok = 0;
            fprintf( stderr, "ssim: %.7f != %.7f [FAILED]\n", res_c, res_a );
        }
324
325
326
327
328
329
        set_func_name( "ssim_core" );
        call_c2( pixel_c.ssim_4x4x2_core,   buf1+2, 32, buf2+2, 32, sums );
        call_a2( pixel_asm.ssim_4x4x2_core, buf1+2, 32, buf2+2, 32, sums );
        set_func_name( "ssim_end" );
        call_c2( pixel_c.ssim_end4,   sums, sums, 4 );
        call_a2( pixel_asm.ssim_end4, sums, sums, 4 );
330
331
332
        report( "ssim :" );
    }

Loren Merritt's avatar
Loren Merritt committed
333
    ok = 1; used_asm = 0;
334
335
    for( i=0; i<32; i++ )
        cost_mv[i] = i*10;
336
    for( i=0; i<100 && ok; i++ )
337
        if( pixel_asm.ads[i&3] != pixel_ref.ads[i&3] )
Loren Merritt's avatar
Loren Merritt committed
338
        {
Loren Merritt's avatar
Loren Merritt committed
339
340
            DECLARE_ALIGNED_16( uint16_t sums[72] );
            DECLARE_ALIGNED_16( int dc[4] );
341
342
343
            int16_t mvs_a[32], mvs_c[32];
            int mvn_a, mvn_c;
            int thresh = rand() & 0x3fff;
344
            set_func_name( "esa_ads" );
Loren Merritt's avatar
Loren Merritt committed
345
346
347
348
349
            for( j=0; j<72; j++ )
                sums[j] = rand() & 0x3fff;
            for( j=0; j<4; j++ )
                dc[j] = rand() & 0x3fff;
            used_asm = 1;
350
351
            mvn_c = call_c( pixel_c.ads[i&3], dc, sums, 32, cost_mv, mvs_c, 28, thresh );
            mvn_a = call_a( pixel_asm.ads[i&3], dc, sums, 32, cost_mv, mvs_a, 28, thresh );
352
            if( mvn_c != mvn_a || memcmp( mvs_c, mvs_a, mvn_c*sizeof(*mvs_c) ) )
353
            {
Loren Merritt's avatar
Loren Merritt committed
354
                ok = 0;
355
356
357
358
359
360
361
362
                printf("c%d: ", i&3);
                for(j=0; j<mvn_c; j++)
                    printf("%d ", mvs_c[j]);
                printf("\na%d: ", i&3);
                for(j=0; j<mvn_a; j++)
                    printf("%d ", mvs_a[j]);
                printf("\n\n");
            }
Loren Merritt's avatar
Loren Merritt committed
363
364
365
        }
    report( "esa ads:" );

Laurent Aimar's avatar
Laurent Aimar committed
366
367
368
    return ret;
}

369
static int check_dct( int cpu_ref, int cpu_new )
Laurent Aimar's avatar
Laurent Aimar committed
370
371
{
    x264_dct_function_t dct_c;
372
    x264_dct_function_t dct_ref;
Laurent Aimar's avatar
Laurent Aimar committed
373
    x264_dct_function_t dct_asm;
374
    x264_quant_function_t qf;
375
    int ret = 0, ok, used_asm, i, interlace;
Loren Merritt's avatar
Loren Merritt committed
376
377
378
379
    DECLARE_ALIGNED_16( int16_t dct1[16][4][4] );
    DECLARE_ALIGNED_16( int16_t dct2[16][4][4] );
    DECLARE_ALIGNED_16( int16_t dct4[16][4][4] );
    DECLARE_ALIGNED_16( int16_t dct8[4][8][8] );
380
381
    x264_t h_buf;
    x264_t *h = &h_buf;
Laurent Aimar's avatar
Laurent Aimar committed
382
383

    x264_dct_init( 0, &dct_c );
384
385
    x264_dct_init( cpu_ref, &dct_ref);
    x264_dct_init( cpu_new, &dct_asm );
386
387
388
389
390
391
392
393
394
395
396
397

    memset( h, 0, sizeof(*h) );
    h->pps = h->pps_array;
    x264_param_default( &h->param );
    h->param.analyse.i_luma_deadzone[0] = 0;
    h->param.analyse.i_luma_deadzone[1] = 0;
    h->param.analyse.b_transform_8x8 = 1;
    for( i=0; i<6; i++ )
        h->pps->scaling_list[i] = x264_cqm_flat16;
    x264_cqm_init( h );
    x264_quant_init( h, 0, &qf );

Laurent Aimar's avatar
Laurent Aimar committed
398
#define TEST_DCT( name, t1, t2, size ) \
399
    if( dct_asm.name != dct_ref.name ) \
Laurent Aimar's avatar
Laurent Aimar committed
400
    { \
401
        set_func_name( #name );\
402
        used_asm = 1; \
403
404
        call_c( dct_c.name, t1, buf1, buf2 ); \
        call_a( dct_asm.name, t2, buf1, buf2 ); \
Laurent Aimar's avatar
Laurent Aimar committed
405
406
407
408
409
410
        if( memcmp( t1, t2, size ) ) \
        { \
            ok = 0; \
            fprintf( stderr, #name " [FAILED]\n" ); \
        } \
    }
411
    ok = 1; used_asm = 0;
Laurent Aimar's avatar
Laurent Aimar committed
412
413
414
    TEST_DCT( sub4x4_dct, dct1[0], dct2[0], 16*2 );
    TEST_DCT( sub8x8_dct, dct1, dct2, 16*2*4 );
    TEST_DCT( sub16x16_dct, dct1, dct2, 16*2*16 );
415
416
417
418
419
420
    report( "sub_dct4 :" );

    ok = 1; used_asm = 0;
    TEST_DCT( sub8x8_dct8, (void*)dct1[0], (void*)dct2[0], 64*2 );
    TEST_DCT( sub16x16_dct8, (void*)dct1, (void*)dct2, 64*2*4 );
    report( "sub_dct8 :" );
Laurent Aimar's avatar
Laurent Aimar committed
421
422
#undef TEST_DCT

423
424
425
426
427
428
429
430
431
432
433
434
435
436
    // fdct and idct are denormalized by different factors, so quant/dequant
    // is needed to force the coefs into the right range.
    dct_c.sub16x16_dct( dct4, buf1, buf2 );
    dct_c.sub16x16_dct8( dct8, buf1, buf2 );
    for( i=0; i<16; i++ )
    {
        qf.quant_4x4( dct4[i], h->quant4_mf[CQM_4IY][20], h->quant4_bias[CQM_4IY][20] );
        qf.dequant_4x4( dct4[i], h->dequant4_mf[CQM_4IY], 20 );
    }
    for( i=0; i<4; i++ )
    {
        qf.quant_8x8( dct8[i], h->quant8_mf[CQM_8IY][20], h->quant8_bias[CQM_8IY][20] );
        qf.dequant_8x8( dct8[i], h->dequant8_mf[CQM_8IY], 20 );
    }
437

438
#define TEST_IDCT( name, src ) \
439
    if( dct_asm.name != dct_ref.name ) \
Laurent Aimar's avatar
Laurent Aimar committed
440
    { \
441
        set_func_name( #name );\
442
        used_asm = 1; \
Laurent Aimar's avatar
Laurent Aimar committed
443
444
        memcpy( buf3, buf1, 32*32 ); \
        memcpy( buf4, buf1, 32*32 ); \
445
446
        memcpy( dct1, src, 512 ); \
        memcpy( dct2, src, 512 ); \
447
448
        call_c1( dct_c.name, buf3, (void*)dct1 ); \
        call_a1( dct_asm.name, buf4, (void*)dct2 ); \
Laurent Aimar's avatar
Laurent Aimar committed
449
450
451
452
453
        if( memcmp( buf3, buf4, 32*32 ) ) \
        { \
            ok = 0; \
            fprintf( stderr, #name " [FAILED]\n" ); \
        } \
454
455
        call_c2( dct_c.name, buf3, (void*)dct1 ); \
        call_a2( dct_asm.name, buf4, (void*)dct2 ); \
Laurent Aimar's avatar
Laurent Aimar committed
456
    }
457
    ok = 1; used_asm = 0;
458
459
460
    TEST_IDCT( add4x4_idct, dct4 );
    TEST_IDCT( add8x8_idct, dct4 );
    TEST_IDCT( add16x16_idct, dct4 );
461
462
463
    report( "add_idct4 :" );

    ok = 1; used_asm = 0;
464
465
    TEST_IDCT( add8x8_idct8, dct8 );
    TEST_IDCT( add16x16_idct8, dct8 );
466
    report( "add_idct8 :" );
Laurent Aimar's avatar
Laurent Aimar committed
467
468
#undef TEST_IDCT

469
470
    ok = 1; used_asm = 0;
    if( dct_asm.dct4x4dc != dct_ref.dct4x4dc )
Laurent Aimar's avatar
Laurent Aimar committed
471
    {
Loren Merritt's avatar
Loren Merritt committed
472
473
        DECLARE_ALIGNED_16( int16_t dct1[4][4] ) = {{-12, 42, 23, 67},{2, 90, 89,56},{67,43,-76,91},{56,-78,-54,1}};
        DECLARE_ALIGNED_16( int16_t dct2[4][4] ) = {{-12, 42, 23, 67},{2, 90, 89,56},{67,43,-76,91},{56,-78,-54,1}};
474
        set_func_name( "dct4x4dc" );
475
        used_asm = 1;
476
477
        call_c1( dct_c.dct4x4dc, dct1 );
        call_a1( dct_asm.dct4x4dc, dct2 );
Laurent Aimar's avatar
Laurent Aimar committed
478
479
480
481
482
        if( memcmp( dct1, dct2, 32 ) )
        {
            ok = 0;
            fprintf( stderr, " - dct4x4dc :        [FAILED]\n" );
        }
483
484
        call_c2( dct_c.dct4x4dc, dct1 );
        call_a2( dct_asm.dct4x4dc, dct2 );
Laurent Aimar's avatar
Laurent Aimar committed
485
    }
Loren Merritt's avatar
Loren Merritt committed
486
    if( dct_asm.idct4x4dc != dct_ref.idct4x4dc )
Laurent Aimar's avatar
Laurent Aimar committed
487
    {
Loren Merritt's avatar
Loren Merritt committed
488
489
        DECLARE_ALIGNED_16( int16_t dct1[4][4] ) = {{-12, 42, 23, 67},{2, 90, 89,56},{67,43,-76,91},{56,-78,-54,1}};
        DECLARE_ALIGNED_16( int16_t dct2[4][4] ) = {{-12, 42, 23, 67},{2, 90, 89,56},{67,43,-76,91},{56,-78,-54,1}};
490
        set_func_name( "idct4x4dc" );
491
        used_asm = 1;
492
493
        call_c1( dct_c.idct4x4dc, dct1 );
        call_a1( dct_asm.idct4x4dc, dct2 );
Laurent Aimar's avatar
Laurent Aimar committed
494
495
496
497
498
        if( memcmp( dct1, dct2, 32 ) )
        {
            ok = 0;
            fprintf( stderr, " - idct4x4dc :        [FAILED]\n" );
        }
499
500
        call_c2( dct_c.idct4x4dc, dct1 );
        call_a2( dct_asm.idct4x4dc, dct2 );
Laurent Aimar's avatar
Laurent Aimar committed
501
    }
502
    report( "(i)dct4x4dc :" );
Laurent Aimar's avatar
Laurent Aimar committed
503

504
505
    ok = 1; used_asm = 0;
    if( dct_asm.dct2x2dc != dct_ref.dct2x2dc )
Laurent Aimar's avatar
Laurent Aimar committed
506
    {
Loren Merritt's avatar
Loren Merritt committed
507
508
        DECLARE_ALIGNED_16( int16_t dct1[2][2] ) = {{-12, 42},{2, 90}};
        DECLARE_ALIGNED_16( int16_t dct2[2][2] ) = {{-12, 42},{2, 90}};
509
        set_func_name( "dct2x2dc" );
510
        used_asm = 1;
511
512
        call_c( dct_c.dct2x2dc, dct1 );
        call_a( dct_asm.dct2x2dc, dct2 );
Laurent Aimar's avatar
Laurent Aimar committed
513
514
515
516
517
518
        if( memcmp( dct1, dct2, 4*2 ) )
        {
            ok = 0;
            fprintf( stderr, " - dct2x2dc :        [FAILED]\n" );
        }
    }
519
    if( dct_asm.idct2x2dc != dct_ref.idct2x2dc )
Laurent Aimar's avatar
Laurent Aimar committed
520
    {
Loren Merritt's avatar
Loren Merritt committed
521
522
        DECLARE_ALIGNED_16( int16_t dct1[2][2] ) = {{-12, 42},{2, 90}};
        DECLARE_ALIGNED_16( int16_t dct2[2][2] ) = {{-12, 42},{2, 90}};
523
        set_func_name( "idct2x2dc" );
524
        used_asm = 1;
525
526
        call_c( dct_c.idct2x2dc, dct1 );
        call_a( dct_asm.idct2x2dc, dct2 );
Laurent Aimar's avatar
Laurent Aimar committed
527
528
529
530
531
532
        if( memcmp( dct1, dct2, 4*2 ) )
        {
            ok = 0;
            fprintf( stderr, " - idct2x2dc :       [FAILED]\n" );
        }
    }
533
    report( "(i)dct2x2dc :" );
Laurent Aimar's avatar
Laurent Aimar committed
534

535
536
537
538
    x264_zigzag_function_t zigzag_c;
    x264_zigzag_function_t zigzag_ref;
    x264_zigzag_function_t zigzag_asm;

Loren Merritt's avatar
Loren Merritt committed
539
540
    DECLARE_ALIGNED_16( int16_t level1[64] );
    DECLARE_ALIGNED_16( int16_t level2[64] );
541
542
543
544

#define TEST_ZIGZAG_SCAN( name, t1, t2, dct, size )   \
    if( zigzag_asm.name != zigzag_ref.name ) \
    { \
545
        set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" );\
546
        used_asm = 1; \
547
548
        call_c( zigzag_c.name, t1, dct ); \
        call_a( zigzag_asm.name, t2, dct ); \
549
        if( memcmp( t1, t2, size*sizeof(int16_t) ) ) \
550
551
552
553
554
555
556
557
558
        { \
            ok = 0; \
            fprintf( stderr, #name " [FAILED]\n" ); \
        } \
    }

#define TEST_ZIGZAG_SUB( name, t1, t2, size ) \
    if( zigzag_asm.name != zigzag_ref.name ) \
    { \
559
        set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" );\
560
561
562
        used_asm = 1; \
        memcpy( buf3, buf1, 16*FDEC_STRIDE ); \
        memcpy( buf4, buf1, 16*FDEC_STRIDE ); \
563
564
        call_c1( zigzag_c.name, t1, buf2, buf3 );  \
        call_a1( zigzag_asm.name, t2, buf2, buf4 ); \
565
        if( memcmp( t1, t2, size*sizeof(int16_t) )|| memcmp( buf3, buf4, 16*FDEC_STRIDE ) )  \
566
567
568
569
        { \
            ok = 0; \
            fprintf( stderr, #name " [FAILED]\n" ); \
        } \
570
571
        call_c2( zigzag_c.name, t1, buf2, buf3 );  \
        call_a2( zigzag_asm.name, t2, buf2, buf4 ); \
572
573
    }

574
    interlace = 0;
575
576
577
578
579
    x264_zigzag_init( 0, &zigzag_c, 0 );
    x264_zigzag_init( cpu_ref, &zigzag_ref, 0 );
    x264_zigzag_init( cpu_new, &zigzag_asm, 0 );

    ok = 1; used_asm = 0;
580
581
582
    TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, (void*)dct1, 64 );
    TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 16  );
    TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16 );
583
584
    report( "zigzag_frame :" );

585
    interlace = 1;
586
587
588
589
590
    x264_zigzag_init( 0, &zigzag_c, 1 );
    x264_zigzag_init( cpu_ref, &zigzag_ref, 1 );
    x264_zigzag_init( cpu_new, &zigzag_asm, 1 );

    ok = 1; used_asm = 0;
591
592
593
    TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, (void*)dct1, 64 );
    TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 16  );
    TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16 );
594
595
596
597
    report( "zigzag_field :" );
#undef TEST_ZIGZAG_SCAN
#undef TEST_ZIGZAG_SUB

Laurent Aimar's avatar
Laurent Aimar committed
598
599
600
    return ret;
}

601
static int check_mc( int cpu_ref, int cpu_new )
Laurent Aimar's avatar
Laurent Aimar committed
602
{
603
604
605
    x264_mc_functions_t mc_c;
    x264_mc_functions_t mc_ref;
    x264_mc_functions_t mc_a;
606
    x264_pixel_function_t pixel;
Eric Petit's avatar
Eric Petit committed
607
608

    uint8_t *src     = &buf1[2*32+2];
609
610
611
612
    uint8_t *src2[4] = { &buf1[3*64+2], &buf1[5*64+2],
                         &buf1[7*64+2], &buf1[9*64+2] };
    uint8_t *dst1    = buf3;
    uint8_t *dst2    = buf4;
Eric Petit's avatar
Eric Petit committed
613

614
    int dx, dy, i, j, k, w;
615
    int ret = 0, ok, used_asm;
Laurent Aimar's avatar
Laurent Aimar committed
616

Eric Petit's avatar
Eric Petit committed
617
    x264_mc_init( 0, &mc_c );
618
619
    x264_mc_init( cpu_ref, &mc_ref );
    x264_mc_init( cpu_new, &mc_a );
620
    x264_pixel_init( 0, &pixel );
Laurent Aimar's avatar
Laurent Aimar committed
621

Eric Petit's avatar
Eric Petit committed
622
#define MC_TEST_LUMA( w, h ) \
623
        if( mc_a.mc_luma != mc_ref.mc_luma && !(w&(w-1)) && h<=16 ) \
Eric Petit's avatar
Eric Petit committed
624
        { \
625
            set_func_name( "mc_luma_%dx%d", w, h );\
626
            used_asm = 1; \
Eric Petit's avatar
Eric Petit committed
627
628
            memset(buf3, 0xCD, 1024); \
            memset(buf4, 0xCD, 1024); \
629
630
            call_c( mc_c.mc_luma, dst1, 32, src2, 64, dx, dy, w, h ); \
            call_a( mc_a.mc_luma, dst2, 32, src2, 64, dx, dy, w, h ); \
631
            if( memcmp( buf3, buf4, 1024 ) ) \
Eric Petit's avatar
Eric Petit committed
632
            { \
633
634
635
636
637
638
639
                fprintf( stderr, "mc_luma[mv(%d,%d) %2dx%-2d]     [FAILED]\n", dx, dy, w, h ); \
                ok = 0; \
            } \
        } \
        if( mc_a.get_ref != mc_ref.get_ref ) \
        { \
            uint8_t *ref = dst2; \
640
            int ref_stride = 32; \
641
            set_func_name( "get_ref_%dx%d", w, h );\
642
643
644
            used_asm = 1; \
            memset(buf3, 0xCD, 1024); \
            memset(buf4, 0xCD, 1024); \
645
646
            call_c( mc_c.mc_luma, dst1, 32, src2, 64, dx, dy, w, h ); \
            ref = (uint8_t*) call_a( mc_a.get_ref, ref, &ref_stride, src2, 64, dx, dy, w, h ); \
647
648
649
650
651
652
653
            for( i=0; i<h; i++ ) \
                if( memcmp( dst1+i*32, ref+i*ref_stride, w ) ) \
                { \
                    fprintf( stderr, "get_ref[mv(%d,%d) %2dx%-2d]     [FAILED]\n", dx, dy, w, h ); \
                    ok = 0; \
                    break; \
                } \
Eric Petit's avatar
Eric Petit committed
654
655
656
        }

#define MC_TEST_CHROMA( w, h ) \
657
        if( mc_a.mc_chroma != mc_ref.mc_chroma ) \
Laurent Aimar's avatar
Laurent Aimar committed
658
        { \
659
            set_func_name( "mc_chroma_%dx%d", w, h );\
660
            used_asm = 1; \
661
662
            memset(buf3, 0xCD, 1024); \
            memset(buf4, 0xCD, 1024); \
663
664
            call_c( mc_c.mc_chroma, dst1, 16, src, 32, dx, dy, w, h ); \
            call_a( mc_a.mc_chroma, dst2, 16, src, 32, dx, dy, w, h ); \
Loren Merritt's avatar
Loren Merritt committed
665
666
667
668
            /* mc_chroma width=2 may write garbage to the right of dst. ignore that. */\
            for( j=0; j<h; j++ ) \
                for( i=w; i<4; i++ ) \
                    dst2[i+j*16] = dst1[i+j*16]; \
669
            if( memcmp( buf3, buf4, 1024 ) ) \
Laurent Aimar's avatar
Laurent Aimar committed
670
            { \
671
                fprintf( stderr, "mc_chroma[mv(%d,%d) %2dx%-2d]     [FAILED]\n", dx, dy, w, h ); \
672
                ok = 0; \
Laurent Aimar's avatar
Laurent Aimar committed
673
674
            } \
        }
675
    ok = 1; used_asm = 0;
676
    for( dy = -8; dy < 8; dy++ )
677
        for( dx = -128; dx < 128; dx++ )
Laurent Aimar's avatar
Laurent Aimar committed
678
        {
679
            if( rand()&15 ) continue; // running all of them is too slow
680
            MC_TEST_LUMA( 20, 18 );
Eric Petit's avatar
Eric Petit committed
681
682
            MC_TEST_LUMA( 16, 16 );
            MC_TEST_LUMA( 16, 8 );
683
            MC_TEST_LUMA( 12, 10 );
Eric Petit's avatar
Eric Petit committed
684
685
686
687
688
            MC_TEST_LUMA( 8, 16 );
            MC_TEST_LUMA( 8, 8 );
            MC_TEST_LUMA( 8, 4 );
            MC_TEST_LUMA( 4, 8 );
            MC_TEST_LUMA( 4, 4 );
689
690
        }
    report( "mc luma :" );
Eric Petit's avatar
Eric Petit committed
691

692
    ok = 1; used_asm = 0;
Loren Merritt's avatar
Loren Merritt committed
693
694
    for( dy = -1; dy < 9; dy++ )
        for( dx = -1; dx < 9; dx++ )
695
        {
Eric Petit's avatar
Eric Petit committed
696
697
698
699
700
701
702
            MC_TEST_CHROMA( 8, 8 );
            MC_TEST_CHROMA( 8, 4 );
            MC_TEST_CHROMA( 4, 8 );
            MC_TEST_CHROMA( 4, 4 );
            MC_TEST_CHROMA( 4, 2 );
            MC_TEST_CHROMA( 2, 4 );
            MC_TEST_CHROMA( 2, 2 );
Laurent Aimar's avatar
Laurent Aimar committed
703
        }
704
    report( "mc chroma :" );
Eric Petit's avatar
Eric Petit committed
705
706
#undef MC_TEST_LUMA
#undef MC_TEST_CHROMA
707
708
709
710
711
712
713
714

#define MC_TEST_AVG( name, ... ) \
    for( i = 0, ok = 1, used_asm = 0; i < 10; i++ ) \
    { \
        memcpy( buf3, buf1, 1024 ); \
        memcpy( buf4, buf1, 1024 ); \
        if( mc_a.name[i] != mc_ref.name[i] ) \
        { \
715
            set_func_name( "%s_%s", #name, pixel_names[i] );\
716
            used_asm = 1; \
717
718
            call_c1( mc_c.name[i], buf3, 32, buf2, 16, ##__VA_ARGS__ ); \
            call_a1( mc_a.name[i], buf4, 32, buf2, 16, ##__VA_ARGS__ ); \
719
720
721
722
723
            if( memcmp( buf3, buf4, 1024 ) )               \
            { \
                ok = 0; \
                fprintf( stderr, #name "[%d]: [FAILED]\n", i ); \
            } \
724
725
            call_c2( mc_c.name[i], buf3, 32, buf2, 16, ##__VA_ARGS__ ); \
            call_a2( mc_a.name[i], buf4, 32, buf2, 16, ##__VA_ARGS__ ); \
726
        } \
Laurent Aimar's avatar
Laurent Aimar committed
727
    }
728
729
    MC_TEST_AVG( avg );
    report( "mc avg :" );
730
    ok = 1; used_asm = 0;
731
732
733
734
    for( w = -64; w <= 128 && ok; w++ )
        MC_TEST_AVG( avg_weight, w );
    report( "mc wpredb :" );

735
736
    if( mc_a.hpel_filter != mc_ref.hpel_filter )
    {
737
738
739
        uint8_t *src = buf1+8+2*64;
        uint8_t *dstc[3] = { buf3+8, buf3+8+16*64, buf3+8+32*64 };
        uint8_t *dsta[3] = { buf4+8, buf4+8+16*64, buf4+8+32*64 };
740
        set_func_name( "hpel_filter" );
741
742
743
        ok = 1; used_asm = 1;
        memset( buf3, 0, 4096 );
        memset( buf4, 0, 4096 );
744
745
        call_c( mc_c.hpel_filter, dstc[0], dstc[1], dstc[2], src, 64, 48, 10 );
        call_a( mc_a.hpel_filter, dsta[0], dsta[1], dsta[2], src, 64, 48, 10 );
746
747
748
        for( i=0; i<3; i++ )
            for( j=0; j<10; j++ )
                //FIXME ideally the first pixels would match too, but they aren't actually used
749
                if( memcmp( dstc[i]+j*64+2, dsta[i]+j*64+2, 43 ) )
750
751
752
753
754
755
756
757
758
759
760
761
762
763
                {
                    ok = 0;
                    fprintf( stderr, "hpel filter differs at plane %c line %d\n", "hvc"[i], j );
                    for( k=0; k<48; k++ )
                        printf("%02x%s", dstc[i][j*64+k], (k+1)&3 ? "" : " ");
                    printf("\n");
                    for( k=0; k<48; k++ )
                        printf("%02x%s", dsta[i][j*64+k], (k+1)&3 ? "" : " ");
                    printf("\n");
                    break;
                }
        report( "hpel filter :" );
    }

764
765
766
    return ret;
}

Loren Merritt's avatar
Loren Merritt committed
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
static int check_deblock( int cpu_ref, int cpu_new )
{
    x264_deblock_function_t db_c;
    x264_deblock_function_t db_ref;
    x264_deblock_function_t db_a;
    int ret = 0, ok = 1, used_asm = 0;
    int alphas[36], betas[36];
    int8_t tcs[36][4];
    int a, c, i, j;

    x264_deblock_init( 0, &db_c );
    x264_deblock_init( cpu_ref, &db_ref );
    x264_deblock_init( cpu_new, &db_a );

    /* not exactly the real values of a,b,tc but close enough */
    a = 255; c = 250;
    for( i = 35; i >= 0; i-- )
    {
        alphas[i] = a;
        betas[i] = (i+1)/2;
        tcs[i][0] = tcs[i][2] = (c+6)/10;
        tcs[i][1] = tcs[i][3] = (c+9)/20;
        a = a*9/10;
        c = c*9/10;
    }

793
#define TEST_DEBLOCK( name, align, ... ) \
Loren Merritt's avatar
Loren Merritt committed
794
795
    for( i = 0; i < 36; i++ ) \
    { \
796
        int off = 8*32 + (i&15)*4*!align; /* benchmark various alignments of h filter */\
Loren Merritt's avatar
Loren Merritt committed
797
798
        for( j = 0; j < 1024; j++ ) \
            /* two distributions of random to excersize different failure modes */\
799
800
            buf3[j] = rand() & (i&1 ? 0xf : 0xff ); \
        memcpy( buf4, buf3, 1024 ); \
Loren Merritt's avatar
Loren Merritt committed
801
802
        if( db_a.name != db_ref.name ) \
        { \
803
            set_func_name( #name );\
Loren Merritt's avatar
Loren Merritt committed
804
            used_asm = 1; \
805
806
807
            call_c1( db_c.name, buf3+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \
            call_a1( db_a.name, buf4+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \
            if( memcmp( buf3, buf4, 1024 ) ) \
Loren Merritt's avatar
Loren Merritt committed
808
809
810
811
812
            { \
                ok = 0; \
                fprintf( stderr, #name "(a=%d, b=%d): [FAILED]\n", alphas[i], betas[i] ); \
                break; \
            } \
813
814
            call_c2( db_c.name, buf3+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \
            call_a2( db_a.name, buf4+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \
Loren Merritt's avatar
Loren Merritt committed
815
816
817
        } \
    }

818
819
820
821
822
823
824
825
    TEST_DEBLOCK( deblock_h_luma, 0, tcs[i] );
    TEST_DEBLOCK( deblock_v_luma, 1, tcs[i] );
    TEST_DEBLOCK( deblock_h_chroma, 0, tcs[i] );
    TEST_DEBLOCK( deblock_v_chroma, 1, tcs[i] );
    TEST_DEBLOCK( deblock_h_luma_intra, 0 );
    TEST_DEBLOCK( deblock_v_luma_intra, 1 );
    TEST_DEBLOCK( deblock_h_chroma_intra, 0 );
    TEST_DEBLOCK( deblock_v_chroma_intra, 1 );
Loren Merritt's avatar
Loren Merritt committed
826
827
828
829
830
831

    report( "deblock :" );

    return ret;
}

832
833
834
835
836
static int check_quant( int cpu_ref, int cpu_new )
{
    x264_quant_function_t qf_c;
    x264_quant_function_t qf_ref;
    x264_quant_function_t qf_a;
Loren Merritt's avatar
Loren Merritt committed
837
838
839
    DECLARE_ALIGNED_16( int16_t dct1[64] );
    DECLARE_ALIGNED_16( int16_t dct2[64] );
    DECLARE_ALIGNED_16( uint8_t cqm_buf[64] );
840
841
    int ret = 0, ok, used_asm;
    int oks[2] = {1,1}, used_asms[2] = {0,0};
Loren Merritt's avatar
Loren Merritt committed
842
    int i, i_cqm, qp;
843
844
    x264_t h_buf;
    x264_t *h = &h_buf;
Loren Merritt's avatar
Loren Merritt committed
845
    memset( h, 0, sizeof(*h) );
846
847
    h->pps = h->pps_array;
    x264_param_default( &h->param );
Loren Merritt's avatar
Loren Merritt committed
848
    h->param.rc.i_qp_min = 26;
Loren Merritt's avatar
Loren Merritt committed
849
    h->param.analyse.b_transform_8x8 = 1;
850
851
852
853

    for( i_cqm = 0; i_cqm < 4; i_cqm++ )
    {
        if( i_cqm == 0 )
854
        {
855
856
            for( i = 0; i < 6; i++ )
                h->pps->scaling_list[i] = x264_cqm_flat16;
857
858
            h->param.i_cqm_preset = h->pps->i_cqm_preset = X264_CQM_FLAT;
        }
859
        else if( i_cqm == 1 )
860
        {
861
862
            for( i = 0; i < 6; i++ )
                h->pps->scaling_list[i] = x264_cqm_jvt[i];
863
864
            h->param.i_cqm_preset = h->pps->i_cqm_preset = X264_CQM_JVT;
        }
865
866
867
868
869
870
871
872
873
874
        else
        {
            if( i_cqm == 2 )
                for( i = 0; i < 64; i++ )
                    cqm_buf[i] = 10 + rand() % 246;
            else
                for( i = 0; i < 64; i++ )
                    cqm_buf[i] = 1;
            for( i = 0; i < 6; i++ )
                h->pps->scaling_list[i] = cqm_buf;
875
            h->param.i_cqm_preset = h->pps->i_cqm_preset = X264_CQM_CUSTOM;
876
877
878
879
880
881
882
        }

        x264_cqm_init( h );
        x264_quant_init( h, 0, &qf_c );
        x264_quant_init( h, cpu_ref, &qf_ref );
        x264_quant_init( h, cpu_new, &qf_a );

883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
#define INIT_QUANT8() \
        { \
            static const int scale1d[8] = {32,31,24,31,32,31,24,31}; \
            int x, y; \
            for( y = 0; y < 8; y++ ) \
                for( x = 0; x < 8; x++ ) \
                { \
                    unsigned int scale = (255*scale1d[y]*scale1d[x])/16; \
                    dct1[y*8+x] = dct2[y*8+x] = (rand()%(2*scale+1))-scale; \
                } \
        }

#define INIT_QUANT4() \
        { \
            static const int scale1d[4] = {4,6,4,6}; \
            int x, y; \
            for( y = 0; y < 4; y++ ) \
                for( x = 0; x < 4; x++ ) \
                { \
                    unsigned int scale = 255*scale1d[y]*scale1d[x]; \
                    dct1[y*4+x] = dct2[y*4+x] = (rand()%(2*scale+1))-scale; \
                } \
        }

Loren Merritt's avatar
Loren Merritt committed
907
#define TEST_QUANT_DC( name, cqm ) \
908
909
        if( qf_a.name != qf_ref.name ) \
        { \
910
            set_func_name( #name ); \
911
912
913
            used_asms[0] = 1; \
            for( qp = 51; qp > 0; qp-- ) \
            { \
Loren Merritt's avatar
Loren Merritt committed
914
915
                for( i = 0; i < 16; i++ ) \
                    dct1[i] = dct2[i] = (rand() & 0x1fff) - 0xfff; \
916
917
                call_c1( qf_c.name, (void*)dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
                call_a1( qf_a.name, (void*)dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
Loren Merritt's avatar
Loren Merritt committed
918
                if( memcmp( dct1, dct2, 16*2 ) )       \
919
920
                { \
                    oks[0] = 0; \
Loren Merritt's avatar
Loren Merritt committed
921
                    fprintf( stderr, #name "(cqm=%d): [FAILED]\n", i_cqm ); \
922
923
                    break; \
                } \
924
925
                call_c2( qf_c.name, (void*)dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
                call_a2( qf_a.name, (void*)dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
926
927
928
            } \
        }

Loren Merritt's avatar
Loren Merritt committed
929
#define TEST_QUANT( qname, block, w ) \
930
931
        if( qf_a.qname != qf_ref.qname ) \
        { \
932
            set_func_name( #qname ); \
933
934
935
            used_asms[0] = 1; \
            for( qp = 51; qp > 0; qp-- ) \
            { \
Loren Merritt's avatar
Loren Merritt committed
936
                INIT_QUANT##w() \
937
938
                call_c1( qf_c.qname, (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
                call_a1( qf_a.qname, (void*)dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
Loren Merritt's avatar
Loren Merritt committed
939
                if( memcmp( dct1, dct2, w*w*2 ) ) \
940
941
                { \
                    oks[0] = 0; \
Loren Merritt's avatar
Loren Merritt committed
942
                    fprintf( stderr, #qname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \
943
944
                    break; \
                } \
945
946
                call_c2( qf_c.qname, (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
                call_a2( qf_a.qname, (void*)dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
947
948
949
            } \
        }

Loren Merritt's avatar
Loren Merritt committed
950
951
952
953
954
955
        TEST_QUANT( quant_8x8, CQM_8IY, 8 );
        TEST_QUANT( quant_8x8, CQM_8PY, 8 );
        TEST_QUANT( quant_4x4, CQM_4IY, 4 );
        TEST_QUANT( quant_4x4, CQM_4PY, 4 );
        TEST_QUANT_DC( quant_4x4_dc, **h->quant4_mf[CQM_4IY] );
        TEST_QUANT_DC( quant_2x2_dc, **h->quant4_mf[CQM_4IC] );
956

Loren Merritt's avatar
Loren Merritt committed
957
#define TEST_DEQUANT( qname, dqname, block, w ) \
958
959
        if( qf_a.dqname != qf_ref.dqname ) \
        { \
960
            set_func_name( "%s_%s", #dqname, i_cqm?"cqm":"flat" ); \
961
962
963
            used_asms[1] = 1; \
            for( qp = 51; qp > 0; qp-- ) \
            { \
Loren Merritt's avatar
Loren Merritt committed
964
                INIT_QUANT##w() \
965
                call_c( qf_c.qname, (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
Loren Merritt's avatar
Loren Merritt committed
966
                memcpy( dct2, dct1, w*w*2 ); \
967
968
                call_c1( qf_c.dqname, (void*)dct1, h->dequant##w##_mf[block], qp ); \
                call_a1( qf_a.dqname, (void*)dct2, h->dequant##w##_mf[block], qp ); \
Loren Merritt's avatar
Loren Merritt committed
969
                if( memcmp( dct1, dct2, w*w*2 ) ) \
970
971
                { \
                    oks[1] = 0; \
Loren Merritt's avatar
Loren Merritt committed
972
                    fprintf( stderr, #dqname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \
973
974
                    break; \
                } \
975
976
                call_c2( qf_c.dqname, (void*)dct1, h->dequant##w##_mf[block], qp ); \
                call_a2( qf_a.dqname, (void*)dct2, h->dequant##w##_mf[block], qp ); \
977
978
979
            } \
        }

Loren Merritt's avatar
Loren Merritt committed
980
981
982
983
        TEST_DEQUANT( quant_8x8, dequant_8x8, CQM_8IY, 8 );
        TEST_DEQUANT( quant_8x8, dequant_8x8, CQM_8PY, 8 );
        TEST_DEQUANT( quant_4x4, dequant_4x4, CQM_4IY, 4 );
        TEST_DEQUANT( quant_4x4, dequant_4x4, CQM_4PY, 4 );
Loren Merritt's avatar
Loren Merritt committed
984
985

        x264_cqm_delete( h );
Laurent Aimar's avatar
Laurent Aimar committed
986
    }
987

988
    ok = oks[0]; used_asm = used_asms[0];
989
990
    report( "quant :" );

991
992
993
    ok = oks[1]; used_asm = used_asms[1];
    report( "dequant :" );

Laurent Aimar's avatar
Laurent Aimar committed
994
995
996
    return ret;
}

997
998
999
1000
static int check_intra( int cpu_ref, int cpu_new )
{
    int ret = 0, ok = 1, used_asm = 0;
    int i;
Loren Merritt's avatar
Loren Merritt committed
1001
    DECLARE_ALIGNED_16( uint8_t edge[33] );
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
    struct
    {
        x264_predict_t      predict_16x16[4+3];
        x264_predict_t      predict_8x8c[4+3];
        x264_predict8x8_t   predict_8x8[9+3];
        x264_predict_t      predict_4x4[9+3];
    } ip_c, ip_ref, ip_a;

    x264_predict_16x16_init( 0, ip_c.predict_16x16 );
    x264_predict_8x8c_init( 0, ip_c.predict_8x8c );
    x264_predict_8x8_init( 0, ip_c.predict_8x8 );
    x264_predict_4x4_init( 0, ip_c.predict_4x4 );

    x264_predict_16x16_init( cpu_ref, ip_ref.predict_16x16 );
    x264_predict_8x8c_init( cpu_ref, ip_ref.predict_8x8c );
    x264_predict_8x8_init( cpu_ref, ip_ref.predict_8x8 );
    x264_predict_4x4_init( cpu_ref, ip_ref.predict_4x4 );

    x264_predict_16x16_init( cpu_new, ip_a.predict_16x16 );
    x264_predict_8x8c_init( cpu_new, ip_a.predict_8x8c );
    x264_predict_8x8_init( cpu_new, ip_a.predict_8x8 );
    x264_predict_4x4_init( cpu_new, ip_a.predict_4x4 );

1025
1026
    x264_predict_8x8_filter( buf1+48, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );

1027
#define INTRA_TEST( name, dir, w, ... ) \
1028
1029
    if( ip_a.name[dir] != ip_ref.name[dir] )\
    { \
1030
        set_func_name( "intra_%s_%s", #name, intra_##name##_names[dir] );\
1031
1032
1033
        used_asm = 1; \
        memcpy( buf3, buf1, 32*20 );\
        memcpy( buf4, buf1, 32*20 );\
1034
1035
        call_c( ip_c.name[dir], buf3+48, ##__VA_ARGS__ );\
        call_a( ip_a.name[dir], buf4+48, ##__VA_ARGS__ );\
1036
1037
1038
1039
        if( memcmp( buf3, buf4, 32*20 ) )\
        {\
            fprintf( stderr, #name "[%d] :  [FAILED]\n", dir );\
            ok = 0;\
1040
1041
1042
1043
            int j,k;\
            for(k=-1; k<16; k++)\
                printf("%2x ", edge[16+k]);\
            printf("\n");\
1044
            for(j=0; j<w; j++){\
Loren Merritt's avatar
Loren Merritt committed
1045
                printf("%2x ", edge[14-j]);\
1046
                for(k=0; k<w; k++)\
1047
1048
1049
1050
                    printf("%2x ", buf4[48+k+j*32]);\
                printf("\n");\
            }\
            printf("\n");\
1051
            for(j=0; j<w; j++){\
1052
                printf("   ");\
1053
                for(k=0; k<w; k++)\
1054
1055
1056
                    printf("%2x ", buf3[48+k+j*32]);\
                printf("\n");\
            }\
1057
1058
1059
1060
        }\
    }

    for( i = 0; i < 12; i++ )
1061
        INTRA_TEST( predict_4x4, i, 4 );
1062
    for( i = 0; i < 7; i++ )
1063
        INTRA_TEST( predict_8x8c, i, 8 );
1064
    for( i = 0; i < 7; i++ )
1065
        INTRA_TEST( predict_16x16, i, 16 );
1066
    for( i = 0; i < 12; i++ )
1067
        INTRA_TEST( predict_8x8, i, 8, edge );
1068
1069
1070
1071
1072

    report( "intra pred :" );
    return ret;
}

Loren Merritt's avatar
Loren Merritt committed
1073
1074
1075
1076
1077
1078
1079
#define DECL_CABAC(cpu) \
static void run_cabac_##cpu( uint8_t *dst )\
{\
    int i;\
    x264_cabac_t cb;\
    x264_cabac_context_init( &cb, SLICE_TYPE_P, 26, 0 );\
    x264_cabac_encode_init( &cb, dst, dst+0xff0 );\
1080
    for( i=0; i<0x1000; i++ )\
Loren Merritt's avatar
Loren Merritt committed
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
        x264_cabac_encode_decision_##cpu( &cb, buf1[i]>>1, buf1[i]&1 );\
}
DECL_CABAC(c)
#ifdef HAVE_MMX
DECL_CABAC(asm)
#else
#define run_cabac_asm run_cabac_c
#endif

static int check_cabac( int cpu_ref, int cpu_new )
{
    int ret = 0, ok, used_asm = 1;
    if( cpu_ref || run_cabac_c == run_cabac_asm)
        return 0;
1095
    set_func_name( "cabac_encode_decision" );
Loren Merritt's avatar
Loren Merritt committed
1096
1097
1098
1099
1100
1101
1102
1103
    memcpy( buf4, buf3, 0x1000 );
    call_c( run_cabac_c, buf3 );
    call_a( run_cabac_asm, buf4 );
    ok = !memcmp( buf3, buf4, 0x1000 );
    report( "cabac :" );
    return ret;
}

1104
int check_all_funcs( int cpu_ref, int cpu_new )
1105
1106
1107
1108
{
    return check_pixel( cpu_ref, cpu_new )
         + check_dct( cpu_ref, cpu_new )
         + check_mc( cpu_ref, cpu_new )
1109
         + check_intra( cpu_ref, cpu_new )
Loren Merritt's avatar
Loren Merritt committed
1110
         + check_deblock( cpu_ref, cpu_new )
Loren Merritt's avatar
Loren Merritt committed
1111
1112
         + check_quant( cpu_ref, cpu_new )
         + check_cabac( cpu_ref, cpu_new );
1113
1114
}

1115
1116
1117
1118
int add_flags( int *cpu_ref, int *cpu_new, int flags, const char *name )
{
    *cpu_ref = *cpu_new;
    *cpu_new |= flags;
1119
1120
    if( *cpu_new & X264_CPU_SSE2_IS_FAST )
        *cpu_new &= ~X264_CPU_SSE2_IS_SLOW;
1121
1122
1123
    if( !quiet )
        fprintf( stderr, "x264: %s\n", name );
    return check_all_funcs( *cpu_ref, *cpu_new );
1124
1125
}

1126
int check_all_flags( void )
Laurent Aimar's avatar
Laurent Aimar committed
1127
{
1128
    int ret = 0;
1129
    int cpu0 = 0, cpu1 = 0;
1130
#ifdef HAVE_MMX
1131
1132
    if( x264_cpu_detect() & X264_CPU_MMXEXT )
    {
1133
1134
        ret |= add_flags( &cpu0, &cpu1, X264_CPU_MMX | X264_CPU_MMXEXT, "MMX" );
        ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "MMX Cache64" );
1135