pixel.c 30.2 KB
Newer Older
Laurent Aimar's avatar
Laurent Aimar committed
1 2 3
/*****************************************************************************
 * pixel.c: h264 encoder
 *****************************************************************************
4
 * Copyright (C) 2003-2008 x264 project
Laurent Aimar's avatar
Laurent Aimar committed
5
 *
6 7
 * Authors: Loren Merritt <lorenm@u.washington.edu>
 *          Laurent Aimar <fenrir@via.ecp.fr>
Laurent Aimar's avatar
Laurent Aimar committed
8 9 10 11 12 13 14 15 16 17 18 19 20
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
21
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
Laurent Aimar's avatar
Laurent Aimar committed
22 23
 *****************************************************************************/

Loren Merritt's avatar
Loren Merritt committed
24
#include "common.h"
Laurent Aimar's avatar
Laurent Aimar committed
25

26
#ifdef HAVE_MMX
27
#   include "x86/pixel.h"
Laurent Aimar's avatar
Laurent Aimar committed
28
#endif
29
#ifdef ARCH_PPC
Laurent Aimar's avatar
Laurent Aimar committed
30 31
#   include "ppc/pixel.h"
#endif
32 33 34
#ifdef ARCH_ARM
#   include "arm/pixel.h"
#endif
35 36 37
#ifdef ARCH_UltraSparc
#   include "sparc/pixel.h"
#endif
Laurent Aimar's avatar
Laurent Aimar committed
38 39 40 41 42 43 44 45 46 47


/****************************************************************************
 * pixel_sad_WxH
 ****************************************************************************/
#define PIXEL_SAD_C( name, lx, ly ) \
static int name( uint8_t *pix1, int i_stride_pix1,  \
                 uint8_t *pix2, int i_stride_pix2 ) \
{                                                   \
    int i_sum = 0;                                  \
48
    for( int y = 0; y < ly; y++ )                   \
Laurent Aimar's avatar
Laurent Aimar committed
49
    {                                               \
50
        for( int x = 0; x < lx; x++ )               \
Laurent Aimar's avatar
Laurent Aimar committed
51 52 53 54 55 56 57 58 59 60
        {                                           \
            i_sum += abs( pix1[x] - pix2[x] );      \
        }                                           \
        pix1 += i_stride_pix1;                      \
        pix2 += i_stride_pix2;                      \
    }                                               \
    return i_sum;                                   \
}


61 62 63 64 65 66 67
PIXEL_SAD_C( x264_pixel_sad_16x16, 16, 16 )
PIXEL_SAD_C( x264_pixel_sad_16x8,  16,  8 )
PIXEL_SAD_C( x264_pixel_sad_8x16,   8, 16 )
PIXEL_SAD_C( x264_pixel_sad_8x8,    8,  8 )
PIXEL_SAD_C( x264_pixel_sad_8x4,    8,  4 )
PIXEL_SAD_C( x264_pixel_sad_4x8,    4,  8 )
PIXEL_SAD_C( x264_pixel_sad_4x4,    4,  4 )
Laurent Aimar's avatar
Laurent Aimar committed
68

69 70 71 72 73 74 75 76 77

/****************************************************************************
 * pixel_ssd_WxH
 ****************************************************************************/
#define PIXEL_SSD_C( name, lx, ly ) \
static int name( uint8_t *pix1, int i_stride_pix1,  \
                 uint8_t *pix2, int i_stride_pix2 ) \
{                                                   \
    int i_sum = 0;                                  \
78
    for( int y = 0; y < ly; y++ )                   \
79
    {                                               \
80
        for( int x = 0; x < lx; x++ )               \
81 82 83 84 85 86 87 88 89 90
        {                                           \
            int d = pix1[x] - pix2[x];              \
            i_sum += d*d;                           \
        }                                           \
        pix1 += i_stride_pix1;                      \
        pix2 += i_stride_pix2;                      \
    }                                               \
    return i_sum;                                   \
}

91 92 93 94 95 96 97
PIXEL_SSD_C( x264_pixel_ssd_16x16, 16, 16 )
PIXEL_SSD_C( x264_pixel_ssd_16x8,  16,  8 )
PIXEL_SSD_C( x264_pixel_ssd_8x16,   8, 16 )
PIXEL_SSD_C( x264_pixel_ssd_8x8,    8,  8 )
PIXEL_SSD_C( x264_pixel_ssd_8x4,    8,  4 )
PIXEL_SSD_C( x264_pixel_ssd_4x8,    4,  8 )
PIXEL_SSD_C( x264_pixel_ssd_4x4,    4,  4 )
98

Loren Merritt's avatar
Loren Merritt committed
99 100 101
int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height )
{
    int64_t i_ssd = 0;
102
    int y;
Anton Mitrofanov's avatar
Anton Mitrofanov committed
103
    int align = !(((intptr_t)pix1 | (intptr_t)pix2 | i_pix1 | i_pix2) & 15);
Loren Merritt's avatar
Loren Merritt committed
104 105 106 107 108

#define SSD(size) i_ssd += pf->ssd[size]( pix1 + y*i_pix1 + x, i_pix1, \
                                          pix2 + y*i_pix2 + x, i_pix2 );
    for( y = 0; y < i_height-15; y += 16 )
    {
109
        int x = 0;
Loren Merritt's avatar
Loren Merritt committed
110 111 112 113
        if( align )
            for( ; x < i_width-15; x += 16 )
                SSD(PIXEL_16x16);
        for( ; x < i_width-7; x += 8 )
Loren Merritt's avatar
Loren Merritt committed
114 115 116
            SSD(PIXEL_8x16);
    }
    if( y < i_height-7 )
117
        for( int x = 0; x < i_width-7; x += 8 )
Loren Merritt's avatar
Loren Merritt committed
118 119 120 121
            SSD(PIXEL_8x8);
#undef SSD

#define SSD1 { int d = pix1[y*i_pix1+x] - pix2[y*i_pix2+x]; i_ssd += d*d; }
122
    if( i_width & 7 )
Loren Merritt's avatar
Loren Merritt committed
123 124
    {
        for( y = 0; y < (i_height & ~7); y++ )
125
            for( int x = i_width & ~7; x < i_width; x++ )
Loren Merritt's avatar
Loren Merritt committed
126 127
                SSD1;
    }
128
    if( i_height & 7 )
Loren Merritt's avatar
Loren Merritt committed
129 130
    {
        for( y = i_height & ~7; y < i_height; y++ )
131
            for( int x = 0; x < i_width; x++ )
Loren Merritt's avatar
Loren Merritt committed
132 133 134 135 136 137 138
                SSD1;
    }
#undef SSD1

    return i_ssd;
}

139

140 141 142
/****************************************************************************
 * pixel_var_wxh
 ****************************************************************************/
Fiona Glaser's avatar
Fiona Glaser committed
143 144
#define PIXEL_VAR_C( name, w ) \
static uint64_t name( uint8_t *pix, int i_stride ) \
145
{                                             \
Fiona Glaser's avatar
Fiona Glaser committed
146
    uint32_t sum = 0, sqr = 0;                \
147
    for( int y = 0; y < w; y++ )              \
148
    {                                         \
149
        for( int x = 0; x < w; x++ )          \
150 151 152 153 154 155
        {                                     \
            sum += pix[x];                    \
            sqr += pix[x] * pix[x];           \
        }                                     \
        pix += i_stride;                      \
    }                                         \
Fiona Glaser's avatar
Fiona Glaser committed
156
    return sum + ((uint64_t)sqr << 32);       \
157 158
}

Fiona Glaser's avatar
Fiona Glaser committed
159 160
PIXEL_VAR_C( x264_pixel_var_16x16, 16 )
PIXEL_VAR_C( x264_pixel_var_8x8,    8 )
161

162 163 164 165 166 167
/****************************************************************************
 * pixel_var2_wxh
 ****************************************************************************/
static int pixel_var2_8x8( uint8_t *pix1, int i_stride1, uint8_t *pix2, int i_stride2, int *ssd )
{
    uint32_t var = 0, sum = 0, sqr = 0;
168
    for( int y = 0; y < 8; y++ )
169
    {
170
        for( int x = 0; x < 8; x++ )
171 172 173 174 175 176 177 178 179 180 181 182 183 184
        {
            int diff = pix1[x] - pix2[x];
            sum += diff;
            sqr += diff * diff;
        }
        pix1 += i_stride1;
        pix2 += i_stride2;
    }
    sum = abs(sum);
    var = sqr - (sum * sum >> 6);
    *ssd = sqr;
    return var;
}

185

186
#define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3) {\
Loren Merritt's avatar
Loren Merritt committed
187 188 189 190 191 192 193 194 195
    int t0 = s0 + s1;\
    int t1 = s0 - s1;\
    int t2 = s2 + s3;\
    int t3 = s2 - s3;\
    d0 = t0 + t2;\
    d2 = t0 - t2;\
    d1 = t1 + t3;\
    d3 = t1 - t3;\
}
196

197 198 199 200 201 202 203 204
// in: a pseudo-simd number of the form x+(y<<16)
// return: abs(x)+(abs(y)<<16)
static ALWAYS_INLINE uint32_t abs2( uint32_t a )
{
    uint32_t s = ((a>>15)&0x10001)*0xffff;
    return (a+s)^s;
}

205 206 207
/****************************************************************************
 * pixel_satd_WxH: sum of 4x4 Hadamard transformed differences
 ****************************************************************************/
Laurent Aimar's avatar
Laurent Aimar committed
208

209 210 211
static NOINLINE int x264_pixel_satd_4x4( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
{
    uint32_t tmp[4][2];
212 213 214
    uint32_t a0, a1, a2, a3, b0, b1;
    int sum = 0;
    for( int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2 )
Laurent Aimar's avatar
Laurent Aimar committed
215
    {
216 217 218 219 220 221 222 223
        a0 = pix1[0] - pix2[0];
        a1 = pix1[1] - pix2[1];
        b0 = (a0+a1) + ((a0-a1)<<16);
        a2 = pix1[2] - pix2[2];
        a3 = pix1[3] - pix2[3];
        b1 = (a2+a3) + ((a2-a3)<<16);
        tmp[i][0] = b0 + b1;
        tmp[i][1] = b0 - b1;
Laurent Aimar's avatar
Laurent Aimar committed
224
    }
225
    for( int i = 0; i < 2; i++ )
226
    {
227
        HADAMARD4( a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i] );
228 229 230 231
        a0 = abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
        sum += ((uint16_t)a0) + (a0>>16);
    }
    return sum >> 1;
Laurent Aimar's avatar
Laurent Aimar committed
232 233
}

234 235 236
static NOINLINE int x264_pixel_satd_8x4( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
{
    uint32_t tmp[4][4];
237 238 239
    uint32_t a0, a1, a2, a3;
    int sum = 0;
    for( int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2 )
240 241 242 243 244 245 246
    {
        a0 = (pix1[0] - pix2[0]) + ((pix1[4] - pix2[4]) << 16);
        a1 = (pix1[1] - pix2[1]) + ((pix1[5] - pix2[5]) << 16);
        a2 = (pix1[2] - pix2[2]) + ((pix1[6] - pix2[6]) << 16);
        a3 = (pix1[3] - pix2[3]) + ((pix1[7] - pix2[7]) << 16);
        HADAMARD4( tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0,a1,a2,a3 );
    }
247
    for( int i = 0; i < 4; i++ )
248
    {
249
        HADAMARD4( a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i] );
250 251 252 253
        sum += abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
    }
    return (((uint16_t)sum) + ((uint32_t)sum>>16)) >> 1;
}
Laurent Aimar's avatar
Laurent Aimar committed
254

255 256 257 258 259 260 261 262 263 264 265 266 267 268 269
#define PIXEL_SATD_C( w, h, sub )\
static int x264_pixel_satd_##w##x##h( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )\
{\
    int sum = sub( pix1, i_pix1, pix2, i_pix2 )\
            + sub( pix1+4*i_pix1, i_pix1, pix2+4*i_pix2, i_pix2 );\
    if( w==16 )\
        sum+= sub( pix1+8, i_pix1, pix2+8, i_pix2 )\
            + sub( pix1+8+4*i_pix1, i_pix1, pix2+8+4*i_pix2, i_pix2 );\
    if( h==16 )\
        sum+= sub( pix1+8*i_pix1, i_pix1, pix2+8*i_pix2, i_pix2 )\
            + sub( pix1+12*i_pix1, i_pix1, pix2+12*i_pix2, i_pix2 );\
    if( w==16 && h==16 )\
        sum+= sub( pix1+8+8*i_pix1, i_pix1, pix2+8+8*i_pix2, i_pix2 )\
            + sub( pix1+8+12*i_pix1, i_pix1, pix2+8+12*i_pix2, i_pix2 );\
    return sum;\
270
}
271 272 273 274 275
PIXEL_SATD_C( 16, 16, x264_pixel_satd_8x4 )
PIXEL_SATD_C( 16, 8,  x264_pixel_satd_8x4 )
PIXEL_SATD_C( 8,  16, x264_pixel_satd_8x4 )
PIXEL_SATD_C( 8,  8,  x264_pixel_satd_8x4 )
PIXEL_SATD_C( 4,  8,  x264_pixel_satd_4x4 )
276 277


278 279 280
static NOINLINE int sa8d_8x8( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
{
    uint32_t tmp[8][4];
281 282 283
    uint32_t a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3;
    int sum = 0;
    for( int i = 0; i < 8; i++, pix1 += i_pix1, pix2 += i_pix2 )
284
    {
285 286 287 288 289 290 291 292 293 294 295 296 297
        a0 = pix1[0] - pix2[0];
        a1 = pix1[1] - pix2[1];
        b0 = (a0+a1) + ((a0-a1)<<16);
        a2 = pix1[2] - pix2[2];
        a3 = pix1[3] - pix2[3];
        b1 = (a2+a3) + ((a2-a3)<<16);
        a4 = pix1[4] - pix2[4];
        a5 = pix1[5] - pix2[5];
        b2 = (a4+a5) + ((a4-a5)<<16);
        a6 = pix1[6] - pix2[6];
        a7 = pix1[7] - pix2[7];
        b3 = (a6+a7) + ((a6-a7)<<16);
        HADAMARD4( tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], b0,b1,b2,b3 );
298
    }
299
    for( int i = 0; i < 4; i++ )
300
    {
301 302
        HADAMARD4( a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i] );
        HADAMARD4( a4, a5, a6, a7, tmp[4][i], tmp[5][i], tmp[6][i], tmp[7][i] );
303 304 305 306 307 308 309 310
        b0  = abs2(a0+a4) + abs2(a0-a4);
        b0 += abs2(a1+a5) + abs2(a1-a5);
        b0 += abs2(a2+a6) + abs2(a2-a6);
        b0 += abs2(a3+a7) + abs2(a3-a7);
        sum += (uint16_t)b0 + (b0>>16);
    }
    return sum;
}
311

312 313 314 315
static int x264_pixel_sa8d_8x8( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
{
    int sum = sa8d_8x8( pix1, i_pix1, pix2, i_pix2 );
    return (sum+2)>>2;
316 317
}

318 319 320 321 322 323 324
static int x264_pixel_sa8d_16x16( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
{
    int sum = sa8d_8x8( pix1, i_pix1, pix2, i_pix2 )
            + sa8d_8x8( pix1+8, i_pix1, pix2+8, i_pix2 )
            + sa8d_8x8( pix1+8*i_pix1, i_pix1, pix2+8*i_pix2, i_pix2 )
            + sa8d_8x8( pix1+8+8*i_pix1, i_pix1, pix2+8+8*i_pix2, i_pix2 );
    return (sum+2)>>2;
325 326
}

Loren Merritt's avatar
Loren Merritt committed
327

328
static NOINLINE uint64_t pixel_hadamard_ac( uint8_t *pix, int stride )
Loren Merritt's avatar
Loren Merritt committed
329
{
330
    uint32_t tmp[32];
331 332 333
    uint32_t a0, a1, a2, a3, dc;
    int sum4 = 0, sum8 = 0;
    for( int i = 0; i < 8; i++, pix+=stride )
Loren Merritt's avatar
Loren Merritt committed
334
    {
335 336 337 338 339 340 341 342 343
        uint32_t *t = tmp + (i&3) + (i&4)*4;
        a0 = (pix[0]+pix[1]) + ((pix[0]-pix[1])<<16);
        a1 = (pix[2]+pix[3]) + ((pix[2]-pix[3])<<16);
        t[0] = a0 + a1;
        t[4] = a0 - a1;
        a2 = (pix[4]+pix[5]) + ((pix[4]-pix[5])<<16);
        a3 = (pix[6]+pix[7]) + ((pix[6]-pix[7])<<16);
        t[8] = a2 + a3;
        t[12] = a2 - a3;
Loren Merritt's avatar
Loren Merritt committed
344
    }
345
    for( int i = 0; i < 8; i++ )
Loren Merritt's avatar
Loren Merritt committed
346
    {
347
        HADAMARD4( a0, a1, a2, a3, tmp[i*4+0], tmp[i*4+1], tmp[i*4+2], tmp[i*4+3] );
348 349 350 351 352
        tmp[i*4+0] = a0;
        tmp[i*4+1] = a1;
        tmp[i*4+2] = a2;
        tmp[i*4+3] = a3;
        sum4 += abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
Loren Merritt's avatar
Loren Merritt committed
353
    }
354
    for( int i = 0; i < 8; i++ )
Loren Merritt's avatar
Loren Merritt committed
355
    {
356 357
        HADAMARD4( a0,a1,a2,a3, tmp[i], tmp[8+i], tmp[16+i], tmp[24+i] );
        sum8 += abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
Loren Merritt's avatar
Loren Merritt committed
358
    }
359 360 361
    dc = (uint16_t)(tmp[0] + tmp[8] + tmp[16] + tmp[24]);
    sum4 = (uint16_t)sum4 + ((uint32_t)sum4>>16) - dc;
    sum8 = (uint16_t)sum8 + ((uint32_t)sum8>>16) - dc;
Loren Merritt's avatar
Loren Merritt committed
362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382
    return ((uint64_t)sum8<<32) + sum4;
}

#define HADAMARD_AC(w,h) \
static uint64_t x264_pixel_hadamard_ac_##w##x##h( uint8_t *pix, int stride )\
{\
    uint64_t sum = pixel_hadamard_ac( pix, stride );\
    if( w==16 )\
        sum += pixel_hadamard_ac( pix+8, stride );\
    if( h==16 )\
        sum += pixel_hadamard_ac( pix+8*stride, stride );\
    if( w==16 && h==16 )\
        sum += pixel_hadamard_ac( pix+8*stride+8, stride );\
    return ((sum>>34)<<32) + ((uint32_t)sum>>1);\
}
HADAMARD_AC( 16, 16 )
HADAMARD_AC( 16, 8 )
HADAMARD_AC( 8, 16 )
HADAMARD_AC( 8, 8 )


Loren Merritt's avatar
Loren Merritt committed
383 384 385
/****************************************************************************
 * pixel_sad_x4
 ****************************************************************************/
386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414
#define SAD_X( size ) \
static void x264_pixel_sad_x3_##size( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, int i_stride, int scores[3] )\
{\
    scores[0] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix0, i_stride );\
    scores[1] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix1, i_stride );\
    scores[2] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix2, i_stride );\
}\
static void x264_pixel_sad_x4_##size( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, int i_stride, int scores[4] )\
{\
    scores[0] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix0, i_stride );\
    scores[1] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix1, i_stride );\
    scores[2] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix2, i_stride );\
    scores[3] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix3, i_stride );\
}

SAD_X( 16x16 )
SAD_X( 16x8 )
SAD_X( 8x16 )
SAD_X( 8x8 )
SAD_X( 8x4 )
SAD_X( 4x8 )
SAD_X( 4x4 )

#ifdef ARCH_UltraSparc
SAD_X( 16x16_vis )
SAD_X( 16x8_vis )
SAD_X( 8x16_vis )
SAD_X( 8x8_vis )
#endif
415

416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434
/****************************************************************************
 * pixel_satd_x4
 * no faster than single satd, but needed for satd to be a drop-in replacement for sad
 ****************************************************************************/

#define SATD_X( size, cpu ) \
static void x264_pixel_satd_x3_##size##cpu( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, int i_stride, int scores[3] )\
{\
    scores[0] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix0, i_stride );\
    scores[1] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix1, i_stride );\
    scores[2] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix2, i_stride );\
}\
static void x264_pixel_satd_x4_##size##cpu( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2, uint8_t *pix3, int i_stride, int scores[4] )\
{\
    scores[0] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix0, i_stride );\
    scores[1] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix1, i_stride );\
    scores[2] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix2, i_stride );\
    scores[3] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix3, i_stride );\
}
435
#define SATD_X_DECL6( cpu )\
436 437 438 439
SATD_X( 16x16, cpu )\
SATD_X( 16x8, cpu )\
SATD_X( 8x16, cpu )\
SATD_X( 8x8, cpu )\
440 441
SATD_X( 8x4, cpu )\
SATD_X( 4x8, cpu )
442
#define SATD_X_DECL7( cpu )\
443
SATD_X_DECL6( cpu )\
444 445 446 447 448
SATD_X( 4x4, cpu )

SATD_X_DECL7()
#ifdef HAVE_MMX
SATD_X_DECL7( _mmxext )
449
SATD_X_DECL6( _sse2 )
Fiona Glaser's avatar
Fiona Glaser committed
450
SATD_X_DECL7( _ssse3 )
451
SATD_X_DECL7( _sse4 )
452 453
#endif

454 455 456 457
#ifdef HAVE_ARMV6
SATD_X_DECL7( _neon )
#endif

Loren Merritt's avatar
Loren Merritt committed
458 459 460
/****************************************************************************
 * structural similarity metric
 ****************************************************************************/
461 462 463 464
static void ssim_4x4x2_core( const uint8_t *pix1, int stride1,
                             const uint8_t *pix2, int stride2,
                             int sums[2][4])
{
465
    for( int z = 0; z < 2; z++ )
466
    {
467 468 469
        uint32_t s1 = 0, s2 = 0, ss = 0, s12 = 0;
        for( int y = 0; y < 4; y++ )
            for( int x = 0; x < 4; x++ )
470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493
            {
                int a = pix1[x+y*stride1];
                int b = pix2[x+y*stride2];
                s1  += a;
                s2  += b;
                ss  += a*a;
                ss  += b*b;
                s12 += a*b;
            }
        sums[z][0] = s1;
        sums[z][1] = s2;
        sums[z][2] = ss;
        sums[z][3] = s12;
        pix1 += 4;
        pix2 += 4;
    }
}

static float ssim_end1( int s1, int s2, int ss, int s12 )
{
    static const int ssim_c1 = (int)(.01*.01*255*255*64 + .5);
    static const int ssim_c2 = (int)(.03*.03*255*255*64*63 + .5);
    int vars = ss*64 - s1*s1 - s2*s2;
    int covar = s12*64 - s1*s2;
494 495
    return (float)(2*s1*s2 + ssim_c1) * (float)(2*covar + ssim_c2)
         / ((float)(s1*s1 + s2*s2 + ssim_c1) * (float)(vars + ssim_c2));
496 497 498 499 500
}

static float ssim_end4( int sum0[5][4], int sum1[5][4], int width )
{
    float ssim = 0.0;
501
    for( int i = 0; i < width; i++ )
502 503 504 505 506 507 508 509 510 511
        ssim += ssim_end1( sum0[i][0] + sum0[i+1][0] + sum1[i][0] + sum1[i+1][0],
                           sum0[i][1] + sum0[i+1][1] + sum1[i][1] + sum1[i+1][1],
                           sum0[i][2] + sum0[i+1][2] + sum1[i][2] + sum1[i+1][2],
                           sum0[i][3] + sum0[i+1][3] + sum1[i][3] + sum1[i+1][3] );
    return ssim;
}

float x264_pixel_ssim_wxh( x264_pixel_function_t *pf,
                           uint8_t *pix1, int stride1,
                           uint8_t *pix2, int stride2,
512
                           int width, int height, void *buf )
513
{
514
    int z = 0;
515
    float ssim = 0.0;
516 517
    int (*sum0)[4] = buf;
    int (*sum1)[4] = sum0 + width/4+3;
518 519
    width >>= 2;
    height >>= 2;
520
    for( int y = 1; y < height; y++ )
521 522 523 524
    {
        for( ; z <= y; z++ )
        {
            XCHG( void*, sum0, sum1 );
525
            for( int x = 0; x < width; x+=2 )
526 527
                pf->ssim_4x4x2_core( &pix1[4*(x+z*stride1)], stride1, &pix2[4*(x+z*stride2)], stride2, &sum0[x] );
        }
528
        for( int x = 0; x < width-1; x += 4 )
529 530
            ssim += pf->ssim_end4( sum0+x, sum1+x, X264_MIN(4,width-x-1) );
    }
531
    return ssim;
532 533 534
}


Loren Merritt's avatar
Loren Merritt committed
535 536 537
/****************************************************************************
 * successive elimination
 ****************************************************************************/
538 539
static int x264_pixel_ads4( int enc_dc[4], uint16_t *sums, int delta,
                            uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
Loren Merritt's avatar
Loren Merritt committed
540
{
541 542
    int nmv = 0;
    for( int i = 0; i < width; i++, sums++ )
543 544 545 546 547 548 549 550 551 552
    {
        int ads = abs( enc_dc[0] - sums[0] )
                + abs( enc_dc[1] - sums[8] )
                + abs( enc_dc[2] - sums[delta] )
                + abs( enc_dc[3] - sums[delta+8] )
                + cost_mvx[i];
        if( ads < thresh )
            mvs[nmv++] = i;
    }
    return nmv;
Loren Merritt's avatar
Loren Merritt committed
553 554
}

555 556
static int x264_pixel_ads2( int enc_dc[2], uint16_t *sums, int delta,
                            uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
Loren Merritt's avatar
Loren Merritt committed
557
{
558 559
    int nmv = 0;
    for( int i = 0; i < width; i++, sums++ )
560 561 562 563 564 565 566 567
    {
        int ads = abs( enc_dc[0] - sums[0] )
                + abs( enc_dc[1] - sums[delta] )
                + cost_mvx[i];
        if( ads < thresh )
            mvs[nmv++] = i;
    }
    return nmv;
Loren Merritt's avatar
Loren Merritt committed
568 569
}

570 571
static int x264_pixel_ads1( int enc_dc[1], uint16_t *sums, int delta,
                            uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
Loren Merritt's avatar
Loren Merritt committed
572
{
573 574
    int nmv = 0;
    for( int i = 0; i<width; i++, sums++ )
575 576 577 578 579 580 581
    {
        int ads = abs( enc_dc[0] - sums[0] )
                + cost_mvx[i];
        if( ads < thresh )
            mvs[nmv++] = i;
    }
    return nmv;
Loren Merritt's avatar
Loren Merritt committed
582 583 584
}


Laurent Aimar's avatar
Laurent Aimar committed
585 586 587 588 589
/****************************************************************************
 * x264_pixel_init:
 ****************************************************************************/
void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
{
590 591
    memset( pixf, 0, sizeof(*pixf) );

592 593 594 595 596 597 598 599 600 601
#define INIT2_NAME( name1, name2, cpu ) \
    pixf->name1[PIXEL_16x16] = x264_pixel_##name2##_16x16##cpu;\
    pixf->name1[PIXEL_16x8]  = x264_pixel_##name2##_16x8##cpu;
#define INIT4_NAME( name1, name2, cpu ) \
    INIT2_NAME( name1, name2, cpu ) \
    pixf->name1[PIXEL_8x16]  = x264_pixel_##name2##_8x16##cpu;\
    pixf->name1[PIXEL_8x8]   = x264_pixel_##name2##_8x8##cpu;
#define INIT5_NAME( name1, name2, cpu ) \
    INIT4_NAME( name1, name2, cpu ) \
    pixf->name1[PIXEL_8x4]   = x264_pixel_##name2##_8x4##cpu;
602
#define INIT6_NAME( name1, name2, cpu ) \
603
    INIT5_NAME( name1, name2, cpu ) \
604 605 606
    pixf->name1[PIXEL_4x8]   = x264_pixel_##name2##_4x8##cpu;
#define INIT7_NAME( name1, name2, cpu ) \
    INIT6_NAME( name1, name2, cpu ) \
607 608 609 610
    pixf->name1[PIXEL_4x4]   = x264_pixel_##name2##_4x4##cpu;
#define INIT2( name, cpu ) INIT2_NAME( name, name, cpu )
#define INIT4( name, cpu ) INIT4_NAME( name, name, cpu )
#define INIT5( name, cpu ) INIT5_NAME( name, name, cpu )
611
#define INIT6( name, cpu ) INIT6_NAME( name, name, cpu )
612
#define INIT7( name, cpu ) INIT7_NAME( name, name, cpu )
613

614 615 616 617 618
#define INIT_ADS( cpu ) \
    pixf->ads[PIXEL_16x16] = x264_pixel_ads4##cpu;\
    pixf->ads[PIXEL_16x8] = x264_pixel_ads2##cpu;\
    pixf->ads[PIXEL_8x8] = x264_pixel_ads1##cpu;

619
    INIT7( sad, );
620
    INIT7_NAME( sad_aligned, sad, );
621 622 623 624
    INIT7( sad_x3, );
    INIT7( sad_x4, );
    INIT7( ssd, );
    INIT7( satd, );
625 626
    INIT7( satd_x3, );
    INIT7( satd_x4, );
Loren Merritt's avatar
Loren Merritt committed
627
    INIT4( hadamard_ac, );
628
    INIT_ADS( );
629

630 631
    pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16;
    pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8;
632 633 634
    pixf->var[PIXEL_16x16] = x264_pixel_var_16x16;
    pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8;

635 636
    pixf->ssim_4x4x2_core = ssim_4x4x2_core;
    pixf->ssim_end4 = ssim_end4;
637
    pixf->var2_8x8 = pixel_var2_8x8;
638

639
#ifdef HAVE_MMX
640 641
    if( cpu&X264_CPU_MMX )
    {
642
        INIT7( ssd, _mmx );
643 644
    }

Laurent Aimar's avatar
Laurent Aimar committed
645 646
    if( cpu&X264_CPU_MMXEXT )
    {
647
        INIT7( sad, _mmxext );
648
        INIT7_NAME( sad_aligned, sad, _mmxext );
649 650 651
        INIT7( sad_x3, _mmxext );
        INIT7( sad_x4, _mmxext );
        INIT7( satd, _mmxext );
652 653
        INIT7( satd_x3, _mmxext );
        INIT7( satd_x4, _mmxext );
Loren Merritt's avatar
Loren Merritt committed
654
        INIT4( hadamard_ac, _mmxext );
655
        INIT_ADS( _mmxext );
656 657
        pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmxext;
        pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8_mmxext;
658 659 660
#ifdef ARCH_X86
        pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmxext;
        pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_mmxext;
661
        pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmxext;
662
        pixf->ssim_4x4x2_core  = x264_pixel_ssim_4x4x2_core_mmxext;
663
        pixf->var2_8x8 = x264_pixel_var2_8x8_mmxext;
664

665
        if( cpu&X264_CPU_CACHELINE_32 )
666
        {
667 668 669 670 671 672 673 674 675
            INIT5( sad, _cache32_mmxext );
            INIT4( sad_x3, _cache32_mmxext );
            INIT4( sad_x4, _cache32_mmxext );
        }
        else if( cpu&X264_CPU_CACHELINE_64 )
        {
            INIT5( sad, _cache64_mmxext );
            INIT4( sad_x3, _cache64_mmxext );
            INIT4( sad_x4, _cache64_mmxext );
676 677
        }
#else
678
        if( cpu&X264_CPU_CACHELINE_64 )
679 680 681 682 683 684 685 686 687
        {
            pixf->sad[PIXEL_8x16] = x264_pixel_sad_8x16_cache64_mmxext;
            pixf->sad[PIXEL_8x8]  = x264_pixel_sad_8x8_cache64_mmxext;
            pixf->sad[PIXEL_8x4]  = x264_pixel_sad_8x4_cache64_mmxext;
            pixf->sad_x3[PIXEL_8x16] = x264_pixel_sad_x3_8x16_cache64_mmxext;
            pixf->sad_x3[PIXEL_8x8]  = x264_pixel_sad_x3_8x8_cache64_mmxext;
            pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_cache64_mmxext;
            pixf->sad_x4[PIXEL_8x8]  = x264_pixel_sad_x4_8x8_cache64_mmxext;
        }
688
#endif
689
        pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_mmxext;
Fiona Glaser's avatar
Fiona Glaser committed
690
        pixf->intra_sad_x3_16x16  = x264_intra_sad_x3_16x16_mmxext;
691
        pixf->intra_satd_x3_8x8c  = x264_intra_satd_x3_8x8c_mmxext;
Fiona Glaser's avatar
Fiona Glaser committed
692
        pixf->intra_sad_x3_8x8c   = x264_intra_sad_x3_8x8c_mmxext;
Fiona Glaser's avatar
Fiona Glaser committed
693
        pixf->intra_sad_x3_8x8    = x264_intra_sad_x3_8x8_mmxext;
694
        pixf->intra_satd_x3_4x4   = x264_intra_satd_x3_4x4_mmxext;
Fiona Glaser's avatar
Fiona Glaser committed
695
        pixf->intra_sad_x3_4x4    = x264_intra_sad_x3_4x4_mmxext;
Laurent Aimar's avatar
Laurent Aimar committed
696
    }
697

698 699 700 701 702 703 704 705 706 707 708 709
    if( cpu&X264_CPU_SSE2 )
    {
        INIT5( ssd, _sse2slow );
        INIT2_NAME( sad_aligned, sad, _sse2_aligned );
        pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_sse2;
        pixf->ssim_4x4x2_core  = x264_pixel_ssim_4x4x2_core_sse2;
        pixf->ssim_end4        = x264_pixel_ssim_end4_sse2;
        pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2;
        pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_sse2;
#ifdef ARCH_X86_64
        pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2;
#endif
710
        pixf->var2_8x8 = x264_pixel_var2_8x8_sse2;
711 712
    }

713
    if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
714
    {
715 716 717
        INIT2( sad, _sse2 );
        INIT2( sad_x3, _sse2 );
        INIT2( sad_x4, _sse2 );
718 719 720
        INIT6( satd, _sse2 );
        INIT6( satd_x3, _sse2 );
        INIT6( satd_x4, _sse2 );
721 722 723 724
        if( !(cpu&X264_CPU_STACK_MOD4) )
        {
            INIT4( hadamard_ac, _sse2 );
        }
725
        INIT_ADS( _sse2 );
726
        pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2;
727
        pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_sse2;
728
        if( cpu&X264_CPU_CACHELINE_64 )
729
        {
730
            INIT2( ssd, _sse2); /* faster for width 16 on p4 */
731
#ifdef ARCH_X86
732 733 734 735
            INIT2( sad, _cache64_sse2 );
            INIT2( sad_x3, _cache64_sse2 );
            INIT2( sad_x4, _cache64_sse2 );
#endif
736 737 738 739 740 741 742
           if( cpu&X264_CPU_SSE2_IS_FAST )
           {
               pixf->sad_x3[PIXEL_8x16] = x264_pixel_sad_x3_8x16_cache64_sse2;
               pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_cache64_sse2;
           }
        }

Fiona Glaser's avatar
Fiona Glaser committed
743 744 745 746 747
        if( cpu&X264_CPU_SSE_MISALIGN )
        {
            INIT2( sad_x3, _sse2_misalign );
            INIT2( sad_x4, _sse2_misalign );
        }
748
    }
749

750 751 752 753 754 755 756 757 758 759 760 761
    if( cpu&X264_CPU_SSE2_IS_FAST && !(cpu&X264_CPU_CACHELINE_64) )
    {
        pixf->sad_aligned[PIXEL_8x16] = x264_pixel_sad_8x16_sse2;
        pixf->sad[PIXEL_8x16] = x264_pixel_sad_8x16_sse2;
        pixf->sad_x3[PIXEL_8x16] = x264_pixel_sad_x3_8x16_sse2;
        pixf->sad_x3[PIXEL_8x8] = x264_pixel_sad_x3_8x8_sse2;
        pixf->sad_x3[PIXEL_8x4] = x264_pixel_sad_x3_8x4_sse2;
        pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_sse2;
        pixf->sad_x4[PIXEL_8x8] = x264_pixel_sad_x4_8x8_sse2;
        pixf->sad_x4[PIXEL_8x4] = x264_pixel_sad_x4_8x4_sse2;
    }

762
    if( (cpu&X264_CPU_SSE3) && (cpu&X264_CPU_CACHELINE_64) )
763 764 765 766 767 768
    {
        INIT2( sad, _sse3 );
        INIT2( sad_x3, _sse3 );
        INIT2( sad_x4, _sse3 );
    }

769 770
    if( cpu&X264_CPU_SSSE3 )
    {
771 772 773 774
        if( !(cpu&X264_CPU_STACK_MOD4) )
        {
            INIT4( hadamard_ac, _ssse3 );
        }
775
        INIT_ADS( _ssse3 );
776 777 778 779 780 781 782 783 784
        if( !(cpu&X264_CPU_SLOW_ATOM) )
        {
            INIT7( ssd, _ssse3 );
            pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
            pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_ssse3;
            INIT7( satd, _ssse3 );
            INIT7( satd_x3, _ssse3 );
            INIT7( satd_x4, _ssse3 );
        }
785
        pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_ssse3;
786
        pixf->intra_sad_x3_16x16  = x264_intra_sad_x3_16x16_ssse3;
787
        pixf->intra_satd_x3_8x8c  = x264_intra_satd_x3_8x8c_ssse3;
Fiona Glaser's avatar
Fiona Glaser committed
788
        pixf->intra_sad_x3_8x8c   = x264_intra_sad_x3_8x8c_ssse3;
789 790 791
        pixf->intra_satd_x3_4x4   = x264_intra_satd_x3_4x4_ssse3;
#ifdef ARCH_X86_64
        pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_ssse3;
Loren Merritt's avatar
Loren Merritt committed
792
#endif
793
        pixf->var2_8x8 = x264_pixel_var2_8x8_ssse3;
794
        if( cpu&X264_CPU_CACHELINE_64 )
795 796 797 798 799
        {
            INIT2( sad, _cache64_ssse3 );
            INIT2( sad_x3, _cache64_ssse3 );
            INIT2( sad_x4, _cache64_ssse3 );
        }
800
        if( cpu&X264_CPU_SLOW_ATOM || !(cpu&X264_CPU_SHUFFLE_IS_FAST) )
801
        {
802
            INIT5( ssd, _sse2 ); /* on conroe, sse2 is faster for width8/16 */
803
        }
804
    }
805 806 807

    if( cpu&X264_CPU_SSE4 )
    {
808 809 810 811 812 813 814 815 816
        INIT7( satd, _sse4 );
        INIT7( satd_x3, _sse4 );
        INIT7( satd_x4, _sse4 );
        if( !(cpu&X264_CPU_STACK_MOD4) )
        {
            INIT4( hadamard_ac, _sse4 );
        }
        pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_sse4;
        pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_sse4;
817
    }
818
#endif //HAVE_MMX
819

820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860
#ifdef HAVE_ARMV6
    if( cpu&X264_CPU_ARMV6 )
    {
        pixf->sad[PIXEL_4x8] = x264_pixel_sad_4x8_armv6;
        pixf->sad[PIXEL_4x4] = x264_pixel_sad_4x4_armv6;
        pixf->sad_aligned[PIXEL_4x8] = x264_pixel_sad_4x8_armv6;
        pixf->sad_aligned[PIXEL_4x4] = x264_pixel_sad_4x4_armv6;
    }
    if( cpu&X264_CPU_NEON )
    {
        INIT5( sad, _neon );
        INIT5( sad_aligned, _neon );
        INIT7( sad_x3, _neon );
        INIT7( sad_x4, _neon );
        INIT7( ssd, _neon );
        INIT7( satd, _neon );
        INIT7( satd_x3, _neon );
        INIT7( satd_x4, _neon );
        INIT4( hadamard_ac, _neon );
        pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_neon;
        pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_neon;
        pixf->var[PIXEL_8x8]    = x264_pixel_var_8x8_neon;
        pixf->var[PIXEL_16x16]  = x264_pixel_var_16x16_neon;
        pixf->var2_8x8          = x264_pixel_var2_8x8_neon;

        pixf->ssim_4x4x2_core   = x264_pixel_ssim_4x4x2_core_neon;
        pixf->ssim_end4         = x264_pixel_ssim_end4_neon;

        if( cpu&X264_CPU_FAST_NEON_MRC )
        {
            pixf->sad[PIXEL_4x8] = x264_pixel_sad_4x8_neon;
            pixf->sad[PIXEL_4x4] = x264_pixel_sad_4x4_neon;
            pixf->sad_aligned[PIXEL_4x8] = x264_pixel_sad_aligned_4x8_neon;
            pixf->sad_aligned[PIXEL_4x4] = x264_pixel_sad_aligned_4x4_neon;
        }
        else    // really just scheduled for dual issue / A8
        {
            INIT5( sad_aligned, _neon_dual );
        }
    }
#endif
861
#ifdef HAVE_ALTIVEC
Laurent Aimar's avatar
Laurent Aimar committed
862 863 864 865 866
    if( cpu&X264_CPU_ALTIVEC )
    {
        x264_pixel_altivec_init( pixf );
    }
#endif
867
#ifdef ARCH_UltraSparc
868 869 870
    INIT4( sad, _vis );
    INIT4( sad_x3, _vis );
    INIT4( sad_x4, _vis );
871
#endif
Loren Merritt's avatar
Loren Merritt committed
872 873 874 875 876

    pixf->ads[PIXEL_8x16] =
    pixf->ads[PIXEL_8x4] =
    pixf->ads[PIXEL_4x8] = pixf->ads[PIXEL_16x8];
    pixf->ads[PIXEL_4x4] = pixf->ads[PIXEL_8x8];
Laurent Aimar's avatar
Laurent Aimar committed
877 878
}