pixel.c 33.6 KB
Newer Older
Laurent Aimar's avatar
Laurent Aimar committed
1 2 3
/*****************************************************************************
 * pixel.c: h264 encoder
 *****************************************************************************
4
 * Copyright (C) 2003-2008 x264 project
Laurent Aimar's avatar
Laurent Aimar committed
5
 *
6 7
 * Authors: Loren Merritt <lorenm@u.washington.edu>
 *          Laurent Aimar <fenrir@via.ecp.fr>
Laurent Aimar's avatar
Laurent Aimar committed
8 9 10 11 12 13 14 15 16 17 18 19 20
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
21
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
Laurent Aimar's avatar
Laurent Aimar committed
22 23
 *****************************************************************************/

Loren Merritt's avatar
Loren Merritt committed
24
#include "common.h"
Laurent Aimar's avatar
Laurent Aimar committed
25

Steven Walters's avatar
Steven Walters committed
26
#if HAVE_MMX
27
#   include "x86/pixel.h"
Laurent Aimar's avatar
Laurent Aimar committed
28
#endif
Steven Walters's avatar
Steven Walters committed
29
#if ARCH_PPC
Laurent Aimar's avatar
Laurent Aimar committed
30 31
#   include "ppc/pixel.h"
#endif
Steven Walters's avatar
Steven Walters committed
32
#if ARCH_ARM
33 34
#   include "arm/pixel.h"
#endif
Steven Walters's avatar
Steven Walters committed
35
#if ARCH_UltraSparc
36 37
#   include "sparc/pixel.h"
#endif
Laurent Aimar's avatar
Laurent Aimar committed
38 39 40 41 42 43


/****************************************************************************
 * pixel_sad_WxH
 ****************************************************************************/
#define PIXEL_SAD_C( name, lx, ly ) \
44 45
static int name( pixel *pix1, int i_stride_pix1,  \
                 pixel *pix2, int i_stride_pix2 ) \
Laurent Aimar's avatar
Laurent Aimar committed
46 47
{                                                   \
    int i_sum = 0;                                  \
48
    for( int y = 0; y < ly; y++ )                   \
Laurent Aimar's avatar
Laurent Aimar committed
49
    {                                               \
50
        for( int x = 0; x < lx; x++ )               \
Laurent Aimar's avatar
Laurent Aimar committed
51 52 53 54 55 56 57 58 59 60
        {                                           \
            i_sum += abs( pix1[x] - pix2[x] );      \
        }                                           \
        pix1 += i_stride_pix1;                      \
        pix2 += i_stride_pix2;                      \
    }                                               \
    return i_sum;                                   \
}


61 62 63 64 65 66 67
PIXEL_SAD_C( x264_pixel_sad_16x16, 16, 16 )
PIXEL_SAD_C( x264_pixel_sad_16x8,  16,  8 )
PIXEL_SAD_C( x264_pixel_sad_8x16,   8, 16 )
PIXEL_SAD_C( x264_pixel_sad_8x8,    8,  8 )
PIXEL_SAD_C( x264_pixel_sad_8x4,    8,  4 )
PIXEL_SAD_C( x264_pixel_sad_4x8,    4,  8 )
PIXEL_SAD_C( x264_pixel_sad_4x4,    4,  4 )
Laurent Aimar's avatar
Laurent Aimar committed
68

69 70 71 72 73

/****************************************************************************
 * pixel_ssd_WxH
 ****************************************************************************/
#define PIXEL_SSD_C( name, lx, ly ) \
74 75
static int name( pixel *pix1, int i_stride_pix1,  \
                 pixel *pix2, int i_stride_pix2 ) \
76 77
{                                                   \
    int i_sum = 0;                                  \
78
    for( int y = 0; y < ly; y++ )                   \
79
    {                                               \
80
        for( int x = 0; x < lx; x++ )               \
81 82 83 84 85 86 87 88 89 90
        {                                           \
            int d = pix1[x] - pix2[x];              \
            i_sum += d*d;                           \
        }                                           \
        pix1 += i_stride_pix1;                      \
        pix2 += i_stride_pix2;                      \
    }                                               \
    return i_sum;                                   \
}

91 92 93 94 95 96 97
PIXEL_SSD_C( x264_pixel_ssd_16x16, 16, 16 )
PIXEL_SSD_C( x264_pixel_ssd_16x8,  16,  8 )
PIXEL_SSD_C( x264_pixel_ssd_8x16,   8, 16 )
PIXEL_SSD_C( x264_pixel_ssd_8x8,    8,  8 )
PIXEL_SSD_C( x264_pixel_ssd_8x4,    8,  4 )
PIXEL_SSD_C( x264_pixel_ssd_4x8,    4,  8 )
PIXEL_SSD_C( x264_pixel_ssd_4x4,    4,  4 )
98

99
uint64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2, int i_width, int i_height )
Loren Merritt's avatar
Loren Merritt committed
100
{
101
    uint64_t i_ssd = 0;
102
    int y;
Anton Mitrofanov's avatar
Anton Mitrofanov committed
103
    int align = !(((intptr_t)pix1 | (intptr_t)pix2 | i_pix1 | i_pix2) & 15);
Loren Merritt's avatar
Loren Merritt committed
104 105 106 107 108

#define SSD(size) i_ssd += pf->ssd[size]( pix1 + y*i_pix1 + x, i_pix1, \
                                          pix2 + y*i_pix2 + x, i_pix2 );
    for( y = 0; y < i_height-15; y += 16 )
    {
109
        int x = 0;
Loren Merritt's avatar
Loren Merritt committed
110 111 112 113
        if( align )
            for( ; x < i_width-15; x += 16 )
                SSD(PIXEL_16x16);
        for( ; x < i_width-7; x += 8 )
Loren Merritt's avatar
Loren Merritt committed
114 115 116
            SSD(PIXEL_8x16);
    }
    if( y < i_height-7 )
117
        for( int x = 0; x < i_width-7; x += 8 )
Loren Merritt's avatar
Loren Merritt committed
118 119 120 121
            SSD(PIXEL_8x8);
#undef SSD

#define SSD1 { int d = pix1[y*i_pix1+x] - pix2[y*i_pix2+x]; i_ssd += d*d; }
122
    if( i_width & 7 )
Loren Merritt's avatar
Loren Merritt committed
123 124
    {
        for( y = 0; y < (i_height & ~7); y++ )
125
            for( int x = i_width & ~7; x < i_width; x++ )
Loren Merritt's avatar
Loren Merritt committed
126 127
                SSD1;
    }
128
    if( i_height & 7 )
Loren Merritt's avatar
Loren Merritt committed
129 130
    {
        for( y = i_height & ~7; y < i_height; y++ )
131
            for( int x = 0; x < i_width; x++ )
Loren Merritt's avatar
Loren Merritt committed
132 133 134 135 136 137 138
                SSD1;
    }
#undef SSD1

    return i_ssd;
}

139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
static uint64_t pixel_ssd_nv12_core( pixel *pixuv1, int stride1, pixel *pixuv2, int stride2, int width, int height )
{
    uint32_t ssd_u=0, ssd_v=0;
    for( int y = 0; y < height; y++, pixuv1+=stride1, pixuv2+=stride2 )
        for( int x = 0; x < width; x++ )
        {
            int du = pixuv1[2*x]   - pixuv2[2*x];
            int dv = pixuv1[2*x+1] - pixuv2[2*x+1];
            ssd_u += du*du;
            ssd_v += dv*dv;
        }
    return ssd_u + ((uint64_t)ssd_v<<32);
}

// SSD in uint32 (i.e. packing two into uint64) can potentially overflow on
// image widths >= 11008 (or 6604 if interlaced), since this is called on blocks
// of height up to 12 (resp 20). Though it will probably take significantly more
// than that at sane distortion levels.
uint64_t x264_pixel_ssd_nv12( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2, int i_width, int i_height )
{
    uint64_t ssd = pf->ssd_nv12_core( pix1, i_pix1, pix2, i_pix2, i_width&~7, i_height );
    if( i_width&7 )
        ssd += pixel_ssd_nv12_core( pix1+(i_width&~7), i_pix1, pix2+(i_width&~7), i_pix2, i_width&7, i_height );
    return ssd;
}
164

165 166 167
/****************************************************************************
 * pixel_var_wxh
 ****************************************************************************/
Fiona Glaser's avatar
Fiona Glaser committed
168
#define PIXEL_VAR_C( name, w ) \
169
static uint64_t name( pixel *pix, int i_stride ) \
170
{                                             \
Fiona Glaser's avatar
Fiona Glaser committed
171
    uint32_t sum = 0, sqr = 0;                \
172
    for( int y = 0; y < w; y++ )              \
173
    {                                         \
174
        for( int x = 0; x < w; x++ )          \
175 176 177 178 179 180
        {                                     \
            sum += pix[x];                    \
            sqr += pix[x] * pix[x];           \
        }                                     \
        pix += i_stride;                      \
    }                                         \
Fiona Glaser's avatar
Fiona Glaser committed
181
    return sum + ((uint64_t)sqr << 32);       \
182 183
}

Fiona Glaser's avatar
Fiona Glaser committed
184 185
PIXEL_VAR_C( x264_pixel_var_16x16, 16 )
PIXEL_VAR_C( x264_pixel_var_8x8,    8 )
186

187 188 189
/****************************************************************************
 * pixel_var2_wxh
 ****************************************************************************/
190
static int pixel_var2_8x8( pixel *pix1, int i_stride1, pixel *pix2, int i_stride2, int *ssd )
191 192
{
    uint32_t var = 0, sum = 0, sqr = 0;
193
    for( int y = 0; y < 8; y++ )
194
    {
195
        for( int x = 0; x < 8; x++ )
196 197 198 199 200 201 202 203 204
        {
            int diff = pix1[x] - pix2[x];
            sum += diff;
            sqr += diff * diff;
        }
        pix1 += i_stride1;
        pix2 += i_stride2;
    }
    sum = abs(sum);
205
    var = sqr - ((uint64_t)sum * sum >> 6);
206 207 208 209
    *ssd = sqr;
    return var;
}

210

211
#define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3) {\
Loren Merritt's avatar
Loren Merritt committed
212 213 214 215 216 217 218 219 220
    int t0 = s0 + s1;\
    int t1 = s0 - s1;\
    int t2 = s2 + s3;\
    int t3 = s2 - s3;\
    d0 = t0 + t2;\
    d2 = t0 - t2;\
    d1 = t1 + t3;\
    d3 = t1 - t3;\
}
221

222 223 224 225 226 227 228 229
// in: a pseudo-simd number of the form x+(y<<16)
// return: abs(x)+(abs(y)<<16)
static ALWAYS_INLINE uint32_t abs2( uint32_t a )
{
    uint32_t s = ((a>>15)&0x10001)*0xffff;
    return (a+s)^s;
}

230 231 232
/****************************************************************************
 * pixel_satd_WxH: sum of 4x4 Hadamard transformed differences
 ****************************************************************************/
Laurent Aimar's avatar
Laurent Aimar committed
233

234
static NOINLINE int x264_pixel_satd_4x4( pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
235 236
{
    uint32_t tmp[4][2];
237 238 239
    uint32_t a0, a1, a2, a3, b0, b1;
    int sum = 0;
    for( int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2 )
Laurent Aimar's avatar
Laurent Aimar committed
240
    {
241 242 243 244 245 246 247 248
        a0 = pix1[0] - pix2[0];
        a1 = pix1[1] - pix2[1];
        b0 = (a0+a1) + ((a0-a1)<<16);
        a2 = pix1[2] - pix2[2];
        a3 = pix1[3] - pix2[3];
        b1 = (a2+a3) + ((a2-a3)<<16);
        tmp[i][0] = b0 + b1;
        tmp[i][1] = b0 - b1;
Laurent Aimar's avatar
Laurent Aimar committed
249
    }
250
    for( int i = 0; i < 2; i++ )
251
    {
252
        HADAMARD4( a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i] );
253 254 255 256
        a0 = abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
        sum += ((uint16_t)a0) + (a0>>16);
    }
    return sum >> 1;
Laurent Aimar's avatar
Laurent Aimar committed
257 258
}

259
static NOINLINE int x264_pixel_satd_8x4( pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
260 261
{
    uint32_t tmp[4][4];
262 263 264
    uint32_t a0, a1, a2, a3;
    int sum = 0;
    for( int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2 )
265 266 267 268 269 270 271
    {
        a0 = (pix1[0] - pix2[0]) + ((pix1[4] - pix2[4]) << 16);
        a1 = (pix1[1] - pix2[1]) + ((pix1[5] - pix2[5]) << 16);
        a2 = (pix1[2] - pix2[2]) + ((pix1[6] - pix2[6]) << 16);
        a3 = (pix1[3] - pix2[3]) + ((pix1[7] - pix2[7]) << 16);
        HADAMARD4( tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0,a1,a2,a3 );
    }
272
    for( int i = 0; i < 4; i++ )
273
    {
274
        HADAMARD4( a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i] );
275 276 277 278
        sum += abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
    }
    return (((uint16_t)sum) + ((uint32_t)sum>>16)) >> 1;
}
Laurent Aimar's avatar
Laurent Aimar committed
279

280
#define PIXEL_SATD_C( w, h, sub )\
281
static int x264_pixel_satd_##w##x##h( pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )\
282 283 284 285 286 287 288 289 290 291 292 293 294
{\
    int sum = sub( pix1, i_pix1, pix2, i_pix2 )\
            + sub( pix1+4*i_pix1, i_pix1, pix2+4*i_pix2, i_pix2 );\
    if( w==16 )\
        sum+= sub( pix1+8, i_pix1, pix2+8, i_pix2 )\
            + sub( pix1+8+4*i_pix1, i_pix1, pix2+8+4*i_pix2, i_pix2 );\
    if( h==16 )\
        sum+= sub( pix1+8*i_pix1, i_pix1, pix2+8*i_pix2, i_pix2 )\
            + sub( pix1+12*i_pix1, i_pix1, pix2+12*i_pix2, i_pix2 );\
    if( w==16 && h==16 )\
        sum+= sub( pix1+8+8*i_pix1, i_pix1, pix2+8+8*i_pix2, i_pix2 )\
            + sub( pix1+8+12*i_pix1, i_pix1, pix2+8+12*i_pix2, i_pix2 );\
    return sum;\
295
}
296 297 298 299 300
PIXEL_SATD_C( 16, 16, x264_pixel_satd_8x4 )
PIXEL_SATD_C( 16, 8,  x264_pixel_satd_8x4 )
PIXEL_SATD_C( 8,  16, x264_pixel_satd_8x4 )
PIXEL_SATD_C( 8,  8,  x264_pixel_satd_8x4 )
PIXEL_SATD_C( 4,  8,  x264_pixel_satd_4x4 )
301 302


303
static NOINLINE int sa8d_8x8( pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
304 305
{
    uint32_t tmp[8][4];
306 307 308
    uint32_t a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3;
    int sum = 0;
    for( int i = 0; i < 8; i++, pix1 += i_pix1, pix2 += i_pix2 )
309
    {
310 311 312 313 314 315 316 317 318 319 320 321 322
        a0 = pix1[0] - pix2[0];
        a1 = pix1[1] - pix2[1];
        b0 = (a0+a1) + ((a0-a1)<<16);
        a2 = pix1[2] - pix2[2];
        a3 = pix1[3] - pix2[3];
        b1 = (a2+a3) + ((a2-a3)<<16);
        a4 = pix1[4] - pix2[4];
        a5 = pix1[5] - pix2[5];
        b2 = (a4+a5) + ((a4-a5)<<16);
        a6 = pix1[6] - pix2[6];
        a7 = pix1[7] - pix2[7];
        b3 = (a6+a7) + ((a6-a7)<<16);
        HADAMARD4( tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], b0,b1,b2,b3 );
323
    }
324
    for( int i = 0; i < 4; i++ )
325
    {
326 327
        HADAMARD4( a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i] );
        HADAMARD4( a4, a5, a6, a7, tmp[4][i], tmp[5][i], tmp[6][i], tmp[7][i] );
328 329 330 331 332 333 334 335
        b0  = abs2(a0+a4) + abs2(a0-a4);
        b0 += abs2(a1+a5) + abs2(a1-a5);
        b0 += abs2(a2+a6) + abs2(a2-a6);
        b0 += abs2(a3+a7) + abs2(a3-a7);
        sum += (uint16_t)b0 + (b0>>16);
    }
    return sum;
}
336

337
static int x264_pixel_sa8d_8x8( pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
338 339 340
{
    int sum = sa8d_8x8( pix1, i_pix1, pix2, i_pix2 );
    return (sum+2)>>2;
341 342
}

343
static int x264_pixel_sa8d_16x16( pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
344 345 346 347 348 349
{
    int sum = sa8d_8x8( pix1, i_pix1, pix2, i_pix2 )
            + sa8d_8x8( pix1+8, i_pix1, pix2+8, i_pix2 )
            + sa8d_8x8( pix1+8*i_pix1, i_pix1, pix2+8*i_pix2, i_pix2 )
            + sa8d_8x8( pix1+8+8*i_pix1, i_pix1, pix2+8+8*i_pix2, i_pix2 );
    return (sum+2)>>2;
350 351
}

Loren Merritt's avatar
Loren Merritt committed
352

353
static NOINLINE uint64_t pixel_hadamard_ac( pixel *pix, int stride )
Loren Merritt's avatar
Loren Merritt committed
354
{
355
    uint32_t tmp[32];
356 357 358
    uint32_t a0, a1, a2, a3, dc;
    int sum4 = 0, sum8 = 0;
    for( int i = 0; i < 8; i++, pix+=stride )
Loren Merritt's avatar
Loren Merritt committed
359
    {
360 361 362 363 364 365 366 367 368
        uint32_t *t = tmp + (i&3) + (i&4)*4;
        a0 = (pix[0]+pix[1]) + ((pix[0]-pix[1])<<16);
        a1 = (pix[2]+pix[3]) + ((pix[2]-pix[3])<<16);
        t[0] = a0 + a1;
        t[4] = a0 - a1;
        a2 = (pix[4]+pix[5]) + ((pix[4]-pix[5])<<16);
        a3 = (pix[6]+pix[7]) + ((pix[6]-pix[7])<<16);
        t[8] = a2 + a3;
        t[12] = a2 - a3;
Loren Merritt's avatar
Loren Merritt committed
369
    }
370
    for( int i = 0; i < 8; i++ )
Loren Merritt's avatar
Loren Merritt committed
371
    {
372
        HADAMARD4( a0, a1, a2, a3, tmp[i*4+0], tmp[i*4+1], tmp[i*4+2], tmp[i*4+3] );
373 374 375 376 377
        tmp[i*4+0] = a0;
        tmp[i*4+1] = a1;
        tmp[i*4+2] = a2;
        tmp[i*4+3] = a3;
        sum4 += abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
Loren Merritt's avatar
Loren Merritt committed
378
    }
379
    for( int i = 0; i < 8; i++ )
Loren Merritt's avatar
Loren Merritt committed
380
    {
381 382
        HADAMARD4( a0,a1,a2,a3, tmp[i], tmp[8+i], tmp[16+i], tmp[24+i] );
        sum8 += abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
Loren Merritt's avatar
Loren Merritt committed
383
    }
384 385 386
    dc = (uint16_t)(tmp[0] + tmp[8] + tmp[16] + tmp[24]);
    sum4 = (uint16_t)sum4 + ((uint32_t)sum4>>16) - dc;
    sum8 = (uint16_t)sum8 + ((uint32_t)sum8>>16) - dc;
Loren Merritt's avatar
Loren Merritt committed
387 388 389 390
    return ((uint64_t)sum8<<32) + sum4;
}

#define HADAMARD_AC(w,h) \
391
static uint64_t x264_pixel_hadamard_ac_##w##x##h( pixel *pix, int stride )\
Loren Merritt's avatar
Loren Merritt committed
392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407
{\
    uint64_t sum = pixel_hadamard_ac( pix, stride );\
    if( w==16 )\
        sum += pixel_hadamard_ac( pix+8, stride );\
    if( h==16 )\
        sum += pixel_hadamard_ac( pix+8*stride, stride );\
    if( w==16 && h==16 )\
        sum += pixel_hadamard_ac( pix+8*stride+8, stride );\
    return ((sum>>34)<<32) + ((uint32_t)sum>>1);\
}
HADAMARD_AC( 16, 16 )
HADAMARD_AC( 16, 8 )
HADAMARD_AC( 8, 16 )
HADAMARD_AC( 8, 8 )


Loren Merritt's avatar
Loren Merritt committed
408 409 410
/****************************************************************************
 * pixel_sad_x4
 ****************************************************************************/
411
#define SAD_X( size ) \
412
static void x264_pixel_sad_x3_##size( pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2, int i_stride, int scores[3] )\
413 414 415 416 417
{\
    scores[0] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix0, i_stride );\
    scores[1] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix1, i_stride );\
    scores[2] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix2, i_stride );\
}\
418
static void x264_pixel_sad_x4_##size( pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2, pixel *pix3, int i_stride, int scores[4] )\
419 420 421 422 423 424 425 426 427 428 429 430 431 432 433
{\
    scores[0] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix0, i_stride );\
    scores[1] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix1, i_stride );\
    scores[2] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix2, i_stride );\
    scores[3] = x264_pixel_sad_##size( fenc, FENC_STRIDE, pix3, i_stride );\
}

SAD_X( 16x16 )
SAD_X( 16x8 )
SAD_X( 8x16 )
SAD_X( 8x8 )
SAD_X( 8x4 )
SAD_X( 4x8 )
SAD_X( 4x4 )

434
#if !X264_HIGH_BIT_DEPTH
Steven Walters's avatar
Steven Walters committed
435
#if ARCH_UltraSparc
436 437 438 439 440
SAD_X( 16x16_vis )
SAD_X( 16x8_vis )
SAD_X( 8x16_vis )
SAD_X( 8x8_vis )
#endif
441
#endif // !X264_HIGH_BIT_DEPTH
442

443 444 445 446 447 448
/****************************************************************************
 * pixel_satd_x4
 * no faster than single satd, but needed for satd to be a drop-in replacement for sad
 ****************************************************************************/

#define SATD_X( size, cpu ) \
449
static void x264_pixel_satd_x3_##size##cpu( pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2, int i_stride, int scores[3] )\
450 451 452 453 454
{\
    scores[0] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix0, i_stride );\
    scores[1] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix1, i_stride );\
    scores[2] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix2, i_stride );\
}\
455
static void x264_pixel_satd_x4_##size##cpu( pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2, pixel *pix3, int i_stride, int scores[4] )\
456 457 458 459 460 461
{\
    scores[0] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix0, i_stride );\
    scores[1] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix1, i_stride );\
    scores[2] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix2, i_stride );\
    scores[3] = x264_pixel_satd_##size##cpu( fenc, FENC_STRIDE, pix3, i_stride );\
}
462
#define SATD_X_DECL6( cpu )\
463 464 465 466
SATD_X( 16x16, cpu )\
SATD_X( 16x8, cpu )\
SATD_X( 8x16, cpu )\
SATD_X( 8x8, cpu )\
467 468
SATD_X( 8x4, cpu )\
SATD_X( 4x8, cpu )
469
#define SATD_X_DECL7( cpu )\
470
SATD_X_DECL6( cpu )\
471 472 473
SATD_X( 4x4, cpu )

SATD_X_DECL7()
474
#if !X264_HIGH_BIT_DEPTH
Steven Walters's avatar
Steven Walters committed
475
#if HAVE_MMX
476
SATD_X_DECL7( _mmxext )
477
SATD_X_DECL6( _sse2 )
Fiona Glaser's avatar
Fiona Glaser committed
478
SATD_X_DECL7( _ssse3 )
479
SATD_X_DECL7( _sse4 )
480 481
#endif

Steven Walters's avatar
Steven Walters committed
482
#if HAVE_ARMV6
483 484
SATD_X_DECL7( _neon )
#endif
485
#endif // !X264_HIGH_BIT_DEPTH
486

487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519
#define INTRA_MBCMP_8x8( mbcmp )\
void x264_intra_##mbcmp##_x3_8x8( pixel *fenc, pixel edge[33], int res[3] )\
{\
    pixel pix[8*FDEC_STRIDE];\
    x264_predict_8x8_v_c( pix, edge );\
    res[0] = x264_pixel_##mbcmp##_8x8( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\
    x264_predict_8x8_h_c( pix, edge );\
    res[1] = x264_pixel_##mbcmp##_8x8( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\
    x264_predict_8x8_dc_c( pix, edge );\
    res[2] = x264_pixel_##mbcmp##_8x8( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\
}

INTRA_MBCMP_8x8(sad)
INTRA_MBCMP_8x8(sa8d)

#define INTRA_MBCMP( mbcmp, size, pred1, pred2, pred3, chroma )\
void x264_intra_##mbcmp##_x3_##size##x##size##chroma( pixel *fenc, pixel *fdec, int res[3] )\
{\
    x264_predict_##size##x##size##chroma##_##pred1##_c( fdec );\
    res[0] = x264_pixel_##mbcmp##_##size##x##size( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\
    x264_predict_##size##x##size##chroma##_##pred2##_c( fdec );\
    res[1] = x264_pixel_##mbcmp##_##size##x##size( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\
    x264_predict_##size##x##size##chroma##_##pred3##_c( fdec );\
    res[2] = x264_pixel_##mbcmp##_##size##x##size( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\
}

INTRA_MBCMP(sad, 4, v, h, dc, )
INTRA_MBCMP(satd, 4, v, h, dc, )
INTRA_MBCMP(sad, 8, dc, h, v, c )
INTRA_MBCMP(satd, 8, dc, h, v, c )
INTRA_MBCMP(sad, 16, v, h, dc, )
INTRA_MBCMP(satd, 16, v, h, dc, )

Loren Merritt's avatar
Loren Merritt committed
520 521 522
/****************************************************************************
 * structural similarity metric
 ****************************************************************************/
523 524
static void ssim_4x4x2_core( const pixel *pix1, int stride1,
                             const pixel *pix2, int stride2,
525 526
                             int sums[2][4])
{
527
    for( int z = 0; z < 2; z++ )
528
    {
529 530 531
        uint32_t s1 = 0, s2 = 0, ss = 0, s12 = 0;
        for( int y = 0; y < 4; y++ )
            for( int x = 0; x < 4; x++ )
532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551
            {
                int a = pix1[x+y*stride1];
                int b = pix2[x+y*stride2];
                s1  += a;
                s2  += b;
                ss  += a*a;
                ss  += b*b;
                s12 += a*b;
            }
        sums[z][0] = s1;
        sums[z][1] = s2;
        sums[z][2] = ss;
        sums[z][3] = s12;
        pix1 += 4;
        pix2 += 4;
    }
}

static float ssim_end1( int s1, int s2, int ss, int s12 )
{
552 553
    static const int ssim_c1 = (int)(.01*.01*PIXEL_MAX*PIXEL_MAX*64 + .5);
    static const int ssim_c2 = (int)(.03*.03*PIXEL_MAX*PIXEL_MAX*64*63 + .5);
554 555
    int vars = ss*64 - s1*s1 - s2*s2;
    int covar = s12*64 - s1*s2;
556 557
    return (float)(2*s1*s2 + ssim_c1) * (float)(2*covar + ssim_c2)
         / ((float)(s1*s1 + s2*s2 + ssim_c1) * (float)(vars + ssim_c2));
558 559 560 561 562
}

static float ssim_end4( int sum0[5][4], int sum1[5][4], int width )
{
    float ssim = 0.0;
563
    for( int i = 0; i < width; i++ )
564 565 566 567 568 569 570 571
        ssim += ssim_end1( sum0[i][0] + sum0[i+1][0] + sum1[i][0] + sum1[i+1][0],
                           sum0[i][1] + sum0[i+1][1] + sum1[i][1] + sum1[i+1][1],
                           sum0[i][2] + sum0[i+1][2] + sum1[i][2] + sum1[i+1][2],
                           sum0[i][3] + sum0[i+1][3] + sum1[i][3] + sum1[i+1][3] );
    return ssim;
}

float x264_pixel_ssim_wxh( x264_pixel_function_t *pf,
572 573
                           pixel *pix1, int stride1,
                           pixel *pix2, int stride2,
574
                           int width, int height, void *buf )
575
{
576
    int z = 0;
577
    float ssim = 0.0;
578
    int (*sum0)[4] = buf;
579
    int (*sum1)[4] = sum0 + (width >> 2) + 3;
580 581
    width >>= 2;
    height >>= 2;
582
    for( int y = 1; y < height; y++ )
583 584 585 586
    {
        for( ; z <= y; z++ )
        {
            XCHG( void*, sum0, sum1 );
587
            for( int x = 0; x < width; x+=2 )
588 589
                pf->ssim_4x4x2_core( &pix1[4*(x+z*stride1)], stride1, &pix2[4*(x+z*stride2)], stride2, &sum0[x] );
        }
590
        for( int x = 0; x < width-1; x += 4 )
591 592
            ssim += pf->ssim_end4( sum0+x, sum1+x, X264_MIN(4,width-x-1) );
    }
593
    return ssim;
594 595 596
}


Loren Merritt's avatar
Loren Merritt committed
597 598 599
/****************************************************************************
 * successive elimination
 ****************************************************************************/
600 601
static int x264_pixel_ads4( int enc_dc[4], uint16_t *sums, int delta,
                            uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
Loren Merritt's avatar
Loren Merritt committed
602
{
603 604
    int nmv = 0;
    for( int i = 0; i < width; i++, sums++ )
605 606 607 608 609 610 611 612 613 614
    {
        int ads = abs( enc_dc[0] - sums[0] )
                + abs( enc_dc[1] - sums[8] )
                + abs( enc_dc[2] - sums[delta] )
                + abs( enc_dc[3] - sums[delta+8] )
                + cost_mvx[i];
        if( ads < thresh )
            mvs[nmv++] = i;
    }
    return nmv;
Loren Merritt's avatar
Loren Merritt committed
615 616
}

617 618
static int x264_pixel_ads2( int enc_dc[2], uint16_t *sums, int delta,
                            uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
Loren Merritt's avatar
Loren Merritt committed
619
{
620 621
    int nmv = 0;
    for( int i = 0; i < width; i++, sums++ )
622 623 624 625 626 627 628 629
    {
        int ads = abs( enc_dc[0] - sums[0] )
                + abs( enc_dc[1] - sums[delta] )
                + cost_mvx[i];
        if( ads < thresh )
            mvs[nmv++] = i;
    }
    return nmv;
Loren Merritt's avatar
Loren Merritt committed
630 631
}

632 633
static int x264_pixel_ads1( int enc_dc[1], uint16_t *sums, int delta,
                            uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
Loren Merritt's avatar
Loren Merritt committed
634
{
635 636
    int nmv = 0;
    for( int i = 0; i<width; i++, sums++ )
637 638 639 640 641 642 643
    {
        int ads = abs( enc_dc[0] - sums[0] )
                + cost_mvx[i];
        if( ads < thresh )
            mvs[nmv++] = i;
    }
    return nmv;
Loren Merritt's avatar
Loren Merritt committed
644 645 646
}


Laurent Aimar's avatar
Laurent Aimar committed
647 648 649 650 651
/****************************************************************************
 * x264_pixel_init:
 ****************************************************************************/
void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
{
652 653
    memset( pixf, 0, sizeof(*pixf) );

654 655 656 657 658 659 660 661 662 663
#define INIT2_NAME( name1, name2, cpu ) \
    pixf->name1[PIXEL_16x16] = x264_pixel_##name2##_16x16##cpu;\
    pixf->name1[PIXEL_16x8]  = x264_pixel_##name2##_16x8##cpu;
#define INIT4_NAME( name1, name2, cpu ) \
    INIT2_NAME( name1, name2, cpu ) \
    pixf->name1[PIXEL_8x16]  = x264_pixel_##name2##_8x16##cpu;\
    pixf->name1[PIXEL_8x8]   = x264_pixel_##name2##_8x8##cpu;
#define INIT5_NAME( name1, name2, cpu ) \
    INIT4_NAME( name1, name2, cpu ) \
    pixf->name1[PIXEL_8x4]   = x264_pixel_##name2##_8x4##cpu;
664
#define INIT6_NAME( name1, name2, cpu ) \
665
    INIT5_NAME( name1, name2, cpu ) \
666 667 668
    pixf->name1[PIXEL_4x8]   = x264_pixel_##name2##_4x8##cpu;
#define INIT7_NAME( name1, name2, cpu ) \
    INIT6_NAME( name1, name2, cpu ) \
669 670 671 672
    pixf->name1[PIXEL_4x4]   = x264_pixel_##name2##_4x4##cpu;
#define INIT2( name, cpu ) INIT2_NAME( name, name, cpu )
#define INIT4( name, cpu ) INIT4_NAME( name, name, cpu )
#define INIT5( name, cpu ) INIT5_NAME( name, name, cpu )
673
#define INIT6( name, cpu ) INIT6_NAME( name, name, cpu )
674
#define INIT7( name, cpu ) INIT7_NAME( name, name, cpu )
675

676 677 678 679 680
#define INIT_ADS( cpu ) \
    pixf->ads[PIXEL_16x16] = x264_pixel_ads4##cpu;\
    pixf->ads[PIXEL_16x8] = x264_pixel_ads2##cpu;\
    pixf->ads[PIXEL_8x8] = x264_pixel_ads1##cpu;

681
    INIT7( sad, );
682
    INIT7_NAME( sad_aligned, sad, );
683 684 685 686
    INIT7( sad_x3, );
    INIT7( sad_x4, );
    INIT7( ssd, );
    INIT7( satd, );
687 688
    INIT7( satd_x3, );
    INIT7( satd_x4, );
Loren Merritt's avatar
Loren Merritt committed
689
    INIT4( hadamard_ac, );
690
    INIT_ADS( );
691

692 693
    pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16;
    pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8;
694 695 696
    pixf->var[PIXEL_16x16] = x264_pixel_var_16x16;
    pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8;

697
    pixf->ssd_nv12_core = pixel_ssd_nv12_core;
698 699
    pixf->ssim_4x4x2_core = ssim_4x4x2_core;
    pixf->ssim_end4 = ssim_end4;
700
    pixf->var2_8x8 = pixel_var2_8x8;
701

702 703 704 705 706 707 708 709 710
    pixf->intra_sad_x3_4x4    = x264_intra_sad_x3_4x4;
    pixf->intra_satd_x3_4x4   = x264_intra_satd_x3_4x4;
    pixf->intra_sad_x3_8x8    = x264_intra_sad_x3_8x8;
    pixf->intra_sa8d_x3_8x8   = x264_intra_sa8d_x3_8x8;
    pixf->intra_sad_x3_8x8c   = x264_intra_sad_x3_8x8c;
    pixf->intra_satd_x3_8x8c  = x264_intra_satd_x3_8x8c;
    pixf->intra_sad_x3_16x16  = x264_intra_sad_x3_16x16;
    pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16;

711
#if !X264_HIGH_BIT_DEPTH
Steven Walters's avatar
Steven Walters committed
712
#if HAVE_MMX
713 714
    if( cpu&X264_CPU_MMX )
    {
715
        INIT7( ssd, _mmx );
716 717
    }

Laurent Aimar's avatar
Laurent Aimar committed
718 719
    if( cpu&X264_CPU_MMXEXT )
    {
720
        INIT7( sad, _mmxext );
721
        INIT7_NAME( sad_aligned, sad, _mmxext );
722 723 724
        INIT7( sad_x3, _mmxext );
        INIT7( sad_x4, _mmxext );
        INIT7( satd, _mmxext );
725 726
        INIT7( satd_x3, _mmxext );
        INIT7( satd_x4, _mmxext );
Loren Merritt's avatar
Loren Merritt committed
727
        INIT4( hadamard_ac, _mmxext );
728
        INIT_ADS( _mmxext );
729 730
        pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmxext;
        pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8_mmxext;
731
        pixf->ssd_nv12_core    = x264_pixel_ssd_nv12_core_mmxext;
Steven Walters's avatar
Steven Walters committed
732
#if ARCH_X86
733 734
        pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmxext;
        pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_mmxext;
735
        pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmxext;
736
        pixf->ssim_4x4x2_core  = x264_pixel_ssim_4x4x2_core_mmxext;
737
        pixf->var2_8x8 = x264_pixel_var2_8x8_mmxext;
738

739
        if( cpu&X264_CPU_CACHELINE_32 )
740
        {
741 742 743 744 745 746 747 748 749
            INIT5( sad, _cache32_mmxext );
            INIT4( sad_x3, _cache32_mmxext );
            INIT4( sad_x4, _cache32_mmxext );
        }
        else if( cpu&X264_CPU_CACHELINE_64 )
        {
            INIT5( sad, _cache64_mmxext );
            INIT4( sad_x3, _cache64_mmxext );
            INIT4( sad_x4, _cache64_mmxext );
750 751
        }
#else
752
        if( cpu&X264_CPU_CACHELINE_64 )
753 754 755 756 757 758 759 760 761
        {
            pixf->sad[PIXEL_8x16] = x264_pixel_sad_8x16_cache64_mmxext;
            pixf->sad[PIXEL_8x8]  = x264_pixel_sad_8x8_cache64_mmxext;
            pixf->sad[PIXEL_8x4]  = x264_pixel_sad_8x4_cache64_mmxext;
            pixf->sad_x3[PIXEL_8x16] = x264_pixel_sad_x3_8x16_cache64_mmxext;
            pixf->sad_x3[PIXEL_8x8]  = x264_pixel_sad_x3_8x8_cache64_mmxext;
            pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_cache64_mmxext;
            pixf->sad_x4[PIXEL_8x8]  = x264_pixel_sad_x4_8x8_cache64_mmxext;
        }
762
#endif
763
        pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_mmxext;
Fiona Glaser's avatar
Fiona Glaser committed
764
        pixf->intra_sad_x3_16x16  = x264_intra_sad_x3_16x16_mmxext;
765
        pixf->intra_satd_x3_8x8c  = x264_intra_satd_x3_8x8c_mmxext;
Fiona Glaser's avatar
Fiona Glaser committed
766
        pixf->intra_sad_x3_8x8c   = x264_intra_sad_x3_8x8c_mmxext;
Fiona Glaser's avatar
Fiona Glaser committed
767
        pixf->intra_sad_x3_8x8    = x264_intra_sad_x3_8x8_mmxext;
768
        pixf->intra_satd_x3_4x4   = x264_intra_satd_x3_4x4_mmxext;
Fiona Glaser's avatar
Fiona Glaser committed
769
        pixf->intra_sad_x3_4x4    = x264_intra_sad_x3_4x4_mmxext;
Laurent Aimar's avatar
Laurent Aimar committed
770
    }
771

772 773 774 775 776
    if( cpu&X264_CPU_SSE2 )
    {
        INIT5( ssd, _sse2slow );
        INIT2_NAME( sad_aligned, sad, _sse2_aligned );
        pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_sse2;
777
        pixf->ssd_nv12_core    = x264_pixel_ssd_nv12_core_sse2;
778 779 780 781
        pixf->ssim_4x4x2_core  = x264_pixel_ssim_4x4x2_core_sse2;
        pixf->ssim_end4        = x264_pixel_ssim_end4_sse2;
        pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2;
        pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_sse2;
Steven Walters's avatar
Steven Walters committed
782
#if ARCH_X86_64
783 784
        pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2;
#endif
785
        pixf->var2_8x8 = x264_pixel_var2_8x8_sse2;
786 787
    }

788
    if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
789
    {
790 791 792
        INIT2( sad, _sse2 );
        INIT2( sad_x3, _sse2 );
        INIT2( sad_x4, _sse2 );
793 794 795
        INIT6( satd, _sse2 );
        INIT6( satd_x3, _sse2 );
        INIT6( satd_x4, _sse2 );
796 797 798 799
        if( !(cpu&X264_CPU_STACK_MOD4) )
        {
            INIT4( hadamard_ac, _sse2 );
        }
800
        INIT_ADS( _sse2 );
801
        pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2;
802
        pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_sse2;
803
        if( cpu&X264_CPU_CACHELINE_64 )
804
        {
805
            INIT2( ssd, _sse2); /* faster for width 16 on p4 */
Steven Walters's avatar
Steven Walters committed
806
#if ARCH_X86
807 808 809 810
            INIT2( sad, _cache64_sse2 );
            INIT2( sad_x3, _cache64_sse2 );
            INIT2( sad_x4, _cache64_sse2 );
#endif
811 812 813 814 815 816 817
           if( cpu&X264_CPU_SSE2_IS_FAST )
           {
               pixf->sad_x3[PIXEL_8x16] = x264_pixel_sad_x3_8x16_cache64_sse2;
               pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_cache64_sse2;
           }
        }

Fiona Glaser's avatar
Fiona Glaser committed
818 819 820 821 822
        if( cpu&X264_CPU_SSE_MISALIGN )
        {
            INIT2( sad_x3, _sse2_misalign );
            INIT2( sad_x4, _sse2_misalign );
        }
823
    }
824

825 826 827 828 829 830 831 832 833 834 835 836
    if( cpu&X264_CPU_SSE2_IS_FAST && !(cpu&X264_CPU_CACHELINE_64) )
    {
        pixf->sad_aligned[PIXEL_8x16] = x264_pixel_sad_8x16_sse2;
        pixf->sad[PIXEL_8x16] = x264_pixel_sad_8x16_sse2;
        pixf->sad_x3[PIXEL_8x16] = x264_pixel_sad_x3_8x16_sse2;
        pixf->sad_x3[PIXEL_8x8] = x264_pixel_sad_x3_8x8_sse2;
        pixf->sad_x3[PIXEL_8x4] = x264_pixel_sad_x3_8x4_sse2;
        pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_sse2;
        pixf->sad_x4[PIXEL_8x8] = x264_pixel_sad_x4_8x8_sse2;
        pixf->sad_x4[PIXEL_8x4] = x264_pixel_sad_x4_8x4_sse2;
    }

837
    if( (cpu&X264_CPU_SSE3) && (cpu&X264_CPU_CACHELINE_64) )
838 839 840 841 842 843
    {
        INIT2( sad, _sse3 );
        INIT2( sad_x3, _sse3 );
        INIT2( sad_x4, _sse3 );
    }

844 845
    if( cpu&X264_CPU_SSSE3 )
    {
846 847 848 849
        if( !(cpu&X264_CPU_STACK_MOD4) )
        {
            INIT4( hadamard_ac, _ssse3 );
        }
850
        INIT_ADS( _ssse3 );
851 852 853 854 855 856 857 858 859
        if( !(cpu&X264_CPU_SLOW_ATOM) )
        {
            INIT7( ssd, _ssse3 );
            pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
            pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_ssse3;
            INIT7( satd, _ssse3 );
            INIT7( satd_x3, _ssse3 );
            INIT7( satd_x4, _ssse3 );
        }
860
        pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_ssse3;
861
        pixf->intra_sad_x3_16x16  = x264_intra_sad_x3_16x16_ssse3;
862
        pixf->intra_satd_x3_8x8c  = x264_intra_satd_x3_8x8c_ssse3;
Fiona Glaser's avatar
Fiona Glaser committed
863
        pixf->intra_sad_x3_8x8c   = x264_intra_sad_x3_8x8c_ssse3;
864
        pixf->intra_satd_x3_4x4   = x264_intra_satd_x3_4x4_ssse3;
Steven Walters's avatar
Steven Walters committed
865
#if ARCH_X86_64
866
        pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_ssse3;
Loren Merritt's avatar
Loren Merritt committed
867
#endif
868
        pixf->var2_8x8 = x264_pixel_var2_8x8_ssse3;
869
        if( cpu&X264_CPU_CACHELINE_64 )
870 871 872 873 874
        {
            INIT2( sad, _cache64_ssse3 );
            INIT2( sad_x3, _cache64_ssse3 );
            INIT2( sad_x4, _cache64_ssse3 );
        }
875
        if( cpu&X264_CPU_SLOW_ATOM || !(cpu&X264_CPU_SHUFFLE_IS_FAST) )
876
        {
877
            INIT5( ssd, _sse2 ); /* on conroe, sse2 is faster for width8/16 */
878
        }
879
    }
880 881 882

    if( cpu&X264_CPU_SSE4 )
    {
883 884 885 886 887 888 889 890 891
        INIT7( satd, _sse4 );
        INIT7( satd_x3, _sse4 );
        INIT7( satd_x4, _sse4 );
        if( !(cpu&X264_CPU_STACK_MOD4) )
        {
            INIT4( hadamard_ac, _sse4 );
        }
        pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_sse4;
        pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_sse4;
892 893 894
        pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_sse4;
        /* Slower on Conroe, so only enable under SSE4 */
        pixf->intra_sad_x3_8x8  = x264_intra_sad_x3_8x8_ssse3;
895
    }
896
#endif //HAVE_MMX
897

Steven Walters's avatar
Steven Walters committed
898
#if HAVE_ARMV6
899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938
    if( cpu&X264_CPU_ARMV6 )
    {
        pixf->sad[PIXEL_4x8] = x264_pixel_sad_4x8_armv6;
        pixf->sad[PIXEL_4x4] = x264_pixel_sad_4x4_armv6;
        pixf->sad_aligned[PIXEL_4x8] = x264_pixel_sad_4x8_armv6;
        pixf->sad_aligned[PIXEL_4x4] = x264_pixel_sad_4x4_armv6;
    }
    if( cpu&X264_CPU_NEON )
    {
        INIT5( sad, _neon );
        INIT5( sad_aligned, _neon );
        INIT7( sad_x3, _neon );
        INIT7( sad_x4, _neon );
        INIT7( ssd, _neon );
        INIT7( satd, _neon );
        INIT7( satd_x3, _neon );
        INIT7( satd_x4, _neon );
        INIT4( hadamard_ac, _neon );
        pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_neon;
        pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_neon;
        pixf->var[PIXEL_8x8]    = x264_pixel_var_8x8_neon;
        pixf->var[PIXEL_16x16]  = x264_pixel_var_16x16_neon;
        pixf->var2_8x8          = x264_pixel_var2_8x8_neon;

        pixf->ssim_4x4x2_core   = x264_pixel_ssim_4x4x2_core_neon;
        pixf->ssim_end4         = x264_pixel_ssim_end4_neon;

        if( cpu&X264_CPU_FAST_NEON_MRC )
        {
            pixf->sad[PIXEL_4x8] = x264_pixel_sad_4x8_neon;
            pixf->sad[PIXEL_4x4] = x264_pixel_sad_4x4_neon;
            pixf->sad_aligned[PIXEL_4x8] = x264_pixel_sad_aligned_4x8_neon;
            pixf->sad_aligned[PIXEL_4x4] = x264_pixel_sad_aligned_4x4_neon;
        }
        else    // really just scheduled for dual issue / A8
        {
            INIT5( sad_aligned, _neon_dual );
        }
    }
#endif
939
#endif // !X264_HIGH_BIT_DEPTH
Steven Walters's avatar
Steven Walters committed
940
#if HAVE_ALTIVEC
Laurent Aimar's avatar
Laurent Aimar committed
941 942 943 944 945
    if( cpu&X264_CPU_ALTIVEC )
    {
        x264_pixel_altivec_init( pixf );
    }
#endif
946
#if !X264_HIGH_BIT_DEPTH
Steven Walters's avatar
Steven Walters committed
947
#if ARCH_UltraSparc
948 949 950
    INIT4( sad, _vis );
    INIT4( sad_x3, _vis );
    INIT4( sad_x4, _vis );
951
#endif
952
#endif // !X264_HIGH_BIT_DEPTH
Loren Merritt's avatar