mc-c.c 30.1 KB
Newer Older
Laurent Aimar's avatar
Laurent Aimar committed
1
/*****************************************************************************
Fiona Glaser's avatar
Fiona Glaser committed
2
 * mc-c.c: x86 motion compensation
Laurent Aimar's avatar
Laurent Aimar committed
3
 *****************************************************************************
Hii's avatar
Hii committed
4
 * Copyright (C) 2003-2012 x264 project
Laurent Aimar's avatar
Laurent Aimar committed
5 6
 *
 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
7
 *          Loren Merritt <lorenm@u.washington.edu>
8
 *          Fiona Glaser <fiona@x264.com>
Laurent Aimar's avatar
Laurent Aimar committed
9 10 11 12 13 14 15 16 17 18 19 20 21
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
Fiona Glaser's avatar
Fiona Glaser committed
23 24 25
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
Laurent Aimar's avatar
Laurent Aimar committed
26 27 28 29 30 31
 *****************************************************************************/

#include <stdlib.h>
#include <stdio.h>
#include <string.h>

Loren Merritt's avatar
Loren Merritt committed
32
#include "common/common.h"
Loic Le Loarer's avatar
Loic Le Loarer committed
33
#include "mc.h"
Laurent Aimar's avatar
Laurent Aimar committed
34

Loren Merritt's avatar
Loren Merritt committed
35
#define DECL_SUF( func, args )\
Loren Merritt's avatar
Loren Merritt committed
36
    void func##_mmx2 args;\
Loren Merritt's avatar
Loren Merritt committed
37 38 39
    void func##_sse2 args;\
    void func##_ssse3 args;

40 41 42 43 44
DECL_SUF( x264_pixel_avg_16x16, ( pixel *, int, pixel *, int, pixel *, int, int ))
DECL_SUF( x264_pixel_avg_16x8,  ( pixel *, int, pixel *, int, pixel *, int, int ))
DECL_SUF( x264_pixel_avg_8x16,  ( pixel *, int, pixel *, int, pixel *, int, int ))
DECL_SUF( x264_pixel_avg_8x8,   ( pixel *, int, pixel *, int, pixel *, int, int ))
DECL_SUF( x264_pixel_avg_8x4,   ( pixel *, int, pixel *, int, pixel *, int, int ))
Fiona Glaser's avatar
Fiona Glaser committed
45
DECL_SUF( x264_pixel_avg_4x16,  ( pixel *, int, pixel *, int, pixel *, int, int ))
46 47 48
DECL_SUF( x264_pixel_avg_4x8,   ( pixel *, int, pixel *, int, pixel *, int, int ))
DECL_SUF( x264_pixel_avg_4x4,   ( pixel *, int, pixel *, int, pixel *, int, int ))
DECL_SUF( x264_pixel_avg_4x2,   ( pixel *, int, pixel *, int, pixel *, int, int ))
Dylan Yudaken's avatar
Dylan Yudaken committed
49 50

#define MC_WEIGHT(w,type) \
51
    void x264_mc_weight_w##w##_##type( pixel *,int, pixel *,int, const x264_weight_t *,int );
Dylan Yudaken's avatar
Dylan Yudaken committed
52 53

#define MC_WEIGHT_OFFSET(w,type) \
54 55
    void x264_mc_offsetadd_w##w##_##type( pixel *,int, pixel *,int, const x264_weight_t *,int ); \
    void x264_mc_offsetsub_w##w##_##type( pixel *,int, pixel *,int, const x264_weight_t *,int ); \
Dylan Yudaken's avatar
Dylan Yudaken committed
56 57
    MC_WEIGHT(w,type)

Loren Merritt's avatar
Loren Merritt committed
58 59 60 61 62
MC_WEIGHT_OFFSET( 4, mmx2 )
MC_WEIGHT_OFFSET( 8, mmx2 )
MC_WEIGHT_OFFSET( 12, mmx2 )
MC_WEIGHT_OFFSET( 16, mmx2 )
MC_WEIGHT_OFFSET( 20, mmx2 )
Dylan Yudaken's avatar
Dylan Yudaken committed
63 64 65
MC_WEIGHT_OFFSET( 12, sse2 )
MC_WEIGHT_OFFSET( 16, sse2 )
MC_WEIGHT_OFFSET( 20, sse2 )
66 67 68
#if HIGH_BIT_DEPTH
MC_WEIGHT_OFFSET( 8, sse2 )
#endif
Dylan Yudaken's avatar
Dylan Yudaken committed
69 70 71 72 73 74 75 76 77
MC_WEIGHT( 8, sse2  )
MC_WEIGHT( 4, ssse3 )
MC_WEIGHT( 8, ssse3 )
MC_WEIGHT( 12, ssse3 )
MC_WEIGHT( 16, ssse3 )
MC_WEIGHT( 20, ssse3 )
#undef MC_OFFSET
#undef MC_WEIGHT

78 79 80 81 82 83
void x264_mc_copy_w4_mmx( pixel *, int, pixel *, int, int );
void x264_mc_copy_w8_mmx( pixel *, int, pixel *, int, int );
void x264_mc_copy_w8_sse2( pixel *, int, pixel *, int, int );
void x264_mc_copy_w16_mmx( pixel *, int, pixel *, int, int );
void x264_mc_copy_w16_sse2( pixel *, int, pixel *, int, int );
void x264_mc_copy_w16_aligned_sse2( pixel *, int, pixel *, int, int );
84 85
void x264_prefetch_fenc_420_mmx2( pixel *, int, pixel *, int, int );
void x264_prefetch_fenc_422_mmx2( pixel *, int, pixel *, int, int );
86
void x264_prefetch_ref_mmx2( pixel *, int, int );
Loren Merritt's avatar
Loren Merritt committed
87
void x264_plane_copy_core_mmx2( pixel *, int, pixel *, int, int w, int h);
88
void x264_plane_copy_c( pixel *, int, pixel *, int, int w, int h );
Loren Merritt's avatar
Loren Merritt committed
89
void x264_plane_copy_interleave_core_mmx2( pixel *dst, int i_dst,
90 91 92 93 94
                                             pixel *srcu, int i_srcu,
                                             pixel *srcv, int i_srcv, int w, int h );
void x264_plane_copy_interleave_core_sse2( pixel *dst, int i_dst,
                                           pixel *srcu, int i_srcu,
                                           pixel *srcv, int i_srcv, int w, int h );
95 96 97
void x264_plane_copy_interleave_core_avx( pixel *dst, int i_dst,
                                           pixel *srcu, int i_srcu,
                                           pixel *srcv, int i_srcv, int w, int h );
98 99 100
void x264_plane_copy_interleave_c( pixel *dst, int i_dst,
                                   pixel *srcu, int i_srcu,
                                   pixel *srcv, int i_srcv, int w, int h );
101 102 103 104 105 106
void x264_plane_copy_deinterleave_mmx( pixel *dstu, int i_dstu,
                                       pixel *dstv, int i_dstv,
                                       pixel *src, int i_src, int w, int h );
void x264_plane_copy_deinterleave_sse2( pixel *dstu, int i_dstu,
                                        pixel *dstv, int i_dstv,
                                        pixel *src, int i_src, int w, int h );
107 108 109
void x264_plane_copy_deinterleave_ssse3( uint8_t *dstu, int i_dstu,
                                         uint8_t *dstv, int i_dstv,
                                         uint8_t *src, int i_src, int w, int h );
110 111 112
void x264_plane_copy_deinterleave_avx( uint16_t *dstu, int i_dstu,
                                         uint16_t *dstv, int i_dstv,
                                         uint16_t *src, int i_src, int w, int h );
Henrik Gramner's avatar
Henrik Gramner committed
113 114 115 116 117 118 119 120 121 122 123
void x264_store_interleave_chroma_mmx2( pixel *dst, int i_dst, pixel *srcu, pixel *srcv, int height );
void x264_store_interleave_chroma_sse2( pixel *dst, int i_dst, pixel *srcu, pixel *srcv, int height );
void x264_store_interleave_chroma_avx( pixel *dst, int i_dst, pixel *srcu, pixel *srcv, int height );
void x264_load_deinterleave_chroma_fenc_mmx( pixel *dst, pixel *src, int i_src, int height );
void x264_load_deinterleave_chroma_fenc_sse2( pixel *dst, pixel *src, int i_src, int height );
void x264_load_deinterleave_chroma_fenc_ssse3( uint8_t *dst, uint8_t *src, int i_src, int height );
void x264_load_deinterleave_chroma_fenc_avx( uint16_t *dst, uint16_t *src, int i_src, int height );
void x264_load_deinterleave_chroma_fdec_mmx( pixel *dst, pixel *src, int i_src, int height );
void x264_load_deinterleave_chroma_fdec_sse2( pixel *dst, pixel *src, int i_src, int height );
void x264_load_deinterleave_chroma_fdec_ssse3( uint8_t *dst, uint8_t *src, int i_src, int height );
void x264_load_deinterleave_chroma_fdec_avx( uint16_t *dst, uint16_t *src, int i_src, int height );
124 125 126 127 128 129
void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n );
void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n );
void x264_memzero_aligned_mmx( void * dst, int n );
void x264_memzero_aligned_sse2( void * dst, int n );
void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, int stride );
void x264_integral_init8h_sse4( uint16_t *sum, uint8_t *pix, int stride );
130
void x264_integral_init8h_avx ( uint16_t *sum, uint8_t *pix, int stride );
131 132 133 134 135 136
void x264_integral_init4v_mmx( uint16_t *sum8, uint16_t *sum4, int stride );
void x264_integral_init4v_sse2( uint16_t *sum8, uint16_t *sum4, int stride );
void x264_integral_init8v_mmx( uint16_t *sum8, int stride );
void x264_integral_init8v_sse2( uint16_t *sum8, int stride );
void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, int stride );
void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
137
                                      uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
Fiona Glaser's avatar
Fiona Glaser committed
138 139
void x264_mbtree_propagate_cost_avx( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
                                     uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
140 141
void x264_mbtree_propagate_cost_fma4( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
                                      uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
142 143

#define MC_CHROMA(cpu)\
144 145
void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, int i_dst,\
                           pixel *src, int i_src,\
146
                           int dx, int dy, int i_width, int i_height );
Loren Merritt's avatar
Loren Merritt committed
147
MC_CHROMA(mmx2)
148 149 150 151
MC_CHROMA(sse2)
MC_CHROMA(sse2_misalign)
MC_CHROMA(ssse3)
MC_CHROMA(ssse3_cache64)
152 153
MC_CHROMA(avx)
MC_CHROMA(avx_cache64)
154

Anton Mitrofanov's avatar
Anton Mitrofanov committed
155
#define LOWRES(cpu)\
156
void x264_frame_init_lowres_core_##cpu( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,\
157
                                        int src_stride, int dst_stride, int width, int height );
Loren Merritt's avatar
Loren Merritt committed
158 159
LOWRES(mmx2)
LOWRES(cache32_mmx2)
Loren Merritt's avatar
Loren Merritt committed
160 161
LOWRES(sse2)
LOWRES(ssse3)
Fiona Glaser's avatar
Fiona Glaser committed
162 163
LOWRES(avx)
LOWRES(xop)
Loren Merritt's avatar
Loren Merritt committed
164

165
#define PIXEL_AVG_W(width,cpu)\
166
void x264_pixel_avg2_w##width##_##cpu( pixel *, int, pixel *, int, pixel *, int );
167 168
/* This declares some functions that don't exist, but that isn't a problem. */
#define PIXEL_AVG_WALL(cpu)\
169
PIXEL_AVG_W(4,cpu); PIXEL_AVG_W(8,cpu); PIXEL_AVG_W(10,cpu); PIXEL_AVG_W(12,cpu); PIXEL_AVG_W(16,cpu); PIXEL_AVG_W(18,cpu); PIXEL_AVG_W(20,cpu);
170

Loren Merritt's avatar
Loren Merritt committed
171 172 173
PIXEL_AVG_WALL(mmx2)
PIXEL_AVG_WALL(cache32_mmx2)
PIXEL_AVG_WALL(cache64_mmx2)
174 175
PIXEL_AVG_WALL(cache64_sse2)
PIXEL_AVG_WALL(sse2)
Fiona Glaser's avatar
Fiona Glaser committed
176
PIXEL_AVG_WALL(sse2_misalign)
177
PIXEL_AVG_WALL(cache64_ssse3)
178 179

#define PIXEL_AVG_WTAB(instr, name1, name2, name3, name4, name5)\
180
static void (* const x264_pixel_avg_wtab_##instr[6])( pixel *, int, pixel *, int, pixel *, int ) =\
181 182 183 184 185 186 187
{\
    NULL,\
    x264_pixel_avg2_w4_##name1,\
    x264_pixel_avg2_w8_##name2,\
    x264_pixel_avg2_w12_##name3,\
    x264_pixel_avg2_w16_##name4,\
    x264_pixel_avg2_w20_##name5,\
Fiona Glaser's avatar
Fiona Glaser committed
188
};
189

190
#if HIGH_BIT_DEPTH
191
/* we can replace w12/w20 with w10/w18 as only 9/17 pixels in fact are important */
Loren Merritt's avatar
Loren Merritt committed
192 193
#define x264_pixel_avg2_w12_mmx2       x264_pixel_avg2_w10_mmx2
#define x264_pixel_avg2_w20_mmx2       x264_pixel_avg2_w18_mmx2
194 195 196
#define x264_pixel_avg2_w12_sse2         x264_pixel_avg2_w10_sse2
#define x264_pixel_avg2_w20_sse2         x264_pixel_avg2_w18_sse2
#else
197
/* w16 sse2 is faster than w12 mmx as long as the cacheline issue is resolved */
198
#define x264_pixel_avg2_w12_cache64_ssse3 x264_pixel_avg2_w16_cache64_ssse3
199 200
#define x264_pixel_avg2_w12_cache64_sse2 x264_pixel_avg2_w16_cache64_sse2
#define x264_pixel_avg2_w12_sse3         x264_pixel_avg2_w16_sse3
Fiona Glaser's avatar
Fiona Glaser committed
201
#define x264_pixel_avg2_w12_sse2         x264_pixel_avg2_w16_sse2
202
#endif // HIGH_BIT_DEPTH
203

Loren Merritt's avatar
Loren Merritt committed
204
PIXEL_AVG_WTAB(mmx2, mmx2, mmx2, mmx2, mmx2, mmx2)
205
#if HIGH_BIT_DEPTH
Loren Merritt's avatar
Loren Merritt committed
206
PIXEL_AVG_WTAB(sse2, mmx2, sse2, sse2, sse2, sse2)
207
#else // !HIGH_BIT_DEPTH
Steven Walters's avatar
Steven Walters committed
208
#if ARCH_X86
Loren Merritt's avatar
Loren Merritt committed
209 210
PIXEL_AVG_WTAB(cache32_mmx2, mmx2, cache32_mmx2, cache32_mmx2, cache32_mmx2, cache32_mmx2)
PIXEL_AVG_WTAB(cache64_mmx2, mmx2, cache64_mmx2, cache64_mmx2, cache64_mmx2, cache64_mmx2)
211
#endif
Loren Merritt's avatar
Loren Merritt committed
212 213 214 215
PIXEL_AVG_WTAB(sse2, mmx2, mmx2, sse2, sse2, sse2)
PIXEL_AVG_WTAB(sse2_misalign, mmx2, mmx2, sse2, sse2, sse2_misalign)
PIXEL_AVG_WTAB(cache64_sse2, mmx2, cache64_mmx2, cache64_sse2, cache64_sse2, cache64_sse2)
PIXEL_AVG_WTAB(cache64_ssse3, mmx2, cache64_mmx2, cache64_ssse3, cache64_ssse3, cache64_sse2)
216
#endif // HIGH_BIT_DEPTH
217 218

#define MC_COPY_WTAB(instr, name1, name2, name3)\
219
static void (* const x264_mc_copy_wtab_##instr[5])( pixel *, int, pixel *, int, int ) =\
220 221 222 223 224 225
{\
    NULL,\
    x264_mc_copy_w4_##name1,\
    x264_mc_copy_w8_##name2,\
    NULL,\
    x264_mc_copy_w16_##name3,\
Fiona Glaser's avatar
Fiona Glaser committed
226
};
227 228

MC_COPY_WTAB(mmx,mmx,mmx,mmx)
229 230 231
#if HIGH_BIT_DEPTH
MC_COPY_WTAB(sse2,mmx,sse2,sse2)
#else
232
MC_COPY_WTAB(sse2,mmx,mmx,sse2)
233
#endif
234

Dylan Yudaken's avatar
Dylan Yudaken committed
235
#define MC_WEIGHT_WTAB(function, instr, name1, name2, w12version)\
236
    static void (* x264_mc_##function##_wtab_##instr[6])( pixel *, int, pixel *, int, const x264_weight_t *, int ) =\
Dylan Yudaken's avatar
Dylan Yudaken committed
237 238 239 240 241 242 243 244 245
{\
    x264_mc_##function##_w4_##name1,\
    x264_mc_##function##_w4_##name1,\
    x264_mc_##function##_w8_##name2,\
    x264_mc_##function##_w##w12version##_##instr,\
    x264_mc_##function##_w16_##instr,\
    x264_mc_##function##_w20_##instr,\
};

246
#if HIGH_BIT_DEPTH
Loren Merritt's avatar
Loren Merritt committed
247 248 249 250 251 252 253 254
MC_WEIGHT_WTAB(weight,mmx2,mmx2,mmx2,12)
MC_WEIGHT_WTAB(offsetadd,mmx2,mmx2,mmx2,12)
MC_WEIGHT_WTAB(offsetsub,mmx2,mmx2,mmx2,12)
MC_WEIGHT_WTAB(weight,sse2,mmx2,sse2,12)
MC_WEIGHT_WTAB(offsetadd,sse2,mmx2,sse2,16)
MC_WEIGHT_WTAB(offsetsub,sse2,mmx2,sse2,16)

static void x264_weight_cache_mmx2( x264_t *h, x264_weight_t *w )
255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275
{
    if( w->i_scale == 1<<w->i_denom )
    {
        if( w->i_offset < 0 )
            w->weightfn = h->mc.offsetsub;
        else
            w->weightfn = h->mc.offsetadd;
        for( int i = 0; i < 8; i++ )
            w->cachea[i] = abs(w->i_offset<<(BIT_DEPTH-8));
        return;
    }
    w->weightfn = h->mc.weight;
    int den1 = 1<<w->i_denom;
    int den2 = w->i_scale<<1;
    int den3 = 1+(w->i_offset<<(BIT_DEPTH-8+1));
    for( int i = 0; i < 8; i++ )
    {
        w->cachea[i] = den1;
        w->cacheb[i] = i&1 ? den3 : den2;
    }
}
276
#else
Loren Merritt's avatar
Loren Merritt committed
277 278 279 280 281 282
MC_WEIGHT_WTAB(weight,mmx2,mmx2,mmx2,12)
MC_WEIGHT_WTAB(offsetadd,mmx2,mmx2,mmx2,12)
MC_WEIGHT_WTAB(offsetsub,mmx2,mmx2,mmx2,12)
MC_WEIGHT_WTAB(weight,sse2,mmx2,sse2,16)
MC_WEIGHT_WTAB(offsetadd,sse2,mmx2,mmx2,16)
MC_WEIGHT_WTAB(offsetsub,sse2,mmx2,mmx2,16)
Dylan Yudaken's avatar
Dylan Yudaken committed
283 284
MC_WEIGHT_WTAB(weight,ssse3,ssse3,ssse3,16)

Loren Merritt's avatar
Loren Merritt committed
285
static void x264_weight_cache_mmx2( x264_t *h, x264_weight_t *w )
Dylan Yudaken's avatar
Dylan Yudaken committed
286 287 288 289 290 291 292 293 294 295 296 297 298 299
{
    int i;
    int16_t den1;

    if( w->i_scale == 1<<w->i_denom )
    {
        if( w->i_offset < 0 )
            w->weightfn = h->mc.offsetsub;
        else
            w->weightfn = h->mc.offsetadd;
        memset( w->cachea, abs(w->i_offset), sizeof(w->cachea) );
        return;
    }
    w->weightfn = h->mc.weight;
Loren Merritt's avatar
Loren Merritt committed
300
    den1 = 1 << (w->i_denom - 1) | w->i_offset << w->i_denom;
Dylan Yudaken's avatar
Dylan Yudaken committed
301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321
    for( i = 0; i < 8; i++ )
    {
        w->cachea[i] = w->i_scale;
        w->cacheb[i] = den1;
    }
}

static void x264_weight_cache_ssse3( x264_t *h, x264_weight_t *w )
{
    int i, den1;
    if( w->i_scale == 1<<w->i_denom )
    {
        if( w->i_offset < 0 )
            w->weightfn = h->mc.offsetsub;
        else
            w->weightfn = h->mc.offsetadd;

        memset( w->cachea, abs( w->i_offset ), sizeof(w->cachea) );
        return;
    }
    w->weightfn = h->mc.weight;
Loren Merritt's avatar
Loren Merritt committed
322
    den1 = w->i_scale << (8 - w->i_denom);
323
    for( i = 0; i < 8; i++ )
Dylan Yudaken's avatar
Dylan Yudaken committed
324
    {
Loren Merritt's avatar
Loren Merritt committed
325
        w->cachea[i] = den1;
Dylan Yudaken's avatar
Dylan Yudaken committed
326 327 328
        w->cacheb[i] = w->i_offset;
    }
}
329
#endif // !HIGH_BIT_DEPTH
Dylan Yudaken's avatar
Dylan Yudaken committed
330

331 332
static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
333

Fiona Glaser's avatar
Fiona Glaser committed
334
#define MC_LUMA(name,instr1,instr2)\
335 336
static void mc_luma_##name( pixel *dst,    int i_dst_stride,\
                  pixel *src[4], int i_src_stride,\
Fiona Glaser's avatar
Fiona Glaser committed
337
                  int mvx, int mvy,\
Dylan Yudaken's avatar
Dylan Yudaken committed
338
                  int i_width, int i_height, const x264_weight_t *weight )\
Fiona Glaser's avatar
Fiona Glaser committed
339 340 341
{\
    int qpel_idx = ((mvy&3)<<2) + (mvx&3);\
    int offset = (mvy>>2)*i_src_stride + (mvx>>2);\
342
    pixel *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;\
Fiona Glaser's avatar
Fiona Glaser committed
343 344
    if( qpel_idx & 5 ) /* qpel interpolation needed */\
    {\
345
        pixel *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);\
Fiona Glaser's avatar
Fiona Glaser committed
346 347 348
        x264_pixel_avg_wtab_##instr1[i_width>>2](\
                dst, i_dst_stride, src1, i_src_stride,\
                src2, i_height );\
Dylan Yudaken's avatar
Dylan Yudaken committed
349 350
        if( weight->weightfn )\
            weight->weightfn[i_width>>2]( dst, i_dst_stride, dst, i_dst_stride, weight, i_height );\
Fiona Glaser's avatar
Fiona Glaser committed
351
    }\
Dylan Yudaken's avatar
Dylan Yudaken committed
352 353
    else if( weight->weightfn )\
        weight->weightfn[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, weight, i_height );\
Fiona Glaser's avatar
Fiona Glaser committed
354
    else\
Dylan Yudaken's avatar
Dylan Yudaken committed
355
        x264_mc_copy_wtab_##instr2[i_width>>2](dst, i_dst_stride, src1, i_src_stride, i_height );\
356 357
}

Loren Merritt's avatar
Loren Merritt committed
358
MC_LUMA(mmx2,mmx2,mmx)
359
MC_LUMA(sse2,sse2,sse2)
360
#if !HIGH_BIT_DEPTH
Steven Walters's avatar
Steven Walters committed
361
#if ARCH_X86
Loren Merritt's avatar
Loren Merritt committed
362 363
MC_LUMA(cache32_mmx2,cache32_mmx2,mmx)
MC_LUMA(cache64_mmx2,cache64_mmx2,mmx)
364 365
#endif
MC_LUMA(cache64_sse2,cache64_sse2,sse2)
366
MC_LUMA(cache64_ssse3,cache64_ssse3,sse2)
367
#endif // !HIGH_BIT_DEPTH
Fiona Glaser's avatar
Fiona Glaser committed
368 369

#define GET_REF(name)\
370 371
static pixel *get_ref_##name( pixel *dst,   int *i_dst_stride,\
                         pixel *src[4], int i_src_stride,\
Fiona Glaser's avatar
Fiona Glaser committed
372
                         int mvx, int mvy,\
Dylan Yudaken's avatar
Dylan Yudaken committed
373
                         int i_width, int i_height, const x264_weight_t *weight )\
Fiona Glaser's avatar
Fiona Glaser committed
374 375 376
{\
    int qpel_idx = ((mvy&3)<<2) + (mvx&3);\
    int offset = (mvy>>2)*i_src_stride + (mvx>>2);\
377
    pixel *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;\
Fiona Glaser's avatar
Fiona Glaser committed
378 379
    if( qpel_idx & 5 ) /* qpel interpolation needed */\
    {\
380
        pixel *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);\
Fiona Glaser's avatar
Fiona Glaser committed
381 382 383
        x264_pixel_avg_wtab_##name[i_width>>2](\
                dst, *i_dst_stride, src1, i_src_stride,\
                src2, i_height );\
Loren Merritt's avatar
Loren Merritt committed
384 385
        if( weight->weightfn )\
            weight->weightfn[i_width>>2]( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_height );\
Dylan Yudaken's avatar
Dylan Yudaken committed
386 387
        return dst;\
    }\
Loren Merritt's avatar
Loren Merritt committed
388
    else if( weight->weightfn )\
Dylan Yudaken's avatar
Dylan Yudaken committed
389 390
    {\
        weight->weightfn[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, weight, i_height );\
Fiona Glaser's avatar
Fiona Glaser committed
391 392 393 394 395 396 397
        return dst;\
    }\
    else\
    {\
        *i_dst_stride = i_src_stride;\
        return src1;\
    }\
398 399
}

Loren Merritt's avatar
Loren Merritt committed
400
GET_REF(mmx2)
401
GET_REF(sse2)
402
#if !HIGH_BIT_DEPTH
Steven Walters's avatar
Steven Walters committed
403
#if ARCH_X86
Loren Merritt's avatar
Loren Merritt committed
404 405
GET_REF(cache32_mmx2)
GET_REF(cache64_mmx2)
406
#endif
Fiona Glaser's avatar
Fiona Glaser committed
407
GET_REF(sse2_misalign)
408
GET_REF(cache64_sse2)
409
GET_REF(cache64_ssse3)
410
#endif // !HIGH_BIT_DEPTH
411

412
#define HPEL(align, cpu, cpuv, cpuc, cpuh)\
413 414 415 416
void x264_hpel_filter_v_##cpuv( pixel *dst, pixel *src, int16_t *buf, int stride, int width);\
void x264_hpel_filter_c_##cpuc( pixel *dst, int16_t *buf, int width );\
void x264_hpel_filter_h_##cpuh( pixel *dst, pixel *src, int width );\
static void x264_hpel_filter_##cpu( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,\
417
                             int stride, int width, int height, int16_t *buf )\
418
{\
Anton Mitrofanov's avatar
Anton Mitrofanov committed
419
    int realign = (intptr_t)src & (align-1);\
420 421 422 423
    src -= realign;\
    dstv -= realign;\
    dstc -= realign;\
    dsth -= realign;\
424
    width += realign;\
425 426 427 428 429 430 431 432 433 434
    while( height-- )\
    {\
        x264_hpel_filter_v_##cpuv( dstv, src, buf+8, stride, width );\
        x264_hpel_filter_c_##cpuc( dstc, buf+8, width );\
        x264_hpel_filter_h_##cpuh( dsth, src, width );\
        dsth += stride;\
        dstv += stride;\
        dstc += stride;\
        src  += stride;\
    }\
435
    x264_sfence();\
436 437
}

Loren Merritt's avatar
Loren Merritt committed
438
HPEL(8, mmx2, mmx2, mmx2, mmx2)
439
#if HIGH_BIT_DEPTH
440
HPEL(16, sse2, sse2, sse2, sse2)
441
#else // !HIGH_BIT_DEPTH
Loren Merritt's avatar
Loren Merritt committed
442
HPEL(16, sse2_amd, mmx2, mmx2, sse2)
Steven Walters's avatar
Steven Walters committed
443
#if ARCH_X86_64
444 445
void x264_hpel_filter_sse2( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height, int16_t *buf );
void x264_hpel_filter_ssse3( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height, int16_t *buf );
446
void x264_hpel_filter_avx( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, int stride, int width, int height, int16_t *buf );
447
#else
448
HPEL(16, sse2, sse2, sse2, sse2)
Fiona Glaser's avatar
Fiona Glaser committed
449
HPEL(16, ssse3, ssse3, ssse3, ssse3)
Fiona Glaser's avatar
Fiona Glaser committed
450
HPEL(16, avx, avx, avx, avx)
451
#endif
Fiona Glaser's avatar
Fiona Glaser committed
452
HPEL(16, sse2_misalign, sse2, sse2_misalign, sse2)
453
#endif // HIGH_BIT_DEPTH
454

Loren Merritt's avatar
Loren Merritt committed
455
static void x264_plane_copy_mmx2( pixel *dst, int i_dst, pixel *src, int i_src, int w, int h )
456
{
457
    int c_w = 16/sizeof(pixel) - 1;
458 459
    if( w < 256 ) { // tiny resolutions don't want non-temporal hints. dunno the exact threshold.
        x264_plane_copy_c( dst, i_dst, src, i_src, w, h );
460
    } else if( !(w&c_w) ) {
Loren Merritt's avatar
Loren Merritt committed
461
        x264_plane_copy_core_mmx2( dst, i_dst, src, i_src, w, h );
462
    } else if( i_src > 0 ) {
463
        // have to use plain memcpy on the last line (in memory order) to avoid overreading src
Loren Merritt's avatar
Loren Merritt committed
464
        x264_plane_copy_core_mmx2( dst, i_dst, src, i_src, (w+c_w)&~c_w, h-1 );
465
        memcpy( dst+i_dst*(h-1), src+i_src*(h-1), w*sizeof(pixel) );
466
    } else {
467
        memcpy( dst, src, w*sizeof(pixel) );
Loren Merritt's avatar
Loren Merritt committed
468
        x264_plane_copy_core_mmx2( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h-1 );
469 470
    }
}
471 472

#define PLANE_INTERLEAVE(cpu) \
473 474 475
static void x264_plane_copy_interleave_##cpu( pixel *dst, int i_dst,\
                                              pixel *srcu, int i_srcu,\
                                              pixel *srcv, int i_srcv, int w, int h )\
476 477 478 479 480 481 482 483 484 485 486 487 488 489
{\
    if( !(w&15) ) {\
        x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\
    } else if( w < 16 || (i_srcu ^ i_srcv) ) {\
        x264_plane_copy_interleave_c( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\
    } else if( i_srcu > 0 ) {\
        x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, (w+15)&~15, h-1 );\
        x264_plane_copy_interleave_c( dst+i_dst*(h-1), 0, srcu+i_srcu*(h-1), 0, srcv+i_srcv*(h-1), 0, w, 1 );\
    } else {\
        x264_plane_copy_interleave_c( dst, 0, srcu, 0, srcv, 0, w, 1 );\
        x264_plane_copy_interleave_core_##cpu( dst+i_dst, i_dst, srcu+i_srcu, i_srcu, srcv+i_srcv, i_srcv, (w+15)&~15, h-1 );\
    }\
}

Loren Merritt's avatar
Loren Merritt committed
490
PLANE_INTERLEAVE(mmx2)
491
PLANE_INTERLEAVE(sse2)
492 493 494
#if HIGH_BIT_DEPTH
PLANE_INTERLEAVE(avx)
#endif
495

Loren Merritt's avatar
Loren Merritt committed
496
void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
Laurent Aimar's avatar
Laurent Aimar committed
497
{
Loren Merritt's avatar
Loren Merritt committed
498 499 500
    if( !(cpu&X264_CPU_MMX) )
        return;

Henrik Gramner's avatar
Henrik Gramner committed
501 502
    pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_mmx;
    pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_mmx;
503 504 505

    pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_mmx;

506
    pf->copy_16x16_unaligned = x264_mc_copy_w16_mmx;
Loren Merritt's avatar
Loren Merritt committed
507 508 509
    pf->copy[PIXEL_16x16] = x264_mc_copy_w16_mmx;
    pf->copy[PIXEL_8x8]   = x264_mc_copy_w8_mmx;
    pf->copy[PIXEL_4x4]   = x264_mc_copy_w4_mmx;
510 511
    pf->memcpy_aligned  = x264_memcpy_aligned_mmx;
    pf->memzero_aligned = x264_memzero_aligned_mmx;
Loren Merritt's avatar
Loren Merritt committed
512 513
    pf->integral_init4v = x264_integral_init4v_mmx;
    pf->integral_init8v = x264_integral_init8v_mmx;
Loren Merritt's avatar
Loren Merritt committed
514

Loren Merritt's avatar
Loren Merritt committed
515
    if( !(cpu&X264_CPU_MMX2) )
Loren Merritt's avatar
Loren Merritt committed
516 517
        return;

518 519
    pf->prefetch_fenc_420 = x264_prefetch_fenc_420_mmx2;
    pf->prefetch_fenc_422 = x264_prefetch_fenc_422_mmx2;
520 521
    pf->prefetch_ref  = x264_prefetch_ref_mmx2;

Loren Merritt's avatar
Loren Merritt committed
522 523
    pf->plane_copy = x264_plane_copy_mmx2;
    pf->plane_copy_interleave = x264_plane_copy_interleave_mmx2;
Henrik Gramner's avatar
Henrik Gramner committed
524
    pf->store_interleave_chroma = x264_store_interleave_chroma_mmx2;
Loren Merritt's avatar
Loren Merritt committed
525 526 527 528 529 530

    pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_mmx2;
    pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_mmx2;
    pf->avg[PIXEL_8x16]  = x264_pixel_avg_8x16_mmx2;
    pf->avg[PIXEL_8x8]   = x264_pixel_avg_8x8_mmx2;
    pf->avg[PIXEL_8x4]   = x264_pixel_avg_8x4_mmx2;
Fiona Glaser's avatar
Fiona Glaser committed
531
    pf->avg[PIXEL_4x16]  = x264_pixel_avg_4x16_mmx2;
Loren Merritt's avatar
Loren Merritt committed
532 533 534 535 536 537 538 539 540 541 542 543 544 545
    pf->avg[PIXEL_4x8]   = x264_pixel_avg_4x8_mmx2;
    pf->avg[PIXEL_4x4]   = x264_pixel_avg_4x4_mmx2;
    pf->avg[PIXEL_4x2]   = x264_pixel_avg_4x2_mmx2;

    pf->mc_luma = mc_luma_mmx2;
    pf->get_ref = get_ref_mmx2;
    pf->mc_chroma = x264_mc_chroma_mmx2;
    pf->hpel_filter = x264_hpel_filter_mmx2;
    pf->weight = x264_mc_weight_wtab_mmx2;
    pf->weight_cache = x264_weight_cache_mmx2;
    pf->offsetadd = x264_mc_offsetadd_wtab_mmx2;
    pf->offsetsub = x264_mc_offsetsub_wtab_mmx2;

    pf->frame_init_lowres_core = x264_frame_init_lowres_core_mmx2;
546

547
#if HIGH_BIT_DEPTH
548 549
#if ARCH_X86 // all x86_64 cpus with cacheline split issues use sse2 instead
    if( cpu&(X264_CPU_CACHELINE_32|X264_CPU_CACHELINE_64) )
Loren Merritt's avatar
Loren Merritt committed
550
        pf->frame_init_lowres_core = x264_frame_init_lowres_core_cache32_mmx2;
551 552
#endif

553 554 555
    if( !(cpu&X264_CPU_SSE2) )
        return;

556 557
    pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2;

Henrik Gramner's avatar
Henrik Gramner committed
558 559
    pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_sse2;
    pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_sse2;
560

561
    pf->plane_copy_interleave   = x264_plane_copy_interleave_sse2;
562 563
    pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2;

564 565 566 567 568 569 570 571 572 573 574 575
    if( cpu&X264_CPU_SSE2_IS_FAST )
    {
        pf->get_ref = get_ref_sse2;
        pf->mc_luma = mc_luma_sse2;
        pf->hpel_filter = x264_hpel_filter_sse2;
    }

    pf->memcpy_aligned  = x264_memcpy_aligned_sse2;
    pf->memzero_aligned = x264_memzero_aligned_sse2;
    pf->integral_init4v = x264_integral_init4v_sse2;
    pf->integral_init8v = x264_integral_init8v_sse2;
    pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_sse2;
Henrik Gramner's avatar
Henrik Gramner committed
576
    pf->store_interleave_chroma = x264_store_interleave_chroma_sse2;
577 578
    pf->offsetadd = x264_mc_offsetadd_wtab_sse2;
    pf->offsetsub = x264_mc_offsetsub_wtab_sse2;
579 580 581 582

    if( cpu&X264_CPU_SSE2_IS_SLOW )
        return;

583 584 585 586 587
    pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2;
    pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_sse2;
    pf->avg[PIXEL_8x16]  = x264_pixel_avg_8x16_sse2;
    pf->avg[PIXEL_8x8]   = x264_pixel_avg_8x8_sse2;
    pf->avg[PIXEL_8x4]   = x264_pixel_avg_8x4_sse2;
Fiona Glaser's avatar
Fiona Glaser committed
588
    pf->avg[PIXEL_4x16]  = x264_pixel_avg_4x16_sse2;
589 590 591 592
    pf->avg[PIXEL_4x8]   = x264_pixel_avg_4x8_sse2;
    pf->avg[PIXEL_4x4]   = x264_pixel_avg_4x4_sse2;
    pf->avg[PIXEL_4x2]   = x264_pixel_avg_4x2_sse2;

593 594 595 596 597 598 599 600 601
    pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse2;
    pf->weight = x264_mc_weight_wtab_sse2;

    if( !(cpu&X264_CPU_STACK_MOD4) )
        pf->mc_chroma = x264_mc_chroma_sse2;

    if( !(cpu&X264_CPU_SSSE3) )
        return;

602 603
    pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;

604 605
    if( (cpu&X264_CPU_SHUFFLE_IS_FAST) && !(cpu&X264_CPU_SLOW_ATOM) )
        pf->integral_init4v = x264_integral_init4v_ssse3;
606 607 608 609

    if( !(cpu&X264_CPU_AVX) )
        return;

Fiona Glaser's avatar
Fiona Glaser committed
610
    pf->frame_init_lowres_core = x264_frame_init_lowres_core_avx;
Henrik Gramner's avatar
Henrik Gramner committed
611 612
    pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx;
    pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_avx;
613 614
    pf->plane_copy_interleave        = x264_plane_copy_interleave_avx;
    pf->plane_copy_deinterleave      = x264_plane_copy_deinterleave_avx;
Henrik Gramner's avatar
Henrik Gramner committed
615
    pf->store_interleave_chroma      = x264_store_interleave_chroma_avx;
616 617 618

    if( !(cpu&X264_CPU_STACK_MOD4) )
        pf->mc_chroma = x264_mc_chroma_avx;
Fiona Glaser's avatar
Fiona Glaser committed
619 620 621

    if( cpu&X264_CPU_XOP )
        pf->frame_init_lowres_core = x264_frame_init_lowres_core_xop;
622
#else // !HIGH_BIT_DEPTH
Loren Merritt's avatar
Loren Merritt committed
623

Steven Walters's avatar
Steven Walters committed
624
#if ARCH_X86 // all x86_64 cpus with cacheline split issues use sse2 instead
625 626
    if( cpu&X264_CPU_CACHELINE_32 )
    {
Loren Merritt's avatar
Loren Merritt committed
627 628 629
        pf->mc_luma = mc_luma_cache32_mmx2;
        pf->get_ref = get_ref_cache32_mmx2;
        pf->frame_init_lowres_core = x264_frame_init_lowres_core_cache32_mmx2;
630
    }
631
    else if( cpu&X264_CPU_CACHELINE_64 )
632
    {
Loren Merritt's avatar
Loren Merritt committed
633 634 635
        pf->mc_luma = mc_luma_cache64_mmx2;
        pf->get_ref = get_ref_cache64_mmx2;
        pf->frame_init_lowres_core = x264_frame_init_lowres_core_cache32_mmx2;
636 637 638
    }
#endif

Fiona Glaser's avatar
Fiona Glaser committed
639 640
    if( !(cpu&X264_CPU_SSE2) )
        return;
Fiona Glaser's avatar
Fiona Glaser committed
641

Fiona Glaser's avatar
Fiona Glaser committed
642
    pf->memcpy_aligned = x264_memcpy_aligned_sse2;
Fiona Glaser's avatar
Fiona Glaser committed
643
    pf->memzero_aligned = x264_memzero_aligned_sse2;
Loren Merritt's avatar
Loren Merritt committed
644 645
    pf->integral_init4v = x264_integral_init4v_sse2;
    pf->integral_init8v = x264_integral_init8v_sse2;
646
    pf->hpel_filter = x264_hpel_filter_sse2_amd;
647
    pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_sse2;
Fiona Glaser's avatar
Fiona Glaser committed
648

649
    if( cpu&X264_CPU_SSE2_IS_SLOW )
Fiona Glaser's avatar
Fiona Glaser committed
650 651
        return;

Dylan Yudaken's avatar
Dylan Yudaken committed
652
    pf->weight = x264_mc_weight_wtab_sse2;
653 654 655 656 657
    if( !(cpu&X264_CPU_SLOW_ATOM) )
    {
        pf->offsetadd = x264_mc_offsetadd_wtab_sse2;
        pf->offsetsub = x264_mc_offsetsub_wtab_sse2;
    }
Dylan Yudaken's avatar
Dylan Yudaken committed
658

659
    pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse2;
Fiona Glaser's avatar
Fiona Glaser committed
660 661
    pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2;
    pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_sse2;
Fiona Glaser's avatar
Fiona Glaser committed
662 663 664
    pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_sse2;
    pf->avg[PIXEL_8x8]  = x264_pixel_avg_8x8_sse2;
    pf->avg[PIXEL_8x4]  = x264_pixel_avg_8x4_sse2;
665
    pf->hpel_filter = x264_hpel_filter_sse2;
Fiona Glaser's avatar
Fiona Glaser committed
666 667
    if( cpu&X264_CPU_SSE_MISALIGN )
        pf->hpel_filter = x264_hpel_filter_sse2_misalign;
Loren Merritt's avatar
Loren Merritt committed
668
    pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2;
669 670
    if( !(cpu&X264_CPU_STACK_MOD4) )
        pf->mc_chroma = x264_mc_chroma_sse2;
671

672
    if( cpu&X264_CPU_SSE2_IS_FAST )
673
    {
Henrik Gramner's avatar
Henrik Gramner committed
674 675 676
        pf->store_interleave_chroma = x264_store_interleave_chroma_sse2; // FIXME sse2fast? sse2medium?
        pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_sse2;
        pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_sse2;
677 678
        pf->plane_copy_interleave   = x264_plane_copy_interleave_sse2;
        pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2;
679 680 681
        pf->mc_luma = mc_luma_sse2;
        pf->get_ref = get_ref_sse2;
        if( cpu&X264_CPU_CACHELINE_64 )
682
        {
683 684
            pf->mc_luma = mc_luma_cache64_sse2;
            pf->get_ref = get_ref_cache64_sse2;
685
        }
Fiona Glaser's avatar
Fiona Glaser committed
686
        if( cpu&X264_CPU_SSE_MISALIGN )
687
        {
Fiona Glaser's avatar
Fiona Glaser committed
688
            pf->get_ref = get_ref_sse2_misalign;
689 690
            if( !(cpu&X264_CPU_STACK_MOD4) )
                pf->mc_chroma = x264_mc_chroma_sse2_misalign;
691
        }
692 693
    }

694 695 696
    if( !(cpu&X264_CPU_SSSE3) )
        return;

Loren Merritt's avatar
Loren Merritt committed
697 698 699 700 701
    pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_ssse3;
    pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_ssse3;
    pf->avg[PIXEL_8x16]  = x264_pixel_avg_8x16_ssse3;
    pf->avg[PIXEL_8x8]   = x264_pixel_avg_8x8_ssse3;
    pf->avg[PIXEL_8x4]   = x264_pixel_avg_8x4_ssse3;
Fiona Glaser's avatar
Fiona Glaser committed
702
    pf->avg[PIXEL_4x16]  = x264_pixel_avg_4x16_ssse3;
Loren Merritt's avatar
Loren Merritt committed
703 704 705 706
    pf->avg[PIXEL_4x8]   = x264_pixel_avg_4x8_ssse3;
    pf->avg[PIXEL_4x4]   = x264_pixel_avg_4x4_ssse3;
    pf->avg[PIXEL_4x2]   = x264_pixel_avg_4x2_ssse3;

Henrik Gramner's avatar
Henrik Gramner committed
707 708
    pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_ssse3;
    pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_ssse3;
709 710
    pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_ssse3;

711
    pf->hpel_filter = x264_hpel_filter_ssse3;
Loren Merritt's avatar
Loren Merritt committed
712
    pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
713 714 715
    if( !(cpu&X264_CPU_STACK_MOD4) )
        pf->mc_chroma = x264_mc_chroma_ssse3;

Fiona Glaser's avatar
Fiona Glaser committed
716
    if( cpu&X264_CPU_CACHELINE_64 )
717
    {
718 719
        if( !(cpu&X264_CPU_STACK_MOD4) )
            pf->mc_chroma = x264_mc_chroma_ssse3_cache64;
720 721
        pf->mc_luma = mc_luma_cache64_ssse3;
        pf->get_ref = get_ref_cache64_ssse3;
Dylan Yudaken's avatar
Dylan Yudaken committed
722 723 724 725

        /* ssse3 weight is slower on Nehalem, so only assign here. */
        pf->weight_cache = x264_weight_cache_ssse3;
        pf->weight = x264_mc_weight_wtab_ssse3;
726
    }
Loren Merritt's avatar
Loren Merritt committed
727

728
    if( (cpu&X264_CPU_SHUFFLE_IS_FAST) && !(cpu&X264_CPU_SLOW_ATOM) )
729 730
        pf->integral_init4v = x264_integral_init4v_ssse3;

Loren Merritt's avatar
Loren Merritt committed
731 732 733 734 735
    if( !(cpu&X264_CPU_SSE4) )
        return;

    pf->integral_init4h = x264_integral_init4h_sse4;
    pf->integral_init8h = x264_integral_init8h_sse4;
736 737 738 739

    if( !(cpu&X264_CPU_AVX) )
        return;

Fiona Glaser's avatar
Fiona Glaser committed
740
    pf->frame_init_lowres_core = x264_frame_init_lowres_core_avx;
741 742
    pf->integral_init8h = x264_integral_init8h_avx;
    pf->hpel_filter = x264_hpel_filter_avx;
743 744 745 746

    /* ssse3 weight seems to be faster again on Sandy Bridge and Bulldozer. */
    pf->weight_cache = x264_weight_cache_ssse3;
    pf->weight = x264_mc_weight_wtab_ssse3;
747 748
    if( !(cpu&X264_CPU_STACK_MOD4) )
        pf->mc_chroma = x264_mc_chroma_avx;
Fiona Glaser's avatar
Fiona Glaser committed
749 750 751

    if( cpu&X264_CPU_XOP )
        pf->frame_init_lowres_core = x264_frame_init_lowres_core_xop;
752
#endif // HIGH_BIT_DEPTH
Fiona Glaser's avatar
Fiona Glaser committed
753 754 755 756

    if( !(cpu&X264_CPU_AVX) )
        return;
    pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx;
757 758 759 760

    if( !(cpu&X264_CPU_FMA4) )
        return;
    pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_fma4;
Laurent Aimar's avatar
Laurent Aimar committed
761
}