mc.c 52.5 KB
Newer Older
Laurent Aimar's avatar
Laurent Aimar committed
1
/*****************************************************************************
Fiona Glaser's avatar
Fiona Glaser committed
2
 * mc.c: ppc motion compensation
Laurent Aimar's avatar
Laurent Aimar committed
3
 *****************************************************************************
Henrik Gramner's avatar
Henrik Gramner committed
4
 * Copyright (C) 2003-2017 x264 project
Laurent Aimar's avatar
Laurent Aimar committed
5
 *
Eric Petit's avatar
Eric Petit committed
6
 * Authors: Eric Petit <eric.petit@lapsus.org>
7
 *          Guillaume Poirier <gpoirier@mplayerhq.hu>
Laurent Aimar's avatar
Laurent Aimar committed
8
9
10
11
12
13
14
15
16
17
18
19
20
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
21
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
Fiona Glaser's avatar
Fiona Glaser committed
22
23
24
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
Laurent Aimar's avatar
Laurent Aimar committed
25
26
 *****************************************************************************/

Christophe Mutricy's avatar
Christophe Mutricy committed
27
#include "common/common.h"
Laurent Aimar's avatar
Laurent Aimar committed
28
#include "ppccommon.h"
29
#include "mc.h"
Laurent Aimar's avatar
Laurent Aimar committed
30

31
#if !HIGH_BIT_DEPTH
32
33
typedef void (*pf_mc_t)( uint8_t *src, intptr_t i_src,
                         uint8_t *dst, intptr_t i_dst, int i_height );
Laurent Aimar's avatar
Laurent Aimar committed
34

35
36
37
static inline void pixel_avg2_w4_altivec( uint8_t *dst,  intptr_t i_dst,
                                          uint8_t *src1, intptr_t i_src1,
                                          uint8_t *src2, int i_height )
Laurent Aimar's avatar
Laurent Aimar committed
38
{
39
    for( int y = 0; y < i_height; y++ )
Laurent Aimar's avatar
Laurent Aimar committed
40
    {
41
        for( int x = 0; x < 4; x++ )
Laurent Aimar's avatar
Laurent Aimar committed
42
43
44
            dst[x] = ( src1[x] + src2[x] + 1 ) >> 1;
        dst  += i_dst;
        src1 += i_src1;
45
        src2 += i_src1;
Laurent Aimar's avatar
Laurent Aimar committed
46
47
    }
}
48

49
50
51
static inline void pixel_avg2_w8_altivec( uint8_t *dst,  intptr_t i_dst,
                                          uint8_t *src1, intptr_t i_src1,
                                          uint8_t *src2, int i_height )
Laurent Aimar's avatar
Laurent Aimar committed
52
{
53
54
    vec_u8_t src1v, src2v;
    PREP_STORE8;
55

56
    for( int y = 0; y < i_height; y++ )
57
    {
58
59
        src1v = vec_vsx_ld( 0, src1 );
        src2v = vec_vsx_ld( 0, src2 );
60
        src1v = vec_avg( src1v, src2v );
61
62

        VEC_STORE8(src1v, dst);
63
64
65

        dst  += i_dst;
        src1 += i_src1;
66
        src2 += i_src1;
67
    }
Laurent Aimar's avatar
Laurent Aimar committed
68
}
69

70
71
72
static inline void pixel_avg2_w16_altivec( uint8_t *dst,  intptr_t i_dst,
                                           uint8_t *src1, intptr_t i_src1,
                                           uint8_t *src2, int i_height )
Laurent Aimar's avatar
Laurent Aimar committed
73
{
74
    vec_u8_t src1v, src2v;
75

76
    for( int y = 0; y < i_height; y++ )
Laurent Aimar's avatar
Laurent Aimar committed
77
    {
78
79
        src1v = vec_vsx_ld( 0, src1 );
        src2v = vec_vsx_ld( 0, src2 );
Laurent Aimar's avatar
Laurent Aimar committed
80
        src1v = vec_avg( src1v, src2v );
81
        vec_st(src1v, 0, dst);
Laurent Aimar's avatar
Laurent Aimar committed
82
83
84

        dst  += i_dst;
        src1 += i_src1;
85
        src2 += i_src1;
Laurent Aimar's avatar
Laurent Aimar committed
86
87
88
    }
}

89
90
91
static inline void pixel_avg2_w20_altivec( uint8_t *dst,  intptr_t i_dst,
                                           uint8_t *src1, intptr_t i_src1,
                                           uint8_t *src2, int i_height )
92
{
93
94
    pixel_avg2_w16_altivec(dst, i_dst, src1, i_src1, src2, i_height);
    pixel_avg2_w4_altivec(dst+16, i_dst, src1+16, i_src1, src2+16, i_height);
95
96
}

Laurent Aimar's avatar
Laurent Aimar committed
97
/* mc_copy: plain c */
98

Laurent Aimar's avatar
Laurent Aimar committed
99
#define MC_COPY( name, a )                                \
100
101
static void name( uint8_t *dst, intptr_t i_dst,           \
                  uint8_t *src, intptr_t i_src, int i_height ) \
Laurent Aimar's avatar
Laurent Aimar committed
102
103
104
105
106
107
108
109
110
{                                                         \
    int y;                                                \
    for( y = 0; y < i_height; y++ )                       \
    {                                                     \
        memcpy( dst, src, a );                            \
        src += i_src;                                     \
        dst += i_dst;                                     \
    }                                                     \
}
111
112
MC_COPY( mc_copy_w4_altivec,  4  )
MC_COPY( mc_copy_w8_altivec,  8  )
Laurent Aimar's avatar
Laurent Aimar committed
113

114
115
static void mc_copy_w16_altivec( uint8_t *dst, intptr_t i_dst,
                                 uint8_t *src, intptr_t i_src, int i_height )
116
117
118
{
    vec_u8_t cpyV;

119
    for( int y = 0; y < i_height; y++ )
120
    {
121
        cpyV = vec_vsx_ld( 0, src );
122
        vec_st(cpyV, 0, dst);
123

124
125
126
127
128
129
        src += i_src;
        dst += i_dst;
    }
}


130
131
static void mc_copy_w16_aligned_altivec( uint8_t *dst, intptr_t i_dst,
                                         uint8_t *src, intptr_t i_src, int i_height )
132
{
133
    for( int y = 0; y < i_height; ++y )
134
    {
135
        vec_u8_t cpyV = vec_ld( 0, src );
136
137
138
139
140
141
142
        vec_st(cpyV, 0, dst);

        src += i_src;
        dst += i_dst;
    }
}

143
#define x264_plane_copy_swap_core_altivec x264_template(plane_copy_swap_core_altivec)
144
145
146
147
148
149
150
151
152
153
154
155
156
157
void x264_plane_copy_swap_core_altivec( uint8_t *dst, intptr_t i_dst,
                                        uint8_t *src, intptr_t i_src, int w, int h )
{
    const vec_u8_t mask = { 0x01, 0x00, 0x03, 0x02, 0x05, 0x04, 0x07, 0x06, 0x09, 0x08, 0x0B, 0x0A, 0x0D, 0x0C, 0x0F, 0x0E };

    for( int y = 0; y < h; y++, dst += i_dst, src += i_src )
        for( int x = 0; x < 2 * w; x += 16 )
        {
            vec_u8_t srcv = vec_vsx_ld( x, src );
            vec_u8_t dstv = vec_perm( srcv, srcv, mask );

            vec_vsx_st( dstv, x, dst );
        }
}
158

159
#define x264_plane_copy_interleave_core_altivec x264_template(plane_copy_interleave_core_altivec)
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
void x264_plane_copy_interleave_core_altivec( uint8_t *dst, intptr_t i_dst,
                                              uint8_t *srcu, intptr_t i_srcu,
                                              uint8_t *srcv, intptr_t i_srcv, int w, int h )
{
    for( int y = 0; y < h; y++, dst += i_dst, srcu += i_srcu, srcv += i_srcv )
        for( int x = 0; x < w; x += 16 )
        {
            vec_u8_t srcvv = vec_vsx_ld( x, srcv );
            vec_u8_t srcuv = vec_vsx_ld( x, srcu );
            vec_u8_t dstv1 = vec_mergeh( srcuv, srcvv );
            vec_u8_t dstv2 = vec_mergel( srcuv, srcvv );

            vec_vsx_st( dstv1, 2 * x, dst );
            vec_vsx_st( dstv2, 2 * x + 16, dst );
        }
}

177
178
179
180
181
182
183
184
185
186
187
188
189
void x264_store_interleave_chroma_altivec( uint8_t *dst, intptr_t i_dst,
                                           uint8_t *srcu, uint8_t *srcv, int height )
{
    for( int y = 0; y < height; y++, dst += i_dst, srcu += FDEC_STRIDE, srcv += FDEC_STRIDE )
    {
        vec_u8_t srcvv = vec_vsx_ld( 0, srcv );
        vec_u8_t srcuv = vec_vsx_ld( 0, srcu );
        vec_u8_t dstv = vec_mergeh( srcuv, srcvv );

        vec_vsx_st(dstv, 0, dst);
    }
}

190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
void x264_plane_copy_deinterleave_altivec( uint8_t *dstu, intptr_t i_dstu,
                                           uint8_t *dstv, intptr_t i_dstv,
                                           uint8_t *src, intptr_t i_src, int w, int h )
{
    const vec_u8_t mask[2] = {
        { 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E },
        { 0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F, 0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F }
    };
    for( int y = 0; y < h; y++, dstu += i_dstu, dstv += i_dstv, src += i_src )
    {
        for( int x = 0; x < w; x += 16 )
        {
            vec_u8_t srcv1 = vec_vsx_ld( 2 * x, src );
            vec_u8_t srcv2 = vec_vsx_ld( 2 * x + 16, src );
            vec_u8_t dstuv = vec_perm( srcv1, srcv2, mask[0] );
            vec_u8_t dstvv = vec_perm( srcv1, srcv2, mask[1] );

            vec_vsx_st( dstuv, x, dstu );
            vec_vsx_st( dstvv, x, dstv );
        }
    }
}

213
214
215
216
217
218
219
220
221
222
223
224
225
226
static void load_deinterleave_chroma_fenc_altivec( uint8_t *dst, uint8_t *src, intptr_t i_src, int height )
{
    const vec_u8_t mask = { 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E, 0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F };

    for( int y = 0; y < height; y += 2, dst += 2*FENC_STRIDE, src += 2*i_src )
    {
        vec_u8_t src0 = vec_ld( 0, src );
        vec_u8_t src1 = vec_ld( i_src, src );

        vec_st( vec_perm( src0, src0, mask ), 0*FENC_STRIDE, dst );
        vec_st( vec_perm( src1, src1, mask ), 1*FENC_STRIDE, dst );
    }
}

227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
#if HAVE_VSX
void x264_plane_copy_deinterleave_rgb_altivec( uint8_t *dsta, intptr_t i_dsta,
                                               uint8_t *dstb, intptr_t i_dstb,
                                               uint8_t *dstc, intptr_t i_dstc,
                                               uint8_t *src, intptr_t i_src,
                                               int pw, int w, int h )
{
    if( pw == 3 )
    {
        const vec_u8_t mask[4] = {
            { 0x00, 0x03, 0x06, 0x09, 0x0C, 0x0F, 0x12, 0x15, 0x01, 0x04, 0x07, 0x0A, 0x0D, 0x10, 0x13, 0x16 },
            { 0x08, 0x0B, 0x0E, 0x11, 0x14, 0x17, 0x1A, 0x1D, 0x09, 0x0C, 0x0F, 0x12, 0x15, 0x18, 0x1B, 0x1E },
            { 0x02, 0x05, 0x08, 0x0B, 0x0E, 0x11, 0x14, 0x17, 0x1A, 0x1D, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF },
            { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x10, 0x13, 0x16, 0x19, 0x1C, 0x1F }
        };

        for( int y = 0; y < h; y++, dsta += i_dsta, dstb += i_dstb, dstc += i_dstc, src += i_src )
        {
            for( int x = 0; x < w; x += 16 )
            {
                vec_u8_t srcv1 = vec_vsx_ld( 3 * x, src );
                vec_u8_t srcv2 = vec_vsx_ld( 3 * x + 16, src );
                vec_u8_t srcv3 = vec_vsx_ld( 3 * x + 32, src );
                vec_u64_t tmp1 = (vec_u64_t)vec_perm( srcv1, srcv2, mask[0] ); // a0  a1  a2  a3  a4  a5  a6  a7  b0  b1  b2  b3  b4  b5  b6  b7
                vec_u64_t tmp2 = (vec_u64_t)vec_perm( srcv2, srcv3, mask[1] ); // a8  a9  a10 a11 a12 a13 a14 a15 b8  b9  b10 b11 b12 b13 b14 b15
                vec_st( (vec_u8_t)vec_mergeh( tmp1, tmp2 ), x, dsta );
                vec_st( (vec_u8_t)vec_mergel( tmp1, tmp2 ), x, dstb );

                srcv1 = vec_perm( srcv1, srcv2, mask[2] );          // c0  c1  c2  c3  c4  c5  c6  c7  c8  c9
                srcv1 = vec_perm( srcv1, srcv3, mask[3] );          // c0  c1  c2  c3  c4  c5  c6  c7  c8  c9  c10 c11 c12 c13 c14 c15
                vec_st( srcv1, x, dstc );
            }
        }
    }
    else
    {
        const vec_u8_t mask[2] = {
            { 0x00, 0x04, 0x08, 0x0C, 0x10, 0x14, 0x18, 0x1C, 0x01, 0x05, 0x09, 0x0D, 0x11, 0x15, 0x19, 0x1D },
            { 0x02, 0x06, 0x0A, 0x0E, 0x12, 0x16, 0x1A, 0x1E, 0x03, 0x07, 0x0B, 0x0F, 0x13, 0x17, 0x1B, 0x1F }
        };

        for( int y = 0; y < h; y++, dsta += i_dsta, dstb += i_dstb, dstc += i_dstc, src += i_src )
        {
            for( int x = 0; x < w; x += 16 )
            {
                vec_u8_t srcv1 = vec_vsx_ld( 4 * x, src );
                vec_u8_t srcv2 = vec_vsx_ld( 4 * x + 16, src );
                vec_u8_t srcv3 = vec_vsx_ld( 4 * x + 32, src );
                vec_u8_t srcv4 = vec_vsx_ld( 4 * x + 48, src );

                vec_u64_t tmp1 = (vec_u64_t)vec_perm( srcv1, srcv2, mask[0] ); // a0  a1  a2  a3  a4  a5  a6  a7  b0  b1  b2  b3  b4  b5  b6  b7
                vec_u64_t tmp2 = (vec_u64_t)vec_perm( srcv3, srcv4, mask[0] ); // a8  a9  a10 a11 a12 a13 a14 a15 b8  b9  b10 b11 b12 b13 b14 b15
                vec_st( (vec_u8_t)vec_mergeh( tmp1, tmp2 ), x, dsta );
                vec_st( (vec_u8_t)vec_mergel( tmp1, tmp2 ), x, dstb );

                tmp1 = (vec_u64_t)vec_perm( srcv1, srcv2, mask[1] );           // c0  c1  c2  c3  c4  c5  c6  c7
                tmp2 = (vec_u64_t)vec_perm( srcv3, srcv4, mask[1] );           // c8  c9  c10 c11 c12 c13 c14 c15
                vec_st( (vec_u8_t)vec_mergeh( tmp1, tmp2 ), x, dstc );
            }
        }
    }
}
#endif

291
292
static void mc_luma_altivec( uint8_t *dst,    intptr_t i_dst_stride,
                             uint8_t *src[4], intptr_t i_src_stride,
293
                             int mvx, int mvy,
David Conrad's avatar
David Conrad committed
294
                             int i_width, int i_height, const x264_weight_t *weight )
295
{
296
    int qpel_idx = ((mvy&3)<<2) + (mvx&3);
297
    intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
298
    uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
299
    if( qpel_idx & 5 ) /* qpel interpolation needed */
300
    {
301
        uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
302

303
304
305
        switch( i_width )
        {
            case 4:
306
                pixel_avg2_w4_altivec( dst, i_dst_stride, src1, i_src_stride, src2, i_height );
307
308
                break;
            case 8:
309
                pixel_avg2_w8_altivec( dst, i_dst_stride, src1, i_src_stride, src2, i_height );
310
311
312
                break;
            case 16:
            default:
313
                pixel_avg2_w16_altivec( dst, i_dst_stride, src1, i_src_stride, src2, i_height );
314
        }
David Conrad's avatar
David Conrad committed
315
316
        if( weight->weightfn )
            weight->weightfn[i_width>>2]( dst, i_dst_stride, dst, i_dst_stride, weight, i_height );
317
    }
David Conrad's avatar
David Conrad committed
318
319
    else if( weight->weightfn )
        weight->weightfn[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, weight, i_height );
320
321
    else
    {
322
323
324
        switch( i_width )
        {
            case 4:
325
                mc_copy_w4_altivec( dst, i_dst_stride, src1, i_src_stride, i_height );
326
327
                break;
            case 8:
328
                mc_copy_w8_altivec( dst, i_dst_stride, src1, i_src_stride, i_height );
329
330
                break;
            case 16:
331
                mc_copy_w16_altivec( dst, i_dst_stride, src1, i_src_stride, i_height );
332
                break;
333
334
335
336
        }
    }
}

337
338


339
340
static uint8_t *get_ref_altivec( uint8_t *dst,   intptr_t *i_dst_stride,
                                 uint8_t *src[4], intptr_t i_src_stride,
341
                                 int mvx, int mvy,
David Conrad's avatar
David Conrad committed
342
                                 int i_width, int i_height, const x264_weight_t *weight )
343
{
344
    int qpel_idx = ((mvy&3)<<2) + (mvx&3);
345
    intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
346
    uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
347
    if( qpel_idx & 5 ) /* qpel interpolation needed */
348
    {
349
        uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
350
351
352
        switch( i_width )
        {
            case 4:
353
                pixel_avg2_w4_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height );
354
355
                break;
            case 8:
356
                pixel_avg2_w8_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height );
357
358
359
360
                break;
            case 12:
            case 16:
            default:
361
                pixel_avg2_w16_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height );
362
363
                break;
            case 20:
364
                pixel_avg2_w20_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height );
365
                break;
366
        }
David Conrad's avatar
David Conrad committed
367
368
369
370
371
372
373
        if( weight->weightfn )
            weight->weightfn[i_width>>2]( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_height );
        return dst;
    }
    else if( weight->weightfn )
    {
        weight->weightfn[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, weight, i_height );
374
375
376
377
378
379
380
381
382
        return dst;
    }
    else
    {
        *i_dst_stride = i_src_stride;
        return src1;
    }
}

383
384
385
static void mc_chroma_2xh( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stride,
                           uint8_t *src, intptr_t i_src_stride,
                           int mvx, int mvy, int i_height )
386
387
388
389
390
{
    uint8_t *srcp;
    int d8x = mvx&0x07;
    int d8y = mvy&0x07;

391
392
393
394
    int cA = (8-d8x)*(8-d8y);
    int cB = d8x    *(8-d8y);
    int cC = (8-d8x)*d8y;
    int cD = d8x    *d8y;
395

396
    src += (mvy >> 3) * i_src_stride + (mvx >> 3)*2;
397
    srcp = &src[i_src_stride];
398

399
    for( int y = 0; y < i_height; y++ )
400
    {
401
402
403
404
        dstu[0] = ( cA*src[0] + cB*src[2] + cC*srcp[0] + cD*srcp[2] + 32 ) >> 6;
        dstv[0] = ( cA*src[1] + cB*src[3] + cC*srcp[1] + cD*srcp[3] + 32 ) >> 6;
        dstu[1] = ( cA*src[2] + cB*src[4] + cC*srcp[2] + cD*srcp[4] + 32 ) >> 6;
        dstv[1] = ( cA*src[3] + cB*src[5] + cC*srcp[3] + cD*srcp[5] + 32 ) >> 6;
405
406
407

        src  += i_src_stride;
        srcp += i_src_stride;
408
409
        dstu += i_dst_stride;
        dstv += i_dst_stride;
410
411
412
    }
 }

413
414
415
416
417
418
#ifdef WORDS_BIGENDIAN
#define VSLD(a,b,n) vec_sld(a,b,n)
#else
#define VSLD(a,b,n) vec_sld(b,a,16-n)
#endif

419
static void mc_chroma_4xh_altivec( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stride,
420
421
                                   uint8_t *src, intptr_t i_src_stride,
                                   int mvx, int mvy, int i_height )
Eric Petit's avatar
Eric Petit committed
422
423
424
425
426
{
    uint8_t *srcp;
    int d8x = mvx & 0x07;
    int d8y = mvy & 0x07;

427
    ALIGNED_16( uint16_t coeff[4] );
Eric Petit's avatar
Eric Petit committed
428
429
430
431
432
    coeff[0] = (8-d8x)*(8-d8y);
    coeff[1] = d8x    *(8-d8y);
    coeff[2] = (8-d8x)*d8y;
    coeff[3] = d8x    *d8y;

433
    src += (mvy >> 3) * i_src_stride + (mvx >> 3)*2;
434
    srcp = &src[i_src_stride];
435

436
    LOAD_ZERO;
Eric Petit's avatar
Eric Petit committed
437
    vec_u16_t   coeff0v, coeff1v, coeff2v, coeff3v;
438
439
    vec_u8_t    src2v_8, dstuv, dstvv;
    vec_u16_t   src0v_16, src1v_16, src2v_16, src3v_16, dstv16;
440
    vec_u16_t   shiftv, k32v;
441

442
#ifdef WORDS_BIGENDIAN
443
444
    static const vec_u8_t perm0v = CV(1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13);
    static const vec_u8_t perm1v = CV(3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15);
445
446
447
448
#else
    static const vec_u8_t perm0v = CV(0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12);
    static const vec_u8_t perm1v = CV(2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14);
#endif
449

Eric Petit's avatar
Eric Petit committed
450
451
452
453
454
455
456
457
    coeff0v = vec_ld( 0, coeff );
    coeff3v = vec_splat( coeff0v, 3 );
    coeff2v = vec_splat( coeff0v, 2 );
    coeff1v = vec_splat( coeff0v, 1 );
    coeff0v = vec_splat( coeff0v, 0 );
    k32v    = vec_sl( vec_splat_u16( 1 ), vec_splat_u16( 5 ) );
    shiftv  = vec_splat_u16( 6 );

458
    src2v_8 = vec_vsx_ld( 0, src );
459
    src2v_16 = vec_u8_to_u16( src2v_8 );
460
    src3v_16 = vec_u8_to_u16( VSLD( src2v_8, src2v_8, 2 ) );
461

462
    for( int y = 0; y < i_height; y += 2 )
463
    {
464
465
        src0v_16 = src2v_16;
        src1v_16 = src3v_16;
466
        src2v_8 = vec_vsx_ld( 0, srcp );
467
        src2v_16 = vec_u8_to_u16( src2v_8 );
468
        src3v_16 = vec_u8_to_u16( VSLD( src2v_8, src2v_8, 2 ) );
469
470
471
472
473
474
475
476
477
478
479
480

        dstv16 = vec_mladd( coeff0v, src0v_16, k32v );
        dstv16 = vec_mladd( coeff1v, src1v_16, dstv16 );
        dstv16 = vec_mladd( coeff2v, src2v_16, dstv16 );
        dstv16 = vec_mladd( coeff3v, src3v_16, dstv16 );

        dstv16 = vec_sr( dstv16, shiftv );

        dstuv = (vec_u8_t)vec_perm( dstv16, dstv16, perm0v );
        dstvv = (vec_u8_t)vec_perm( dstv16, dstv16, perm1v );
        vec_ste( (vec_u32_t)dstuv, 0, (uint32_t*) dstu );
        vec_ste( (vec_u32_t)dstvv, 0, (uint32_t*) dstv );
Eric Petit's avatar
Eric Petit committed
481
482

        srcp += i_src_stride;
483
484
485
486
487
        dstu += i_dst_stride;
        dstv += i_dst_stride;

        src0v_16 = src2v_16;
        src1v_16 = src3v_16;
488
        src2v_8 = vec_vsx_ld( 0, srcp );
489
        src2v_16 = vec_u8_to_u16( src2v_8 );
490
        src3v_16 = vec_u8_to_u16( VSLD( src2v_8, src2v_8, 2 ) );
491
492
493
494
495
496
497
498
499
500
501
502
503

        dstv16 = vec_mladd( coeff0v, src0v_16, k32v );
        dstv16 = vec_mladd( coeff1v, src1v_16, dstv16 );
        dstv16 = vec_mladd( coeff2v, src2v_16, dstv16 );
        dstv16 = vec_mladd( coeff3v, src3v_16, dstv16 );

        dstv16 = vec_sr( dstv16, shiftv );

        dstuv = (vec_u8_t)vec_perm( dstv16, dstv16, perm0v );
        dstvv = (vec_u8_t)vec_perm( dstv16, dstv16, perm1v );
        vec_ste( (vec_u32_t)dstuv, 0, (uint32_t*) dstu );
        vec_ste( (vec_u32_t)dstvv, 0, (uint32_t*) dstv );

504
        srcp += i_src_stride;
505
506
        dstu += i_dst_stride;
        dstv += i_dst_stride;
Eric Petit's avatar
Eric Petit committed
507
508
509
    }
}

510
static void mc_chroma_8xh_altivec( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stride,
511
512
                                   uint8_t *src, intptr_t i_src_stride,
                                   int mvx, int mvy, int i_height )
Eric Petit's avatar
Eric Petit committed
513
514
515
516
517
{
    uint8_t *srcp;
    int d8x = mvx & 0x07;
    int d8y = mvy & 0x07;

518
    ALIGNED_16( uint16_t coeff[4] );
Eric Petit's avatar
Eric Petit committed
519
520
521
522
    coeff[0] = (8-d8x)*(8-d8y);
    coeff[1] = d8x    *(8-d8y);
    coeff[2] = (8-d8x)*d8y;
    coeff[3] = d8x    *d8y;
523

524
    src += (mvy >> 3) * i_src_stride + (mvx >> 3)*2;
525
    srcp = &src[i_src_stride];
526

Eric Petit's avatar
Eric Petit committed
527
528
529
    LOAD_ZERO;
    PREP_STORE8;
    vec_u16_t   coeff0v, coeff1v, coeff2v, coeff3v;
530
531
532
533
    vec_u8_t    src0v_8, src1v_8, src2v_8, src3v_8;
    vec_u8_t    dstuv, dstvv;
    vec_u16_t   src0v_16h, src1v_16h, src2v_16h, src3v_16h, dstv_16h;
    vec_u16_t   src0v_16l, src1v_16l, src2v_16l, src3v_16l, dstv_16l;
534
    vec_u16_t   shiftv, k32v;
535

Eric Petit's avatar
Eric Petit committed
536
537
538
539
540
541
542
543
    coeff0v = vec_ld( 0, coeff );
    coeff3v = vec_splat( coeff0v, 3 );
    coeff2v = vec_splat( coeff0v, 2 );
    coeff1v = vec_splat( coeff0v, 1 );
    coeff0v = vec_splat( coeff0v, 0 );
    k32v    = vec_sl( vec_splat_u16( 1 ), vec_splat_u16( 5 ) );
    shiftv  = vec_splat_u16( 6 );

544
#ifdef WORDS_BIGENDIAN
545
546
    static const vec_u8_t perm0v = CV(1,5,9,13,17,21,25,29,0,0,0,0,0,0,0,0);
    static const vec_u8_t perm1v = CV(3,7,11,15,19,23,27,31,0,0,0,0,0,0,0,0);
547
548
549
550
#else
    static const vec_u8_t perm0v = CV(0,4,8,12,16,20,24,28,1,1,1,1,1,1,1,1);
    static const vec_u8_t perm1v = CV(2,6,10,14,18,22,26,30,1,1,1,1,1,1,1,1);
#endif
Eric Petit's avatar
Eric Petit committed
551

552
553
    src2v_8 = vec_vsx_ld( 0, src );
    src3v_8 = vec_vsx_ld( 16, src );
554
    src3v_8 = VSLD( src2v_8, src3v_8, 2 );
555
556

    for( int y = 0; y < i_height; y += 2 )
Eric Petit's avatar
Eric Petit committed
557
    {
558
559
        src0v_8 = src2v_8;
        src1v_8 = src3v_8;
560
561
        src2v_8 = vec_vsx_ld( 0, srcp );
        src3v_8 = vec_vsx_ld( 16, srcp );
562

563
        src3v_8 = VSLD( src2v_8, src3v_8, 2 );
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585

        src0v_16h = vec_u8_to_u16_h( src0v_8 );
        src0v_16l = vec_u8_to_u16_l( src0v_8 );
        src1v_16h = vec_u8_to_u16_h( src1v_8 );
        src1v_16l = vec_u8_to_u16_l( src1v_8 );
        src2v_16h = vec_u8_to_u16_h( src2v_8 );
        src2v_16l = vec_u8_to_u16_l( src2v_8 );
        src3v_16h = vec_u8_to_u16_h( src3v_8 );
        src3v_16l = vec_u8_to_u16_l( src3v_8 );

        dstv_16h = vec_mladd( coeff0v, src0v_16h, k32v );
        dstv_16l = vec_mladd( coeff0v, src0v_16l, k32v );
        dstv_16h = vec_mladd( coeff1v, src1v_16h, dstv_16h );
        dstv_16l = vec_mladd( coeff1v, src1v_16l, dstv_16l );
        dstv_16h = vec_mladd( coeff2v, src2v_16h, dstv_16h );
        dstv_16l = vec_mladd( coeff2v, src2v_16l, dstv_16l );
        dstv_16h = vec_mladd( coeff3v, src3v_16h, dstv_16h );
        dstv_16l = vec_mladd( coeff3v, src3v_16l, dstv_16l );

        dstv_16h = vec_sr( dstv_16h, shiftv );
        dstv_16l = vec_sr( dstv_16l, shiftv );

586
587
        dstuv = (vec_u8_t)vec_perm( dstv_16h, dstv_16l, perm0v );
        dstvv = (vec_u8_t)vec_perm( dstv_16h, dstv_16l, perm1v );
588
589
590

        VEC_STORE8( dstuv, dstu );
        VEC_STORE8( dstvv, dstv );
Eric Petit's avatar
Eric Petit committed
591

592
        srcp += i_src_stride;
593
594
595
596
597
        dstu += i_dst_stride;
        dstv += i_dst_stride;

        src0v_8 = src2v_8;
        src1v_8 = src3v_8;
598
599
        src2v_8 = vec_vsx_ld( 0, srcp );
        src3v_8 = vec_vsx_ld( 16, srcp );
600

601
        src3v_8 = VSLD( src2v_8, src3v_8, 2 );
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623

        src0v_16h = vec_u8_to_u16_h( src0v_8 );
        src0v_16l = vec_u8_to_u16_l( src0v_8 );
        src1v_16h = vec_u8_to_u16_h( src1v_8 );
        src1v_16l = vec_u8_to_u16_l( src1v_8 );
        src2v_16h = vec_u8_to_u16_h( src2v_8 );
        src2v_16l = vec_u8_to_u16_l( src2v_8 );
        src3v_16h = vec_u8_to_u16_h( src3v_8 );
        src3v_16l = vec_u8_to_u16_l( src3v_8 );

        dstv_16h = vec_mladd( coeff0v, src0v_16h, k32v );
        dstv_16l = vec_mladd( coeff0v, src0v_16l, k32v );
        dstv_16h = vec_mladd( coeff1v, src1v_16h, dstv_16h );
        dstv_16l = vec_mladd( coeff1v, src1v_16l, dstv_16l );
        dstv_16h = vec_mladd( coeff2v, src2v_16h, dstv_16h );
        dstv_16l = vec_mladd( coeff2v, src2v_16l, dstv_16l );
        dstv_16h = vec_mladd( coeff3v, src3v_16h, dstv_16h );
        dstv_16l = vec_mladd( coeff3v, src3v_16l, dstv_16l );

        dstv_16h = vec_sr( dstv_16h, shiftv );
        dstv_16l = vec_sr( dstv_16l, shiftv );

624
625
        dstuv = (vec_u8_t)vec_perm( dstv_16h, dstv_16l, perm0v );
        dstvv = (vec_u8_t)vec_perm( dstv_16h, dstv_16l, perm1v );
626
627
628
629

        VEC_STORE8( dstuv, dstu );
        VEC_STORE8( dstvv, dstv );

630
        srcp += i_src_stride;
631
632
        dstu += i_dst_stride;
        dstv += i_dst_stride;
633
634
635
    }
}

636
637
638
static void mc_chroma_altivec( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stride,
                               uint8_t *src, intptr_t i_src_stride,
                               int mvx, int mvy, int i_width, int i_height )
Eric Petit's avatar
Eric Petit committed
639
640
{
    if( i_width == 8 )
641
        mc_chroma_8xh_altivec( dstu, dstv, i_dst_stride, src, i_src_stride,
Eric Petit's avatar
Eric Petit committed
642
                               mvx, mvy, i_height );
643
    else if( i_width == 4 )
644
        mc_chroma_4xh_altivec( dstu, dstv, i_dst_stride, src, i_src_stride,
Eric Petit's avatar
Eric Petit committed
645
                               mvx, mvy, i_height );
646
    else
647
        mc_chroma_2xh( dstu, dstv, i_dst_stride, src, i_src_stride,
648
                       mvx, mvy, i_height );
Eric Petit's avatar
Eric Petit committed
649
650
}

651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
#define HPEL_FILTER_1( t1v, t2v, t3v, t4v, t5v, t6v ) \
{                                                     \
    t1v = vec_add( t1v, t6v );                        \
    t2v = vec_add( t2v, t5v );                        \
    t3v = vec_add( t3v, t4v );                        \
                                                      \
    t1v = vec_sub( t1v, t2v );   /* (a-b) */          \
    t2v = vec_sub( t2v, t3v );   /* (b-c) */          \
    t2v = vec_sl(  t2v, twov );  /* (b-c)*4 */        \
    t1v = vec_sub( t1v, t2v );   /* a-5*b+4*c */      \
    t3v = vec_sl(  t3v, fourv ); /* 16*c */           \
    t1v = vec_add( t1v, t3v );   /* a-5*b+20*c */     \
}

#define HPEL_FILTER_2( t1v, t2v, t3v, t4v, t5v, t6v ) \
{                                                     \
    t1v = vec_add( t1v, t6v );                        \
    t2v = vec_add( t2v, t5v );                        \
    t3v = vec_add( t3v, t4v );                        \
                                                      \
    t1v = vec_sub( t1v, t2v );  /* (a-b) */           \
    t1v = vec_sra( t1v, twov ); /* (a-b)/4 */         \
    t1v = vec_sub( t1v, t2v );  /* (a-b)/4-b */       \
    t1v = vec_add( t1v, t3v );  /* (a-b)/4-b+c */     \
    t1v = vec_sra( t1v, twov ); /* ((a-b)/4-b+c)/4 */ \
    t1v = vec_add( t1v, t3v );  /* ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 */ \
}

679
680
#define HPEL_FILTER_HORIZONTAL()                             \
{                                                            \
681
682
    src1v = vec_vsx_ld( x- 2+i_stride*y, src );              \
    src6v = vec_vsx_ld( x+14+i_stride*y, src );              \
683
                                                             \
684
685
686
687
688
    src2v = VSLD( src1v, src6v,  1 );                        \
    src3v = VSLD( src1v, src6v,  2 );                        \
    src4v = VSLD( src1v, src6v,  3 );                        \
    src5v = VSLD( src1v, src6v,  4 );                        \
    src6v = VSLD( src1v, src6v,  5 );                        \
689
690
691
692
693
694
695
696
697
698
699
700
701
702
                                                             \
    temp1v = vec_u8_to_s16_h( src1v );                       \
    temp2v = vec_u8_to_s16_h( src2v );                       \
    temp3v = vec_u8_to_s16_h( src3v );                       \
    temp4v = vec_u8_to_s16_h( src4v );                       \
    temp5v = vec_u8_to_s16_h( src5v );                       \
    temp6v = vec_u8_to_s16_h( src6v );                       \
                                                             \
    HPEL_FILTER_1( temp1v, temp2v, temp3v,                   \
                   temp4v, temp5v, temp6v );                 \
                                                             \
    dest1v = vec_add( temp1v, sixteenv );                    \
    dest1v = vec_sra( dest1v, fivev );                       \
                                                             \
703
704
705
706
707
708
    temp1v = vec_u8_to_s16_l( src1v );                       \
    temp2v = vec_u8_to_s16_l( src2v );                       \
    temp3v = vec_u8_to_s16_l( src3v );                       \
    temp4v = vec_u8_to_s16_l( src4v );                       \
    temp5v = vec_u8_to_s16_l( src5v );                       \
    temp6v = vec_u8_to_s16_l( src6v );                       \
709
                                                             \
710
711
    HPEL_FILTER_1( temp1v, temp2v, temp3v,                   \
                   temp4v, temp5v, temp6v );                 \
712
                                                             \
713
    dest2v = vec_add( temp1v, sixteenv );                    \
714
715
716
717
    dest2v = vec_sra( dest2v, fivev );                       \
                                                             \
    destv = vec_packsu( dest1v, dest2v );                    \
                                                             \
718
    vec_vsx_st( destv, x+i_stride*y, dsth );                 \
719
720
721
722
}

#define HPEL_FILTER_VERTICAL()                                    \
{                                                                 \
723
724
725
726
727
728
    src1v = vec_vsx_ld( x+i_stride*(y-2), src );                  \
    src2v = vec_vsx_ld( x+i_stride*(y-1), src );                  \
    src3v = vec_vsx_ld( x+i_stride*(y-0), src );                  \
    src4v = vec_vsx_ld( x+i_stride*(y+1), src );                  \
    src5v = vec_vsx_ld( x+i_stride*(y+2), src );                  \
    src6v = vec_vsx_ld( x+i_stride*(y+3), src );                  \
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
                                                                  \
    temp1v = vec_u8_to_s16_h( src1v );                            \
    temp2v = vec_u8_to_s16_h( src2v );                            \
    temp3v = vec_u8_to_s16_h( src3v );                            \
    temp4v = vec_u8_to_s16_h( src4v );                            \
    temp5v = vec_u8_to_s16_h( src5v );                            \
    temp6v = vec_u8_to_s16_h( src6v );                            \
                                                                  \
    HPEL_FILTER_1( temp1v, temp2v, temp3v,                        \
                   temp4v, temp5v, temp6v );                      \
                                                                  \
    dest1v = vec_add( temp1v, sixteenv );                         \
    dest1v = vec_sra( dest1v, fivev );                            \
                                                                  \
    temp4v = vec_u8_to_s16_l( src1v );                            \
    temp5v = vec_u8_to_s16_l( src2v );                            \
    temp6v = vec_u8_to_s16_l( src3v );                            \
    temp7v = vec_u8_to_s16_l( src4v );                            \
    temp8v = vec_u8_to_s16_l( src5v );                            \
    temp9v = vec_u8_to_s16_l( src6v );                            \
                                                                  \
    HPEL_FILTER_1( temp4v, temp5v, temp6v,                        \
                   temp7v, temp8v, temp9v );                      \
                                                                  \
    dest2v = vec_add( temp4v, sixteenv );                         \
    dest2v = vec_sra( dest2v, fivev );                            \
                                                                  \
    destv = vec_packsu( dest1v, dest2v );                         \
                                                                  \
758
    vec_vsx_st( destv, x+i_stride*y, dstv );                      \
759
760
}

761
762
#define HPEL_FILTER_CENTRAL()                           \
{                                                       \
763
764
    temp1v = VSLD( tempav, tempbv, 12 );                \
    temp2v = VSLD( tempav, tempbv, 14 );                \
765
    temp3v = tempbv;                                    \
766
767
768
    temp4v = VSLD( tempbv, tempcv,  2 );                \
    temp5v = VSLD( tempbv, tempcv,  4 );                \
    temp6v = VSLD( tempbv, tempcv,  6 );                \
769
770
771
772
773
774
775
                                                        \
    HPEL_FILTER_2( temp1v, temp2v, temp3v,              \
                   temp4v, temp5v, temp6v );            \
                                                        \
    dest1v = vec_add( temp1v, thirtytwov );             \
    dest1v = vec_sra( dest1v, sixv );                   \
                                                        \
776
777
    temp1v = VSLD( tempbv, tempcv, 12 );                \
    temp2v = VSLD( tempbv, tempcv, 14 );                \
778
    temp3v = tempcv;                                    \
779
780
781
    temp4v = VSLD( tempcv, tempdv,  2 );                \
    temp5v = VSLD( tempcv, tempdv,  4 );                \
    temp6v = VSLD( tempcv, tempdv,  6 );                \
782
783
784
785
786
787
788
789
790
                                                        \
    HPEL_FILTER_2( temp1v, temp2v, temp3v,              \
                   temp4v, temp5v, temp6v );            \
                                                        \
    dest2v = vec_add( temp1v, thirtytwov );             \
    dest2v = vec_sra( dest2v, sixv );                   \
                                                        \
    destv = vec_packsu( dest1v, dest2v );               \
                                                        \
791
    vec_vsx_st( destv, x-16+i_stride*y, dstc );         \
792
793
794
}

void x264_hpel_filter_altivec( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
795
                               intptr_t i_stride, int i_width, int i_height, int16_t *buf )
796
797
798
799
800
801
802
803
804
805
806
{
    vec_u8_t destv;
    vec_u8_t src1v, src2v, src3v, src4v, src5v, src6v;
    vec_s16_t dest1v, dest2v;
    vec_s16_t temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v, temp8v, temp9v;
    vec_s16_t tempav, tempbv, tempcv, tempdv, tempev;

    LOAD_ZERO;

    vec_u16_t twov, fourv, fivev, sixv;
    vec_s16_t sixteenv, thirtytwov;
807
    vec_u16_u temp_u;
808
809
810
811
812
813
814
815
816
817
818
819
820
821

    temp_u.s[0]=2;
    twov = vec_splat( temp_u.v, 0 );
    temp_u.s[0]=4;
    fourv = vec_splat( temp_u.v, 0 );
    temp_u.s[0]=5;
    fivev = vec_splat( temp_u.v, 0 );
    temp_u.s[0]=6;
    sixv = vec_splat( temp_u.v, 0 );
    temp_u.s[0]=16;
    sixteenv = (vec_s16_t)vec_splat( temp_u.v, 0 );
    temp_u.s[0]=32;
    thirtytwov = (vec_s16_t)vec_splat( temp_u.v, 0 );

822
    for( int y = 0; y < i_height; y++ )
823
    {
824
        int x = 0;
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856

        /* horizontal_filter */
        HPEL_FILTER_HORIZONTAL();

        /* vertical_filter */
        HPEL_FILTER_VERTICAL();

        /* central_filter */
        tempav = tempcv;
        tempbv = tempdv;
        tempcv = vec_splat( temp1v, 0 ); /* first only */
        tempdv = temp1v;
        tempev = temp4v;

        for( x = 16; x < i_width; x+=16 )
        {
            /* horizontal_filter */
            HPEL_FILTER_HORIZONTAL();

            /* vertical_filter */
            HPEL_FILTER_VERTICAL();

            /* central_filter */
            tempav = tempcv;
            tempbv = tempdv;
            tempcv = tempev;
            tempdv = temp1v;
            tempev = temp4v;

            HPEL_FILTER_CENTRAL();
        }

857
        /* Partial vertical filter */
858
859
860
861
862
863
        src1v = vec_vsx_ld( x+i_stride*(y-2), src );
        src2v = vec_vsx_ld( x+i_stride*(y-1), src );
        src3v = vec_vsx_ld( x+i_stride*(y-0), src );
        src4v = vec_vsx_ld( x+i_stride*(y+1), src );
        src5v = vec_vsx_ld( x+i_stride*(y+2), src );
        src6v = vec_vsx_ld( x+i_stride*(y+3), src );
864
865
866
867
868
869
870
871

        temp1v = vec_u8_to_s16_h( src1v );
        temp2v = vec_u8_to_s16_h( src2v );
        temp3v = vec_u8_to_s16_h( src3v );
        temp4v = vec_u8_to_s16_h( src4v );
        temp5v = vec_u8_to_s16_h( src5v );
        temp6v = vec_u8_to_s16_h( src6v );

872
        HPEL_FILTER_1( temp1v, temp2v, temp3v, temp4v, temp5v, temp6v );
873

874
875
876
877
        /* central_filter */
        tempav = tempcv;
        tempbv = tempdv;
        tempcv = tempev;
878
879
        tempdv = temp1v;
        /* tempev is not used */
880
881
882
883
884

        HPEL_FILTER_CENTRAL();
    }
}

885
static void frame_init_lowres_core_altivec( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
886
                                            intptr_t src_stride, intptr_t dst_stride, int width, int height )
887
{
888
    int w = width >> 4;
889
890
891
892
893
    int end = (width & 15);
    vec_u8_t src0v, src1v, src2v;
    vec_u8_t lv, hv, src1p1v;
    vec_u8_t avg0v, avg1v, avghv, avghp1v, avgleftv, avgrightv;
    static const vec_u8_t inverse_bridge_shuffle = CV(0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E );
894
895
896
#ifndef WORDS_BIGENDIAN
    static const vec_u8_t inverse_bridge_shuffle_1 = CV(0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F, 0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F );
#endif
897

898
    for( int y = 0; y < height; y++ )
899
    {
900
        int x;
901
902
903
904
905
906
907
908
909
910
        uint8_t *src1 = src0+src_stride;
        uint8_t *src2 = src1+src_stride;

        src0v = vec_ld(0, src0);
        src1v = vec_ld(0, src1);
        src2v = vec_ld(0, src2);

        avg0v = vec_avg(src0v, src1v);
        avg1v = vec_avg(src1v, src2v);

911
        for( x = 0; x < w; x++ )
912
913
914
915
916
917
918
919
920
        {
            lv = vec_ld(16*(x*2+1), src0);
            src1v = vec_ld(16*(x*2+1), src1);
            avghv = vec_avg(lv, src1v);

            lv = vec_ld(16*(x*2+2), src0);
            src1p1v = vec_ld(16*(x*2+2), src1);
            avghp1v = vec_avg(lv, src1p1v);

921
922
            avgleftv = vec_avg(VSLD(avg0v, avghv, 1), avg0v);
            avgrightv = vec_avg(VSLD(avghv, avghp1v, 1), avghv);
923
924

            vec_st(vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle), 16*x, dst0);
925
#ifdef WORDS_BIGENDIAN
926
            vec_st((vec_u8_t)vec_pack((vec_u16_t)avgleftv,(vec_u16_t)avgrightv), 16*x, dsth);
927
928
929
#else
            vec_st(vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle_1), 16*x, dsth);
#endif
930
931
932
933
934
935
936
937
938

            avg0v = avghp1v;

            hv = vec_ld(16*(x*2+1), src2);
            avghv = vec_avg(src1v, hv);

            hv = vec_ld(16*(x*2+2), src2);
            avghp1v = vec_avg(src1p1v, hv);

939
940
            avgleftv = vec_avg(VSLD(avg1v, avghv, 1), avg1v);
            avgrightv = vec_avg(VSLD(avghv, avghp1v, 1), avghv);
941
942

            vec_st(vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle), 16*x, dstv);
943
#ifdef WORDS_BIGENDIAN
944
            vec_st((vec_u8_t)vec_pack((vec_u16_t)avgleftv,(vec_u16_t)avgrightv), 16*x, dstc);
945
946
947
#else
            vec_st(vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle_1), 16*x, dstc);
#endif
948
949
950
951

            avg1v = avghp1v;

        }
952
        if( end )
953
954
955
956
957
958
959
960
        {
            lv = vec_ld(16*(x*2+1), src0);
            src1v = vec_ld(16*(x*2+1), src1);
            avghv = vec_avg(lv, src1v);

            lv = vec_ld(16*(x*2+1), src2);
            avghp1v = vec_avg(src1v, lv);

961
962
            avgleftv = vec_avg(VSLD(avg0v, avghv, 1), avg0v);
            avgrightv = vec_avg(VSLD(avg1v, avghp1v, 1), avg1v);
963
964

            lv = vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle);
965
#ifdef WORDS_BIGENDIAN
966
            hv = (vec_u8_t)vec_pack((vec_u16_t)avgleftv,(vec_u16_t)avgrightv);
967
968
969
#else
            hv = vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle_1);
#endif
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991

            vec_ste((vec_u32_t)lv,16*x,(uint32_t*)dst0);
            vec_ste((vec_u32_t)lv,16*x+4,(uint32_t*)dst0);
            vec_ste((vec_u32_t)hv,16*x,(uint32_t*)dsth);
            vec_ste((vec_u32_t)hv,16*x+4,(uint32_t*)dsth);

            lv = vec_sld(lv, lv, 8);
            hv = vec_sld(hv, hv, 8);

            vec_ste((vec_u32_t)lv,16*x,(uint32_t*)dstv);
            vec_ste((vec_u32_t)lv,16*x+4,(uint32_t*)dstv);
            vec_ste((vec_u32_t)hv,16*x,(uint32_t*)dstc);
            vec_ste((vec_u32_t)hv,16*x+4,(uint32_t*)dstc);
        }

        src0 += src_stride*2;
        dst0 += dst_stride;
        dsth += dst_stride;
        dstv += dst_stride;
        dstc += dst_stride;
    }
}
992

993
static void mc_weight_w2_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src,
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
                                  const x264_weight_t *weight, int i_height )
{
    LOAD_ZERO;
    vec_u8_t srcv;
    vec_s16_t weightv;
    vec_s16_t scalev, offsetv, denomv, roundv;
    vec_s16_u loadv;

    int denom = weight->i_denom;

    loadv.s[0] = weight->i_scale;
    scalev = vec_splat( loadv.v, 0 );

    loadv.s[0] = weight->i_offset;
    offsetv = vec_splat( loadv.v, 0 );

    if( denom >= 1 )
    {
        loadv.s[0] = denom;
        denomv = vec_splat( loadv.v, 0 );

        loadv.s[0] = 1<<(denom - 1);
        roundv = vec_splat( loadv.v, 0 );

        for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
        {
1020
            srcv = vec_vsx_ld( 0, src );
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
            weightv = vec_u8_to_s16( srcv );

            weightv = vec_mladd( weightv, scalev, roundv );
            weightv = vec_sra( weightv, (vec_u16_t)denomv );
            weightv = vec_add( weightv, offsetv );

            srcv = vec_packsu( weightv, zero_s16v );
            vec_ste( vec_splat( (vec_u16_t)srcv, 0 ), 0, (uint16_t*)dst );
        }
    }
    else
    {
        for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
        {
1035
            srcv = vec_vsx_ld( 0, src );
1036
1037
1038
1039
1040
1041
1042
1043
1044
            weightv = vec_u8_to_s16( srcv );

            weightv = vec_mladd( weightv, scalev, offsetv );

            srcv = vec_packsu( weightv, zero_s16v );
            vec_ste( vec_splat( (vec_u16_t)srcv, 0 ), 0, (uint16_t*)dst );
        }
    }
}
1045
static void mc_weight_w4_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src,
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
                                  const x264_weight_t *weight, int i_height )
{
    LOAD_ZERO;
    vec_u8_t srcv;
    vec_s16_t weightv;
    vec_s16_t scalev, offsetv, denomv, roundv;
    vec_s16_u loadv;

    int denom = weight->i_denom;

    loadv.s[0] = weight->i_scale;
    scalev = vec_splat( loadv.v, 0 );

    loadv.s[0] = weight->i_offset;
    offsetv = vec_splat( loadv.v, 0 );

    if( denom >= 1 )
    {
        loadv.s[0] = denom;
        denomv = vec_splat( loadv.v, 0 );

        loadv.s[0] = 1<<(denom - 1);
        roundv = vec_splat( loadv.v, 0 );

        for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
        {
1072
            srcv = vec_vsx_ld( 0, src );
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
            weightv = vec_u8_to_s16( srcv );

            weightv = vec_mladd( weightv, scalev, roundv );
            weightv = vec_sra( weightv, (vec_u16_t)denomv );
            weightv = vec_add( weightv, offsetv );

            srcv = vec_packsu( weightv, zero_s16v );
            vec_ste( vec_splat( (vec_u32_t)srcv, 0 ), 0, (uint32_t*)dst );
        }
    }
    else
    {
        for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
        {
1087
            srcv = vec_vsx_ld( 0, src );
1088
1089
1090
1091
1092
1093
1094
1095
1096
            weightv = vec_u8_to_s16( srcv );

            weightv = vec_mladd( weightv, scalev, offsetv );

            srcv = vec_packsu( weightv, zero_s16v );
            vec_ste( vec_splat( (vec_u32_t)srcv, 0 ), 0, (uint32_t*)dst );
        }
    }
}
1097
static void mc_weight_w8_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src,
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
                                  const x264_weight_t *weight, int i_height )
{
    LOAD_ZERO;
    PREP_STORE8;
    vec_u8_t srcv;
    vec_s16_t weightv;
    vec_s16_t scalev, offsetv, denomv, roundv;
    vec_s16_u loadv;

    int denom = weight->i_denom;

    loadv.s[0] = weight->i_scale;
    scalev = vec_splat( loadv.v, 0 );

    loadv.s[0] = weight->i_offset;
    offsetv = vec_splat( loadv.v, 0 );

    if( denom >= 1 )
    {
        loadv.s[0] = denom;
        denomv = vec_splat( loadv.v, 0 );

        loadv.s[0] = 1<<(denom - 1);
        roundv = vec_splat( loadv.v, 0 );

        for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
        {
1125
            srcv = vec_vsx_ld( 0, src );
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
            weightv = vec_u8_to_s16( srcv );

            weightv = vec_mladd( weightv, scalev, roundv );
            weightv = vec_sra( weightv, (vec_u16_t)denomv );
            weightv = vec_add( weightv, offsetv );

            srcv = vec_packsu( weightv, zero_s16v );
            VEC_STORE8( srcv, dst );
        }
    }
    else
    {
        for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
        {
1140
            srcv = vec_vsx_ld( 0, src );
1141
1142
1143
1144
1145
1146
1147
1148
1149
            weightv = vec_u8_to_s16( srcv );

            weightv = vec_mladd( weightv, scalev, offsetv );

            srcv = vec_packsu( weightv, zero_s16v );
            VEC_STORE8( srcv, dst );
        }
    }
}
1150
static void mc_weight_w16_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src,
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
                                   const x264_weight_t *weight, int i_height )
{
    LOAD_ZERO;
    vec_u8_t srcv;
    vec_s16_t weight_lv, weight_hv;
    vec_s16_t scalev, offsetv, denomv, roundv;
    vec_s16_u loadv;

    int denom = weight->i_denom;

    loadv.s[0] = weight->i_scale;
    scalev = vec_splat( loadv.v, 0 );

    loadv.s[0] = weight->i_offset;
    offsetv = vec_splat( loadv.v, 0 );

    if( denom >= 1 )
    {
        loadv.s[0] = denom;
        denomv = vec_splat( loadv.v, 0 );

        loadv.s[0] = 1<<(denom - 1);
        roundv = vec_splat( loadv.v, 0 );

        for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
        {
1177
            srcv = vec_vsx_ld( 0, src );
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
            weight_hv = vec_u8_to_s16_h( srcv );
            weight_lv = vec_u8_to_s16_l( srcv );

            weight_hv = vec_mladd( weight_hv, scalev, roundv );
            weight_lv = vec_mladd( weight_lv, scalev, roundv );
            weight_hv = vec_sra( weight_hv, (vec_u16_t)denomv );
            weight_lv = vec_sra( weight_lv, (vec_u16_t)denomv );
            weight_hv = vec_add( weight_hv, offsetv );
            weight_lv = vec_add( weight_lv, offsetv );

            srcv = vec_packsu( weight_hv, weight_lv );
            vec_st( srcv, 0, dst );
        }
    }
    else
    {
        for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
        {
1196
            srcv = vec_vsx_ld( 0, src );
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
            weight_hv = vec_u8_to_s16_h( srcv );
            weight_lv = vec_u8_to_s16_l( srcv );

            weight_hv = vec_mladd( weight_hv, scalev, offsetv );
            weight_lv = vec_mladd( weight_lv, scalev, offsetv );

            srcv = vec_packsu( weight_hv, weight_lv );
            vec_st( srcv, 0, dst );
        }
    }
}
1208
static void mc_weight_w20_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src,