quant.c 13.8 KB
Newer Older
1
/*****************************************************************************
Fiona Glaser's avatar
Fiona Glaser committed
2 3
 * quant.c: ppc quantization
 *****************************************************************************
Loren Merritt's avatar
Loren Merritt committed
4
 * Copyright (C) 2007-2013 x264 project
Fiona Glaser's avatar
Fiona Glaser committed
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 *
 * Authors: Guillaume Poirier <gpoirier@mplayerhq.hu>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
 *
 * This program is also available under a commercial proprietary license.
 * For more information, contact us at licensing@x264.com.
 *****************************************************************************/
25 26 27

#include "common/common.h"
#include "ppccommon.h"
Anton Mitrofanov's avatar
Anton Mitrofanov committed
28
#include "quant.h"
29

30
#if !HIGH_BIT_DEPTH
31
// quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
32 33
#define QUANT_16_U( idx0, idx1 )                                    \
{                                                                   \
34 35
    temp1v = vec_ld((idx0), dct);                                   \
    temp2v = vec_ld((idx1), dct);                                   \
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58
    mfvA = vec_ld((idx0), mf);                                      \
    mfvB = vec_ld((idx1), mf);                                      \
    biasvA = vec_ld((idx0), bias);                                  \
    biasvB = vec_ld((idx1), bias);                                  \
    mskA = vec_cmplt(temp1v, zero_s16v);                            \
    mskB = vec_cmplt(temp2v, zero_s16v);                            \
    coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v);\
    coefvB = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp2v), temp2v);\
    coefvA = vec_adds(coefvA, biasvA);                              \
    coefvB = vec_adds(coefvB, biasvB);                              \
    multEvenvA = vec_mule(coefvA, mfvA);                            \
    multOddvA = vec_mulo(coefvA, mfvA);                             \
    multEvenvB = vec_mule(coefvB, mfvB);                            \
    multOddvB = vec_mulo(coefvB, mfvB);                             \
    multEvenvA = vec_sr(multEvenvA, i_qbitsv);                      \
    multOddvA = vec_sr(multOddvA, i_qbitsv);                        \
    multEvenvB = vec_sr(multEvenvB, i_qbitsv);                      \
    multOddvB = vec_sr(multOddvB, i_qbitsv);                        \
    temp1v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
    temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvB, multOddvB), vec_mergel(multEvenvB, multOddvB)); \
    temp1v = vec_xor(temp1v, mskA);                                 \
    temp2v = vec_xor(temp2v, mskB);                                 \
    temp1v = vec_adds(temp1v, vec_and(mskA, one));                  \
59
    vec_st(temp1v, (idx0), dct);                                    \
60 61
    temp2v = vec_adds(temp2v, vec_and(mskB, one));                  \
    nz = vec_or(nz, vec_or(temp1v, temp2v));                        \
62
    vec_st(temp2v, (idx1), dct);                                    \
63
}
Anton Mitrofanov's avatar
Anton Mitrofanov committed
64

65
int x264_quant_4x4_altivec( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
66
{
67
    LOAD_ZERO;
68
    vector bool short mskA;
69
    vec_u32_t i_qbitsv;
70 71
    vec_u16_t coefvA;
    vec_u32_t multEvenvA, multOddvA;
72
    vec_u16_t mfvA;
73
    vec_u16_t biasvA;
74 75
    vec_s16_t one = vec_splat_s16(1);;
    vec_s16_t nz = zero_s16v;
76 77 78 79

    vector bool short mskB;
    vec_u16_t coefvB;
    vec_u32_t multEvenvB, multOddvB;
80
    vec_u16_t mfvB;
81
    vec_u16_t biasvB;
82 83 84

    vec_s16_t temp1v, temp2v;

85
    vec_u32_u qbits_u;
86
    qbits_u.s[0]=16;
87 88
    i_qbitsv = vec_splat(qbits_u.v, 0);

89
    QUANT_16_U( 0, 16 );
90
    return vec_any_ne(nz, zero_s16v);
91 92 93
}

// DC quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
94 95
#define QUANT_16_U_DC( idx0, idx1 )                                 \
{                                                                   \
96 97
    temp1v = vec_ld((idx0), dct);                                   \
    temp2v = vec_ld((idx1), dct);                                   \
98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
    mskA = vec_cmplt(temp1v, zero_s16v);                            \
    mskB = vec_cmplt(temp2v, zero_s16v);                            \
    coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v);\
    coefvB = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp2v), temp2v);\
    coefvA = vec_add(coefvA, biasv);                                \
    coefvB = vec_add(coefvB, biasv);                                \
    multEvenvA = vec_mule(coefvA, mfv);                             \
    multOddvA = vec_mulo(coefvA, mfv);                              \
    multEvenvB = vec_mule(coefvB, mfv);                             \
    multOddvB = vec_mulo(coefvB, mfv);                              \
    multEvenvA = vec_sr(multEvenvA, i_qbitsv);                      \
    multOddvA = vec_sr(multOddvA, i_qbitsv);                        \
    multEvenvB = vec_sr(multEvenvB, i_qbitsv);                      \
    multOddvB = vec_sr(multOddvB, i_qbitsv);                        \
    temp1v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
    temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvB, multOddvB), vec_mergel(multEvenvB, multOddvB)); \
    temp1v = vec_xor(temp1v, mskA);                                 \
    temp2v = vec_xor(temp2v, mskB);                                 \
    temp1v = vec_add(temp1v, vec_and(mskA, one));                   \
117
    vec_st(temp1v, (idx0), dct);                                    \
118 119
    temp2v = vec_add(temp2v, vec_and(mskB, one));                   \
    nz = vec_or(nz, vec_or(temp1v, temp2v));                        \
120
    vec_st(temp2v, (idx1), dct);                                    \
121
}
122

123
int x264_quant_4x4_dc_altivec( int16_t dct[16], int mf, int bias )
124
{
125
    LOAD_ZERO;
126
    vector bool short mskA;
127
    vec_u32_t i_qbitsv;
128 129
    vec_u16_t coefvA;
    vec_u32_t multEvenvA, multOddvA;
130 131
    vec_s16_t one = vec_splat_s16(1);
    vec_s16_t nz = zero_s16v;
132 133 134 135 136 137 138

    vector bool short mskB;
    vec_u16_t coefvB;
    vec_u32_t multEvenvB, multOddvB;

    vec_s16_t temp1v, temp2v;

139
    vec_u16_t mfv;
140 141
    vec_u16_t biasv;

142
    vec_u16_u mf_u;
143
    mf_u.s[0]=mf;
144 145
    mfv = vec_splat( mf_u.v, 0 );

146
    vec_u32_u qbits_u;
147
    qbits_u.s[0]=16;
148 149
    i_qbitsv = vec_splat(qbits_u.v, 0);

150
    vec_u16_u bias_u;
151 152
    bias_u.s[0]=bias;
    biasv = vec_splat(bias_u.v, 0);
153 154

    QUANT_16_U_DC( 0, 16 );
155
    return vec_any_ne(nz, zero_s16v);
156 157
}

158
// DC quant of a whole 2x2 block
159 160 161
#define QUANT_4_U_DC( idx0 )                                        \
{                                                                   \
    const vec_u16_t sel = (vec_u16_t) CV(-1,-1,-1,-1,0,0,0,0);      \
162
    temp1v = vec_ld((idx0), dct);                                   \
163 164 165 166 167 168 169 170 171 172 173 174
    mskA = vec_cmplt(temp1v, zero_s16v);                            \
    coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v);\
    coefvA = vec_add(coefvA, biasv);                                \
    multEvenvA = vec_mule(coefvA, mfv);                             \
    multOddvA = vec_mulo(coefvA, mfv);                              \
    multEvenvA = vec_sr(multEvenvA, i_qbitsv);                      \
    multOddvA = vec_sr(multOddvA, i_qbitsv);                        \
    temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
    temp2v = vec_xor(temp2v, mskA);                                 \
    temp2v = vec_add(temp2v, vec_and(mskA, one));                   \
    temp1v = vec_sel(temp1v, temp2v, sel);                          \
    nz = vec_or(nz, temp1v);                                        \
175
    vec_st(temp1v, (idx0), dct);                                    \
176
}
177

178
int x264_quant_2x2_dc_altivec( int16_t dct[4], int mf, int bias )
179
{
180
    LOAD_ZERO;
181 182 183 184
    vector bool short mskA;
    vec_u32_t i_qbitsv;
    vec_u16_t coefvA;
    vec_u32_t multEvenvA, multOddvA;
185 186
    vec_s16_t one = vec_splat_s16(1);
    vec_s16_t nz = zero_s16v;
187 188 189 190 191 192

    vec_s16_t temp1v, temp2v;

    vec_u16_t mfv;
    vec_u16_t biasv;

193
    vec_u16_u mf_u;
194 195 196
    mf_u.s[0]=mf;
    mfv = vec_splat( mf_u.v, 0 );

197
    vec_u32_u qbits_u;
198 199 200
    qbits_u.s[0]=16;
    i_qbitsv = vec_splat(qbits_u.v, 0);

201
    vec_u16_u bias_u;
202 203 204
    bias_u.s[0]=bias;
    biasv = vec_splat(bias_u.v, 0);

205
    static const vec_s16_t mask2 = CV(-1, -1, -1, -1,  0, 0, 0, 0);
206
    QUANT_4_U_DC(0);
207
    return vec_any_ne(vec_and(nz, mask2), zero_s16v);
208
}
209

210
int x264_quant_8x8_altivec( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
211
{
212
    LOAD_ZERO;
213
    vector bool short mskA;
214
    vec_u32_t i_qbitsv;
215
    vec_u16_t coefvA;
216 217
    vec_u32_t multEvenvA, multOddvA;
    vec_u16_t mfvA;
218
    vec_u16_t biasvA;
219 220 221
    vec_s16_t one = vec_splat_s16(1);;
    vec_s16_t nz = zero_s16v;

222 223
    vector bool short mskB;
    vec_u16_t coefvB;
224 225
    vec_u32_t multEvenvB, multOddvB;
    vec_u16_t mfvB;
226
    vec_u16_t biasvB;
227

228
    vec_s16_t temp1v, temp2v;
Anton Mitrofanov's avatar
Anton Mitrofanov committed
229

230
    vec_u32_u qbits_u;
231
    qbits_u.s[0]=16;
232 233
    i_qbitsv = vec_splat(qbits_u.v, 0);

234 235
    for( int i = 0; i < 4; i++ )
        QUANT_16_U( i*2*16, i*2*16+16 );
236
    return vec_any_ne(nz, zero_s16v);
237 238
}

239 240
#define DEQUANT_SHL()                                                \
{                                                                    \
241 242 243
    dctv = vec_ld(8*y, dct);                                         \
    mf1v = vec_ld(16*y, dequant_mf[i_mf]);                           \
    mf2v = vec_ld(16+16*y, dequant_mf[i_mf]);                        \
244 245 246 247 248 249 250
    mfv  = vec_packs(mf1v, mf2v);                                    \
                                                                     \
    multEvenvA = vec_mule(dctv, mfv);                                \
    multOddvA = vec_mulo(dctv, mfv);                                 \
    dctv = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA),  \
                                 vec_mergel(multEvenvA, multOddvA)); \
    dctv = vec_sl(dctv, i_qbitsv);                                   \
251
    vec_st(dctv, 8*y, dct);                                          \
252 253 254 255
}

#define DEQUANT_SHR()                                          \
{                                                              \
256
    dctv = vec_ld(8*y, dct);                                   \
257 258
    dct1v = vec_mergeh(dctv, dctv);                            \
    dct2v = vec_mergel(dctv, dctv);                            \
259 260
    mf1v = vec_ld(16*y, dequant_mf[i_mf]);                     \
    mf2v = vec_ld(16+16*y, dequant_mf[i_mf]);                  \
261 262 263 264 265 266 267 268 269 270 271 272 273 274
                                                               \
    multEvenvA = vec_mule(dct1v, (vec_s16_t)mf1v);             \
    multOddvA = vec_mulo(dct1v, (vec_s16_t)mf1v);              \
    temp1v = vec_add(vec_sl(multEvenvA, sixteenv), multOddvA); \
    temp1v = vec_add(temp1v, fv);                              \
    temp1v = vec_sra(temp1v, i_qbitsv);                        \
                                                               \
    multEvenvA = vec_mule(dct2v, (vec_s16_t)mf2v);             \
    multOddvA = vec_mulo(dct2v, (vec_s16_t)mf2v);              \
    temp2v = vec_add(vec_sl(multEvenvA, sixteenv), multOddvA); \
    temp2v = vec_add(temp2v, fv);                              \
    temp2v = vec_sra(temp2v, i_qbitsv);                        \
                                                               \
    dctv = (vec_s16_t)vec_packs(temp1v, temp2v);               \
275
    vec_st(dctv, y*8, dct);                                    \
276 277
}

278
void x264_dequant_4x4_altivec( int16_t dct[16], int dequant_mf[6][16], int i_qp )
279
{
280 281
    int i_mf = i_qp%6;
    int i_qbits = i_qp/6 - 4;
282 283 284 285 286 287 288 289 290 291 292

    vec_s16_t dctv;
    vec_s16_t dct1v, dct2v;
    vec_s32_t mf1v, mf2v;
    vec_s16_t mfv;
    vec_s32_t multEvenvA, multOddvA;
    vec_s32_t temp1v, temp2v;

    if( i_qbits >= 0 )
    {
        vec_u16_t i_qbitsv;
293
        vec_u16_u qbits_u;
294 295 296
        qbits_u.s[0]=i_qbits;
        i_qbitsv = vec_splat(qbits_u.v, 0);

297
        for( int y = 0; y < 4; y+=2 )
298 299 300 301 302 303 304
            DEQUANT_SHL();
    }
    else
    {
        const int f = 1 << (-i_qbits-1);

        vec_s32_t fv;
305
        vec_u32_u f_u;
306 307 308 309
        f_u.s[0]=f;
        fv = (vec_s32_t)vec_splat(f_u.v, 0);

        vec_u32_t i_qbitsv;
310
        vec_u32_u qbits_u;
311 312 313 314
        qbits_u.s[0]=-i_qbits;
        i_qbitsv = vec_splat(qbits_u.v, 0);

        vec_u32_t sixteenv;
315
        vec_u32_u sixteen_u;
316 317 318
        sixteen_u.s[0]=16;
        sixteenv = vec_splat(sixteen_u.v, 0);

319
        for( int y = 0; y < 4; y+=2 )
320 321 322 323
            DEQUANT_SHR();
    }
}

324
void x264_dequant_8x8_altivec( int16_t dct[64], int dequant_mf[6][64], int i_qp )
325
{
326 327
    int i_mf = i_qp%6;
    int i_qbits = i_qp/6 - 6;
328 329 330 331 332 333 334 335 336 337 338

    vec_s16_t dctv;
    vec_s16_t dct1v, dct2v;
    vec_s32_t mf1v, mf2v;
    vec_s16_t mfv;
    vec_s32_t multEvenvA, multOddvA;
    vec_s32_t temp1v, temp2v;

    if( i_qbits >= 0 )
    {
        vec_u16_t i_qbitsv;
339
        vec_u16_u qbits_u;
340 341 342
        qbits_u.s[0]=i_qbits;
        i_qbitsv = vec_splat(qbits_u.v, 0);

343
        for( int y = 0; y < 16; y+=2 )
344 345 346 347 348 349 350
            DEQUANT_SHL();
    }
    else
    {
        const int f = 1 << (-i_qbits-1);

        vec_s32_t fv;
351
        vec_u32_u f_u;
352 353 354 355
        f_u.s[0]=f;
        fv = (vec_s32_t)vec_splat(f_u.v, 0);

        vec_u32_t i_qbitsv;
356
        vec_u32_u qbits_u;
357 358 359 360
        qbits_u.s[0]=-i_qbits;
        i_qbitsv = vec_splat(qbits_u.v, 0);

        vec_u32_t sixteenv;
361
        vec_u32_u sixteen_u;
362 363 364
        sixteen_u.s[0]=16;
        sixteenv = vec_splat(sixteen_u.v, 0);

365
        for( int y = 0; y < 16; y+=2 )
366 367 368
            DEQUANT_SHR();
    }
}
369
#endif // !HIGH_BIT_DEPTH
370