ppccommon.h 12.1 KB
Newer Older
Laurent Aimar's avatar
Laurent Aimar committed
1 2 3
/*****************************************************************************
 * ppccommon.h: h264 encoder
 *****************************************************************************
Eric Petit's avatar
Eric Petit committed
4
 * Copyright (C) 2003 Eric Petit <eric.petit@lapsus.org>
Laurent Aimar's avatar
Laurent Aimar committed
5 6 7 8 9 10 11 12 13 14 15 16 17
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
Laurent Aimar's avatar
Laurent Aimar committed
19 20
 *****************************************************************************/

21
#ifdef HAVE_ALTIVEC_H
22 23 24
#include <altivec.h>
#endif

25 26 27
/***********************************************************************
 * For constant vectors, use parentheses on OS X and braces on Linux
 **********************************************************************/
28
#if defined(__APPLE__) && __GNUC__ < 4
29 30 31 32 33
#define CV(a...) (a)
#else
#define CV(a...) {a}
#endif

34 35 36 37 38 39 40 41 42
/***********************************************************************
 * Vector types
 **********************************************************************/
#define vec_u8_t  vector unsigned char
#define vec_s8_t  vector signed char
#define vec_u16_t vector unsigned short
#define vec_s16_t vector signed short
#define vec_u32_t vector unsigned int
#define vec_s32_t vector signed int
Laurent Aimar's avatar
Laurent Aimar committed
43

44
typedef union {
45 46 47
  uint32_t s[4];
  vec_u32_t v;
} vec_u32_u;
48 49

typedef union {
50 51 52
  uint16_t s[8];
  vec_u16_t v;
} vec_u16_u;
53

54
typedef union {
55 56 57
  int16_t s[8];
  vec_s16_t v;
} vec_s16_u;
58

59
typedef union {
60 61
  uint8_t s[16];
  vec_u8_t v;
62 63
} vec_u8_u;

64 65 66
/***********************************************************************
 * Null vector
 **********************************************************************/
Eric Petit's avatar
Eric Petit committed
67
#define LOAD_ZERO const vec_u8_t zerov = vec_splat_u8( 0 )
Laurent Aimar's avatar
Laurent Aimar committed
68

69 70 71 72 73 74
#define zero_u8v  (vec_u8_t)  zerov
#define zero_s8v  (vec_s8_t)  zerov
#define zero_u16v (vec_u16_t) zerov
#define zero_s16v (vec_s16_t) zerov
#define zero_u32v (vec_u32_t) zerov
#define zero_s32v (vec_s32_t) zerov
Laurent Aimar's avatar
Laurent Aimar committed
75

76
/***********************************************************************
Eric Petit's avatar
Eric Petit committed
77
 * 8 <-> 16 bits conversions
78
 **********************************************************************/
Eric Petit's avatar
Eric Petit committed
79 80 81 82
#define vec_u8_to_u16_h(v) (vec_u16_t) vec_mergeh( zero_u8v, (vec_u8_t) v )
#define vec_u8_to_u16_l(v) (vec_u16_t) vec_mergel( zero_u8v, (vec_u8_t) v )
#define vec_u8_to_s16_h(v) (vec_s16_t) vec_mergeh( zero_u8v, (vec_u8_t) v )
#define vec_u8_to_s16_l(v) (vec_s16_t) vec_mergel( zero_u8v, (vec_u8_t) v )
83

Eric Petit's avatar
Eric Petit committed
84 85
#define vec_u8_to_u16(v) vec_u8_to_u16_h(v)
#define vec_u8_to_s16(v) vec_u8_to_s16_h(v)
86

Eric Petit's avatar
Eric Petit committed
87
#define vec_u16_to_u8(v) vec_pack( v, zero_u16v )
88
#define vec_s16_to_u8(v) vec_packsu( v, zero_s16v )
89

90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105

/***********************************************************************
 * 16 <-> 32 bits conversions
 **********************************************************************/
#define vec_u16_to_u32_h(v) (vec_u32_t) vec_mergeh( zero_u16v, (vec_u16_t) v )
#define vec_u16_to_u32_l(v) (vec_u32_t) vec_mergel( zero_u16v, (vec_u16_t) v )
#define vec_u16_to_s32_h(v) (vec_s32_t) vec_mergeh( zero_u16v, (vec_u16_t) v )
#define vec_u16_to_s32_l(v) (vec_s32_t) vec_mergel( zero_u16v, (vec_u16_t) v )

#define vec_u16_to_u32(v) vec_u16_to_u32_h(v)
#define vec_u16_to_s32(v) vec_u16_to_s32_h(v)

#define vec_u32_to_u16(v) vec_pack( v, zero_u32v )
#define vec_s32_to_u16(v) vec_packsu( v, zero_s32v )


106
/***********************************************************************
Eric Petit's avatar
Eric Petit committed
107
 * PREP_LOAD: declares two vectors required to perform unaligned loads
108 109 110
 * VEC_LOAD:  loads n bytes from u8 * p into vector v of type t where o is from original src offset
 * VEC_LOAD:_G: loads n bytes from u8 * p into vectory v of type t - use when offset is not known
 * VEC_LOAD_OFFSET: as above, but with offset vector known in advance
111
 **********************************************************************/
112
#define PREP_LOAD     \
Eric Petit's avatar
Eric Petit committed
113
    vec_u8_t _hv, _lv
114

115
#define PREP_LOAD_SRC( src )              \
Anton Mitrofanov's avatar
Anton Mitrofanov committed
116
    vec_u8_t _##src##_ = vec_lvsl(0, src)
117

118 119 120 121
#define VEC_LOAD_G( p, v, n, t )                 \
    _hv = vec_ld( 0, p );                        \
    v   = (t) vec_lvsl( 0, p );                  \
    _lv = vec_ld( n - 1, p );                    \
Anton Mitrofanov's avatar
Anton Mitrofanov committed
122
    v   = (t) vec_perm( _hv, _lv, (vec_u8_t) v )
123

124 125 126
#define VEC_LOAD( p, v, n, t, g )                   \
    _hv = vec_ld( 0, p );                           \
    _lv = vec_ld( n - 1, p );                       \
127 128
    v = (t) vec_perm( _hv, _lv, (vec_u8_t) _##g##_ )

129 130 131
#define VEC_LOAD_OFFSET( p, v, n, t, o )         \
    _hv = vec_ld( 0, p);                         \
    _lv = vec_ld( n - 1, p );                    \
132 133
    v   = (t) vec_perm( _hv, _lv, (vec_u8_t) o )

134 135
#define VEC_LOAD_PARTIAL( p, v, n, t, g)               \
    _hv = vec_ld( 0, p);                               \
136
    v   = (t) vec_perm( _hv, _hv, (vec_u8_t) _##g##_ )
Anton Mitrofanov's avatar
Anton Mitrofanov committed
137

Laurent Aimar's avatar
Laurent Aimar committed
138

139
/***********************************************************************
Eric Petit's avatar
Eric Petit committed
140 141 142
 * PREP_STORE##n: declares required vectors to store n bytes to a
 *                potentially unaligned address
 * VEC_STORE##n:  stores n bytes from vector v to address p
143
 **********************************************************************/
Eric Petit's avatar
Eric Petit committed
144
#define PREP_STORE16 \
145
    vec_u8_t _tmp1v  \
Eric Petit's avatar
Eric Petit committed
146

147
#define PREP_STORE16_DST( dst )             \
148 149 150
    vec_u8_t _##dst##l_ = vec_lvsl(0, dst); \
    vec_u8_t _##dst##r_ = vec_lvsr(0, dst);

151 152 153 154
#define VEC_STORE16( v, p, o )                           \
    _hv    = vec_ld( 0, p );                             \
    _lv    = vec_ld( 15, p );                            \
    _tmp1v = vec_perm( _lv, _hv, _##o##l_ );             \
155
    _lv    = vec_perm( (vec_u8_t) v, _tmp1v, _##o##r_ ); \
156
    vec_st( _lv, 15, (uint8_t *) p );                    \
157
    _hv    = vec_perm( _tmp1v, (vec_u8_t) v, _##o##r_ ); \
Anton Mitrofanov's avatar
Anton Mitrofanov committed
158
    vec_st( _hv, 0, (uint8_t *) p )
159

Eric Petit's avatar
Eric Petit committed
160 161

#define PREP_STORE8 \
162
    vec_u8_t _tmp3v \
163

164 165 166
#define VEC_STORE8( v, p )                \
    _tmp3v = vec_lvsl(0, p);              \
    v = vec_perm(v, v, _tmp3v);           \
167 168 169
    vec_ste((vec_u32_t)v,0,(uint32_t*)p); \
    vec_ste((vec_u32_t)v,4,(uint32_t*)p)

Eric Petit's avatar
Eric Petit committed
170

171 172 173 174
#define PREP_STORE4                                        \
    PREP_STORE16;                                          \
    vec_u8_t _tmp2v, _tmp3v;                               \
    const vec_u8_t sel =                                   \
Eric Petit's avatar
Eric Petit committed
175 176
        (vec_u8_t) CV(-1,-1,-1,-1,0,0,0,0,0,0,0,0,0,0,0,0)

177 178 179 180
#define VEC_STORE4( v, p )                      \
    _tmp3v = vec_lvsr( 0, p );                  \
    v      = vec_perm( v, v, _tmp3v );          \
    _lv    = vec_ld( 3, p );                    \
Eric Petit's avatar
Eric Petit committed
181
    _tmp1v = vec_perm( sel, zero_u8v, _tmp3v ); \
182 183 184
    _lv    = vec_sel( _lv, v, _tmp1v );         \
    vec_st( _lv, 3, p );                        \
    _hv    = vec_ld( 0, p );                    \
Eric Petit's avatar
Eric Petit committed
185
    _tmp2v = vec_perm( zero_u8v, sel, _tmp3v ); \
186
    _hv    = vec_sel( _hv, v, _tmp2v );         \
Eric Petit's avatar
Eric Petit committed
187
    vec_st( _hv, 0, p )
Laurent Aimar's avatar
Laurent Aimar committed
188

Eric Petit's avatar
Eric Petit committed
189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249
/***********************************************************************
 * VEC_TRANSPOSE_8
 ***********************************************************************
 * Transposes a 8x8 matrix of s16 vectors
 **********************************************************************/
#define VEC_TRANSPOSE_8(a0,a1,a2,a3,a4,a5,a6,a7,b0,b1,b2,b3,b4,b5,b6,b7) \
    b0 = vec_mergeh( a0, a4 ); \
    b1 = vec_mergel( a0, a4 ); \
    b2 = vec_mergeh( a1, a5 ); \
    b3 = vec_mergel( a1, a5 ); \
    b4 = vec_mergeh( a2, a6 ); \
    b5 = vec_mergel( a2, a6 ); \
    b6 = vec_mergeh( a3, a7 ); \
    b7 = vec_mergel( a3, a7 ); \
    a0 = vec_mergeh( b0, b4 ); \
    a1 = vec_mergel( b0, b4 ); \
    a2 = vec_mergeh( b1, b5 ); \
    a3 = vec_mergel( b1, b5 ); \
    a4 = vec_mergeh( b2, b6 ); \
    a5 = vec_mergel( b2, b6 ); \
    a6 = vec_mergeh( b3, b7 ); \
    a7 = vec_mergel( b3, b7 ); \
    b0 = vec_mergeh( a0, a4 ); \
    b1 = vec_mergel( a0, a4 ); \
    b2 = vec_mergeh( a1, a5 ); \
    b3 = vec_mergel( a1, a5 ); \
    b4 = vec_mergeh( a2, a6 ); \
    b5 = vec_mergel( a2, a6 ); \
    b6 = vec_mergeh( a3, a7 ); \
    b7 = vec_mergel( a3, a7 )

/***********************************************************************
 * VEC_TRANSPOSE_4
 ***********************************************************************
 * Transposes a 4x4 matrix of s16 vectors.
 * Actually source and destination are 8x4. The low elements of the
 * source are discarded and the low elements of the destination mustn't
 * be used.
 **********************************************************************/
#define VEC_TRANSPOSE_4(a0,a1,a2,a3,b0,b1,b2,b3) \
    b0 = vec_mergeh( a0, a0 ); \
    b1 = vec_mergeh( a1, a0 ); \
    b2 = vec_mergeh( a2, a0 ); \
    b3 = vec_mergeh( a3, a0 ); \
    a0 = vec_mergeh( b0, b2 ); \
    a1 = vec_mergel( b0, b2 ); \
    a2 = vec_mergeh( b1, b3 ); \
    a3 = vec_mergel( b1, b3 ); \
    b0 = vec_mergeh( a0, a2 ); \
    b1 = vec_mergel( a0, a2 ); \
    b2 = vec_mergeh( a1, a3 ); \
    b3 = vec_mergel( a1, a3 )

/***********************************************************************
 * VEC_DIFF_H
 ***********************************************************************
 * p1, p2:    u8 *
 * i1, i2, n: int
 * d:         s16v
 *
 * Loads n bytes from p1 and p2, do the diff of the high elements into
250
 * d, increments p1 and p2 by i1 and i2 into known offset g
Eric Petit's avatar
Eric Petit committed
251 252 253 254 255 256
 **********************************************************************/
#define PREP_DIFF           \
    LOAD_ZERO;              \
    PREP_LOAD;              \
    vec_s16_t pix1v, pix2v;

257

258
#define VEC_DIFF_H(p1,i1,p2,i2,n,d,g)               \
259
    VEC_LOAD_PARTIAL( p1, pix1v, n, vec_s16_t, p1); \
260 261 262 263 264
    pix1v = vec_u8_to_s16( pix1v );                 \
    VEC_LOAD( p2, pix2v, n, vec_s16_t, g);          \
    pix2v = vec_u8_to_s16( pix2v );                 \
    d     = vec_sub( pix1v, pix2v );                \
    p1   += i1;                                     \
Eric Petit's avatar
Eric Petit committed
265 266 267 268 269 270 271 272 273 274 275 276 277
    p2   += i2

/***********************************************************************
 * VEC_DIFF_HL
 ***********************************************************************
 * p1, p2: u8 *
 * i1, i2: int
 * dh, dl: s16v
 *
 * Loads 16 bytes from p1 and p2, do the diff of the high elements into
 * dh, the diff of the low elements into dl, increments p1 and p2 by i1
 * and i2
 **********************************************************************/
278
#define VEC_DIFF_HL(p1,i1,p2,i2,dh,dl)       \
279
    pix1v = (vec_s16_t)vec_ld(0, p1);        \
280 281
    temp0v = vec_u8_to_s16_h( pix1v );       \
    temp1v = vec_u8_to_s16_l( pix1v );       \
282
    VEC_LOAD( p2, pix2v, 16, vec_s16_t, p2); \
283 284 285 286 287
    temp2v = vec_u8_to_s16_h( pix2v );       \
    temp3v = vec_u8_to_s16_l( pix2v );       \
    dh     = vec_sub( temp0v, temp2v );      \
    dl     = vec_sub( temp1v, temp3v );      \
    p1    += i1;                             \
Eric Petit's avatar
Eric Petit committed
288
    p2    += i2
289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304

/***********************************************************************
* VEC_DIFF_H_8BYTE_ALIGNED
***********************************************************************
* p1, p2:    u8 *
* i1, i2, n: int
* d:         s16v
*
* Loads n bytes from p1 and p2, do the diff of the high elements into
* d, increments p1 and p2 by i1 and i2
* Slightly faster when we know we are loading/diffing 8bytes which
* are 8 byte aligned. Reduces need for two loads and two vec_lvsl()'s
**********************************************************************/
#define PREP_DIFF_8BYTEALIGNED \
LOAD_ZERO;                     \
vec_s16_t pix1v, pix2v;        \
305
vec_u8_t pix1v8, pix2v8;       \
306 307 308 309 310
vec_u8_t permPix1, permPix2;   \
permPix1 = vec_lvsl(0, pix1);  \
permPix2 = vec_lvsl(0, pix2);  \

#define VEC_DIFF_H_8BYTE_ALIGNED(p1,i1,p2,i2,n,d)     \
311 312 313 314 315 316
pix1v8 = vec_perm(vec_ld(0,p1), zero_u8v, permPix1);  \
pix2v8 = vec_perm(vec_ld(0, p2), zero_u8v, permPix2); \
pix1v = vec_u8_to_s16( pix1v8 );                      \
pix2v = vec_u8_to_s16( pix2v8 );                      \
d = vec_sub( pix1v, pix2v);                           \
p1 += i1;                                             \
317
p2 += i2;