video_yuv_asm.h 15.2 KB
Newer Older
1
/*****************************************************************************
Sam Hocevar's avatar
Sam Hocevar committed
2
 * video_yuv_asm.h: MMX YUV transformation assembly
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
 *****************************************************************************
 * Copyright (C) 1999, 2000 VideoLAN
 *
 * Authors:
 * Olie Lho <ollie@sis.com.tw>
 *
 * Adapted to VideoLAN by:
 * Gal Hendryckx <jimmy@via.ecp.fr>
 * Samuel Hocevar <sam@via.ecp.fr>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this program; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 02111-1307, USA.
 *****************************************************************************/

/* hope these constant values are cache line aligned */
static unsigned long long mmx_80w     = 0x0080008000800080;
static unsigned long long mmx_10w     = 0x1010101010101010;
static unsigned long long mmx_00ffw   = 0x00ff00ff00ff00ff;
static unsigned long long mmx_Y_coeff = 0x253f253f253f253f;

static unsigned long long mmx_U_green = 0xf37df37df37df37d;
static unsigned long long mmx_U_blue  = 0x4093409340934093;
static unsigned long long mmx_V_red   = 0x3312331233123312;
static unsigned long long mmx_V_green = 0xe5fce5fce5fce5fc;

static unsigned long long mmx_redmask = 0xf8f8f8f8f8f8f8f8;
static unsigned long long mmx_grnmask = 0xfcfcfcfcfcfcfcfc;
static unsigned long long mmx_grnshift   = 0x03;
static unsigned long long mmx_blueshift  = 0x03;

#define MMX_INIT_16 "                                                       \n\
                                                                            \n\
movd      (%1), %%mm0       # Load 4 Cb       00 00 00 00 u3 u2 u1 u0       \n\
movd      (%2), %%mm1       # Load 4 Cr       00 00 00 00 v3 v2 v1 v0       \n\
pxor      %%mm4, %%mm4      # zero mm4                                      \n\
movq      (%0), %%mm6       # Load 8 Y        Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0       \n\
#movl      $0, (%3)         # cache preload for image                       \n\
"

54 55 56 57 58 59
#define MMX_INIT_16_GRAY "                                                  \n\
                                                                            \n\
movq      (%0), %%mm6       # Load 8 Y        Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0       \n\
#movl      $0, (%3)         # cache preload for image                       \n\
"

60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
#define MMX_INIT_32 "                                                       \n\
                                                                            \n\
movd      (%1), %%mm0       # Load 4 Cb       00 00 00 00 u3 u2 u1 u0       \n\
movl      $0, (%3)          # cache preload for image                       \n\
movd      (%2), %%mm1       # Load 4 Cr       00 00 00 00 v3 v2 v1 v0       \n\
pxor      %%mm4, %%mm4      # zero mm4                                      \n\
movq      (%0), %%mm6       # Load 8 Y        Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0       \n\
"

/*
 * Do the multiply part of the conversion for even and odd pixels,
 * register usage:
 * mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels,
 * mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd  pixels,
 * mm6 -> Y even, mm7 -> Y odd
 */

#define MMX_YUV_MUL "                                                       \n\
                                                                            \n\
# convert the chroma part                                                   \n\
punpcklbw %%mm4, %%mm0          # scatter 4 Cb    00 u3 00 u2 00 u1 00 u0   \n\
punpcklbw %%mm4, %%mm1          # scatter 4 Cr    00 v3 00 v2 00 v1 00 v0   \n\
psubsw    mmx_80w, %%mm0        # Cb -= 128                                 \n\
psubsw    mmx_80w, %%mm1        # Cr -= 128                                 \n\
psllw     $3, %%mm0             # Promote precision                         \n\
psllw     $3, %%mm1             # Promote precision                         \n\
movq      %%mm0, %%mm2          # Copy 4 Cb       00 u3 00 u2 00 u1 00 u0   \n\
movq      %%mm1, %%mm3          # Copy 4 Cr       00 v3 00 v2 00 v1 00 v0   \n\
pmulhw    mmx_U_green, %%mm2    # Mul Cb with green coeff -> Cb green       \n\
pmulhw    mmx_V_green, %%mm3    # Mul Cr with green coeff -> Cr green       \n\
pmulhw    mmx_U_blue, %%mm0     # Mul Cb -> Cblue 00 b3 00 b2 00 b1 00 b0   \n\
Sam Hocevar's avatar
Sam Hocevar committed
91
pmulhw    mmx_V_red, %%mm1      # Mul Cr -> Cred  00 r3 00 r2 00 r1 00 r0   \n\
92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141
paddsw    %%mm3, %%mm2          # Cb green + Cr green -> Cgreen             \n\
                                                                            \n\
# convert the luma part                                                     \n\
psubusb   mmx_10w, %%mm6        # Y -= 16                                   \n\
movq      %%mm6, %%mm7          # Copy 8 Y        Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0   \n\
pand      mmx_00ffw, %%mm6      # get Y even      00 Y6 00 Y4 00 Y2 00 Y0   \n\
psrlw     $8, %%mm7             # get Y odd       00 Y7 00 Y5 00 Y3 00 Y1   \n\
psllw     $3, %%mm6             # Promote precision                         \n\
psllw     $3, %%mm7             # Promote precision                         \n\
pmulhw    mmx_Y_coeff, %%mm6    # Mul 4 Y even    00 y6 00 y4 00 y2 00 y0   \n\
pmulhw    mmx_Y_coeff, %%mm7    # Mul 4 Y odd     00 y7 00 y5 00 y3 00 y1   \n\
"

/*
 * Do the addition part of the conversion for even and odd pixels,
 * register usage:
 * mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels,
 * mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd  pixels,
 * mm6 -> Y even, mm7 -> Y odd
 */

#define MMX_YUV_ADD "                                                       \n\
                                                                            \n\
# Do horizontal and vertical scaling                                        \n\
movq      %%mm0, %%mm3          # Copy Cblue                                \n\
movq      %%mm1, %%mm4          # Copy Cred                                 \n\
movq      %%mm2, %%mm5          # Copy Cgreen                               \n\
paddsw    %%mm6, %%mm0          # Y even + Cblue  00 B6 00 B4 00 B2 00 B0   \n\
paddsw    %%mm7, %%mm3          # Y odd  + Cblue  00 B7 00 B5 00 B3 00 B1   \n\
paddsw    %%mm6, %%mm1          # Y even + Cred   00 R6 00 R4 00 R2 00 R0   \n\
paddsw    %%mm7, %%mm4          # Y odd  + Cred   00 R7 00 R5 00 R3 00 R1   \n\
paddsw    %%mm6, %%mm2          # Y even + Cgreen 00 G6 00 G4 00 G2 00 G0   \n\
paddsw    %%mm7, %%mm5          # Y odd  + Cgreen 00 G7 00 G5 00 G3 00 G1   \n\
                                                                            \n\
# Limit RGB even to 0..255                                                  \n\
packuswb  %%mm0, %%mm0          # B6 B4 B2 B0 | B6 B4 B2 B0                 \n\
packuswb  %%mm1, %%mm1          # R6 R4 R2 R0 | R6 R4 R2 R0                 \n\
packuswb  %%mm2, %%mm2          # G6 G4 G2 G0 | G6 G4 G2 G0                 \n\
                                                                            \n\
# Limit RGB odd to 0..255                                                   \n\
packuswb  %%mm3, %%mm3          # B7 B5 B3 B1 | B7 B5 B3 B1                 \n\
packuswb  %%mm4, %%mm4          # R7 R5 R3 R1 | R7 R5 R3 R1                 \n\
packuswb  %%mm5, %%mm5          # G7 G5 G3 G1 | G7 G5 G3 G1                 \n\
                                                                            \n\
# Interleave RGB even and odd                                               \n\
punpcklbw %%mm3, %%mm0          #                 B7 B6 B5 B4 B3 B2 B1 B0   \n\
punpcklbw %%mm4, %%mm1          #                 R7 R6 R5 R4 R3 R2 R1 R0   \n\
punpcklbw %%mm5, %%mm2          #                 G7 G6 G5 G4 G3 G2 G1 G0   \n\
"

142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184
/*
 * Grayscale case, only use Y
 */

#define MMX_YUV_GRAY "                                                      \n\
                                                                            \n\
# convert the luma part                                                     \n\
psubusb   mmx_10w, %%mm6                                                    \n\
movq      %%mm6, %%mm7                                                      \n\
pand      mmx_00ffw, %%mm6                                                  \n\
psrlw     $8, %%mm7                                                         \n\
psllw     $3, %%mm6                                                         \n\
psllw     $3, %%mm7                                                         \n\
pmulhw    mmx_Y_coeff, %%mm6                                                \n\
pmulhw    mmx_Y_coeff, %%mm7                                                \n\
packuswb  %%mm6, %%mm6                                                      \n\
packuswb  %%mm7, %%mm7                                                      \n\
punpcklbw %%mm7, %%mm6                                                      \n\
"

#define MMX_UNPACK_16_GRAY "                                                \n\
movq      %%mm6, %%mm5                                                      \n\
pand      mmx_redmask, %%mm6                                                \n\
pand      mmx_grnmask, %%mm5                                                \n\
movq      %%mm6, %%mm7                                                      \n\
psrlw     mmx_blueshift, %%mm7                                              \n\
pxor      %%mm3, %%mm3                                                      \n\
movq      %%mm7, %%mm2                                                      \n\
movq      %%mm5, %%mm0                                                      \n\
punpcklbw %%mm3, %%mm5                                                      \n\
punpcklbw %%mm6, %%mm7                                                      \n\
psllw     mmx_blueshift, %%mm5                                              \n\
por       %%mm5, %%mm7                                                      \n\
movq      %%mm7, (%3)                                                       \n\
punpckhbw %%mm3, %%mm0                                                      \n\
punpckhbw %%mm6, %%mm2                                                      \n\
psllw     mmx_blueshift, %%mm0                                              \n\
movq      8(%0), %%mm6                                                      \n\
por       %%mm0, %%mm2                                                      \n\
movq      %%mm2, 8(%3)                                                      \n\
"


185 186 187 188 189 190 191
/*
 * convert RGB plane to RGB 16 bits,
 * mm0 -> B, mm1 -> R, mm2 -> G,
 * mm4 -> GB, mm5 -> AR pixel 4-7,
 * mm6 -> GB, mm7 -> AR pixel 0-3
 */

192 193 194
#define MMX_UNPACK_16 "                                                     \n\
                                                                            \n\
# mask unneeded bits off                                                    \n\
195 196 197 198
pand      mmx_redmask, %%mm0    # b7b6b5b4 b3______ b7b6b5b4 b3______       \n\
pand      mmx_grnmask, %%mm2    # g7g6g5g4 g3g2____ g7g6g5g4 g3g2____       \n\
pand      mmx_redmask, %%mm1    # r7r6r5r4 r3______ r7r6r5r4 r3______       \n\
psrlw     mmx_blueshift,%%mm0   # ______b7 b6b5b4b3 ______b7 b6b5b4b3       \n\
199 200 201 202 203
pxor      %%mm4, %%mm4          # zero mm4                                  \n\
movq      %%mm0, %%mm5          # Copy B7-B0                                \n\
movq      %%mm2, %%mm7          # Copy G7-G0                                \n\
                                                                            \n\
# convert rgb24 plane to rgb16 pack for pixel 0-3                           \n\
204 205 206
punpcklbw %%mm4, %%mm2          # ________ ________ g7g6g5g4 g3g2____       \n\
punpcklbw %%mm1, %%mm0          # r7r6r5r4 r3______ ______b7 b6b5b4b3       \n\
psllw     mmx_blueshift,%%mm2   # ________ __g7g6g5 g4g3g2__ ________       \n\
207 208 209 210 211
por       %%mm2, %%mm0          # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3       \n\
movq      8(%0), %%mm6          # Load 8 Y        Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0   \n\
movq      %%mm0, (%3)           # store pixel 0-3                           \n\
                                                                            \n\
# convert rgb24 plane to rgb16 pack for pixel 0-3                           \n\
212 213 214 215
punpckhbw %%mm4, %%mm7          # ________ ________ g7g6g5g4 g3g2____       \n\
punpckhbw %%mm1, %%mm5          # r7r6r5r4 r3______ ______b7 b6b5b4b3       \n\
psllw     mmx_blueshift,%%mm7   # ________ __g7g6g5 g4g3g2__ ________       \n\
movd      4(%1), %%mm0          # Load 4 Cb       __ __ __ __ u3 u2 u1 u0   \n\
216
por       %%mm7, %%mm5          # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3       \n\
217
movd      4(%2), %%mm1          # Load 4 Cr       __ __ __ __ v3 v2 v1 v0   \n\
218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250
movq      %%mm5, 8(%3)          # store pixel 4-7                           \n\
"

/*
 * convert RGB plane to RGB packed format,
 * mm0 -> B, mm1 -> R, mm2 -> G, mm3 -> 0,
 * mm4 -> GB, mm5 -> AR pixel 4-7,
 * mm6 -> GB, mm7 -> AR pixel 0-3
 */

#define MMX_UNPACK_32 "                                                     \n\
                                                                            \n\
pxor      %%mm3, %%mm3  # zero mm3                                          \n\
movq      %%mm0, %%mm6  #                 B7 B6 B5 B4 B3 B2 B1 B0           \n\
movq      %%mm1, %%mm7  #                 R7 R6 R5 R4 R3 R2 R1 R0           \n\
movq      %%mm0, %%mm4  #                 B7 B6 B5 B4 B3 B2 B1 B0           \n\
movq      %%mm1, %%mm5  #                 R7 R6 R5 R4 R3 R2 R1 R0           \n\
punpcklbw %%mm2, %%mm6  #                 G3 B3 G2 B2 G1 B1 G0 B0           \n\
punpcklbw %%mm3, %%mm7  #                 00 R3 00 R2 00 R1 00 R0           \n\
punpcklwd %%mm7, %%mm6  #                 00 R1 B1 G1 00 R0 B0 G0           \n\
movq      %%mm6, (%3)   # Store ARGB1 ARGB0                                 \n\
movq      %%mm0, %%mm6  #                 B7 B6 B5 B4 B3 B2 B1 B0           \n\
punpcklbw %%mm2, %%mm6  #                 G3 B3 G2 B2 G1 B1 G0 B0           \n\
punpckhwd %%mm7, %%mm6  #                 00 R3 G3 B3 00 R2 B3 G2           \n\
movq      %%mm6, 8(%3)  # Store ARGB3 ARGB2                                 \n\
punpckhbw %%mm2, %%mm4  #                 G7 B7 G6 B6 G5 B5 G4 B4           \n\
punpckhbw %%mm3, %%mm5  #                 00 R7 00 R6 00 R5 00 R4           \n\
punpcklwd %%mm5, %%mm4  #                 00 R5 B5 G5 00 R4 B4 G4           \n\
movq      %%mm4, 16(%3) # Store ARGB5 ARGB4                                 \n\
movq      %%mm0, %%mm4  #                 B7 B6 B5 B4 B3 B2 B1 B0           \n\
punpckhbw %%mm2, %%mm4  #                 G7 B7 G6 B6 G5 B5 G4 B4           \n\
punpckhwd %%mm5, %%mm4  #                 00 R7 G7 B7 00 R6 B6 G6           \n\
movq      %%mm4, 24(%3) # Store ARGB7 ARGB6                                 \n\
251 252 253 254 255
                                                                            \n\
#movd      4(%1), %%mm0  # Load 4 Cb       00 00 00 00 u3 u2 u1 u0           \n\
#movd      4(%2), %%mm1  # Load 4 Cr       00 00 00 00 v3 v2 v1 v0           \n\
#pxor      %%mm4, %%mm4  # zero mm4                                          \n\
#movq      8(%0), %%mm6  # Load 8 Y        Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0           \n\
256
"