copy.c 18.9 KB
Newer Older
1 2 3 4 5 6 7 8
/*****************************************************************************
 * copy.c: Fast YV12/NV12 copy
 *****************************************************************************
 * Copyright (C) 2010 Laurent Aimar
 * $Id$
 *
 * Authors: Laurent Aimar <fenrir _AT_ videolan _DOT_ org>
 *
Jean-Baptiste Kempf's avatar
LGPL  
Jean-Baptiste Kempf committed
9 10 11
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or
12 13 14 15
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Jean-Baptiste Kempf's avatar
LGPL  
Jean-Baptiste Kempf committed
16 17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Lesser General Public License for more details.
18
 *
Jean-Baptiste Kempf's avatar
LGPL  
Jean-Baptiste Kempf committed
19 20 21
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
22 23 24 25 26 27 28 29 30 31 32 33 34
 *****************************************************************************/

#ifdef HAVE_CONFIG_H
# include "config.h"
#endif

#include <vlc_common.h>
#include <vlc_picture.h>
#include <vlc_cpu.h>
#include <assert.h>

#include "copy.h"

35 36 37
int CopyInitCache(copy_cache_t *cache, unsigned width)
{
#ifdef CAN_COMPILE_SSE2
38 39
    cache->size = __MAX((width + 0x3f) & ~ 0x3f, 4096);
    cache->buffer = vlc_memalign(64, cache->size);
40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
    if (!cache->buffer)
        return VLC_EGENERIC;
#else
    (void) cache; (void) width;
#endif
    return VLC_SUCCESS;
}

void CopyCleanCache(copy_cache_t *cache)
{
#ifdef CAN_COMPILE_SSE2
    vlc_free(cache->buffer);
    cache->buffer = NULL;
    cache->size   = 0;
#else
    (void) cache;
#endif
}

#ifdef CAN_COMPILE_SSE2
60
/* Copy 16/64 bytes from srcp to dstp loading data with the SSE>=2 instruction
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
61
 * load and storing data with the SSE>=2 instruction store.
62
 */
63 64 65 66 67 68
#define COPY16(dstp, srcp, load, store) \
    asm volatile (                      \
        load "  0(%[src]), %%xmm1\n"    \
        store " %%xmm1,    0(%[dst])\n" \
        : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1")

69 70 71 72 73 74 75 76 77 78
#define COPY64(dstp, srcp, load, store) \
    asm volatile (                      \
        load "  0(%[src]), %%xmm1\n"    \
        load " 16(%[src]), %%xmm2\n"    \
        load " 32(%[src]), %%xmm3\n"    \
        load " 48(%[src]), %%xmm4\n"    \
        store " %%xmm1,    0(%[dst])\n" \
        store " %%xmm2,   16(%[dst])\n" \
        store " %%xmm3,   32(%[dst])\n" \
        store " %%xmm4,   48(%[dst])\n" \
79
        : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1", "xmm2", "xmm3", "xmm4")
80

81 82 83
#ifndef __SSE4_1__
# undef vlc_CPU_SSE4_1
# define vlc_CPU_SSE4_1() ((cpu & VLC_CPU_SSE4_1) != 0)
84 85
#endif

86 87 88 89 90
#ifndef __SSSE3__
# undef vlc_CPU_SSSE3
# define vlc_CPU_SSSE3() ((cpu & VLC_CPU_SSSE3) != 0)
#endif

91 92 93
#ifndef __SSE2__
# undef vlc_CPU_SSE2
# define vlc_CPU_SSE2() ((cpu & VLC_CPU_SSE2) != 0)
94 95 96 97 98 99
#endif

/* Optimized copy from "Uncacheable Speculative Write Combining" memory
 * as used by some video surface.
 * XXX It is really efficient only when SSE4.1 is available.
 */
100
VLC_SSE
101 102 103 104 105
static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
                         const uint8_t *src, size_t src_pitch,
                         unsigned width, unsigned height,
                         unsigned cpu)
{
106
#if defined (__SSE4_1__) || !defined(CAN_COMPILE_SSSE3)
107 108
    VLC_UNUSED(cpu);
#endif
109 110
    assert(((intptr_t)dst & 0x0f) == 0 && (dst_pitch & 0x0f) == 0);

111 112
    asm volatile ("mfence");

113
    for (unsigned y = 0; y < height; y++) {
114
        const unsigned unaligned = (-(uintptr_t)src) & 0x0f;
115
        unsigned x = unaligned;
116 117

#ifdef CAN_COMPILE_SSE4_1
118
        if (vlc_CPU_SSE4_1()) {
119 120 121 122
            if (!unaligned) {
                for (; x+63 < width; x += 64)
                    COPY64(&dst[x], &src[x], "movntdqa", "movdqa");
            } else {
123
                COPY16(dst, src, "movdqu", "movdqa");
124 125 126 127 128
                for (; x+63 < width; x += 64)
                    COPY64(&dst[x], &src[x], "movntdqa", "movdqu");
            }
        } else
#endif
129
        {
130 131 132 133
            if (!unaligned) {
                for (; x+63 < width; x += 64)
                    COPY64(&dst[x], &src[x], "movdqa", "movdqa");
            } else {
134
                COPY16(dst, src, "movdqu", "movdqa");
135 136 137 138 139 140 141 142 143 144 145
                for (; x+63 < width; x += 64)
                    COPY64(&dst[x], &src[x], "movdqa", "movdqu");
            }
        }

        for (; x < width; x++)
            dst[x] = src[x];

        src += src_pitch;
        dst += dst_pitch;
    }
146
    asm volatile ("mfence");
147 148
}

149
VLC_SSE
150 151
static void Copy2d(uint8_t *dst, size_t dst_pitch,
                   const uint8_t *src, size_t src_pitch,
152
                   unsigned width, unsigned height)
153 154 155 156 157 158
{
    assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);

    for (unsigned y = 0; y < height; y++) {
        unsigned x = 0;

159
        bool unaligned = ((intptr_t)dst & 0x0f) != 0;
160 161 162 163 164 165
        if (!unaligned) {
            for (; x+63 < width; x += 64)
                COPY64(&dst[x], &src[x], "movdqa", "movntdq");
        } else {
            for (; x+63 < width; x += 64)
                COPY64(&dst[x], &src[x], "movdqa", "movdqu");
166 167 168 169 170 171 172 173 174 175
        }

        for (; x < width; x++)
            dst[x] = src[x];

        src += src_pitch;
        dst += dst_pitch;
    }
}

176
VLC_SSE
177 178 179 180
static void SSE_SplitUV(uint8_t *dstu, size_t dstu_pitch,
                        uint8_t *dstv, size_t dstv_pitch,
                        const uint8_t *src, size_t src_pitch,
                        unsigned width, unsigned height, unsigned cpu)
181
{
182
#if defined(__SSSE3__) || !defined (CAN_COMPILE_SSSE3)
183
    VLC_UNUSED(cpu);
184
#endif
185 186 187 188 189
    const uint8_t shuffle[] = { 0, 2, 4, 6, 8, 10, 12, 14,
                                1, 3, 5, 7, 9, 11, 13, 15 };
    const uint8_t mask[] = { 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00,
                             0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00 };

190
    assert(((intptr_t)src & 0xf) == 0 && (src_pitch & 0x0f) == 0);
191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211

    for (unsigned y = 0; y < height; y++) {
        unsigned x = 0;

#define LOAD64 \
    "movdqa  0(%[src]), %%xmm0\n" \
    "movdqa 16(%[src]), %%xmm1\n" \
    "movdqa 32(%[src]), %%xmm2\n" \
    "movdqa 48(%[src]), %%xmm3\n"

#define STORE2X32 \
    "movq   %%xmm0,   0(%[dst1])\n" \
    "movq   %%xmm1,   8(%[dst1])\n" \
    "movhpd %%xmm0,   0(%[dst2])\n" \
    "movhpd %%xmm1,   8(%[dst2])\n" \
    "movq   %%xmm2,  16(%[dst1])\n" \
    "movq   %%xmm3,  24(%[dst1])\n" \
    "movhpd %%xmm2,  16(%[dst2])\n" \
    "movhpd %%xmm3,  24(%[dst2])\n"

#ifdef CAN_COMPILE_SSSE3
212
        if (vlc_CPU_SSSE3())
213
        {
214 215 216 217 218 219 220 221 222
            for (x = 0; x < (width & ~31); x += 32) {
                asm volatile (
                    "movdqu (%[shuffle]), %%xmm7\n"
                    LOAD64
                    "pshufb  %%xmm7, %%xmm0\n"
                    "pshufb  %%xmm7, %%xmm1\n"
                    "pshufb  %%xmm7, %%xmm2\n"
                    "pshufb  %%xmm7, %%xmm3\n"
                    STORE2X32
223
                    : : [dst1]"r"(&dstu[x]), [dst2]"r"(&dstv[x]), [src]"r"(&src[2*x]), [shuffle]"r"(shuffle) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7");
224 225 226
            }
        } else
#endif
227
        {
228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247
            for (x = 0; x < (width & ~31); x += 32) {
                asm volatile (
                    "movdqu (%[mask]), %%xmm7\n"
                    LOAD64
                    "movdqa   %%xmm0, %%xmm4\n"
                    "movdqa   %%xmm1, %%xmm5\n"
                    "movdqa   %%xmm2, %%xmm6\n"
                    "psrlw    $8,     %%xmm0\n"
                    "psrlw    $8,     %%xmm1\n"
                    "pand     %%xmm7, %%xmm4\n"
                    "pand     %%xmm7, %%xmm5\n"
                    "pand     %%xmm7, %%xmm6\n"
                    "packuswb %%xmm4, %%xmm0\n"
                    "packuswb %%xmm5, %%xmm1\n"
                    "pand     %%xmm3, %%xmm7\n"
                    "psrlw    $8,     %%xmm2\n"
                    "psrlw    $8,     %%xmm3\n"
                    "packuswb %%xmm6, %%xmm2\n"
                    "packuswb %%xmm7, %%xmm3\n"
                    STORE2X32
248
                    : : [dst2]"r"(&dstu[x]), [dst1]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(mask) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
249 250 251 252 253 254 255 256 257 258 259 260 261 262 263
            }
        }
#undef STORE2X32
#undef LOAD64

        for (; x < width; x++) {
            dstu[x] = src[2*x+0];
            dstv[x] = src[2*x+1];
        }
        src  += src_pitch;
        dstu += dstu_pitch;
        dstv += dstv_pitch;
    }
}

264 265 266
static void SSE_CopyPlane(uint8_t *dst, size_t dst_pitch,
                          const uint8_t *src, size_t src_pitch,
                          uint8_t *cache, size_t cache_size,
267
                          unsigned height, unsigned cpu)
268
{
269
    const unsigned w16 = (src_pitch+15) & ~15;
270 271 272
    const unsigned hstep = cache_size / w16;
    assert(hstep > 0);

273
    if (src_pitch == dst_pitch)
274
        memcpy(dst, src, src_pitch * height);
275
    else
276 277 278 279 280 281
    for (unsigned y = 0; y < height; y += hstep) {
        const unsigned hblock =  __MIN(hstep, height - y);

        /* Copy a bunch of line into our cache */
        CopyFromUswc(cache, w16,
                     src, src_pitch,
282
                     src_pitch, hblock, cpu);
283 284 285 286

        /* Copy from our cache to the destination */
        Copy2d(dst, dst_pitch,
               cache, w16,
287
               src_pitch, hblock);
288 289 290 291 292 293

        /* */
        src += src_pitch * hblock;
        dst += dst_pitch * hblock;
    }
}
294 295 296 297 298

static void SSE_SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
                            uint8_t *dstv, size_t dstv_pitch,
                            const uint8_t *src, size_t src_pitch,
                            uint8_t *cache, size_t cache_size,
299
                            unsigned height, unsigned cpu)
300
{
301
    const unsigned w16 = (2*src_pitch+15) & ~15;
302
    const unsigned hstep = cache_size / w16;
303 304 305 306 307 308
    assert(hstep > 0);

    for (unsigned y = 0; y < height; y += hstep) {
        const unsigned hblock =  __MIN(hstep, height - y);

        /* Copy a bunch of line into our cache */
309
        CopyFromUswc(cache, w16, src, src_pitch,
310
                     2*src_pitch, hblock, cpu);
311 312

        /* Copy from our cache to the destination */
313
        SSE_SplitUV(dstu, dstu_pitch, dstv, dstv_pitch,
314
                    cache, w16, src_pitch, hblock, cpu);
315 316 317 318 319 320

        /* */
        src  += src_pitch  * hblock;
        dstu += dstu_pitch * hblock;
        dstv += dstv_pitch * hblock;
    }
321 322 323 324
}

static void SSE_CopyFromNv12(picture_t *dst,
                             uint8_t *src[2], size_t src_pitch[2],
325
                             unsigned height,
326
                             copy_cache_t *cache, unsigned cpu)
327 328 329 330
{
    SSE_CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
                  src[0], src_pitch[0],
                  cache->buffer, cache->size,
331
                  height, cpu);
332 333 334 335
    SSE_SplitPlanes(dst->p[2].p_pixels, dst->p[2].i_pitch,
                    dst->p[1].p_pixels, dst->p[1].i_pitch,
                    src[1], src_pitch[1],
                    cache->buffer, cache->size,
336
                    (height+1)/2, cpu);
337 338
    asm volatile ("emms");
}
339

340 341
static void SSE_CopyFromYv12(picture_t *dst,
                             uint8_t *src[3], size_t src_pitch[3],
342
                             unsigned height,
343
                             copy_cache_t *cache, unsigned cpu)
344 345 346 347 348 349
{
    for (unsigned n = 0; n < 3; n++) {
        const unsigned d = n > 0 ? 2 : 1;
        SSE_CopyPlane(dst->p[n].p_pixels, dst->p[n].i_pitch,
                      src[n], src_pitch[n],
                      cache->buffer, cache->size,
350
                      (height+d-1)/d, cpu);
351 352
    }
    asm volatile ("emms");
353
}
354 355 356 357


static void SSE_CopyFromNv12ToNv12(picture_t *dst,
                             uint8_t *src[2], size_t src_pitch[2],
358
                             unsigned height,
359 360 361 362 363
                             copy_cache_t *cache, unsigned cpu)
{
    SSE_CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
                  src[0], src_pitch[0],
                  cache->buffer, cache->size,
364
                  height, cpu);
365 366 367
    SSE_CopyPlane(dst->p[1].p_pixels, dst->p[1].i_pitch,
                  src[1], src_pitch[1],
                  cache->buffer, cache->size,
368
                  height/2, cpu);
369 370
    asm volatile ("emms");
}
371 372

static void SSE_CopyFromI420ToNv12(picture_t *dst,
373
                             uint8_t *src[3], size_t src_pitch[3],
374
                             unsigned height,
375 376 377 378 379
                             copy_cache_t *cache, unsigned cpu)
{
    SSE_CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
                  src[0], src_pitch[0],
                  cache->buffer, cache->size,
380
                  height, cpu);
381 382 383

    /* TODO optimise the plane merging */
    const unsigned copy_lines = height / 2;
384
    const unsigned copy_pitch = src_pitch[1];
385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405

    const int i_extra_pitch_uv = dst->p[1].i_pitch - 2 * copy_pitch;
    const int i_extra_pitch_u  = src_pitch[U_PLANE] - copy_pitch;
    const int i_extra_pitch_v  = src_pitch[V_PLANE] - copy_pitch;

    uint8_t *dstUV = dst->p[1].p_pixels;
    uint8_t *srcU  = src[U_PLANE];
    uint8_t *srcV  = src[V_PLANE];
    for ( unsigned int line = 0; line < copy_lines; line++ )
    {
        for ( unsigned int col = 0; col < copy_pitch; col++ )
        {
            *dstUV++ = *srcU++;
            *dstUV++ = *srcV++;
        }
        dstUV += i_extra_pitch_uv;
        srcU  += i_extra_pitch_u;
        srcV  += i_extra_pitch_v;
    }
    asm volatile ("emms");
}
406 407
#undef COPY64
#endif /* CAN_COMPILE_SSE2 */
408

409 410
static void CopyPlane(uint8_t *dst, size_t dst_pitch,
                      const uint8_t *src, size_t src_pitch,
411
                      unsigned height)
412
{
413
    if (src_pitch == dst_pitch)
414
        memcpy(dst, src, src_pitch * height);
415
    else
416
    for (unsigned y = 0; y < height; y++) {
417
        memcpy(dst, src, src_pitch);
418 419 420
        src += src_pitch;
        dst += dst_pitch;
    }
421
}
422 423 424 425

static void SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
                        uint8_t *dstv, size_t dstv_pitch,
                        const uint8_t *src, size_t src_pitch,
426
                        unsigned height)
427
{
428
    for (unsigned y = 0; y < height; y++) {
429
        for (unsigned x = 0; x < src_pitch / 2; x++) {
430 431 432 433 434 435 436
            dstu[x] = src[2*x+0];
            dstv[x] = src[2*x+1];
        }
        src  += src_pitch;
        dstu += dstu_pitch;
        dstv += dstv_pitch;
    }
437 438 439
}

void CopyFromNv12(picture_t *dst, uint8_t *src[2], size_t src_pitch[2],
440
                  unsigned height, copy_cache_t *cache)
441
{
442 443 444
#ifdef CAN_COMPILE_SSE2
    unsigned cpu = vlc_CPU();
    if (vlc_CPU_SSE2())
445
        return SSE_CopyFromNv12(dst, src, src_pitch, height,
446
                                cache, cpu);
447 448 449
#else
    (void) cache;
#endif
450 451

    CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
452
              src[0], src_pitch[0], height);
453 454
    SplitPlanes(dst->p[2].p_pixels, dst->p[2].i_pitch,
                dst->p[1].p_pixels, dst->p[1].i_pitch,
455
                src[1], src_pitch[1], height/2);
456
}
457

458
void CopyFromNv12ToNv12(picture_t *dst, uint8_t *src[2], size_t src_pitch[2],
459
                  unsigned height, copy_cache_t *cache)
460 461 462 463
{
#ifdef CAN_COMPILE_SSE2
    unsigned cpu = vlc_CPU();
    if (vlc_CPU_SSE2())
464
        return SSE_CopyFromNv12ToNv12(dst, src, src_pitch, height,
465 466 467 468 469 470
                                cache, cpu);
#else
    (void) cache;
#endif

    CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
471
              src[0], src_pitch[0], height);
472
    CopyPlane(dst->p[1].p_pixels, dst->p[1].i_pitch,
473
              src[1], src_pitch[1], height/2);
474 475
}

476
void CopyFromNv12ToI420(picture_t *dst, uint8_t *src[2], size_t src_pitch[2],
477
                        unsigned height)
478 479
{
    CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
480
              src[0], src_pitch[0], height);
481 482
    SplitPlanes(dst->p[1].p_pixels, dst->p[1].i_pitch,
                dst->p[2].p_pixels, dst->p[2].i_pitch,
483
                src[1], src_pitch[1], height/2);
484 485
}

486
void CopyFromI420ToNv12(picture_t *dst, uint8_t *src[3], size_t src_pitch[3],
487
                        unsigned height, copy_cache_t *cache)
488 489 490 491
{
#ifdef CAN_COMPILE_SSE2
    unsigned cpu = vlc_CPU();
    if (vlc_CPU_SSE2())
492
        return SSE_CopyFromI420ToNv12(dst, src, src_pitch, height,
493 494 495 496 497 498
                                cache, cpu);
#else
    (void) cache;
#endif

    CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
499
              src[0], src_pitch[0], height);
500 501

    const unsigned copy_lines = height / 2;
502
    const unsigned copy_pitch = src_pitch[1];
503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523

    const int i_extra_pitch_uv = dst->p[1].i_pitch - 2 * copy_pitch;
    const int i_extra_pitch_u  = src_pitch[U_PLANE] - copy_pitch;
    const int i_extra_pitch_v  = src_pitch[V_PLANE] - copy_pitch;

    uint8_t *dstUV = dst->p[1].p_pixels;
    uint8_t *srcU  = src[U_PLANE];
    uint8_t *srcV  = src[V_PLANE];
    for ( unsigned int line = 0; line < copy_lines; line++ )
    {
        for ( unsigned int col = 0; col < copy_pitch; col++ )
        {
            *dstUV++ = *srcU++;
            *dstUV++ = *srcV++;
        }
        dstUV += i_extra_pitch_uv;
        srcU  += i_extra_pitch_u;
        srcV  += i_extra_pitch_v;
    }
}

524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563
void CopyFromI420_10ToP010(picture_t *dst, uint8_t *src[3], size_t src_pitch[3],
                        unsigned height, copy_cache_t *cache)
{
    (void) cache;

    const int i_extra_pitch_dst_y = (dst->p[0].i_pitch  - src_pitch[0]) / 2;
    const int i_extra_pitch_src_y = (src_pitch[Y_PLANE] - src_pitch[0]) / 2;
    uint16_t *dstY = dst->p[0].p_pixels;
    uint16_t *srcY = src[Y_PLANE];
    for (unsigned y = 0; y < height; y++) {
        for (unsigned x = 0; x < (src_pitch[0] / 2); x++) {
            *dstY++ = *srcY++ << 6;
        }
        dstY += i_extra_pitch_dst_y;
        srcY += i_extra_pitch_src_y;
    }

    const unsigned copy_lines = height / 2;
    const unsigned copy_pitch = src_pitch[1] / 2;

    const int i_extra_pitch_uv = dst->p[1].i_pitch / 2 - 2 * copy_pitch;
    const int i_extra_pitch_u  = src_pitch[U_PLANE] / 2 - copy_pitch;
    const int i_extra_pitch_v  = src_pitch[V_PLANE] / 2 - copy_pitch;

    uint16_t *dstUV = dst->p[1].p_pixels;
    uint16_t *srcU  = src[U_PLANE];
    uint16_t *srcV  = src[V_PLANE];
    for ( unsigned int line = 0; line < copy_lines; line++ )
    {
        for ( unsigned int col = 0; col < copy_pitch; col++ )
        {
            *dstUV++ = *srcU++ << 6;
            *dstUV++ = *srcV++ << 6;
        }
        dstUV += i_extra_pitch_uv;
        srcU  += i_extra_pitch_u;
        srcV  += i_extra_pitch_v;
    }
}

564

565
void CopyFromYv12(picture_t *dst, uint8_t *src[3], size_t src_pitch[3],
566
                  unsigned height, copy_cache_t *cache)
567
{
568 569 570
#ifdef CAN_COMPILE_SSE2
    unsigned cpu = vlc_CPU();
    if (vlc_CPU_SSE2())
571
        return SSE_CopyFromYv12(dst, src, src_pitch, height,
572
                                cache, cpu);
573 574 575
#else
    (void) cache;
#endif
576

577
     CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
578
               src[0], src_pitch[0], height);
579
     CopyPlane(dst->p[1].p_pixels, dst->p[1].i_pitch,
580
               src[1], src_pitch[1], height / 2);
581
     CopyPlane(dst->p[2].p_pixels, dst->p[2].i_pitch,
582
               src[2], src_pitch[2], height / 2);
583
}