Commit 0777bcbf authored by Laurent Aimar's avatar Laurent Aimar
Browse files

Moved out optimized VA nv12/yv12 copy functions from dxva2.

parent 287ccf14
......@@ -10,6 +10,8 @@ libavcodec_plugin_la_SOURCES = \
chroma.c \
vaapi.c \
dxva2.c \
copy.c \
copy.h \
va.h \
$(NULL)
if ENABLE_SOUT
......
/*****************************************************************************
* copy.c: Fast YV12/NV12 copy
*****************************************************************************
* Copyright (C) 2010 Laurent Aimar
* $Id$
*
* Authors: Laurent Aimar <fenrir _AT_ videolan _DOT_ org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
*****************************************************************************/
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif
#include <vlc_common.h>
#include <vlc_picture.h>
#include <vlc_cpu.h>
#include <assert.h>
#include "copy.h"
/* Copy 64 bytes from srcp to dsp loading data with the SSE>=2 instruction load and
* storing data with the SSE>=2 instruction store.
*/
#define COPY64(dstp, srcp, load, store) \
asm volatile ( \
load " 0(%[src]), %%xmm1\n" \
load " 16(%[src]), %%xmm2\n" \
load " 32(%[src]), %%xmm3\n" \
load " 48(%[src]), %%xmm4\n" \
store " %%xmm1, 0(%[dst])\n" \
store " %%xmm2, 16(%[dst])\n" \
store " %%xmm3, 32(%[dst])\n" \
store " %%xmm4, 48(%[dst])\n" \
: : [dst]"r"(dstp), [src]"r"(srcp) : "memory")
/* Execute the instruction op only if SSE2 is supported. */
#ifdef CAN_COMPILE_SSE2
# define ASM_SSE2(cpu, op) do { \
if (cpu & CPU_CAPABILITY_SSE2) \
asm volatile (op); \
} while (0)
#else
# define ASM_SSE2(cpu, op)
#endif
/* Optimized copy from "Uncacheable Speculative Write Combining" memory
* as used by some video surface.
* XXX It is really efficient only when SSE4.1 is available.
*/
static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
const uint8_t *src, size_t src_pitch,
unsigned unaligned,
unsigned width, unsigned height,
unsigned cpu)
{
assert(((intptr_t)dst & 0x0f) == 0 && (dst_pitch & 0x0f) == 0);
ASM_SSE2(cpu, "mfence");
for (unsigned y = 0; y < height; y++) {
unsigned x;
for (x = 0; x < unaligned; x++)
dst[x] = src[x];
#ifdef CAN_COMPILE_SSE4_1
if (cpu & CPU_CAPABILITY_SSE4_1) {
if (!unaligned) {
for (; x+63 < width; x += 64)
COPY64(&dst[x], &src[x], "movntdqa", "movdqa");
} else {
for (; x+63 < width; x += 64)
COPY64(&dst[x], &src[x], "movntdqa", "movdqu");
}
} else
#endif
#ifdef CAN_COMPILE_SSE2
if (cpu & CPU_CAPABILITY_SSE2) {
if (!unaligned) {
for (; x+63 < width; x += 64)
COPY64(&dst[x], &src[x], "movdqa", "movdqa");
} else {
for (; x+63 < width; x += 64)
COPY64(&dst[x], &src[x], "movdqa", "movdqu");
}
}
#endif
for (; x < width; x++)
dst[x] = src[x];
src += src_pitch;
dst += dst_pitch;
}
}
static void Copy2d(uint8_t *dst, size_t dst_pitch,
const uint8_t *src, size_t src_pitch,
unsigned width, unsigned height,
unsigned cpu)
{
assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
ASM_SSE2(cpu, "mfence");
for (unsigned y = 0; y < height; y++) {
unsigned x = 0;
bool unaligned = ((intptr_t)dst & 0x0f) != 0;
#ifdef CAN_COMPILE_SSE2
if (cpu & CPU_CAPABILITY_SSE2) {
if (!unaligned) {
for (; x+63 < width; x += 64)
COPY64(&dst[x], &src[x], "movdqa", "movntdq");
} else {
for (; x+63 < width; x += 64)
COPY64(&dst[x], &src[x], "movdqa", "movdqu");
}
}
#endif
for (; x < width; x++)
dst[x] = src[x];
src += src_pitch;
dst += dst_pitch;
}
}
static void SplitUV(uint8_t *dstu, size_t dstu_pitch,
uint8_t *dstv, size_t dstv_pitch,
const uint8_t *src, size_t src_pitch,
unsigned width, unsigned height, unsigned cpu)
{
const uint8_t shuffle[] = { 0, 2, 4, 6, 8, 10, 12, 14,
1, 3, 5, 7, 9, 11, 13, 15 };
const uint8_t mask[] = { 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00,
0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00 };
assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
ASM_SSE2(cpu, "mfence");
for (unsigned y = 0; y < height; y++) {
unsigned x = 0;
#define LOAD64 \
"movdqa 0(%[src]), %%xmm0\n" \
"movdqa 16(%[src]), %%xmm1\n" \
"movdqa 32(%[src]), %%xmm2\n" \
"movdqa 48(%[src]), %%xmm3\n"
#define STORE2X32 \
"movq %%xmm0, 0(%[dst1])\n" \
"movq %%xmm1, 8(%[dst1])\n" \
"movhpd %%xmm0, 0(%[dst2])\n" \
"movhpd %%xmm1, 8(%[dst2])\n" \
"movq %%xmm2, 16(%[dst1])\n" \
"movq %%xmm3, 24(%[dst1])\n" \
"movhpd %%xmm2, 16(%[dst2])\n" \
"movhpd %%xmm3, 24(%[dst2])\n"
#ifdef CAN_COMPILE_SSSE3
if (cpu & CPU_CAPABILITY_SSSE3) {
for (x = 0; x < (width & ~31); x += 32) {
asm volatile (
"movdqu (%[shuffle]), %%xmm7\n"
LOAD64
"pshufb %%xmm7, %%xmm0\n"
"pshufb %%xmm7, %%xmm1\n"
"pshufb %%xmm7, %%xmm2\n"
"pshufb %%xmm7, %%xmm3\n"
STORE2X32
: : [dst1]"r"(&dstu[x]), [dst2]"r"(&dstv[x]), [src]"r"(&src[2*x]), [shuffle]"r"(shuffle) : "memory");
}
} else
#endif
#ifdef CAN_COMPILE_SSE2
if (cpu & CPU_CAPABILITY_SSE2) {
for (x = 0; x < (width & ~31); x += 32) {
asm volatile (
"movdqu (%[mask]), %%xmm7\n"
LOAD64
"movdqa %%xmm0, %%xmm4\n"
"movdqa %%xmm1, %%xmm5\n"
"movdqa %%xmm2, %%xmm6\n"
"psrlw $8, %%xmm0\n"
"psrlw $8, %%xmm1\n"
"pand %%xmm7, %%xmm4\n"
"pand %%xmm7, %%xmm5\n"
"pand %%xmm7, %%xmm6\n"
"packuswb %%xmm4, %%xmm0\n"
"packuswb %%xmm5, %%xmm1\n"
"pand %%xmm3, %%xmm7\n"
"psrlw $8, %%xmm2\n"
"psrlw $8, %%xmm3\n"
"packuswb %%xmm6, %%xmm2\n"
"packuswb %%xmm7, %%xmm3\n"
STORE2X32
: : [dst2]"r"(&dstu[x]), [dst1]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(mask) : "memory");
}
}
#endif
#undef STORE2X32
#undef LOAD64
for (; x < width; x++) {
dstu[x] = src[2*x+0];
dstv[x] = src[2*x+1];
}
src += src_pitch;
dstu += dstu_pitch;
dstv += dstv_pitch;
}
}
static void CopyPlane(uint8_t *dst, size_t dst_pitch, const uint8_t *src, size_t src_pitch,
uint8_t *cache, size_t cache_size,
unsigned width, unsigned height,
unsigned cpu)
{
const unsigned w16 = (width+15) & ~15;
const unsigned hstep = cache_size / w16;
assert(hstep > 0);
for (unsigned y = 0; y < height; y += hstep) {
const unsigned unaligned = (intptr_t)src & 0x0f;
const unsigned hblock = __MIN(hstep, height - y);
/* Copy a bunch of line into our cache */
CopyFromUswc(cache, w16,
src, src_pitch,
unaligned,
width, hblock, cpu);
/* Copy from our cache to the destination */
Copy2d(dst, dst_pitch,
cache, w16,
width, hblock, cpu);
/* */
src += src_pitch * hblock;
dst += dst_pitch * hblock;
}
ASM_SSE2(cpu, "mfence");
}
static void SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
uint8_t *dstv, size_t dstv_pitch,
const uint8_t *src, size_t src_pitch,
uint8_t *cache, size_t cache_size,
unsigned width, unsigned height,
unsigned cpu)
{
const unsigned w2_16 = (2*width+15) & ~15;
const unsigned hstep = cache_size / w2_16;
assert(hstep > 0);
for (unsigned y = 0; y < height; y += hstep) {
const unsigned unaligned = (intptr_t)src & 0x0f;
const unsigned hblock = __MIN(hstep, height - y);
/* Copy a bunch of line into our cache */
CopyFromUswc(cache, w2_16,
src, src_pitch,
unaligned,
2*width, hblock, cpu);
/* Copy from our cache to the destination */
SplitUV(dstu, dstu_pitch,
dstv, dstv_pitch,
cache, w2_16,
width, hblock, cpu);
/* */
src += src_pitch * hblock;
dstu += dstu_pitch * hblock;
dstv += dstv_pitch * hblock;
}
ASM_SSE2(cpu, "mfence");
}
int CopyInitCache(copy_cache_t *cache, unsigned width)
{
cache->size = __MAX((width + 0x0f) & ~ 0x0f, 4096);
cache->base = malloc(16 + cache->size);
if (cache->base == NULL) {
cache->buffer = NULL;
return VLC_EGENERIC;
}
cache->buffer = &cache->base[16 - ((intptr_t)cache->base & 0x0f)];
return VLC_SUCCESS;
}
void CopyCleanCache(copy_cache_t *cache)
{
free(cache->base);
cache->base = NULL;
cache->buffer = NULL;
cache->size = 0;
}
void CopyFromNv12(picture_t *dst, uint8_t *src[2], size_t src_pitch[2],
unsigned width, unsigned height,
copy_cache_t *cache)
{
const unsigned cpu = vlc_CPU();
/* */
CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
src[0], src_pitch[0],
cache->buffer, cache->size,
width, height, cpu);
SplitPlanes(dst->p[2].p_pixels, dst->p[2].i_pitch,
dst->p[1].p_pixels, dst->p[1].i_pitch,
src[1], src_pitch[1],
cache->buffer, cache->size,
width/2, height/2, cpu);
ASM_SSE2(cpu, "emms");
}
void CopyFromYv12(picture_t *dst, uint8_t *src[3], size_t src_pitch[3],
unsigned width, unsigned height,
copy_cache_t *cache)
{
const unsigned cpu = vlc_CPU();
/* */
for (unsigned n = 0; n < 3; n++) {
const unsigned d = n > 0 ? 2 : 1;
CopyPlane(dst->p[n].p_pixels, dst->p[n].i_pitch,
src[n], src_pitch[n],
cache->buffer, cache->size,
width/d, height/d, cpu);
}
ASM_SSE2(cpu, "emms");
}
#undef ASM_SSE2
#undef COPY64
/*****************************************************************************
* copy.h: Fast YV12/NV12 copy
*****************************************************************************
* Copyright (C) 2009 Laurent Aimar
* $Id$
*
* Authors: Laurent Aimar <fenrir_AT_ videolan _DOT_ org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
*****************************************************************************/
#ifndef _VLC_AVCODEC_COPY_H
#define _VLC_AVCODEC_COPY_H 1
typedef struct {
uint8_t *base;
uint8_t *buffer;
size_t size;
} copy_cache_t;
int CopyInitCache(copy_cache_t *cache, unsigned width);
void CopyCleanCache(copy_cache_t *cache);
void CopyFromNv12(picture_t *dst, uint8_t *src[2], size_t src_pitch[2],
unsigned width, unsigned height,
copy_cache_t *cache);
void CopyFromYv12(picture_t *dst, uint8_t *src[3], size_t src_pitch[3],
unsigned width, unsigned height,
copy_cache_t *cache);
#endif
......@@ -47,6 +47,7 @@
#include "avcodec.h"
#include "va.h"
#include "copy.h"
#ifdef HAVE_AVCODEC_DXVA2
......@@ -252,9 +253,7 @@ typedef struct
/* Option conversion */
D3DFORMAT output;
uint8_t *surface_cache_base;
uint8_t *surface_cache;
size_t surface_cache_size;
copy_cache_t surface_cache;
/* */
struct dxva_context hw;
......@@ -297,13 +296,6 @@ static int DxResetVideoDecoder(vlc_va_dxva2_t *);
static void DxCreateVideoConversion(vlc_va_dxva2_t *);
static void DxDestroyVideoConversion(vlc_va_dxva2_t *);
static void CopyFromNv12(picture_t *dst, const D3DLOCKED_RECT *src,
uint8_t *cache, size_t cache_size,
unsigned width, unsigned height);
static void CopyFromYv12(picture_t *dst, const D3DLOCKED_RECT *src,
uint8_t *cache, size_t cache_size,
unsigned width, unsigned height);
/* */
static int Setup(vlc_va_t *external, void **hw, vlc_fourcc_t *chroma,
int width, int height)
......@@ -356,7 +348,7 @@ static int Extract(vlc_va_t *external, picture_t *picture, AVFrame *ff)
vlc_va_dxva2_t *va = vlc_va_dxva2_Get(external);
LPDIRECT3DSURFACE9 d3d = (LPDIRECT3DSURFACE9)(uintptr_t)ff->data[3];
if (!va->surface_cache)
if (!va->surface_cache.buffer)
return VLC_EGENERIC;
/* */
......@@ -370,14 +362,33 @@ static int Extract(vlc_va_t *external, picture_t *picture, AVFrame *ff)
}
if (va->render == MAKEFOURCC('Y','V','1','2')) {
CopyFromYv12(picture, &lock,
va->surface_cache, va->surface_cache_size,
va->surface_width, va->surface_height);
uint8_t *plane[3] = {
lock.pBits,
(uint8_t*)lock.pBits + lock.Pitch * va->surface_height,
(uint8_t*)lock.pBits + lock.Pitch * va->surface_height
+ (lock.Pitch/2) * (va->surface_height/2)
};
size_t pitch[3] = {
lock.Pitch,
lock.Pitch / 2,
lock.Pitch / 2,
};
CopyFromYv12(picture, plane, pitch,
va->surface_width, va->surface_height,
&va->surface_cache);
} else {
assert(va->render == MAKEFOURCC('N','V','1','2'));
CopyFromNv12(picture, &lock,
va->surface_cache, va->surface_cache_size,
va->surface_width, va->surface_height);
uint8_t *plane[2] = {
lock.pBits,
(uint8_t*)lock.pBits + lock.Pitch * va->surface_height
};
size_t pitch[2] = {
lock.Pitch,
lock.Pitch / 2,
};
CopyFromNv12(picture, plane, pitch,
va->surface_width, va->surface_height,
&va->surface_cache);
}
/* */
......@@ -954,319 +965,12 @@ static void DxCreateVideoConversion(vlc_va_dxva2_t *va)
va->output = va->render;
break;
}
va->surface_cache_size = __MAX((va->surface_width + 0x0f) & ~ 0x0f, 4096);
va->surface_cache_base = malloc(16 + va->surface_cache_size);
va->surface_cache = &va->surface_cache_base[16 - ((intptr_t)va->surface_cache_base & 0x0f)];
CopyInitCache(&va->surface_cache, va->surface_width);
}
static void DxDestroyVideoConversion(vlc_va_dxva2_t *va)
{
free(va->surface_cache_base);
va->surface_cache_base = NULL;
va->surface_cache = NULL;
va->surface_cache_size = 0;
CopyCleanCache(&va->surface_cache);
}
/* Copy 64 bytes from srcp to dsp loading data with the SSE>=2 instruction load and
* storing data with the SSE>=2 instruction store.
*/
#define COPY64(dstp, srcp, load, store) \
asm volatile ( \
load " 0(%[src]), %%xmm1\n" \
load " 16(%[src]), %%xmm2\n" \
load " 32(%[src]), %%xmm3\n" \
load " 48(%[src]), %%xmm4\n" \
store " %%xmm1, 0(%[dst])\n" \
store " %%xmm2, 16(%[dst])\n" \
store " %%xmm3, 32(%[dst])\n" \
store " %%xmm4, 48(%[dst])\n" \
: : [dst]"r"(dstp), [src]"r"(srcp) : "memory")
/* Execute the instruction op only if SSE2 is supported. */
#ifdef CAN_COMPILE_SSE2
# define ASM_SSE2(cpu, op) do { \
if (cpu & CPU_CAPABILITY_SSE2) \
asm volatile (op); \
} while (0)
#else
# define ASM_SSE2(cpu, op)
#endif
/* Optimized copy from "Uncacheable Speculative Write Combining" memory
* as used by some video surface.
* XXX It is really efficient only when SSE4.1 is available.
*/
static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
const uint8_t *src, size_t src_pitch,
unsigned unaligned,
unsigned width, unsigned height,
unsigned cpu)
{
assert(((intptr_t)dst & 0x0f) == 0 && (dst_pitch & 0x0f) == 0);
ASM_SSE2(cpu, "mfence");
for (unsigned y = 0; y < height; y++) {
unsigned x;
for (x = 0; x < unaligned; x++)
dst[x] = src[x];
#ifdef CAN_COMPILE_SSE4_1
if (cpu & CPU_CAPABILITY_SSE4_1) {
if (!unaligned) {
for (; x+63 < width; x += 64)
COPY64(&dst[x], &src[x], "movntdqa", "movdqa");
} else {
for (; x+63 < width; x += 64)
COPY64(&dst[x], &src[x], "movntdqa", "movdqu");
}
} else
#endif
#ifdef CAN_COMPILE_SSE2
if (cpu & CPU_CAPABILITY_SSE2) {
if (!unaligned) {
for (; x+63 < width; x += 64)
COPY64(&dst[x], &src[x], "movdqa", "movdqa");
} else {
for (; x+63 < width; x += 64)
COPY64(&dst[x], &src[x], "movdqa", "movdqu");
}
}
#endif
for (; x < width; x++)
dst[x] = src[x];
src += src_pitch;
dst += dst_pitch;
}
}
static void Copy2d(uint8_t *dst, size_t dst_pitch,
const uint8_t *src, size_t src_pitch,
unsigned width, unsigned height,
unsigned cpu)
{
assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
ASM_SSE2(cpu, "mfence");
for (unsigned y = 0; y < height; y++) {
unsigned x = 0;
bool unaligned = ((intptr_t)dst & 0x0f) != 0;
#ifdef CAN_COMPILE_SSE2
if (cpu & CPU_CAPABILITY_SSE2) {
if (!unaligned) {
for (; x+63 < width; x += 64)
COPY64(&dst[x], &src[x], "movdqa", "movntdq");
} else {