Commit 299d3ed4 authored by Loren Merritt's avatar Loren Merritt

mmx/prefetch implementation of plane_copy


git-svn-id: svn://svn.videolan.org/x264/trunk@582 df754926-b1dd-0310-bc7b-ec298dee348c
parent 9f05a7fd
......@@ -107,6 +107,7 @@ SECTION .text
cglobal x264_horizontal_filter_mmxext
cglobal x264_center_filter_mmxext
cglobal x264_plane_copy_mmxext
;-----------------------------------------------------------------------------
;
......@@ -366,3 +367,57 @@ loophx:
jnz loophy
ret
;-----------------------------------------------------------------------------
; void x264_plane_copy_mmxext( uint8_t *dst, int i_dst,
; uint8_t *src, int i_src, int w, int h)
;-----------------------------------------------------------------------------
ALIGN 16
x264_plane_copy_mmxext:
movsxd parm2q, parm2d
movsxd parm4q, parm4d
add parm5d, 3
and parm5d, ~3
sub parm2q, parm5q
sub parm4q, parm5q
; shuffle regs because movsd needs dst=rdi, src=rsi, w=ecx
xchg rsi, rdx
mov rax, parm4q
.loopy:
mov ecx, parm5d
sub ecx, 64
jl .endx
.loopx:
prefetchnta [rsi+256]
movq mm0, [rsi ]
movq mm1, [rsi+ 8]
movq mm2, [rsi+16]
movq mm3, [rsi+24]
movq mm4, [rsi+32]
movq mm5, [rsi+40]
movq mm6, [rsi+48]
movq mm7, [rsi+56]
movntq [rdi ], mm0
movntq [rdi+ 8], mm1
movntq [rdi+16], mm2
movntq [rdi+24], mm3
movntq [rdi+32], mm4
movntq [rdi+40], mm5
movntq [rdi+48], mm6
movntq [rdi+56], mm7
add rsi, 64
add rdi, 64
sub ecx, 64
jge .loopx
.endx:
prefetchnta [rsi+256]
add ecx, 64
shr ecx, 2
rep movsd
add rdi, rdx
add rsi, rax
sub parm6d, 1
jge .loopy
rep ret
......@@ -26,20 +26,11 @@
#include "common.h"
static inline void plane_copy( uint8_t *dst, int i_dst,
uint8_t *src, int i_src, int w, int h)
{
for( ; h > 0; h-- )
{
memcpy( dst, src, w );
dst += i_dst;
src += i_src;
}
}
static inline void plane_copy_vflip( uint8_t *dst, int i_dst,
static inline void plane_copy_vflip( x264_mc_functions_t *mc,
uint8_t *dst, int i_dst,
uint8_t *src, int i_src, int w, int h)
{
plane_copy( dst, i_dst, src + (h -1)*i_src, -i_src, w, h );
mc->plane_copy( dst, i_dst, src + (h -1)*i_src, -i_src, w, h );
}
static inline void plane_subsamplev2( uint8_t *dst, int i_dst,
......@@ -90,70 +81,73 @@ static inline void plane_subsamplehv2_vlip( uint8_t *dst, int i_dst,
plane_subsamplehv2( dst, i_dst, src + (2*h-1)*i_src, -i_src, w, h );
}
static void i420_to_i420( x264_frame_t *frm, x264_image_t *img,
static void i420_to_i420( x264_mc_functions_t *mc,
x264_frame_t *frm, x264_image_t *img,
int i_width, int i_height )
{
if( img->i_csp & X264_CSP_VFLIP )
{
plane_copy_vflip( frm->plane[0], frm->i_stride[0],
plane_copy_vflip( mc, frm->plane[0], frm->i_stride[0],
img->plane[0], img->i_stride[0],
i_width, i_height );
plane_copy_vflip( frm->plane[1], frm->i_stride[1],
plane_copy_vflip( mc, frm->plane[1], frm->i_stride[1],
img->plane[1], img->i_stride[1],
i_width / 2, i_height / 2 );
plane_copy_vflip( frm->plane[2], frm->i_stride[2],
plane_copy_vflip( mc, frm->plane[2], frm->i_stride[2],
img->plane[2], img->i_stride[2],
i_width / 2, i_height / 2 );
}
else
{
plane_copy( frm->plane[0], frm->i_stride[0],
img->plane[0], img->i_stride[0],
i_width, i_height );
plane_copy( frm->plane[1], frm->i_stride[1],
img->plane[1], img->i_stride[1],
i_width / 2, i_height / 2 );
plane_copy( frm->plane[2], frm->i_stride[2],
img->plane[2], img->i_stride[2],
i_width / 2, i_height / 2 );
mc->plane_copy( frm->plane[0], frm->i_stride[0],
img->plane[0], img->i_stride[0],
i_width, i_height );
mc->plane_copy( frm->plane[1], frm->i_stride[1],
img->plane[1], img->i_stride[1],
i_width / 2, i_height / 2 );
mc->plane_copy( frm->plane[2], frm->i_stride[2],
img->plane[2], img->i_stride[2],
i_width / 2, i_height / 2 );
}
}
static void yv12_to_i420( x264_frame_t *frm, x264_image_t *img,
static void yv12_to_i420( x264_mc_functions_t *mc,
x264_frame_t *frm, x264_image_t *img,
int i_width, int i_height )
{
if( img->i_csp & X264_CSP_VFLIP )
{
plane_copy_vflip( frm->plane[0], frm->i_stride[0],
plane_copy_vflip( mc, frm->plane[0], frm->i_stride[0],
img->plane[0], img->i_stride[0],
i_width, i_height );
plane_copy_vflip( frm->plane[2], frm->i_stride[2],
plane_copy_vflip( mc, frm->plane[2], frm->i_stride[2],
img->plane[1], img->i_stride[1],
i_width / 2, i_height / 2 );
plane_copy_vflip( frm->plane[1], frm->i_stride[1],
plane_copy_vflip( mc, frm->plane[1], frm->i_stride[1],
img->plane[2], img->i_stride[2],
i_width / 2, i_height / 2 );
}
else
{
plane_copy( frm->plane[0], frm->i_stride[0],
img->plane[0], img->i_stride[0],
i_width, i_height );
plane_copy( frm->plane[2], frm->i_stride[2],
img->plane[1], img->i_stride[1],
i_width / 2, i_height / 2 );
plane_copy( frm->plane[1], frm->i_stride[1],
img->plane[2], img->i_stride[2],
i_width / 2, i_height / 2 );
mc->plane_copy( frm->plane[0], frm->i_stride[0],
img->plane[0], img->i_stride[0],
i_width, i_height );
mc->plane_copy( frm->plane[2], frm->i_stride[2],
img->plane[1], img->i_stride[1],
i_width / 2, i_height / 2 );
mc->plane_copy( frm->plane[1], frm->i_stride[1],
img->plane[2], img->i_stride[2],
i_width / 2, i_height / 2 );
}
}
static void i422_to_i420( x264_frame_t *frm, x264_image_t *img,
static void i422_to_i420( x264_mc_functions_t *mc,
x264_frame_t *frm, x264_image_t *img,
int i_width, int i_height )
{
if( img->i_csp & X264_CSP_VFLIP )
{
plane_copy_vflip( frm->plane[0], frm->i_stride[0],
plane_copy_vflip( mc, frm->plane[0], frm->i_stride[0],
img->plane[0], img->i_stride[0],
i_width, i_height );
......@@ -166,9 +160,9 @@ static void i422_to_i420( x264_frame_t *frm, x264_image_t *img,
}
else
{
plane_copy( frm->plane[0], frm->i_stride[0],
img->plane[0], img->i_stride[0],
i_width, i_height );
mc->plane_copy( frm->plane[0], frm->i_stride[0],
img->plane[0], img->i_stride[0],
i_width, i_height );
plane_subsamplev2( frm->plane[1], frm->i_stride[1],
img->plane[1], img->i_stride[1],
......@@ -179,12 +173,13 @@ static void i422_to_i420( x264_frame_t *frm, x264_image_t *img,
}
}
static void i444_to_i420( x264_frame_t *frm, x264_image_t *img,
static void i444_to_i420( x264_mc_functions_t *mc,
x264_frame_t *frm, x264_image_t *img,
int i_width, int i_height )
{
if( img->i_csp & X264_CSP_VFLIP )
{
plane_copy_vflip( frm->plane[0], frm->i_stride[0],
plane_copy_vflip( mc, frm->plane[0], frm->i_stride[0],
img->plane[0], img->i_stride[0],
i_width, i_height );
......@@ -197,9 +192,9 @@ static void i444_to_i420( x264_frame_t *frm, x264_image_t *img,
}
else
{
plane_copy( frm->plane[0], frm->i_stride[0],
img->plane[0], img->i_stride[0],
i_width, i_height );
mc->plane_copy( frm->plane[0], frm->i_stride[0],
img->plane[0], img->i_stride[0],
i_width, i_height );
plane_subsamplehv2( frm->plane[1], frm->i_stride[1],
img->plane[1], img->i_stride[1],
......@@ -209,7 +204,8 @@ static void i444_to_i420( x264_frame_t *frm, x264_image_t *img,
i_width / 2, i_height / 2 );
}
}
static void yuyv_to_i420( x264_frame_t *frm, x264_image_t *img,
static void yuyv_to_i420( x264_mc_functions_t *mc,
x264_frame_t *frm, x264_image_t *img,
int i_width, int i_height )
{
uint8_t *src = img->plane[0];
......@@ -280,7 +276,8 @@ static void yuyv_to_i420( x264_frame_t *frm, x264_image_t *img,
#define V_B FIX(0.071)
#define V_ADD 128
#define RGB_TO_I420( name, POS_R, POS_G, POS_B, S_RGB ) \
static void name( x264_frame_t *frm, x264_image_t *img, \
static void name( x264_mc_functions_t *mc, \
x264_frame_t *frm, x264_image_t *img, \
int i_width, int i_height ) \
{ \
uint8_t *src = img->plane[0]; \
......@@ -357,14 +354,14 @@ void x264_csp_init( int cpu, int i_csp, x264_csp_function_t *pf )
switch( i_csp )
{
case X264_CSP_I420:
pf->i420 = i420_to_i420;
pf->i422 = i422_to_i420;
pf->i444 = i444_to_i420;
pf->yv12 = yv12_to_i420;
pf->yuyv = yuyv_to_i420;
pf->rgb = rgb_to_i420;
pf->bgr = bgr_to_i420;
pf->bgra = bgra_to_i420;
pf->convert[X264_CSP_I420] = i420_to_i420;
pf->convert[X264_CSP_I422] = i422_to_i420;
pf->convert[X264_CSP_I444] = i444_to_i420;
pf->convert[X264_CSP_YV12] = yv12_to_i420;
pf->convert[X264_CSP_YUYV] = yuyv_to_i420;
pf->convert[X264_CSP_RGB ] = rgb_to_i420;
pf->convert[X264_CSP_BGR ] = bgr_to_i420;
pf->convert[X264_CSP_BGRA] = bgra_to_i420;
break;
default:
......
......@@ -24,19 +24,14 @@
#ifndef _CSP_H
#define _CSP_H 1
typedef void (*x264_csp_t) ( x264_mc_functions_t *, x264_frame_t *, x264_image_t *,
int i_width, int i_height );
typedef struct
{
void (*i420)( x264_frame_t *, x264_image_t *, int i_width, int i_height );
void (*i422)( x264_frame_t *, x264_image_t *, int i_width, int i_height );
void (*i444)( x264_frame_t *, x264_image_t *, int i_width, int i_height );
void (*yv12)( x264_frame_t *, x264_image_t *, int i_width, int i_height );
void (*yuyv)( x264_frame_t *, x264_image_t *, int i_width, int i_height );
void (*rgb )( x264_frame_t *, x264_image_t *, int i_width, int i_height );
void (*bgr )( x264_frame_t *, x264_image_t *, int i_width, int i_height );
void (*bgra)( x264_frame_t *, x264_image_t *, int i_width, int i_height );
x264_csp_t convert[X264_CSP_MAX];
} x264_csp_function_t;
void x264_csp_init( int cpu, int i_csp, x264_csp_function_t *pf );
#endif
......
......@@ -159,41 +159,15 @@ void x264_frame_delete( x264_frame_t *frame )
void x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
{
int i_csp = src->img.i_csp & X264_CSP_MASK;
dst->i_type = src->i_type;
dst->i_qpplus1 = src->i_qpplus1;
dst->i_pts = src->i_pts;
switch( src->img.i_csp & X264_CSP_MASK )
{
case X264_CSP_I420:
h->csp.i420( dst, &src->img, h->param.i_width, h->param.i_height );
break;
case X264_CSP_YV12:
h->csp.yv12( dst, &src->img, h->param.i_width, h->param.i_height );
break;
case X264_CSP_I422:
h->csp.i422( dst, &src->img, h->param.i_width, h->param.i_height );
break;
case X264_CSP_I444:
h->csp.i444( dst, &src->img, h->param.i_width, h->param.i_height );
break;
case X264_CSP_YUYV:
h->csp.yuyv( dst, &src->img, h->param.i_width, h->param.i_height );
break;
case X264_CSP_RGB:
h->csp.rgb( dst, &src->img, h->param.i_width, h->param.i_height );
break;
case X264_CSP_BGR:
h->csp.bgr( dst, &src->img, h->param.i_width, h->param.i_height );
break;
case X264_CSP_BGRA:
h->csp.bgra( dst, &src->img, h->param.i_width, h->param.i_height );
break;
default:
x264_log( h, X264_LOG_ERROR, "Arg invalid CSP\n" );
break;
}
if( i_csp <= X264_CSP_NONE || i_csp >= X264_CSP_MAX )
x264_log( h, X264_LOG_ERROR, "Arg invalid CSP\n" );
else
h->csp.convert[i_csp]( &h->mc, dst, &src->img, h->param.i_width, h->param.i_height );
}
......
......@@ -117,6 +117,7 @@ SECTION .text
cglobal x264_horizontal_filter_mmxext
cglobal x264_center_filter_mmxext
cglobal x264_plane_copy_mmxext
;-----------------------------------------------------------------------------
;
......@@ -374,3 +375,63 @@ loophx:
pop edi
ret
;-----------------------------------------------------------------------------
; void x264_plane_copy_mmxext( uint8_t *dst, int i_dst,
; uint8_t *src, int i_src, int w, int h)
;-----------------------------------------------------------------------------
ALIGN 16
x264_plane_copy_mmxext:
push edi
push esi
push ebx
mov edi, [esp+16] ; dst
mov ebx, [esp+20] ; i_dst
mov esi, [esp+24] ; src
mov eax, [esp+28] ; i_src
mov edx, [esp+32] ; w
add edx, 3
and edx, ~3
sub ebx, edx
sub eax, edx
.loopy:
mov ecx, edx
sub ecx, 64
jl .endx
.loopx:
prefetchnta [esi+256]
movq mm0, [esi ]
movq mm1, [esi+ 8]
movq mm2, [esi+16]
movq mm3, [esi+24]
movq mm4, [esi+32]
movq mm5, [esi+40]
movq mm6, [esi+48]
movq mm7, [esi+56]
movntq [edi ], mm0
movntq [edi+ 8], mm1
movntq [edi+16], mm2
movntq [edi+24], mm3
movntq [edi+32], mm4
movntq [edi+40], mm5
movntq [edi+48], mm6
movntq [edi+56], mm7
add esi, 64
add edi, 64
sub ecx, 64
jge .loopx
.endx:
prefetchnta [esi+256]
add ecx, 64
shr ecx, 2
rep movsd
add edi, ebx
add esi, eax
sub dword [esp+36], 1
jge .loopy
pop ebx
pop esi
pop edi
ret
......@@ -39,6 +39,7 @@ extern void x264_mc_copy_w4_mmx( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w8_mmx( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w16_mmx( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w16_sse2( uint8_t *, int, uint8_t *, int, int );
extern void x264_plane_copy_mmxext( uint8_t *, int, uint8_t *, int, int w, int h);
#define AVG(W,H) \
static void x264_pixel_avg_ ## W ## x ## H ## _mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i_src ) \
......@@ -158,6 +159,8 @@ void x264_mc_mmxext_init( x264_mc_functions_t *pf )
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_mmx;
pf->copy[PIXEL_8x8] = x264_mc_copy_w8_mmx;
pf->copy[PIXEL_4x4] = x264_mc_copy_w4_mmx;
pf->plane_copy = x264_plane_copy_mmxext;
}
void x264_mc_sse2_init( x264_mc_functions_t *pf )
{
......
......@@ -316,6 +316,17 @@ MC_COPY( 16 )
MC_COPY( 8 )
MC_COPY( 4 )
static void plane_copy( uint8_t *dst, int i_dst,
uint8_t *src, int i_src, int w, int h)
{
while( h-- )
{
memcpy( dst, src, w );
dst += i_dst;
src += i_src;
}
}
void x264_mc_init( int cpu, x264_mc_functions_t *pf )
{
pf->mc_luma = mc_luma;
......@@ -348,6 +359,8 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf )
pf->copy[PIXEL_8x8] = mc_copy_w8;
pf->copy[PIXEL_4x4] = mc_copy_w4;
pf->plane_copy = plane_copy;
#ifdef HAVE_MMXEXT
if( cpu&X264_CPU_MMXEXT ) {
x264_mc_mmxext_init( pf );
......
......@@ -52,6 +52,9 @@ typedef struct
/* only 16x16, 8x8, and 4x4 defined */
void (*copy[7])( uint8_t *dst, int, uint8_t *src, int, int i_height );
void (*plane_copy)( uint8_t *dst, int i_dst,
uint8_t *src, int i_src, int w, int h);
} x264_mc_functions_t;
void x264_mc_init( int cpu, x264_mc_functions_t *pf );
......
......@@ -98,6 +98,7 @@ static const char * const x264_colmatrix_names[] = { "GBR", "bt709", "undef", ""
#define X264_CSP_RGB 0x0006 /* rgb 24bits */
#define X264_CSP_BGR 0x0007 /* bgr 24bits */
#define X264_CSP_BGRA 0x0008 /* bgr 32bits */
#define X264_CSP_MAX 0x0009 /* end of list */
#define X264_CSP_VFLIP 0x1000 /* */
/* Slice type
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment