Commit 6ecfa83c authored by Daniel Kang's avatar Daniel Kang Committed by Fiona Glaser

MMX/SSE2 high bit depth interleave functions

Patch from Google Code-In.
parent 15595e6d
......@@ -300,7 +300,8 @@ int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
get_plane_ptr( h, src, &pix[1], &stride[1], i_csp==X264_CSP_I420 ? 1 : 2, 1, 1 );
get_plane_ptr( h, src, &pix[2], &stride[2], i_csp==X264_CSP_I420 ? 2 : 1, 1, 1 );
h->mc.plane_copy_interleave( dst->plane[1], dst->i_stride[1],
pix[1], stride[1], pix[2], stride[2],
(pixel*)pix[1], stride[1]/sizeof(pixel),
(pixel*)pix[2], stride[2]/sizeof(pixel),
h->param.i_width>>1, h->param.i_height>>1 );
}
return 0;
......
......@@ -291,14 +291,14 @@ void x264_plane_copy_c( pixel *dst, int i_dst,
}
void x264_plane_copy_interleave_c( pixel *dst, int i_dst,
uint8_t *srcu, int i_srcu,
uint8_t *srcv, int i_srcv, int w, int h )
pixel *srcu, int i_srcu,
pixel *srcv, int i_srcv, int w, int h )
{
for( int y=0; y<h; y++, dst+=i_dst, srcu+=i_srcu, srcv+=i_srcv )
for( int x=0; x<w; x++ )
{
dst[2*x] = ((pixel*)srcu)[x];
dst[2*x+1] = ((pixel*)srcv)[x];
dst[2*x] = srcu[x];
dst[2*x+1] = srcv[x];
}
}
......
......@@ -90,8 +90,8 @@ typedef struct
void (*plane_copy)( pixel *dst, int i_dst,
uint8_t *src, int i_src, int w, int h );
void (*plane_copy_interleave)( pixel *dst, int i_dst,
uint8_t *srcu, int i_srcu,
uint8_t *srcv, int i_srcv, int w, int h );
pixel *srcu, int i_srcu,
pixel *srcv, int i_srcv, int w, int h );
/* may write up to 15 pixels off the end of each plane */
void (*plane_copy_deinterleave)( pixel *dstu, int i_dstu,
pixel *dstv, int i_dstv,
......
......@@ -881,56 +881,21 @@ cglobal plane_copy_core_mmxext, 6,7
emms
RET
%ifdef HIGH_BIT_DEPTH
%macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint
%if mmsize==16
mov%4 m0, [%2]
mov%4 m1, [%3]
SBUTTERFLY wd, 0, 1, 2
mov%5a [%1+ 0], m0
mov%5a [%1+16], m1
%ifdef HIGH_BIT_DEPTH
%assign x 0
%rep 16/mmsize
mov%4 m0, [%2+(x/2)*mmsize]
mov%4 m1, [%3+(x/2)*mmsize]
mova m2, m0
punpcklwd m0, m1
punpckhwd m2, m1
mov%5a [%1+(x+0)*mmsize], m0
mov%5a [%1+(x+1)*mmsize], m2
%assign x (x+2)
%endrep
%else
movq m0, [%2+0]
movq m1, [%3+0]
SBUTTERFLY wd, 0, 1, 2
mov%5q [%1+ 0], m0
mov%5q [%1+ 8], m1
movq m0, [%2+8]
movq m1, [%3+8]
SBUTTERFLY wd, 0, 1, 2
mov%5q [%1+16], m0
mov%5q [%1+24], m1
%endif
%endmacro
%macro PLANE_INTERLEAVE 1
;-----------------------------------------------------------------------------
; void store_interleave_8x8x2( uint16_t *dst, int i_dst, uint16_t *srcu, uint16_t *srcv )
;-----------------------------------------------------------------------------
cglobal store_interleave_8x8x2_%1, 4,5
mov r4d, 16
FIX_STRIDES r1
.loop:
INTERLEAVE r0, r2, r3, a
add r2, FDEC_STRIDEB
add r3, FDEC_STRIDEB
add r0, r1
dec r4d
jg .loop
REP_RET
%endmacro ; PLANE_INTERLEAVE
INIT_MMX
PLANE_INTERLEAVE mmxext
INIT_XMM
PLANE_INTERLEAVE sse2
%endif ; HIGH_BIT_DEPTH
%ifndef HIGH_BIT_DEPTH
%macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint
movq m0, [%2]
%if mmsize==16
%ifidn %4, a
......@@ -945,11 +910,11 @@ PLANE_INTERLEAVE sse2
mova m2, m0
punpcklbw m0, m1
punpckhbw m2, m1
mov%5a [%1], m0
mov%5a [%1+0], m0
mov%5a [%1+8], m2
%endif
%endif ; HIGH_BIT_DEPTH
%endmacro
%endif
%macro DEINTERLEAVE 7 ; dstu, dstv, src, dstv==dstu+8, cpu, shuffle constant, is aligned
%ifdef HIGH_BIT_DEPTH
......@@ -1003,7 +968,6 @@ PLANE_INTERLEAVE sse2
%endif ; HIGH_BIT_DEPTH
%endmacro
%ifndef HIGH_BIT_DEPTH
%macro PLANE_INTERLEAVE 1
;-----------------------------------------------------------------------------
; void plane_copy_interleave_core( uint8_t *dst, int i_dst,
......@@ -1011,11 +975,17 @@ PLANE_INTERLEAVE sse2
; uint8_t *srcv, int i_srcv, int w, int h )
;-----------------------------------------------------------------------------
; assumes i_dst and w are multiples of 16, and i_dst>2*w
cglobal plane_copy_interleave_core_%1, 6,7
mov r6d, r6m
cglobal plane_copy_interleave_core_%1, 7,7
FIX_STRIDES r1d, r3d, r5d, r6d
%ifdef HIGH_BIT_DEPTH
mov r1m, r1d
mov r3m, r3d
mov r6m, r6d
%endif
movsxdifnidn r1, r1d
movsxdifnidn r3, r3d
movsxdifnidn r5, r5d
movsxdifnidn r6, r6d
lea r0, [r0+r6*2]
add r2, r6
add r4, r6
......@@ -1024,10 +994,10 @@ cglobal plane_copy_interleave_core_%1, 6,7
%else
DECLARE_REG_TMP 1,3
%endif
mov t1, r1
shr t1, SIZEOF_PIXEL
sub t1, r6
mov t0d, r7m
mov t1d, r1d
shr t1d, 1
sub t1d, r6d
.loopy:
mov r6d, r6m
neg r6
......@@ -1039,21 +1009,25 @@ cglobal plane_copy_interleave_core_%1, 6,7
mov r6d, r6m
neg r6
.loopx:
INTERLEAVE r0+r6*2, r2+r6, r4+r6, u, nt
INTERLEAVE r0+r6*2+16, r2+r6+8, r4+r6+8, u, nt
add r6, 16
INTERLEAVE r0+r6*2+ 0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6+0*SIZEOF_PIXEL, u, nt
INTERLEAVE r0+r6*2+16*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6+8*SIZEOF_PIXEL, u, nt
add r6, 16*SIZEOF_PIXEL
jl .loopx
.pad:
%assign n 0
%rep SIZEOF_PIXEL
%if mmsize==8
movntq [r0+r6*2], m0
movntq [r0+r6*2+8], m0
movntq [r0+r6*2+16], m0
movntq [r0+r6*2+24], m0
movntq [r0+r6*2+(n+ 0)], m0
movntq [r0+r6*2+(n+ 8)], m0
movntq [r0+r6*2+(n+16)], m0
movntq [r0+r6*2+(n+24)], m0
%else
movntdq [r0+r6*2], m0
movntdq [r0+r6*2+16], m0
movntdq [r0+r6*2+(n+ 0)], m0
movntdq [r0+r6*2+(n+16)], m0
%endif
add r6, 16
%assign n n+32
%endrep
add r6, 16*SIZEOF_PIXEL
cmp r6, t1
jl .pad
add r0, r1mp
......@@ -1070,17 +1044,17 @@ cglobal plane_copy_interleave_core_%1, 6,7
;-----------------------------------------------------------------------------
cglobal store_interleave_8x8x2_%1, 4,5
mov r4d, 4
FIX_STRIDES r1d
.loop:
INTERLEAVE r0, r2, r3, a
INTERLEAVE r0+r1, r2+FDEC_STRIDE, r3+FDEC_STRIDE, a
add r2, FDEC_STRIDE*2
add r3, FDEC_STRIDE*2
INTERLEAVE r0+ 0, r2+ 0, r3+ 0, a
INTERLEAVE r0+r1, r2+FDEC_STRIDEB, r3+FDEC_STRIDEB, a
add r2, FDEC_STRIDEB*2
add r3, FDEC_STRIDEB*2
lea r0, [r0+r1*2]
dec r4d
jg .loop
REP_RET
%endmacro ; PLANE_INTERLEAVE
%endif ; !HIGH_BIT_DEPTH
%macro DEINTERLEAVE_START 1
%ifdef HIGH_BIT_DEPTH
......@@ -1161,8 +1135,10 @@ cglobal load_deinterleave_8x8x2_fdec_%1, 3,4
%ifdef HIGH_BIT_DEPTH
INIT_MMX
PLANE_INTERLEAVE mmxext
PLANE_DEINTERLEAVE mmx
INIT_XMM
PLANE_INTERLEAVE sse2
PLANE_DEINTERLEAVE sse2
%else
INIT_MMX
......
......@@ -86,15 +86,15 @@ void x264_prefetch_fenc_mmxext( uint8_t *, int, uint8_t *, int, int );
void x264_prefetch_ref_mmxext( uint8_t *, int, int );
void x264_plane_copy_core_mmxext( uint8_t *, int, uint8_t *, int, int w, int h);
void x264_plane_copy_c( uint8_t *, int, uint8_t *, int, int w, int h );
void x264_plane_copy_interleave_core_mmxext( uint8_t *dst, int i_dst,
uint8_t *srcu, int i_srcu,
uint8_t *srcv, int i_srcv, int w, int h );
void x264_plane_copy_interleave_core_sse2( uint8_t *dst, int i_dst,
uint8_t *srcu, int i_srcu,
uint8_t *srcv, int i_srcv, int w, int h );
void x264_plane_copy_interleave_c( uint8_t *dst, int i_dst,
uint8_t *srcu, int i_srcu,
uint8_t *srcv, int i_srcv, int w, int h );
void x264_plane_copy_interleave_core_mmxext( pixel *dst, int i_dst,
pixel *srcu, int i_srcu,
pixel *srcv, int i_srcv, int w, int h );
void x264_plane_copy_interleave_core_sse2( pixel *dst, int i_dst,
pixel *srcu, int i_srcu,
pixel *srcv, int i_srcv, int w, int h );
void x264_plane_copy_interleave_c( pixel *dst, int i_dst,
pixel *srcu, int i_srcu,
pixel *srcv, int i_srcv, int w, int h );
void x264_plane_copy_deinterleave_mmx( pixel *dstu, int i_dstu,
pixel *dstv, int i_dstv,
pixel *src, int i_src, int w, int h );
......@@ -442,11 +442,12 @@ static void x264_plane_copy_mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i
x264_plane_copy_core_mmxext( dst+i_dst, i_dst, src+i_src, i_src, (w+15)&~15, h-1 );
}
}
#endif // HIGH_BIT_DEPTH
#define PLANE_INTERLEAVE(cpu) \
static void x264_plane_copy_interleave_##cpu( uint8_t *dst, int i_dst,\
uint8_t *srcu, int i_srcu,\
uint8_t *srcv, int i_srcv, int w, int h )\
static void x264_plane_copy_interleave_##cpu( pixel *dst, int i_dst,\
pixel *srcu, int i_srcu,\
pixel *srcv, int i_srcv, int w, int h )\
{\
if( !(w&15) ) {\
x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\
......@@ -463,7 +464,6 @@ static void x264_plane_copy_interleave_##cpu( uint8_t *dst, int i_dst,\
PLANE_INTERLEAVE(mmxext)
PLANE_INTERLEAVE(sse2)
#endif // HIGH_BIT_DEPTH
void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
{
......@@ -488,6 +488,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
if( !(cpu&X264_CPU_MMXEXT) )
return;
pf->plane_copy_interleave = x264_plane_copy_interleave_mmxext;
pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_mmxext;
pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_mmxext;
pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_mmxext;
......@@ -513,6 +515,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->load_deinterleave_8x8x2_fenc = x264_load_deinterleave_8x8x2_fenc_sse2;
pf->load_deinterleave_8x8x2_fdec = x264_load_deinterleave_8x8x2_fdec_sse2;
pf->plane_copy_interleave = x264_plane_copy_interleave_sse2;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2;
if( cpu&X264_CPU_SSE2_IS_FAST )
......@@ -560,7 +563,6 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->load_deinterleave_8x8x2_fdec = x264_load_deinterleave_8x8x2_fdec_mmx;
pf->plane_copy = x264_plane_copy_mmxext;
pf->plane_copy_interleave = x264_plane_copy_interleave_mmxext;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_mmx;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_mmxext;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment