Commit 6f4d6fe9 authored by Daniel Alexandru Morie's avatar Daniel Alexandru Morie Committed by Fiona Glaser

MMX/SSE2 versions of high bit depth store_interleave

Patch from Google Code-In.
parent 898579cc
......@@ -880,11 +880,58 @@ cglobal plane_copy_core_mmxext, 6,7
emms
RET
%ifdef HIGH_BIT_DEPTH
%macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint
%if mmsize==16
mov%4 m0, [%2]
mov%4 m1, [%3]
SBUTTERFLY wd, 0, 1, 2
mov%5a [%1+ 0], m0
mov%5a [%1+16], m1
%else
movq m0, [%2+0]
movq m1, [%3+0]
SBUTTERFLY wd, 0, 1, 2
mov%5q [%1+ 0], m0
mov%5q [%1+ 8], m1
movq m0, [%2+8]
movq m1, [%3+8]
SBUTTERFLY wd, 0, 1, 2
mov%5q [%1+16], m0
mov%5q [%1+24], m1
%endif
%endmacro
%macro PLANE_INTERLEAVE 1
;-----------------------------------------------------------------------------
; void store_interleave_8x8x2( uint16_t *dst, int i_dst, uint16_t *srcu, uint16_t *srcv )
;-----------------------------------------------------------------------------
cglobal store_interleave_8x8x2_%1, 4,5
mov r4d, 16
FIX_STRIDES r1
.loop:
INTERLEAVE r0, r2, r3, a
add r2, FDEC_STRIDEB
add r3, FDEC_STRIDEB
add r0, r1
dec r4d
jg .loop
REP_RET
%endmacro ; PLANE_INTERLEAVE
INIT_MMX
PLANE_INTERLEAVE mmxext
INIT_XMM
PLANE_INTERLEAVE sse2
%else ;!HIGH_BIT_DEPTH
%macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint
movq m0, [%2]
%if mmsize==16
%if %4
%ifidn %4, a
punpcklbw m0, [%3]
%else
movq m1, [%3]
......@@ -969,8 +1016,8 @@ cglobal plane_copy_interleave_core_%1, 6,7
mov r6d, r6m
neg r6
.loopx:
INTERLEAVE r0+r6*2, r2+r6, r4+r6, 0, nt
INTERLEAVE r0+r6*2+16, r2+r6+8, r4+r6+8, 0, nt
INTERLEAVE r0+r6*2, r2+r6, r4+r6, u, nt
INTERLEAVE r0+r6*2+16, r2+r6+8, r4+r6+8, u, nt
add r6, 16
jl .loopx
.pad:
......@@ -1001,8 +1048,8 @@ cglobal plane_copy_interleave_core_%1, 6,7
cglobal store_interleave_8x8x2_%1, 4,5
mov r4d, 4
.loop:
INTERLEAVE r0, r2, r3, 1
INTERLEAVE r0+r1, r2+FDEC_STRIDE, r3+FDEC_STRIDE, 1
INTERLEAVE r0, r2, r3, a
INTERLEAVE r0+r1, r2+FDEC_STRIDE, r3+FDEC_STRIDE, a
add r2, FDEC_STRIDE*2
add r3, FDEC_STRIDE*2
lea r0, [r0+r1*2]
......@@ -1088,6 +1135,7 @@ PLANE_INTERLEAVE sse2
PLANE_DEINTERLEAVE sse2
PLANE_DEINTERLEAVE ssse3
%endif ; HIGH_BIT_DEPTH
; These functions are not general-use; not only do the SSE ones require aligned input,
; but they also will fail if given a non-mod16 size or a size less than 64.
......
......@@ -101,8 +101,8 @@ void x264_plane_copy_deinterleave_sse2( uint8_t *dstu, int i_dstu,
void x264_plane_copy_deinterleave_ssse3( uint8_t *dstu, int i_dstu,
uint8_t *dstv, int i_dstv,
uint8_t *src, int i_src, int w, int h );
void x264_store_interleave_8x8x2_mmxext( uint8_t *dst, int i_dst, uint8_t *srcu, uint8_t *srcv );
void x264_store_interleave_8x8x2_sse2( uint8_t *dst, int i_dst, uint8_t *srcu, uint8_t *srcv );
void x264_store_interleave_8x8x2_mmxext( pixel *dst, int i_dst, pixel *srcu, pixel *srcv );
void x264_store_interleave_8x8x2_sse2( pixel *dst, int i_dst, pixel *srcu, pixel *srcv );
void x264_load_deinterleave_8x8x2_fenc_mmx( uint8_t *dst, uint8_t *src, int i_src );
void x264_load_deinterleave_8x8x2_fenc_sse2( uint8_t *dst, uint8_t *src, int i_src );
void x264_load_deinterleave_8x8x2_fenc_ssse3( uint8_t *dst, uint8_t *src, int i_src );
......@@ -448,6 +448,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->memzero_aligned = x264_memzero_aligned_mmx;
pf->integral_init4v = x264_integral_init4v_mmx;
pf->integral_init8v = x264_integral_init8v_mmx;
pf->store_interleave_8x8x2 = x264_store_interleave_8x8x2_mmxext;
if( !(cpu&X264_CPU_MMXEXT) )
return;
......@@ -474,6 +475,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->integral_init4v = x264_integral_init4v_sse2;
pf->integral_init8v = x264_integral_init8v_sse2;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_sse2;
pf->store_interleave_8x8x2 = x264_store_interleave_8x8x2_sse2;
if( cpu&X264_CPU_SSE2_IS_SLOW )
return;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment