Commit 25a1ffb2 authored by Daniel Kang's avatar Daniel Kang Committed by Fiona Glaser

MMX/SSE2 high bit depth weight_cache/offset(sub|add) functions

Patch from Google Code-In.
parent fd8cfd44
......@@ -188,18 +188,10 @@ AVG_WEIGHT ssse3, 16, 7
%ifdef HIGH_BIT_DEPTH
%macro WEIGHT_START 1 ; (width)
mova m0, [r4+ 0] ; 1<<denom
mova m3, [r4+16]
movd m2, [r4+32] ; denom
movd m3, [r4+36] ; scale
mov TMP_REG, [r4+40] ; offset
mova m0, [pw_1]
shl TMP_REG, BIT_DEPTH-7
mova m4, [pw_pixel_max]
add TMP_REG, 1
psllw m0, m2 ; 1<<denom
movd m1, TMP_REG ; 1+(offset<<(BIT_DEPTH-8+1))
psllw m3, 1 ; scale<<1
punpcklwd m3, m1
SPLATD m3, m3
paddw m2, [sq_1] ; denom+1
%endmacro
......@@ -354,7 +346,7 @@ AVG_WEIGHT ssse3, 16, 7
%endif ; HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
;void mc_weight_wX( uint8_t *dst, int i_dst_stride, uint8_t *src, int i_src_stride, weight_t *weight, int h )
;void mc_weight_wX( pixel *dst, int i_dst_stride, pixel *src, int i_src_stride, weight_t *weight, int h )
;-----------------------------------------------------------------------------
%ifdef ARCH_X86_64
......@@ -415,8 +407,17 @@ WEIGHTER 20, ssse3
%macro OFFSET_OP 7
mov%6 m0, [%1]
mov%6 m1, [%2]
%ifdef HIGH_BIT_DEPTH
p%5usw m0, m2
p%5usw m1, m2
%ifidn %5,add
pminsw m0, m3
pminsw m1, m3
%endif
%else
p%5usb m0, m2
p%5usb m1, m2
%endif
mov%7 [%3], m0
mov%7 [%4], m1
%endmacro
......@@ -424,25 +425,35 @@ WEIGHTER 20, ssse3
%macro OFFSET_TWO_ROW 4
%assign x 0
%rep %3
%if (%3-x) >= mmsize
%if (%3*SIZEOF_PIXEL-x) >= mmsize
OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, u, a
%assign x (x+mmsize)
%else
OFFSET_OP (%1+x),(%1+x+r3), (%2+x), (%2+x+r1), %4, d, d
%ifdef HIGH_BIT_DEPTH
OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, h, h
%else
OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, d, d
%endif
%exitrep
%endif
%if x >= %3
%if x >= %3*SIZEOF_PIXEL
%exitrep
%endif
%endrep
%endmacro
;-----------------------------------------------------------------------------
;void mc_offset_wX( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, weight_t *w, int h )
;void mc_offset_wX( pixel *src, int i_src_stride, pixel *dst, int i_dst_stride, weight_t *w, int h )
;-----------------------------------------------------------------------------
%macro OFFSET 3
cglobal mc_offset%3_w%1_%2, NUMREGS, NUMREGS
FIX_STRIDES r1, r3
mova m2, [r4]
%ifdef HIGH_BIT_DEPTH
%ifidn %3,add
mova m3, [pw_pixel_max]
%endif
%endif
LOAD_HEIGHT
.loop:
OFFSET_TWO_ROW r2, r0, %1, %3
......@@ -467,6 +478,9 @@ INIT_XMM
OFFSETPN 12, sse2
OFFSETPN 16, sse2
OFFSETPN 20, sse2
%ifdef HIGH_BIT_DEPTH
OFFSETPN 8, sse2
%endif
%undef LOAD_HEIGHT
%undef HEIGHT_REG
%undef NUMREGS
......
......@@ -50,8 +50,8 @@ DECL_SUF( x264_pixel_avg_4x2, ( uint8_t *, int, uint8_t *, int, uint8_t *, int
void x264_mc_weight_w##w##_##type( pixel *,int, pixel *,int, const x264_weight_t *,int );
#define MC_WEIGHT_OFFSET(w,type) \
void x264_mc_offsetadd_w##w##_##type( uint8_t *,int, uint8_t *,int, const x264_weight_t *,int ); \
void x264_mc_offsetsub_w##w##_##type( uint8_t *,int, uint8_t *,int, const x264_weight_t *,int ); \
void x264_mc_offsetadd_w##w##_##type( pixel *,int, pixel *,int, const x264_weight_t *,int ); \
void x264_mc_offsetsub_w##w##_##type( pixel *,int, pixel *,int, const x264_weight_t *,int ); \
MC_WEIGHT(w,type)
MC_WEIGHT_OFFSET( 4, mmxext )
......@@ -62,6 +62,9 @@ MC_WEIGHT_OFFSET( 20, mmxext )
MC_WEIGHT_OFFSET( 12, sse2 )
MC_WEIGHT_OFFSET( 16, sse2 )
MC_WEIGHT_OFFSET( 20, sse2 )
#if HIGH_BIT_DEPTH
MC_WEIGHT_OFFSET( 8, sse2 )
#endif
MC_WEIGHT( 8, sse2 )
MC_WEIGHT( 4, ssse3 )
MC_WEIGHT( 8, ssse3 )
......@@ -220,7 +223,34 @@ MC_COPY_WTAB(sse2,mmx,mmx,sse2)
#if HIGH_BIT_DEPTH
MC_WEIGHT_WTAB(weight,mmxext,mmxext,mmxext,12)
MC_WEIGHT_WTAB(offsetadd,mmxext,mmxext,mmxext,12)
MC_WEIGHT_WTAB(offsetsub,mmxext,mmxext,mmxext,12)
MC_WEIGHT_WTAB(weight,sse2,mmxext,sse2,12)
MC_WEIGHT_WTAB(offsetadd,sse2,mmxext,sse2,16)
MC_WEIGHT_WTAB(offsetsub,sse2,mmxext,sse2,16)
static void x264_weight_cache_mmxext( x264_t *h, x264_weight_t *w )
{
if( w->i_scale == 1<<w->i_denom )
{
if( w->i_offset < 0 )
w->weightfn = h->mc.offsetsub;
else
w->weightfn = h->mc.offsetadd;
for( int i = 0; i < 8; i++ )
w->cachea[i] = abs(w->i_offset<<(BIT_DEPTH-8));
return;
}
w->weightfn = h->mc.weight;
int den1 = 1<<w->i_denom;
int den2 = w->i_scale<<1;
int den3 = 1+(w->i_offset<<(BIT_DEPTH-8+1));
for( int i = 0; i < 8; i++ )
{
w->cachea[i] = den1;
w->cacheb[i] = i&1 ? den3 : den2;
}
}
#else
MC_WEIGHT_WTAB(weight,mmxext,mmxext,mmxext,12)
MC_WEIGHT_WTAB(offsetadd,mmxext,mmxext,mmxext,12)
......@@ -268,7 +298,7 @@ static void x264_weight_cache_ssse3( x264_t *h, x264_weight_t *w )
}
w->weightfn = h->mc.weight;
den1 = w->i_scale << (8 - w->i_denom);
for(i = 0;i<8;i++)
for( i = 0; i < 8; i++ )
{
w->cachea[i] = den1;
w->cacheb[i] = w->i_offset;
......@@ -458,6 +488,9 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->mc_chroma = x264_mc_chroma_mmxext;
pf->hpel_filter = x264_hpel_filter_mmxext;
pf->weight = x264_mc_weight_wtab_mmxext;
pf->weight_cache = x264_weight_cache_mmxext;
pf->offsetadd = x264_mc_offsetadd_wtab_mmxext;
pf->offsetsub = x264_mc_offsetsub_wtab_mmxext;
#if HIGH_BIT_DEPTH
if( !(cpu&X264_CPU_SSE2) )
......@@ -476,6 +509,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->integral_init8v = x264_integral_init8v_sse2;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_sse2;
pf->store_interleave_8x8x2 = x264_store_interleave_8x8x2_sse2;
pf->offsetadd = x264_mc_offsetadd_wtab_sse2;
pf->offsetsub = x264_mc_offsetsub_wtab_sse2;
if( cpu&X264_CPU_SSE2_IS_SLOW )
return;
......@@ -492,10 +527,6 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
if( (cpu&X264_CPU_SHUFFLE_IS_FAST) && !(cpu&X264_CPU_SLOW_ATOM) )
pf->integral_init4v = x264_integral_init4v_ssse3;
#else // !HIGH_BIT_DEPTH
pf->offsetadd = x264_mc_offsetadd_wtab_mmxext;
pf->offsetsub = x264_mc_offsetsub_wtab_mmxext;
pf->weight_cache = x264_weight_cache_mmxext;
pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_mmxext;
pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_mmxext;
pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_mmxext;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment