Commit cb173c50 authored by Loren Merritt's avatar Loren Merritt

avg_weight_ssse3

parent 3e5b130a
......@@ -1501,7 +1501,12 @@ void x264_macroblock_bipred_init( x264_t *h )
if( h->param.analyse.b_weighted_bipred
&& dist_scale_factor >= -64
&& dist_scale_factor <= 128 )
{
h->mb.bipred_weight[i_ref0][i_ref1] = 64 - dist_scale_factor;
// ssse3 implementation of biweight doesn't support the extrema.
// if we ever generate them, we'll have to drop that optimization.
assert( dist_scale_factor >= -63 && dist_scale_factor <= 127 );
}
else
h->mb.bipred_weight[i_ref0][i_ref1] = 32;
}
......
......@@ -47,8 +47,11 @@ SECTION .text
%define t3 r3
%define t4 r4
%define t5 r5
%macro AVG_START 1
cglobal %1, 6,7
%define t6d r10d
%define t7d r11d
%macro AVG_START 0
PROLOGUE 6,7
.height_loop:
%endmacro
%else
%define t0 r1
......@@ -57,14 +60,17 @@ SECTION .text
%define t3 r4
%define t4 r5
%define t5 r6
%macro AVG_START 1
cglobal %1, 0,7
%define t6d r1d
%define t7d r2d
%macro AVG_START 0
PROLOGUE 0,7
mov t0, r0m
mov t1, r1m
mov t2, r2m
mov t3, r3m
mov t4, r4m
mov t5, r5m
.height_loop:
%endmacro
%endif
......@@ -77,9 +83,9 @@ SECTION .text
%endif
%endmacro
%macro BIWEIGHT 3
movh m0, %2
movh m1, %3
%macro BIWEIGHT_MMX 2
movh m0, %1
movh m1, %2
punpcklbw m0, m7
punpcklbw m1, m7
pmullw m0, m4
......@@ -87,35 +93,72 @@ SECTION .text
paddw m0, m1
paddw m0, m6
psraw m0, 6
pmaxsw m0, m7
packuswb m0, m0
movh %1, m0
%endmacro
%macro BIWEIGHT_START 0
%macro BIWEIGHT_START_MMX 0
movd m4, r6m
SPLATW m4, m4 ; weight_dst
mova m5, [pw_64 GLOBAL]
psubw m5, m4 ; weight_src
psubw m5, m4 ; weight_src
mova m6, [pw_32 GLOBAL] ; rounding
pxor m7, m7
.height_loop:
%endmacro
INIT_MMX
%macro BIWEIGHT_SSSE3 2
movh m0, %1
movh m1, %2
punpcklbw m0, m1
pmaddubsw m0, m5
paddw m0, m6
psraw m0, 6
%endmacro
%macro BIWEIGHT_START_SSSE3 0
movzx t6d, byte r6m ; FIXME x86_64
mov t7d, 64
sub t7d, t6d
shl t7d, 8
add t6d, t7d
movd m5, t6d
mova m6, [pw_32 GLOBAL]
SPLATW m5, m5 ; weight_dst,src
%endmacro
%macro BIWEIGHT_ROW 4
BIWEIGHT [%2], [%3]
%if %4==mmsize/2
packuswb m0, m0
movh [%1], m0
%else
SWAP 0, 2
BIWEIGHT [%2+mmsize/2], [%3+mmsize/2]
packuswb m2, m0
mova [%1], m2
%endif
%endmacro
;-----------------------------------------------------------------------------
; int x264_pixel_avg_weight_w16_mmxext( uint8_t *dst, int, uint8_t *src1, int, uint8_t *src2, int, int i_weight )
;-----------------------------------------------------------------------------
%macro AVG_WEIGHT 2
AVG_START x264_pixel_avg_weight_w%2_%1
cglobal x264_pixel_avg_weight_w%2_%1, 0,0
BIWEIGHT_START
AVG_START
%if %2==8 && mmsize==16
BIWEIGHT [t2], [t4]
SWAP 0, 2
BIWEIGHT [t2+t3], [t4+t5]
packuswb m2, m0
movlps [t0], m2
movhps [t0+t1], m2
%else
%assign x 0
%rep %2*2/mmsize
BIWEIGHT [t0+x], [t2+x], [t4+x]
BIWEIGHT [t0+x+t1], [t2+x+t3], [t4+x+t5]
%assign x x+mmsize/2
%rep 1+%2/(mmsize*2)
BIWEIGHT_ROW t0+x, t2+x, t4+x, %2
BIWEIGHT_ROW t0+x+t1, t2+x+t3, t4+x+t5, %2
%assign x x+mmsize
%endrep
%endif
lea t0, [t0+t1*2]
lea t2, [t2+t3*2]
lea t4, [t4+t5*2]
......@@ -124,12 +167,23 @@ AVG_START x264_pixel_avg_weight_w%2_%1
REP_RET
%endmacro
%define BIWEIGHT BIWEIGHT_MMX
%define BIWEIGHT_START BIWEIGHT_START_MMX
INIT_MMX
AVG_WEIGHT mmxext, 4
AVG_WEIGHT mmxext, 8
AVG_WEIGHT mmxext, 16
INIT_XMM
%define x264_pixel_avg_weight_w4_sse2 x264_pixel_avg_weight_w4_mmxext
AVG_WEIGHT sse2, 8
AVG_WEIGHT sse2, 16
%define BIWEIGHT BIWEIGHT_SSSE3
%define BIWEIGHT_START BIWEIGHT_START_SSSE3
INIT_MMX
AVG_WEIGHT ssse3, 4
INIT_XMM
AVG_WEIGHT ssse3, 8
AVG_WEIGHT ssse3, 16
......@@ -145,7 +199,7 @@ AVG_WEIGHT sse2, 16
cglobal x264_pixel_avg_%1x%2_%3,0,0
mov eax, %2
cmp dword r6m, 32
jne x264_pixel_avg_weight_w%1_mmxext
jne x264_pixel_avg_weight_w%1_%3
%if mmsize == 16 && %1 == 16
test dword r4m, 15
jz x264_pixel_avg_w%1_sse2
......@@ -168,38 +222,31 @@ cglobal x264_pixel_avg_%1x%2_%3,0,0
REP_RET
%endmacro
INIT_MMX
AVG_START x264_pixel_avg_w4_mmxext
.height_loop:
movd mm0, [t2]
movd mm1, [t2+t3]
pavgb mm0, [t4]
pavgb mm1, [t4+t5]
movd [t0], mm0
movd [t0+t1], mm1
AVG_END
%macro AVG_FUNC 3
cglobal %1
AVG_START
%2 m0, [t2]
%2 m1, [t2+t3]
pavgb m0, [t4]
pavgb m1, [t4+t5]
%3 [t0], m0
%3 [t0+t1], m1
AVG_END
%endmacro
INIT_MMX
AVG_FUNC x264_pixel_avg_w4_mmxext, movd, movd
AVGH 4, 8, mmxext
AVGH 4, 4, mmxext
AVGH 4, 2, mmxext
AVG_START x264_pixel_avg_w8_mmxext
.height_loop:
movq mm0, [t2]
movq mm1, [t2+t3]
pavgb mm0, [t4]
pavgb mm1, [t4+t5]
movq [t0], mm0
movq [t0+t1], mm1
AVG_END
AVG_FUNC x264_pixel_avg_w8_mmxext, movq, movq
AVGH 8, 16, mmxext
AVGH 8, 8, mmxext
AVGH 8, 4, mmxext
AVG_START x264_pixel_avg_w16_mmxext
.height_loop:
cglobal x264_pixel_avg_w16_mmxext
AVG_START
movq mm0, [t2 ]
movq mm1, [t2+8]
movq mm2, [t2+t3 ]
......@@ -212,27 +259,27 @@ AVG_START x264_pixel_avg_w16_mmxext
movq [t0+8], mm1
movq [t0+t1 ], mm2
movq [t0+t1+8], mm3
AVG_END
AVG_END
AVGH 16, 16, mmxext
AVGH 16, 8, mmxext
AVG_START x264_pixel_avg_w16_sse2
.height_loop:
movdqu xmm0, [t2]
movdqu xmm1, [t2+t3]
pavgb xmm0, [t4]
pavgb xmm1, [t4+t5]
movdqa [t0], xmm0
movdqa [t0+t1], xmm1
AVG_END
INIT_XMM
AVG_FUNC x264_pixel_avg_w16_sse2, movdqu, movdqa
AVGH 16, 16, sse2
AVGH 16, 8, sse2
AVGH 8, 16, sse2
AVGH 8, 8, sse2
AVGH 8, 4, sse2
AVGH 16, 8, sse2
AVGH 8, 16, sse2
AVGH 8, 8, sse2
AVGH 8, 4, sse2
AVGH 16, 16, ssse3
AVGH 16, 8, ssse3
AVGH 8, 16, ssse3
AVGH 8, 8, ssse3
AVGH 8, 4, ssse3
INIT_MMX
AVGH 4, 8, ssse3
AVGH 4, 4, ssse3
AVGH 4, 2, ssse3
......
......@@ -29,20 +29,19 @@
#include "common/common.h"
#include "mc.h"
/* NASM functions */
extern void x264_pixel_avg_16x16_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_16x8_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_8x16_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_8x8_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_8x4_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_16x16_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_16x8_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_8x16_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_8x8_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_8x4_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_4x8_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_4x4_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_4x2_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
#define DECL_SUF( func, args )\
void func##_mmxext args;\
void func##_sse2 args;\
void func##_ssse3 args;
DECL_SUF( x264_pixel_avg_16x16, ( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ))
DECL_SUF( x264_pixel_avg_16x8, ( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ))
DECL_SUF( x264_pixel_avg_8x16, ( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ))
DECL_SUF( x264_pixel_avg_8x8, ( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ))
DECL_SUF( x264_pixel_avg_8x4, ( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ))
DECL_SUF( x264_pixel_avg_4x8, ( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ))
DECL_SUF( x264_pixel_avg_4x4, ( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ))
DECL_SUF( x264_pixel_avg_4x2, ( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ))
extern void x264_mc_copy_w4_mmx( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w8_mmx( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w16_mmx( uint8_t *, int, uint8_t *, int, int );
......@@ -310,6 +309,15 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
if( !(cpu&X264_CPU_SSSE3) )
return;
pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_ssse3;
pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_ssse3;
pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_ssse3;
pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_ssse3;
pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_ssse3;
pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_ssse3;
pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_ssse3;
pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_ssse3;
pf->hpel_filter = x264_hpel_filter_ssse3;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
pf->mc_chroma = x264_mc_chroma_ssse3;
......
......@@ -774,15 +774,15 @@ static int check_mc( int cpu_ref, int cpu_new )
#define MC_TEST_AVG( name, weight ) \
for( i = 0, ok = 1, used_asm = 0; i < 10; i++ ) \
{ \
memcpy( buf2, buf1, 1024 ); \
memcpy( buf4, buf3, 1024 ); \
memcpy( buf3, buf1+320, 320 ); \
memcpy( buf4, buf1+320, 320 ); \
if( mc_a.name[i] != mc_ref.name[i] ) \
{ \
set_func_name( "%s_%s", #name, pixel_names[i] );\
used_asm = 1; \
call_c1( mc_c.name[i], buf3, 16, buf2+1, 16, buf1+18, 16, weight ); \
call_a1( mc_a.name[i], buf4, 16, buf2+1, 16, buf1+18, 16, weight ); \
if( memcmp( buf3, buf4, 1024 ) ) \
if( memcmp( buf3, buf4, 320 ) ) \
{ \
ok = 0; \
fprintf( stderr, #name "[%d]: [FAILED]\n", i ); \
......@@ -792,7 +792,7 @@ static int check_mc( int cpu_ref, int cpu_new )
} \
}
ok = 1; used_asm = 0;
for( w = -64; w <= 128 && ok; w++ )
for( w = -63; w <= 127 && ok; w++ )
MC_TEST_AVG( avg, w );
report( "mc wpredb :" );
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment