Commit 458e63ca authored by Loren Merritt's avatar Loren Merritt

mmx avg (already existed by not used for bipred)

mmx biweighted avg (3x faster than C)



git-svn-id: svn://svn.videolan.org/x264/trunk@307 df754926-b1dd-0310-bc7b-ec298dee348c
parent 31126194
......@@ -74,6 +74,10 @@ cglobal x264_pixel_avg_w8_mmxext
cglobal x264_pixel_avg_w16_mmxext
cglobal x264_pixel_avg_w16_sse2
cglobal x264_pixel_avg_weight_4x4_mmxext
cglobal x264_pixel_avg_weight_w8_mmxext
cglobal x264_pixel_avg_weight_w16_mmxext
cglobal x264_mc_copy_w4_mmxext
cglobal x264_mc_copy_w8_mmxext
cglobal x264_mc_copy_w16_mmxext
......@@ -247,6 +251,98 @@ ALIGN 4
;=============================================================================
; weighted prediction
;=============================================================================
; implicit bipred only:
; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
%macro BIWEIGHT_4P_MMX 2
movd mm0, %1
movd mm1, %2
punpcklbw mm0, mm7
punpcklbw mm1, mm7
pmullw mm0, mm4
pmullw mm1, mm5
paddw mm0, mm1
paddw mm0, mm6
psraw mm0, 6
pmaxsw mm0, mm7
packuswb mm0, mm0
movd %1, mm0
%endmacro
%macro BIWEIGHT_START_MMX 0
; mov rdi, rdi ; dst
movsxd rsi, esi ; i_dst
; mov rdx, rdx ; src
movsxd rcx, ecx ; i_src
; movsxd r8, r8d ; i_weight_dst
; movsxd r9, r9d ; i_height
movd mm4, r8d
pshufw mm4, mm4, 0 ; weight_dst
movq mm5, [pw_64]
psubw mm5, mm4 ; weight_src
movq mm6, [pw_32] ; rounding
pxor mm7, mm7
ALIGN 4
.height_loop
%endmacro
ALIGN 16
;-----------------------------------------------------------------------------
; int __cdecl x264_pixel_avg_weight_w16_mmxext( uint8_t *, int, uint8_t *, int, int, int )
;-----------------------------------------------------------------------------
x264_pixel_avg_weight_w16_mmxext:
BIWEIGHT_START_MMX
BIWEIGHT_4P_MMX [rdi ], [rdx ]
BIWEIGHT_4P_MMX [rdi+ 4], [rdx+ 4]
BIWEIGHT_4P_MMX [rdi+ 8], [rdx+ 8]
BIWEIGHT_4P_MMX [rdi+12], [rdx+12]
add rdi, rsi
add rdx, rcx
dec r9d
jnz .height_loop
ret
ALIGN 16
;-----------------------------------------------------------------------------
; int __cdecl x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int )
;-----------------------------------------------------------------------------
x264_pixel_avg_weight_w8_mmxext:
BIWEIGHT_START_MMX
BIWEIGHT_4P_MMX [rdi ], [rdx ]
BIWEIGHT_4P_MMX [rdi+4 ], [rdx+4 ]
BIWEIGHT_4P_MMX [rdi+rsi ], [rdx+rcx ]
BIWEIGHT_4P_MMX [rdi+rsi+4], [rdx+rcx+4]
lea rdi, [rdi+rsi*2]
lea rdx, [rdx+rcx*2]
sub r9d, byte 2
jnz .height_loop
ret
ALIGN 16
;-----------------------------------------------------------------------------
; int __cdecl x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int )
;-----------------------------------------------------------------------------
x264_pixel_avg_weight_4x4_mmxext:
BIWEIGHT_START_MMX
BIWEIGHT_4P_MMX [rdi ], [rdx ]
BIWEIGHT_4P_MMX [rdi+rsi ], [rdx+rcx ]
BIWEIGHT_4P_MMX [rdi+rsi*2], [rdx+rcx*2]
add rdi, rsi
add rdx, rcx
BIWEIGHT_4P_MMX [rdi+rsi*2], [rdx+rcx*2]
ret
;=============================================================================
; pixel copy
;=============================================================================
......
......@@ -72,6 +72,10 @@ cglobal x264_pixel_avg_w8_mmxext
cglobal x264_pixel_avg_w16_mmxext
cglobal x264_pixel_avg_w16_sse2
cglobal x264_pixel_avg_weight_4x4_mmxext
cglobal x264_pixel_avg_weight_w8_mmxext
cglobal x264_pixel_avg_weight_w16_mmxext
cglobal x264_mc_copy_w4_mmxext
cglobal x264_mc_copy_w8_mmxext
cglobal x264_mc_copy_w16_mmxext
......@@ -244,6 +248,105 @@ ALIGN 4
ret
;=============================================================================
; weighted prediction
;=============================================================================
; implicit bipred only:
; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
%macro BIWEIGHT_4P_MMX 2
movd mm0, %1
movd mm1, %2
punpcklbw mm0, mm7
punpcklbw mm1, mm7
pmullw mm0, mm4
pmullw mm1, mm5
paddw mm0, mm1
paddw mm0, mm6
psraw mm0, 6
pmaxsw mm0, mm7
packuswb mm0, mm0
movd %1, mm0
%endmacro
%macro BIWEIGHT_START_MMX 0
push edi
push esi
mov edi, [esp+12] ; dst
mov esi, [esp+16] ; i_dst
mov edx, [esp+20] ; src
mov ecx, [esp+24] ; i_src
pshufw mm4, [esp+28], 0 ; weight_dst
movq mm5, [pw_64]
psubw mm5, mm4 ; weight_src
movq mm6, [pw_32] ; rounding
pxor mm7, mm7
%endmacro
%macro BIWEIGHT_END_MMX 0
pop esi
pop edi
ret
%endmacro
ALIGN 16
;-----------------------------------------------------------------------------
; int __cdecl x264_pixel_avg_weight_w16_mmxext( uint8_t *, int, uint8_t *, int, int, int )
;-----------------------------------------------------------------------------
x264_pixel_avg_weight_w16_mmxext:
BIWEIGHT_START_MMX
mov eax, [esp+32] ; i_height
ALIGN 4
.height_loop
BIWEIGHT_4P_MMX [edi ], [edx ]
BIWEIGHT_4P_MMX [edi+ 4], [edx+ 4]
BIWEIGHT_4P_MMX [edi+ 8], [edx+ 8]
BIWEIGHT_4P_MMX [edi+12], [edx+12]
add edi, esi
add edx, ecx
dec eax
jnz .height_loop
BIWEIGHT_END_MMX
ALIGN 16
;-----------------------------------------------------------------------------
; int __cdecl x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int )
;-----------------------------------------------------------------------------
x264_pixel_avg_weight_w8_mmxext:
BIWEIGHT_START_MMX
mov eax, [esp+32]
ALIGN 4
.height_loop
BIWEIGHT_4P_MMX [edi ], [edx ]
BIWEIGHT_4P_MMX [edi+4 ], [edx+4 ]
BIWEIGHT_4P_MMX [edi+esi ], [edx+ecx ]
BIWEIGHT_4P_MMX [edi+esi+4], [edx+ecx+4]
lea edi, [edi+esi*2]
lea edx, [edi+ecx*2]
sub eax, byte 2
jnz .height_loop
BIWEIGHT_END_MMX
ALIGN 16
;-----------------------------------------------------------------------------
; int __cdecl x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int )
;-----------------------------------------------------------------------------
x264_pixel_avg_weight_4x4_mmxext:
BIWEIGHT_START_MMX
BIWEIGHT_4P_MMX [edi ], [edx ]
BIWEIGHT_4P_MMX [edi+esi ], [edx+ecx ]
BIWEIGHT_4P_MMX [edi+esi*2], [edx+ecx*2]
add edi, esi
add edx, ecx
BIWEIGHT_4P_MMX [edi+esi*2], [edx+ecx*2]
BIWEIGHT_END_MMX
;=============================================================================
; pixel copy
;=============================================================================
......
......@@ -42,11 +42,39 @@ extern void x264_pixel_avg_w4_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *
extern void x264_pixel_avg_w8_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_w16_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_w16_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int );
extern void x264_pixel_avg_weight_w16_mmxext( uint8_t *, int, uint8_t *, int, int, int );
extern void x264_mc_copy_w4_mmxext( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w8_mmxext( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w16_mmxext( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w16_sse2( uint8_t *, int, uint8_t *, int, int );
#define AVG(W,H) \
static void x264_pixel_avg_ ## W ## x ## H ## _mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i_src ) \
{ \
x264_pixel_avg_w ## W ## _mmxext( dst, i_dst, dst, i_dst, src, i_src, H ); \
}
AVG(16,16)
AVG(16,8)
AVG(8,16)
AVG(8,8)
AVG(8,4)
AVG(4,8)
AVG(4,4)
AVG(4,2)
#define AVG_WEIGHT(W,H) \
void x264_pixel_avg_weight_ ## W ## x ## H ## _mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int i_weight_dst ) \
{ \
x264_pixel_avg_weight_w ## W ## _mmxext( dst, i_dst, src, i_src, i_weight_dst, H ); \
}
AVG_WEIGHT(16,16)
AVG_WEIGHT(16,8)
AVG_WEIGHT(8,16)
AVG_WEIGHT(8,8)
AVG_WEIGHT(8,4)
#if 0
#if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3)
......@@ -1128,6 +1156,23 @@ void x264_mc_mmxext_init( x264_mc_functions_t *pf )
{
pf->mc_luma = mc_luma_mmx;
pf->get_ref = get_ref_mmx;
pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_mmxext;
pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_mmxext;
pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_mmxext;
pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_mmxext;
pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_mmxext;
pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_mmxext;
pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_mmxext;
pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_mmxext;
pf->avg_weight[PIXEL_16x16] = x264_pixel_avg_weight_16x16_mmxext;
pf->avg_weight[PIXEL_16x8] = x264_pixel_avg_weight_16x8_mmxext;
pf->avg_weight[PIXEL_8x16] = x264_pixel_avg_weight_8x16_mmxext;
pf->avg_weight[PIXEL_8x8] = x264_pixel_avg_weight_8x8_mmxext;
pf->avg_weight[PIXEL_8x4] = x264_pixel_avg_weight_8x4_mmxext;
pf->avg_weight[PIXEL_4x4] = x264_pixel_avg_weight_4x4_mmxext;
// avg_weight_4x8 is rare and 4x2 is not used
}
void x264_mc_sse2_init( x264_mc_functions_t *pf )
{
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment