Commit 0df24cf9 authored by Loren Merritt's avatar Loren Merritt

convert mc's inline asm to nasm (slight speedup and msvc compatibility).

patch by Mathieu Monnier.


git-svn-id: svn://svn.videolan.org/x264/trunk@180 df754926-b1dd-0310-bc7b-ec298dee348c
parent 48c34d0b
......@@ -32,7 +32,8 @@ ifeq ($(ARCH),X86)
CFLAGS+=-DHAVE_MMXEXT -DHAVE_SSE2
SRCS+= common/i386/mc-c.c common/i386/dct-c.c common/i386/predict.c
ASMSRC= common/i386/dct-a.asm common/i386/cpu-a.asm \
common/i386/pixel-a.asm common/i386/mc-a.asm
common/i386/pixel-a.asm common/i386/mc-a.asm \
common/i386/mc-a2.asm common/i386/predict-a.asm
OBJASM= $(ASMSRC:%.asm=%.o)
endif
......
......@@ -22,7 +22,8 @@ SRC_C= common/mc.c common/predict.c common/pixel.c common/macroblock.c \
encoder/encoder.c encoder/eval.c \
common/i386/mc-c.c common/i386/dct-c.c common/i386/predict.c
SRC_ASM= common/i386/dct-a.asm common/i386/cpu-a.asm common/i386/pixel-a.asm common/i386/mc-a.asm
SRC_ASM= common/i386/dct-a.asm common/i386/cpu-a.asm common/i386/pixel-a.asm \
common/i386/mc-a.asm common/i386/mc-a2.asm common/i386/predict-a.asm
# Alias
RM= rm -rf
......
;*****************************************************************************
;* mc-a2.asm: h264 encoder library
;*****************************************************************************
;* Copyright (C) 2005 x264 project
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
;*****************************************************************************
BITS 32
;=============================================================================
; Macros and other preprocessor constants
;=============================================================================
%macro cglobal 1
%ifdef PREFIX
global _%1
%define %1 _%1
%else
global %1
%endif
%endmacro
;=============================================================================
; Read only data
;=============================================================================
SECTION .rodata data align=16
ALIGN 16
mmx_dw_one:
times 4 dw 16
mmx_dd_one:
times 2 dd 512
mmx_dw_20:
times 4 dw 20
mmx_dw_5:
times 4 dw -5
SECTION .data
width:
dd 0
height:
dd 0
dstp1:
dd 0
dstp2:
dd 0
buffer:
dd 0
dst1:
dd 0
dst2:
dd 0
src:
dd 0
;=============================================================================
; Macros
;=============================================================================
%macro LOAD_4 9
movd %1, %5
movd %2, %6
movd %3, %7
movd %4, %8
punpcklbw %1, %9
punpcklbw %2, %9
punpcklbw %3, %9
punpcklbw %4, %9
%endmacro
%macro FILT_2 2
psubw %1, %2
psllw %2, 2
psubw %1, %2
%endmacro
%macro FILT_4 3
paddw %2, %3
psllw %2, 2
paddw %1, %2
psllw %2, 2
paddw %1, %2
%endmacro
%macro FILT_6 4
psubw %1, %2
psllw %2, 2
psubw %1, %2
paddw %1, %3
paddw %1, %4
psraw %1, 5
%endmacro
%macro FILT_ALL 1
LOAD_4 mm1, mm2, mm3, mm4, [%1], [%1 + ecx], [%1 + 2 * ecx], [%1 + ebx], mm0
FILT_2 mm1, mm2
movd mm5, [%1 + 4 * ecx]
movd mm6, [%1 + edx]
FILT_4 mm1, mm3, mm4
punpcklbw mm5, mm0
punpcklbw mm6, mm0
psubw mm1, mm5
psllw mm5, 2
psubw mm1, mm5
paddw mm1, mm6
%endmacro
;=============================================================================
; Code
;=============================================================================
SECTION .text
cglobal x264_vertical_filter_mmxext
cglobal x264_horizontal_filter_mmxext
cglobal x264_center_filter_mmxext
;-----------------------------------------------------------------------------
;
; void x264_center_filter_mmxext( uint8_t *dst1, int i_dst1_stride,
; uint8_t *dst2, int i_dst2_stride,
; uint8_t *src, int i_src_stride,
; int i_width, int i_height );
;
;-----------------------------------------------------------------------------
ALIGN 16
x264_center_filter_mmxext :
push edi
push esi
push ebx
push ebp
mov esi, [esp + 36] ; src
mov edx, [esp + 20] ; dst1
mov [dst1], edx
mov edi, [esp + 28] ; dst2
mov [dst2], edi
mov eax, [esp + 44] ; width
mov [width], eax
mov eax, [esp + 48] ; height
mov [height], eax
mov eax, [esp + 24] ; dst1_stride
mov [dstp1], eax
mov eax, [esp + 32] ; dst2_stride
mov [dstp2], eax
mov ecx, [esp + 40] ; src_stride
sub esp, ecx
sub esp, ecx ; esp is now at the beginning of the buffer
mov [buffer], esp
;sub esi, 2
sub esi, ecx
sub esi, ecx ; esi - 2 - 2 * stride
mov [src], esi
;sub edi, 2
mov ebx, ecx
shl ebx, 1
add ebx, ecx ; 3 * src_stride
mov edx, ecx
shl edx, 1
add edx, ebx ; 5 * src_stride
pxor mm0, mm0 ; 0 ---> mm0
movq mm7, [mmx_dd_one] ; for rounding
mov ebp, [height]
loopcy:
dec ebp
mov eax, [width]
mov edi, [dst1]
mov esp, [buffer]
mov esi, [src]
FILT_ALL esi
pshufw mm2, mm1, 0
movq [esp], mm2
add esp, 8
movq [esp], mm1
add esp, 8
paddw mm1, [mmx_dw_one]
psraw mm1, 5
packuswb mm1, mm1
movd [edi], mm1
sub eax, 8
add edi, 4
add esi, 4
loopcx1:
sub eax, 4
FILT_ALL esi
movq [esp], mm1
paddw mm1, [mmx_dw_one]
psraw mm1, 5
packuswb mm1, mm1
movd [edi], mm1
add esp, 8
add esi, 4
add edi, 4
test eax, eax
jnz loopcx1
FILT_ALL esi
pshufw mm2, mm1, 7
movq [esp], mm1
add esp, 8
movq [esp], mm2
paddw mm1, [mmx_dw_one]
psraw mm1, 5
packuswb mm1, mm1
movd [edi], mm1
mov esi, [src]
add esi, ecx
mov [src], esi
mov edi, [dst1]
add edi, [dstp1]
mov [dst1], edi
mov eax, [width]
mov edi, [dst2]
mov esp, [buffer]
add esp, 4
loopcx2:
sub eax, 4
movq mm2, [esp + 2 * eax + 2]
movq mm3, [esp + 2 * eax + 4]
movq mm4, [esp + 2 * eax + 6]
movq mm5, [esp + 2 * eax + 8]
movq mm1, [esp + 2 * eax]
movq mm6, [esp + 2 * eax + 10]
paddw mm2, mm5
paddw mm3, mm4
paddw mm1, mm6
movq mm5, [mmx_dw_20]
movq mm4, [mmx_dw_5]
movq mm6, mm1
pxor mm7, mm7
punpckhwd mm5, mm2
punpcklwd mm4, mm3
punpcklwd mm2, [mmx_dw_20]
punpckhwd mm3, [mmx_dw_5]
pcmpgtw mm7, mm1
pmaddwd mm2, mm4
pmaddwd mm3, mm5
punpcklwd mm1, mm7
punpckhwd mm6, mm7
paddd mm2, mm1
paddd mm3, mm6
paddd mm2, [mmx_dd_one]
paddd mm3, [mmx_dd_one]
psrad mm2, 10
psrad mm3, 10
packssdw mm2, mm3
packuswb mm2, mm0
movd [edi + eax], mm2
test eax, eax
jnz loopcx2
add edi, [dstp2]
mov [dst2], edi
test ebp, ebp
jnz loopcy
mov esp, [buffer]
shl ecx, 1
add esp, ecx
pop ebp
pop ebx
pop esi
pop edi
ret
;-----------------------------------------------------------------------------
;
; void x264_horizontal_filter_mmxext( uint8_t *dst, int i_dst_stride,
; uint8_t *src, int i_src_stride,
; int i_width, int i_height );
;
;-----------------------------------------------------------------------------
ALIGN 16
x264_horizontal_filter_mmxext :
push edi
push esi
mov edi, [esp + 12] ; dst
mov esi, [esp + 20] ; src
pxor mm0, mm0
movq mm7, [mmx_dw_one]
mov ecx, [esp + 32] ; height
sub esi, 2
loophy:
dec ecx
mov eax, [esp + 28] ; width
loophx:
sub eax, 8
LOAD_4 mm1, mm2, mm3, mm4, [esi + eax], [esi + eax + 1], [esi + eax + 2], [esi + eax + 3], mm0
FILT_2 mm1, mm2
movd mm5, [esi + eax + 4]
movd mm6, [esi + eax + 5]
FILT_4 mm1, mm3, mm4
movd mm2, [esi + eax + 4]
movd mm3, [esi + eax + 6]
punpcklbw mm5, mm0
punpcklbw mm6, mm0
FILT_6 mm1, mm5, mm6, mm7
movd mm4, [esi + eax + 7]
movd mm5, [esi + eax + 8]
punpcklbw mm2, mm0
punpcklbw mm3, mm0 ; mm2(1), mm3(20), mm6(-5) ready
FILT_2 mm2, mm6
movd mm6, [esi + eax + 9]
punpcklbw mm4, mm0
punpcklbw mm5, mm0 ; mm2(1-5), mm3(20), mm4(20), mm5(-5) ready
FILT_4 mm2, mm3, mm4
punpcklbw mm6, mm0
FILT_6 mm2, mm5, mm6, mm7
packuswb mm1, mm2
movq [edi + eax], mm1
test eax, eax
jnz loophx
add esi, [esp + 24] ; src_pitch
add edi, [esp + 16] ; dst_pitch
test ecx, ecx
jnz loophy
pop esi
pop edi
ret
......@@ -36,6 +36,8 @@
#include "common/clip1.h"
#include "mc.h"
#if 0
#if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3)
#define USED_UINT64(foo) \
static const uint64_t foo __asm__ (#foo) __attribute__((used))
......@@ -1021,6 +1023,7 @@ static void motion_compensation_luma_sse2( uint8_t *src, int i_src_stride,
MOTION_COMPENSATION_LUMA
}
#endif
void mc_luma_mmx( uint8_t *src[4], int i_src_stride,
uint8_t *dst, int i_dst_stride,
......@@ -1141,6 +1144,7 @@ void x264_mc_sse2_init( x264_mc_functions_t *pf )
pf->get_ref = get_ref_mmx;
}
#if 0
void get_funcs_mmx(pf_mc_t *int_h, pf_mc_t *int_v, pf_mc_t *int_hv)
{
*int_h = mc_hh_w16;
......@@ -1154,3 +1158,4 @@ void get_funcs_sse2(pf_mc_t *int_h, pf_mc_t *int_v, pf_mc_t *int_hv)
*int_v = mc_hv_w16;
*int_hv = mc_hc_w16;
}
#endif
;*****************************************************************************
;* predict-a.asm: h264 encoder library
;*****************************************************************************
;* Copyright (C) 2005 x264 project
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
;*****************************************************************************
BITS 32
;=============================================================================
; Macros and other preprocessor constants
;=============================================================================
%macro cglobal 1
%ifdef PREFIX
global _%1
%define %1 _%1
%else
global %1
%endif
%endmacro
;=============================================================================
; Read only data
;=============================================================================
SECTION .rodata data align=16
SECTION .data
;=============================================================================
; Macros
;=============================================================================
%macro SAVE_0_1 1
movq [%1] , mm0
movq [%1 + 8] , mm1
%endmacro
;=============================================================================
; Code
;=============================================================================
SECTION .text
cglobal predict_8x8_v_mmx
cglobal predict_16x16_v_mmx
;-----------------------------------------------------------------------------
;
; void predict_8x8_v_mmx( uint8_t *src, int i_stride )
;
;-----------------------------------------------------------------------------
ALIGN 16
predict_8x8_v_mmx :
;push edi
;push esi
mov edx , [esp + 4]
mov ecx , [esp + 8]
sub edx , ecx ; esi <-- line -1
movq mm0 , [edx]
movq [edx + ecx] , mm0 ; 0
movq [edx + 2 * ecx] , mm0 ; 1
movq [edx + 4 * ecx] , mm0 ; 3
movq [edx + 8 * ecx] , mm0 ; 7
add edx , ecx ; esi <-- line 0
movq [edx + 2 * ecx] , mm0 ; 2
movq [edx + 4 * ecx] , mm0 ; 4
lea edx , [edx + 4 * ecx] ; esi <-- line 4
movq [edx + ecx] , mm0 ; 5
movq [edx + 2 * ecx] , mm0 ; 6
;pop esi
;pop edi
ret
;-----------------------------------------------------------------------------
;
; void predict_16x16_v_mmx( uint8_t *src, int i_stride )
;
;-----------------------------------------------------------------------------
ALIGN 16
predict_16x16_v_mmx :
;push edi
;push esi
mov edx, [esp + 4]
mov ecx, [esp + 8]
sub edx, ecx ; esi <-- line -1
movq mm0, [edx]
movq mm1, [edx + 8]
mov eax, ecx
shl eax, 1
add eax, ecx ; eax <-- 3* stride
SAVE_0_1 (edx + ecx) ; 0
SAVE_0_1 (edx + 2 * ecx) ; 1
SAVE_0_1 (edx + eax) ; 2
SAVE_0_1 (edx + 4 * ecx) ; 3
SAVE_0_1 (edx + 2 * eax) ; 5
SAVE_0_1 (edx + 8 * ecx) ; 7
SAVE_0_1 (edx + 4 * eax) ; 11
add edx, ecx ; esi <-- line 0
SAVE_0_1 (edx + 4 * ecx) ; 4
SAVE_0_1 (edx + 2 * eax) ; 6
SAVE_0_1 (edx + 8 * ecx) ; 8
SAVE_0_1 (edx + 4 * eax) ; 12
lea edx, [edx + 8 * ecx] ; esi <-- line 8
SAVE_0_1 (edx + ecx) ; 9
SAVE_0_1 (edx + 2 * ecx) ; 10
lea edx, [edx + 4 * ecx] ; esi <-- line 12
SAVE_0_1 (edx + ecx) ; 13
SAVE_0_1 (edx + 2 * ecx) ; 14
SAVE_0_1 (edx + eax) ; 15
;pop esi
;pop edi
ret
......@@ -152,6 +152,10 @@ static void predict_16x16_h( uint8_t *src, int i_stride )
}
}
extern predict_16x16_v_mmx( uint8_t *src, int i_stride );
#if 0
static void predict_16x16_v( uint8_t *src, int i_stride )
{
int i;
......@@ -168,6 +172,7 @@ static void predict_16x16_v( uint8_t *src, int i_stride )
src += i_stride;
}
}
#endif
/****************************************************************************
* 8x8 prediction for intra chroma block DC, H, V, P
......@@ -301,6 +306,10 @@ static void predict_8x8_h( uint8_t *src, int i_stride )
src += i_stride;
}
}
extern void predict_8x8_v_mmx( uint8_t *src, int i_stride );
#if 0
static void predict_8x8_v( uint8_t *src, int i_stride )
{
int i;
......@@ -313,6 +322,7 @@ static void predict_8x8_v( uint8_t *src, int i_stride )
src += i_stride;
}
}
#endif
/****************************************************************************
......@@ -404,7 +414,7 @@ static void predict_4x4_v( uint8_t *src, int i_stride )
****************************************************************************/
void x264_predict_16x16_init_mmxext( x264_predict_t pf[7] )
{
pf[I_PRED_16x16_V ] = predict_16x16_v;