Commit 02713c24 authored by Min Chen's avatar Min Chen

update & SSE2 support


git-svn-id: svn://svn.videolan.org/x264/trunk@10 df754926-b1dd-0310-bc7b-ec298dee348c
parent 77bce7d1
......@@ -4,8 +4,9 @@
# Defines: HAVE_ALTIVEC
# CFLAGS: -faltivec
#
PFLAGS=-DARCH_X86 -DHAVE_MMXEXT -DHAVE_SSE2
CC=gcc
CFLAGS=-g -Wall -I. -DDEBUG -O4 -funroll-loops -D__X264__ -DHAVE_MALLOC_H -DHAVE_MMXEXT -DARCH_X86
CFLAGS=-g -Wall -I. -DDEBUG -O4 -funroll-loops -D__X264__ -DHAVE_MALLOC_H $(PFLAGS)
SRCS= core/mc.c core/predict.c core/pixel.c core/macroblock.c \
core/frame.c core/dct.c core/cpu.c core/cabac.c \
......@@ -18,7 +19,7 @@ SRCS= core/mc.c core/predict.c core/pixel.c core/macroblock.c \
AS= nasm
# for linux
ASFLAGS=-f elf
ASFLAGS=-f elf $(PFLAGS)
# for cygwin
#ASFLAGS=-f gnuwin32 -DPREFIX
......
# Makefile: tuned for i386/MMX cygwin system only
#
PFLAGS=-DARCH_X86 -DHAVE_MMXEXT -DHAVE_SSE2
CC=gcc
CFLAGS=-g -Wall -I. -mno-cygwin -DDEBUG -O4 -funroll-loops -D__X264__ -UHAVE_MALLOC_H -DHAVE_MMXEXT -DARCH_X86
CFLAGS=-g -Wall -I. -mno-cygwin -DDEBUG -O4 -funroll-loops -D__X264__ -UHAVE_MALLOC_H $(PFLAGS)
SRCS= core/mc.c core/predict.c core/pixel.c core/macroblock.c \
core/frame.c core/dct.c core/cpu.c core/cabac.c \
......@@ -14,8 +15,8 @@ SRCS= core/mc.c core/predict.c core/pixel.c core/macroblock.c \
AS= nasm
#for cygwin
ASFLAGS=-f win32 -DPREFIX
ASFLAGS=-f win32 -DPREFIX $(PFLAGS)
ASMSRC= core/i386/dct.asm core/i386/cpu.asm core/i386/pixel.asm core/i386/mc.asm
OBJASM= $(ASMSRC:%.asm=%.o)
......
......@@ -4,7 +4,7 @@
#
# Author: x264 by Laurent Aimar <fenrir@via.ecp.fr>
#
# $Id: Makefile,v 1.3 2004/06/14 05:47:51 chenm001 Exp $
# $Id: Makefile,v 1.4 2004/06/18 02:00:40 chenm001 Exp $
##############################################################################
# Current dir
......@@ -27,6 +27,11 @@ SRC_ASM= core/i386/dct.asm core/i386/cpu.asm core/i386/pixel.asm core/i386/mc.as
# Alias
RM= rm -rf
##############################################################################
# PFLAGS
##############################################################################
PFLAGS=-DARCH_X86 -DHAVE_MMXEXT -UHAVE_SSE2
##############################################################################
# CFLAGS
##############################################################################
......@@ -35,7 +40,8 @@ RM= rm -rf
# The `mingw-runtime` package is required when building with -mno-cygwin
CFLAGS += -I$(DIR_SRC)
CFLAGS += -mno-cygwin
CFLAGS += -D__X264__ -DARCH_X86 -DHAVE_MMXEXT -D_CYGWIN
CFLAGS += -D__X264__ -D_CYGWIN
CFLAGS += $(PFLAGS)
# Optional Compiler options
CFLAGS += -g -Wall -DDEBUG
......@@ -55,7 +61,7 @@ LDFLAGS += -L$(DIR_LIB) -lx264
# ASM
##############################################################################
AS= nasm
ASFLAGS= -f win32 -DPREFIX
ASFLAGS= -f win32 -DPREFIX $(PFLAGS)
##############################################################################
# Rules
##############################################################################
......
......@@ -2,7 +2,7 @@
* mc.c: h264 encoder library (Motion Compensation)
*****************************************************************************
* Copyright (C) 2003 Laurent Aimar
* $Id: mc-c.c,v 1.4 2004/06/17 09:01:19 chenm001 Exp $
* $Id: mc-c.c,v 1.5 2004/06/18 01:59:58 chenm001 Exp $
*
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
*
......@@ -26,6 +26,7 @@
#include <string.h>
#include <stdint.h>
#include "x264.h" /* DECLARE_ALIGNED */
#include "../mc.h"
#include "../clip1.h"
#include "mc.h"
......@@ -198,12 +199,6 @@ static inline void pixel_avg_w4( uint8_t *dst, int i_dst_stride,
src2 += i_src2_stride;
}
}
#else
extern void pixel_avg_w4( uint8_t *dst, int i_dst_stride,
uint8_t *src1, int i_src1_stride,
uint8_t *src2, int i_src2_stride,
int i_height );
#endif
static inline void pixel_avg_w8( uint8_t *dst, int i_dst_stride,
uint8_t *src1, int i_src1_stride,
......@@ -251,6 +246,20 @@ static inline void pixel_avg_w16( uint8_t *dst, int i_dst_stride,
src2 += i_src2_stride;
}
}
#else
extern void pixel_avg_w4( uint8_t *dst, int i_dst_stride,
uint8_t *src1, int i_src1_stride,
uint8_t *src2, int i_src2_stride,
int i_height );
extern void pixel_avg_w8( uint8_t *dst, int i_dst_stride,
uint8_t *src1, int i_src1_stride,
uint8_t *src2, int i_src2_stride,
int i_height );
extern void pixel_avg_w16( uint8_t *dst, int i_dst_stride,
uint8_t *src1, int i_src1_stride,
uint8_t *src2, int i_src2_stride,
int i_height );
#endif
typedef void (*pf_mc_t)(uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height );
......@@ -803,34 +812,34 @@ static inline void mc_hc_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int
/* mc I+H */
static void mc_xy10_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
{
uint8_t tmp[16*16];
DECLARE_ALIGNED(uint8_t, tmp[16*16], 16);
mc_hh_w16( src, i_src_stride, tmp, 16, i_height );
pixel_avg_w16( dst, i_dst_stride, src, i_src_stride, tmp, 16, i_height );
}
static void mc_xy30_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
{
uint8_t tmp[16*16];
DECLARE_ALIGNED(uint8_t, tmp[16*16], 16);
mc_hh_w16( src, i_src_stride, tmp, 16, i_height );
pixel_avg_w16( dst, i_dst_stride, src+1, i_src_stride, tmp, 16, i_height );
}
/* mc I+V */
static void mc_xy01_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
{
uint8_t tmp[16*16];
DECLARE_ALIGNED(uint8_t, tmp[16*16], 16);
mc_hv_w16( src, i_src_stride, tmp, 16, i_height );
pixel_avg_w16( dst, i_dst_stride, src, i_src_stride, tmp, 16, i_height );
}
static void mc_xy03_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
{
uint8_t tmp[16*16];
DECLARE_ALIGNED(uint8_t, tmp[16*16], 16);
mc_hv_w16( src, i_src_stride, tmp, 16, i_height );
pixel_avg_w16( dst, i_dst_stride, src+i_src_stride, i_src_stride, tmp, 16, i_height );
}
/* H+V */
static void mc_xy11_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
{
uint8_t tmp1[16*16];
uint8_t tmp2[16*16];
DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
mc_hv_w16( src, i_src_stride, tmp1, 16, i_height );
mc_hh_w16( src, i_src_stride, tmp2, 16, i_height );
......@@ -838,8 +847,8 @@ static void mc_xy11_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst
}
static void mc_xy31_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
{
uint8_t tmp1[16*16];
uint8_t tmp2[16*16];
DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
mc_hv_w16( src+1, i_src_stride, tmp1, 16, i_height );
mc_hh_w16( src, i_src_stride, tmp2, 16, i_height );
......@@ -847,8 +856,8 @@ static void mc_xy31_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst
}
static void mc_xy13_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
{
uint8_t tmp1[16*16];
uint8_t tmp2[16*16];
DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
mc_hv_w16( src, i_src_stride, tmp1, 16, i_height );
mc_hh_w16( src+i_src_stride, i_src_stride, tmp2, 16, i_height );
......@@ -856,8 +865,8 @@ static void mc_xy13_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst
}
static void mc_xy33_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
{
uint8_t tmp1[16*16];
uint8_t tmp2[16*16];
DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
mc_hv_w16( src+1, i_src_stride, tmp1, 16, i_height );
mc_hh_w16( src+i_src_stride, i_src_stride, tmp2, 16, i_height );
......@@ -865,8 +874,8 @@ static void mc_xy33_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst
}
static void mc_xy21_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
{
uint8_t tmp1[16*16];
uint8_t tmp2[16*16];
DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
mc_hc_w16( src, i_src_stride, tmp1, 16, i_height );
mc_hh_w16( src, i_src_stride, tmp2, 16, i_height );
......@@ -874,8 +883,8 @@ static void mc_xy21_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst
}
static void mc_xy12_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
{
uint8_t tmp1[16*16];
uint8_t tmp2[16*16];
DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
mc_hc_w16( src, i_src_stride, tmp1, 16, i_height );
mc_hv_w16( src, i_src_stride, tmp2, 16, i_height );
......@@ -883,8 +892,8 @@ static void mc_xy12_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst
}
static void mc_xy32_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
{
uint8_t tmp1[16*16];
uint8_t tmp2[16*16];
DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
mc_hc_w16( src, i_src_stride, tmp1, 16, i_height );
mc_hv_w16( src+1, i_src_stride, tmp2, 16, i_height );
......@@ -892,8 +901,8 @@ static void mc_xy32_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst
}
static void mc_xy23_w16( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_height )
{
uint8_t tmp1[16*16];
uint8_t tmp2[16*16];
DECLARE_ALIGNED(uint8_t, tmp1[16*16], 16);
DECLARE_ALIGNED(uint8_t, tmp2[16*16], 16);
mc_hc_w16( src, i_src_stride, tmp1, 16, i_height );
mc_hh_w16( src+i_src_stride, i_src_stride, tmp2, 16, i_height );
......
......@@ -2,7 +2,7 @@
;* mc.asm: h264 encoder library
;*****************************************************************************
;* Copyright (C) 2003 x264 project
;* $Id: mc.asm,v 1.2 2004/06/17 09:01:19 chenm001 Exp $
;* $Id: mc.asm,v 1.3 2004/06/18 01:59:58 chenm001 Exp $
;*
;* Authors: Min Chen <chenm001.163.com> (converted to nasm)
;* Laurent Aimar <fenrir@via.ecp.fr> (init algorithm)
......@@ -111,6 +111,95 @@ ALIGN 4
ret
cglobal pixel_avg_w8
ALIGN 16
;-----------------------------------------------------------------------------
; void pixel_avg_w8( uint8_t *dst, int i_dst_stride,
; uint8_t *src1, int i_src1_stride,
; uint8_t *src2, int i_src2_stride,
; int i_height );
;-----------------------------------------------------------------------------
pixel_avg_w8:
push ebp
push ebx
push esi
push edi
mov edi, [esp+20] ; dst
mov ebx, [esp+28] ; src1
mov ecx, [esp+36] ; src2
mov esi, [esp+24] ; i_dst_stride
mov eax, [esp+32] ; i_src1_stride
mov edx, [esp+40] ; i_src2_stride
mov ebp, [esp+44] ; i_height
ALIGN 4
.height_loop
movq mm0, [ebx]
pavgb mm0, [ecx]
movq [edi], mm0
dec ebp
lea ebx, [ebx+eax]
lea ecx, [ecx+edx]
lea edi, [edi+esi]
jne .height_loop
pop edi
pop esi
pop ebx
pop ebp
ret
cglobal pixel_avg_w16
ALIGN 16
;-----------------------------------------------------------------------------
; void pixel_avg_w16( uint8_t *dst, int i_dst_stride,
; uint8_t *src1, int i_src1_stride,
; uint8_t *src2, int i_src2_stride,
; int i_height );
;-----------------------------------------------------------------------------
pixel_avg_w16:
push ebp
push ebx
push esi
push edi
mov edi, [esp+20] ; dst
mov ebx, [esp+28] ; src1
mov ecx, [esp+36] ; src2
mov esi, [esp+24] ; i_dst_stride
mov eax, [esp+32] ; i_src1_stride
mov edx, [esp+40] ; i_src2_stride
mov ebp, [esp+44] ; i_height
ALIGN 4
.height_loop
%ifndef HAVE_SSE2
movq mm0, [ebx ]
movq mm1, [ebx+8]
pavgb mm0, [ecx ]
pavgb mm1, [ecx+8]
movq [edi ], mm0
movq [edi+8], mm1
%else
movdqu xmm0, [ebx]
pavgb xmm0, [ecx]
movdqu [edi], xmm0
%endif
dec ebp
lea ebx, [ebx+eax]
lea ecx, [ecx+edx]
lea edi, [edi+esi]
jne .height_loop
pop edi
pop esi
pop ebx
pop ebp
ret
cglobal mc_copy_w4
ALIGN 16
......@@ -201,6 +290,7 @@ mc_copy_w16:
mov ecx, [esp+32] ; i_height
ALIGN 4
.height_loop
%ifndef HAVE_SSE2
movq mm0, [esi]
movq mm1, [esi+8]
movq [edi], mm0
......@@ -221,10 +311,20 @@ ALIGN 4
movq [edi+edx+8], mm7
lea esi, [esi+ebx*2]
lea edi, [edi+edx*2]
sub ecx, byte 4
jnz .height_loop
%else
movdqu xmm0, [esi]
movdqu xmm1, [esi+ebx]
movdqu [edi], xmm0
movdqu [edi+edx], xmm1
dec ecx
dec ecx
lea esi, [esi+ebx*2]
lea edi, [edi+edx*2]
jnz .height_loop
%endif
pop edi
pop esi
pop ebx
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment