Commit 41f9b813 authored by Loren Merritt's avatar Loren Merritt

amd64 asm patch, part2.

by Josef Zlomek ( josef dot zlomek at xeris dot cz )


git-svn-id: svn://svn.videolan.org/x264/trunk@213 df754926-b1dd-0310-bc7b-ec298dee348c
parent 413d8fa9
......@@ -18,6 +18,15 @@ ASMSRC = common/i386/dct-a.asm common/i386/cpu-a.asm \
OBJASM = $(ASMSRC:%.asm=%.o)
endif
# MMX/SSE optims
ifeq ($(ARCH),X86_64)
SRCS += common/amd64/mc-c.c common/amd64/dct-c.c common/amd64/predict.c
ASMSRC = common/amd64/dct-a.asm common/amd64/cpu-a.asm \
common/amd64/pixel-a.asm common/amd64/mc-a.asm \
common/amd64/mc-a2.asm common/amd64/predict-a.asm
OBJASM = $(ASMSRC:%.asm=%.o)
endif
# AltiVec optims
ifeq ($(ARCH),PPC)
SRCS += common/ppc/mc.c common/ppc/pixel.c
......
......@@ -21,7 +21,7 @@
;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
;*****************************************************************************
BITS 32
BITS 64
;=============================================================================
; Macros and other preprocessor constants
......@@ -51,27 +51,23 @@ ALIGN 16
; int __cdecl x264_cpu_cpuid_test( void ) return 0 if unsupported
;-----------------------------------------------------------------------------
x264_cpu_cpuid_test:
pushfd
push ebx
push ebp
push esi
push edi
pushfq
push rbx
push rbp
pushfd
pop eax
pushfq
pop rax
mov ebx, eax
xor eax, 0x200000
push eax
popfd
pushfd
pop eax
push rax
popfq
pushfq
pop rax
xor eax, ebx
pop edi
pop esi
pop ebp
pop ebx
popfd
pop rbp
pop rbx
popfq
ret
ALIGN 16
......@@ -80,31 +76,21 @@ ALIGN 16
;-----------------------------------------------------------------------------
x264_cpu_cpuid:
push ebp
mov ebp, esp
push ebx
push esi
push edi
push rbp
push rbx
mov r10, rcx
mov r11, rdx
mov eax, [ebp + 8]
mov eax, edi
cpuid
mov esi, [ebp + 12]
mov [esi], eax
mov [rsi], eax
mov [r11], ebx
mov [r10], ecx
mov [r8], edx
mov esi, [ebp + 16]
mov [esi], ebx
mov esi, [ebp + 20]
mov [esi], ecx
mov esi, [ebp + 24]
mov [esi], edx
pop edi
pop esi
pop ebx
pop ebp
pop rbx
pop rbp
ret
ALIGN 16
......
......@@ -30,7 +30,7 @@
;* *
;*****************************************************************************
BITS 32
BITS 64
;=============================================================================
; Macros and other preprocessor constants
......@@ -83,8 +83,8 @@ BITS 32
%macro MMX_SUMSUBD2_AB 4
movq %4, %1
movq %3, %2
psraw %2, $1
psraw %4, $1
psraw %2, 1
psraw %4, 1
paddw %1, %2
psubw %4, %3
%endmacro
......@@ -113,7 +113,7 @@ BITS 32
%macro MMX_STORE_DIFF_4P 5
paddw %1, %3
psraw %1, $6
psraw %1, 6
movd %2, %5
punpcklbw %2, %4
paddsw %1, %2
......@@ -129,9 +129,9 @@ BITS 32
;=============================================================================
%ifdef FORMAT_COFF
SECTION .rodata data
SECTION .rodata
%else
SECTION .rodata data align=16
SECTION .rodata
%endif
;-----------------------------------------------------------------------------
......@@ -158,11 +158,10 @@ ALIGN 16
; void __cdecl dct4x4dc( int16_t d[4][4] )
;-----------------------------------------------------------------------------
x264_dct4x4dc_mmxext:
mov eax, [esp+ 4]
movq mm0, [eax+ 0]
movq mm1, [eax+ 8]
movq mm2, [eax+16]
movq mm3, [eax+24]
movq mm0, [rdi+ 0]
movq mm1, [rdi+ 8]
movq mm2, [rdi+16]
movq mm3, [rdi+24]
MMX_SUMSUB_BADC mm1, mm0, mm3, mm2 ; mm1=s01 mm0=d01 mm3=s23 mm2=d23
MMX_SUMSUB_BADC mm3, mm1, mm2, mm0 ; mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23
......@@ -178,15 +177,15 @@ x264_dct4x4dc_mmxext:
paddw mm0, mm6
paddw mm4, mm6
psraw mm0, 1
movq [eax+ 0], mm0
movq [rdi+ 0], mm0
psraw mm4, 1
movq [eax+ 8], mm4
movq [rdi+ 8], mm4
paddw mm1, mm6
paddw mm3, mm6
psraw mm1, 1
movq [eax+16], mm1
movq [rdi+16], mm1
psraw mm3, 1
movq [eax+24], mm3
movq [rdi+24], mm3
ret
cglobal x264_idct4x4dc_mmxext
......@@ -196,11 +195,10 @@ ALIGN 16
; void __cdecl x264_idct4x4dc_mmxext( int16_t d[4][4] )
;-----------------------------------------------------------------------------
x264_idct4x4dc_mmxext:
mov eax, [esp+ 4]
movq mm0, [eax+ 0]
movq mm1, [eax+ 8]
movq mm2, [eax+16]
movq mm3, [eax+24]
movq mm0, [rdi+ 0]
movq mm1, [rdi+ 8]
movq mm2, [rdi+16]
movq mm3, [rdi+24]
MMX_SUMSUB_BADC mm1, mm0, mm3, mm2 ; mm1=s01 mm0=d01 mm3=s23 mm2=d23
MMX_SUMSUB_BADC mm3, mm1, mm2, mm0 ; mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23
......@@ -212,10 +210,10 @@ x264_idct4x4dc_mmxext:
MMX_TRANSPOSE mm0, mm2, mm3, mm4, mm1 ; in: mm0, mm2, mm3, mm4 out: mm0, mm4, mm1, mm3
movq [eax+ 0], mm0
movq [eax+ 8], mm4
movq [eax+16], mm1
movq [eax+24], mm3
movq [rdi+ 0], mm0
movq [rdi+ 8], mm4
movq [rdi+16], mm1
movq [rdi+24], mm3
ret
cglobal x264_sub4x4_dct_mmxext
......@@ -225,21 +223,21 @@ ALIGN 16
; void __cdecl x264_sub4x4_dct_mmxext( int16_t dct[4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
;-----------------------------------------------------------------------------
x264_sub4x4_dct_mmxext:
push ebx
mov eax, [esp+12] ; pix1
mov ebx, [esp+16] ; i_pix1
mov ecx, [esp+20] ; pix2
mov edx, [esp+24] ; i_pix2
push rbx
mov rax, rsi ; pix1
movsxd rbx, edx ; i_pix1
; mov rcx, rcx ; pix2
movsxd rdx, r8d ; i_pix2
MMX_ZERO mm7
; Load 4 lines
MMX_LOAD_DIFF_4P mm0, mm6, mm7, [eax ], [ecx]
MMX_LOAD_DIFF_4P mm1, mm6, mm7, [eax+ebx ], [ecx+edx]
MMX_LOAD_DIFF_4P mm2, mm6, mm7, [eax+ebx*2], [ecx+edx*2]
add eax, ebx
add ecx, edx
MMX_LOAD_DIFF_4P mm3, mm6, mm7, [eax+ebx*2], [ecx+edx*2]
MMX_LOAD_DIFF_4P mm0, mm6, mm7, [rax ], [rcx]
MMX_LOAD_DIFF_4P mm1, mm6, mm7, [rax+rbx ], [rcx+rdx]
MMX_LOAD_DIFF_4P mm2, mm6, mm7, [rax+rbx*2], [rcx+rdx*2]
add rax, rbx
add rcx, rdx
MMX_LOAD_DIFF_4P mm3, mm6, mm7, [rax+rbx*2], [rcx+rdx*2]
MMX_SUMSUB_BADC mm3, mm0, mm2, mm1 ; mm3=s03 mm0=d03 mm2=s12 mm1=d12
......@@ -257,13 +255,12 @@ x264_sub4x4_dct_mmxext:
; transpose in: mm1, mm2, mm3, mm0, out: mm1, mm0, mm4, mm3
MMX_TRANSPOSE mm1, mm2, mm3, mm0, mm4
mov eax, [esp+ 8] ; dct
movq [eax+ 0], mm1
movq [eax+ 8], mm0
movq [eax+16], mm4
movq [eax+24], mm3
movq [rdi+ 0], mm1 ; dct
movq [rdi+ 8], mm0
movq [rdi+16], mm4
movq [rdi+24], mm3
pop ebx
pop rbx
ret
cglobal x264_add4x4_idct_mmxext
......@@ -273,17 +270,15 @@ ALIGN 16
; void __cdecl x264_add4x4_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[4][4] )
;-----------------------------------------------------------------------------
x264_add4x4_idct_mmxext:
; Load dct coeffs
mov eax, [esp+12] ; dct
movq mm0, [eax+ 0]
movq mm4, [eax+ 8]
movq mm3, [eax+16]
movq mm1, [eax+24]
movq mm0, [rdx+ 0] ; dct
movq mm4, [rdx+ 8]
movq mm3, [rdx+16]
movq mm1, [rdx+24]
mov eax, [esp+ 4] ; p_dst
mov ecx, [esp+ 8] ; i_dst
lea edx, [ecx+ecx*2]
mov rax, rdi ; p_dst
movsxd rcx, esi ; i_dst
lea rdx, [rcx+rcx*2]
; out:mm0, mm1, mm2, mm3
MMX_TRANSPOSE mm0, mm4, mm3, mm1, mm2
......@@ -304,10 +299,10 @@ x264_add4x4_idct_mmxext:
MMX_ZERO mm7
movq mm6, [x264_mmx_32]
MMX_STORE_DIFF_4P mm2, mm0, mm6, mm7, [eax]
MMX_STORE_DIFF_4P mm4, mm0, mm6, mm7, [eax+ecx]
MMX_STORE_DIFF_4P mm1, mm0, mm6, mm7, [eax+ecx*2]
MMX_STORE_DIFF_4P mm3, mm0, mm6, mm7, [eax+edx]
MMX_STORE_DIFF_4P mm2, mm0, mm6, mm7, [rax]
MMX_STORE_DIFF_4P mm4, mm0, mm6, mm7, [rax+rcx]
MMX_STORE_DIFF_4P mm1, mm0, mm6, mm7, [rax+rcx*2]
MMX_STORE_DIFF_4P mm3, mm0, mm6, mm7, [rax+rdx]
ret
......@@ -30,7 +30,7 @@
;* *
;*****************************************************************************
BITS 32
BITS 64
;=============================================================================
; Macros and other preprocessor constants
......@@ -45,16 +45,6 @@ BITS 32
%endif
%endmacro
;=============================================================================
; Local Data (Read Only)
;=============================================================================
%ifdef FORMAT_COFF
SECTION .rodata data
%else
SECTION .rodata data align=16
%endif
;-----------------------------------------------------------------------------
; Various memory constants (trigonometric values or rounding values)
;-----------------------------------------------------------------------------
......@@ -88,37 +78,37 @@ ALIGN 16
; int i_height );
;-----------------------------------------------------------------------------
x264_pixel_avg_w4_mmxext:
push ebp
push ebx
push esi
push edi
mov edi, [esp+20] ; dst
mov ebx, [esp+28] ; src1
mov ecx, [esp+36] ; src2
mov esi, [esp+24] ; i_dst_stride
mov eax, [esp+32] ; i_src1_stride
mov edx, [esp+40] ; i_src2_stride
mov ebp, [esp+44] ; i_height
push rbp
mov rbp, rsp
push r12
push r13
mov r12, r8 ; src2
movsxd r13, r9d ; i_src2_stride
mov r10, rdx ; src1
movsxd r11, ecx ; i_src1_stride
mov r8, rdi ; dst
movsxd r9, esi ; i_dst_stride
movsxd rax, dword [rbp+16] ; i_height
ALIGN 4
.height_loop
movd mm0, [ebx]
pavgb mm0, [ecx]
movd mm1, [ebx+eax]
pavgb mm1, [ecx+edx]
movd [edi], mm0
movd [edi+esi], mm1
dec ebp
dec ebp
lea ebx, [ebx+eax*2]
lea ecx, [ecx+edx*2]
lea edi, [edi+esi*2]
movd mm0, [r10]
pavgb mm0, [r12]
movd mm1, [r10+r11]
pavgb mm1, [r12+r13]
movd [r8], mm0
movd [r8+r9], mm1
dec rax
dec rax
lea r10, [r10+r11*2]
lea r12, [r12+r13*2]
lea r8, [r8+r9*2]
jne .height_loop
pop edi
pop esi
pop ebx
pop ebp
pop r13
pop r12
pop rbp
ret
......@@ -131,33 +121,33 @@ ALIGN 16
; int i_height );
;-----------------------------------------------------------------------------
x264_pixel_avg_w8_mmxext:
push ebp
push ebx
push esi
push edi
mov edi, [esp+20] ; dst
mov ebx, [esp+28] ; src1
mov ecx, [esp+36] ; src2
mov esi, [esp+24] ; i_dst_stride
mov eax, [esp+32] ; i_src1_stride
mov edx, [esp+40] ; i_src2_stride
mov ebp, [esp+44] ; i_height
push rbp
mov rbp, rsp
push r12
push r13
mov r12, r8 ; src2
movsxd r13, r9d ; i_src2_stride
mov r10, rdx ; src1
movsxd r11, ecx ; i_src1_stride
mov r8, rdi ; dst
movsxd r9, esi ; i_dst_stride
movsxd rax, dword [rbp+16] ; i_height
ALIGN 4
.height_loop
movq mm0, [ebx]
pavgb mm0, [ecx]
movq [edi], mm0
dec ebp
lea ebx, [ebx+eax]
lea ecx, [ecx+edx]
lea edi, [edi+esi]
movq mm0, [r10]
pavgb mm0, [r12]
movq [r8], mm0
dec rax
lea r10, [r10+r11]
lea r12, [r12+r13]
lea r8, [r8+r9]
jne .height_loop
pop edi
pop esi
pop ebx
pop ebp
pop r13
pop r12
pop rbp
ret
......@@ -170,36 +160,36 @@ ALIGN 16
; int i_height );
;-----------------------------------------------------------------------------
x264_pixel_avg_w16_mmxext:
push ebp
push ebx
push esi
push edi
mov edi, [esp+20] ; dst
mov ebx, [esp+28] ; src1
mov ecx, [esp+36] ; src2
mov esi, [esp+24] ; i_dst_stride
mov eax, [esp+32] ; i_src1_stride
mov edx, [esp+40] ; i_src2_stride
mov ebp, [esp+44] ; i_height
push rbp
mov rbp, rsp
push r12
push r13
mov r12, r8 ; src2
movsxd r13, r9d ; i_src2_stride
mov r10, rdx ; src1
movsxd r11, ecx ; i_src1_stride
mov r8, rdi ; dst
movsxd r9, esi ; i_dst_stride
movsxd rax, dword [rbp+16] ; i_height
ALIGN 4
.height_loop
movq mm0, [ebx ]
movq mm1, [ebx+8]
pavgb mm0, [ecx ]
pavgb mm1, [ecx+8]
movq [edi ], mm0
movq [edi+8], mm1
dec ebp
lea ebx, [ebx+eax]
lea ecx, [ecx+edx]
lea edi, [edi+esi]
movq mm0, [r10 ]
movq mm1, [r10+8]
pavgb mm0, [r12 ]
pavgb mm1, [r12+8]
movq [r8 ], mm0
movq [r8+8], mm1
dec rax
lea r10, [r10+r11]
lea r12, [r12+r13]
lea r8, [r8+r9]
jne .height_loop
pop edi
pop esi
pop ebx
pop ebp
pop r13
pop r12
pop rbp
ret
ALIGN 16
......@@ -210,34 +200,34 @@ ALIGN 16
; int i_height );
;-----------------------------------------------------------------------------
x264_pixel_avg_w16_sse2:
push ebp
push ebx
push esi
push edi
mov edi, [esp+20] ; dst
mov ebx, [esp+28] ; src1
mov ecx, [esp+36] ; src2
mov esi, [esp+24] ; i_dst_stride
mov eax, [esp+32] ; i_src1_stride
mov edx, [esp+40] ; i_src2_stride
mov ebp, [esp+44] ; i_height
push rbp
mov rbp, rsp
push r12
push r13
mov r12, r8 ; src2
movsxd r13, r9d ; i_src2_stride
mov r10, rdx ; src1
movsxd r11, ecx ; i_src1_stride
mov r8, rdi ; dst
movsxd r9, esi ; i_dst_stride
movsxd rax, dword [rbp+16] ; i_height
ALIGN 4
.height_loop
movdqu xmm0, [ebx]
pavgb xmm0, [ecx]
movdqu [edi], xmm0
dec ebp
lea ebx, [ebx+eax]
lea ecx, [ecx+edx]
lea edi, [edi+esi]
movdqu xmm0, [r10]
pavgb xmm0, [r12]
movdqu [r8], xmm0
dec rax
lea r10, [r10+r11]
lea r12, [r12+r13]
lea r8, [r8+r9]
jne .height_loop
pop edi
pop esi
pop ebx
pop ebp
pop r13
pop r12
pop rbp
ret
......@@ -248,30 +238,24 @@ ALIGN 16
; uint8_t *dst, int i_dst_stride, int i_height )
;-----------------------------------------------------------------------------
x264_mc_copy_w4_mmxext:
push ebx
push esi
push edi
mov esi, [esp+16] ; src
mov edi, [esp+24] ; dst
mov ebx, [esp+20] ; i_src_stride
mov edx, [esp+28] ; i_dst_stride
mov ecx, [esp+32] ; i_height
mov eax, r8d ; i_height
mov r8, rdi ; src
movsxd r9, esi ; i_src_stride
mov r10, rdx ; dst
movsxd r11, ecx ; i_dst_stride
ALIGN 4
.height_loop
mov eax, [esi]
mov [edi], eax
mov eax, [esi+ebx]
mov [edi+edx], eax
lea esi, [esi+ebx*2]
lea edi, [edi+edx*2]
dec ecx
dec ecx
mov ecx, [r8]
mov edx, [r8+r9]
mov [r10], ecx
mov [r10+r11], edx
lea r8, [r8+r9*2]
lea r10, [r10+r11*2]
dec eax
dec eax
jne .height_loop
pop edi
pop esi
pop ebx
ret
cglobal mc_copy_w8
......@@ -282,36 +266,31 @@ ALIGN 16
; uint8_t *dst, int i_dst_stride, int i_height )
;-----------------------------------------------------------------------------
x264_mc_copy_w8_mmxext:
push ebx
push esi
push edi
mov esi, [esp+16] ; src
mov edi, [esp+24] ; dst
mov ebx, [esp+20] ; i_src_stride
mov edx, [esp+28] ; i_dst_stride
mov ecx, [esp+32] ; i_height
mov eax, r8d ; i_height
mov r8, rdi ; src
movsxd r9, esi ; i_src_stride
mov r10, rdx ; dst
movsxd r11, ecx ; i_dst_stride
lea rcx, [r9+r9*2] ; 3 * i_src_stride
lea rdx, [r11+r11*2] ; 3 * i_dst_stride
ALIGN 4
.height_loop
movq mm0, [esi]
movq [edi], mm0