Commit 58e12b0e authored by Loren Merritt's avatar Loren Merritt
Browse files

tweak motion compensation amd64 asm. 0.3% overall speedup.


git-svn-id: svn://svn.videolan.org/x264/trunk@575 df754926-b1dd-0310-bc7b-ec298dee348c
parent 6de50f51
......@@ -86,24 +86,22 @@ ALIGN 16
x264_pixel_avg_w4_mmxext:
mov r10, parm5q ; src2
movsxd r11, parm6d ; i_src2_stride
movsxd rax, parm7d ; i_height
mov eax, parm7d ; i_height
ALIGN 4
.height_loop
movd mm0, [parm3q]
pavgb mm0, [r10]
movd mm1, [parm3q+parm4q]
pavgb mm0, [r10]
pavgb mm1, [r10+r11]
movd [parm1q], mm0
movd [parm1q+parm2q], mm1
dec rax
dec rax
sub eax, 2
lea parm3q, [parm3q+parm4q*2]
lea r10, [r10+r11*2]
lea parm1q, [parm1q+parm2q*2]
jne .height_loop
ret
rep ret
......@@ -115,23 +113,24 @@ ALIGN 16
; int i_height );
;-----------------------------------------------------------------------------
x264_pixel_avg_w8_mmxext:
mov r10, parm5q ; src2
movsxd r11, parm6d ; i_src2_stride
movsxd rax, parm7d ; i_height
mov eax, parm7d ; i_height
ALIGN 4
.height_loop
movq mm0, [parm3q]
movq mm1, [parm3q+parm4q]
pavgb mm0, [r10]
pavgb mm1, [r10+r11]
movq [parm1q], mm0
dec rax
lea parm3q, [parm3q+parm4q]
lea r10, [r10+r11]
lea parm1q, [parm1q+parm2q]
movq [parm1q+parm2q], mm1
sub eax, 2
lea parm3q, [parm3q+parm4q*2]
lea r10, [r10+r11*2]
lea parm1q, [parm1q+parm2q*2]
jne .height_loop
ret
rep ret
ALIGN 16
;-----------------------------------------------------------------------------
......@@ -143,7 +142,7 @@ ALIGN 16
x264_pixel_avg_w16_mmxext:
mov r10, parm5q ; src2
movsxd r11, parm6d ; i_src2_stride
movsxd rax, parm7d ; i_height
mov eax, parm7d ; i_height
ALIGN 4
.height_loop
......@@ -153,13 +152,12 @@ ALIGN 4
pavgb mm1, [r10+8]
movq [parm1q ], mm0
movq [parm1q+8], mm1
dec rax
dec eax
lea parm3q, [parm3q+parm4q]
lea r10, [r10+r11]
lea parm1q, [parm1q+parm2q]
jne .height_loop
ret
rep ret
ALIGN 16
;-----------------------------------------------------------------------------
......@@ -171,21 +169,19 @@ ALIGN 16
x264_pixel_avg_w16_sse2:
mov r10, parm5q ; src2
movsxd r11, parm6d ; i_src2_stride
movsxd rax, parm7d ; i_height
mov eax, parm7d ; i_height
ALIGN 4
.height_loop
movdqu xmm0, [parm3q]
pavgb xmm0, [r10]
movdqu [parm1q], xmm0
dec rax
lea parm3q, [parm3q+parm4q]
lea r10, [r10+r11]
lea parm1q, [parm1q+parm2q]
jne .height_loop
ret
rep ret
......@@ -246,7 +242,7 @@ x264_pixel_avg_weight_w16_mmxext:
add parm3q, parm4q
dec r11d
jnz .height_loop
ret
rep ret
ALIGN 16
;-----------------------------------------------------------------------------
......@@ -255,16 +251,14 @@ ALIGN 16
x264_pixel_avg_weight_w8_mmxext:
BIWEIGHT_START_MMX
BIWEIGHT_4P_MMX [parm1q ], [parm3q ]
BIWEIGHT_4P_MMX [parm1q+4 ], [parm3q+4 ]
BIWEIGHT_4P_MMX [parm1q+parm2q ], [parm3q+parm4q ]
BIWEIGHT_4P_MMX [parm1q+parm2q+4], [parm3q+parm4q+4]
BIWEIGHT_4P_MMX [parm1q ], [parm3q ]
BIWEIGHT_4P_MMX [parm1q+4], [parm3q+4]
lea parm1q, [parm1q+parm2q*2]
lea parm3q, [parm3q+parm4q*2]
sub r11d, byte 2
add parm1q, parm2q
add parm3q, parm4q
dec r11d
jnz .height_loop
ret
rep ret
ALIGN 16
;-----------------------------------------------------------------------------
......@@ -305,8 +299,7 @@ ALIGN 4
dec eax
dec eax
jne .height_loop
ret
rep ret
ALIGN 16
;-----------------------------------------------------------------------------
......@@ -334,8 +327,7 @@ ALIGN 4
sub eax, byte 4
jnz .height_loop
ret
rep ret
ALIGN 16
;-----------------------------------------------------------------------------
......@@ -370,8 +362,7 @@ ALIGN 4
lea parm1q, [parm1q+parm2q*4]
sub eax, byte 4
jnz .height_loop
ret
rep ret
ALIGN 16
......@@ -387,13 +378,11 @@ ALIGN 4
movdqu xmm1, [parm3q+parm4q]
movdqu [parm1q], xmm0
movdqu [parm1q+parm2q], xmm1
dec eax
dec eax
sub eax, byte 2
lea parm3q, [parm3q+parm4q*2]
lea parm1q, [parm1q+parm2q*2]
jnz .height_loop
ret
rep ret
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment