Commit 2701440c authored by Fiona Glaser's avatar Fiona Glaser
Browse files

Optimize x86 asm for Intel macro-op fusion

That is, place all loop counter tests right before their conditional jumps.
parent 2d481bc0
......@@ -97,8 +97,8 @@ cglobal cabac_encode_decision_asm, 0,7
LOAD_GLOBAL t3d, cabac_renorm_shift, 0, t3
shl t4d, t3b
shl t6d, t3b
add t3d, [t0+cb.queue]
mov [t0+cb.range], t4d
add t3d, [t0+cb.queue]
jge cabac_putbyte
.update_queue_low:
mov [t0+cb.low], t6d
......
......@@ -80,10 +80,10 @@ cextern pd_32
%endif
%macro AVG_END 0
sub eax, 2
lea t4, [t4+t5*2*SIZEOF_PIXEL]
lea t2, [t2+t3*2*SIZEOF_PIXEL]
lea t0, [t0+t1*2*SIZEOF_PIXEL]
sub eax, 2
jg .height_loop
REP_RET
%endmacro
......@@ -716,9 +716,9 @@ cglobal pixel_avg2_w%1, 6,7,4
%endif
mova [r0], m0
mova [r0+r1*2], m1
sub r5d, 2
lea r2, [r2+r3*4]
lea r0, [r0+r1*4]
sub r5d, 2
jg .height_loop
REP_RET
%endmacro
......@@ -751,9 +751,9 @@ cglobal pixel_avg2_w%1, 6,7,8
%3 [r0+mmsize], m1
mova [r0+r1*2], m2
%3 [r0+r1*2+mmsize], m3
sub r5d, 2
lea r2, [r2+r3*4]
lea r0, [r0+r1*4]
sub r5d, 2
jg .height_loop
REP_RET
%endmacro
......@@ -789,9 +789,9 @@ cglobal pixel_avg2_w10_mmx2, 6,7
mova [r0+r1*2+ 0], m3
mova [r0+r1*2+ 8], m4
movh [r0+r1*2+16], m5
sub r5d, 2
lea r2, [r2+r3*2*2]
lea r0, [r0+r1*2*2]
sub r5d, 2
jg .height_loop
REP_RET
......@@ -823,9 +823,9 @@ cglobal pixel_avg2_w16_mmx2, 6,7
mova [r0+r1*2+ 8], m5
mova [r0+r1*2+16], m6
mova [r0+r1*2+24], m7
sub r5d, 2
lea r2, [r2+r3*2*2]
lea r0, [r0+r1*2*2]
sub r5d, 2
jg .height_loop
REP_RET
......@@ -847,9 +847,9 @@ cglobal pixel_avg2_w18_mmx2, 6,7
mova [r0+16], m2
mova [r0+24], m3
movh [r0+32], m4
sub r5d, 1
lea r2, [r2+r3*2]
lea r0, [r0+r1*2]
dec r5d
jg .height_loop
REP_RET
......@@ -869,9 +869,9 @@ cglobal pixel_avg2_w18_sse2, 6,7,6
mova [r0+ 0], m0
mova [r0+16], m1
movh [r0+32], m2
sub r5d, 1
lea r2, [r2+r3*2]
lea r0, [r0+r1*2]
dec r5d
jg .height_loop
REP_RET
%endif ; HIGH_BIT_DEPTH
......
......@@ -199,12 +199,12 @@ cglobal hpel_filter_v, 5,6,11
mova [r2+r4+mmsize], m4
paddw m1, s30
paddw m4, s30
add r4, 2*mmsize
FILT_PACK m1, m4, 5, m6, w, s10
CLIPW m1, m0, m7
CLIPW m4, m0, m7
mova [r0+r4-mmsize*2], m1
mova [r0+r4-mmsize*1], m4
mova [r0+r4], m1
mova [r0+r4+mmsize], m4
add r4, 2*mmsize
jl .loop
REP_RET
......@@ -291,12 +291,12 @@ cglobal hpel_filter_h, 3,4,8
FILT_H2 m1, m2, m3, m4, m5, m6
mova m7, [pw_1]
pxor m2, m2
add r2, mmsize*2
FILT_PACK m1, m4, 1, m7, w
CLIPW m1, m2, m0
CLIPW m4, m2, m0
mova [r0+r2-mmsize*2], m1
mova [r0+r2-mmsize*1], m4
mova [r0+r2], m1
mova [r0+r2+mmsize], m4
add r2, mmsize*2
jl .loop
REP_RET
%endmacro ; HPEL_FILTER
......@@ -1139,24 +1139,24 @@ INIT_MMX
cglobal memcpy_aligned_mmx, 3,3
test r2d, 16
jz .copy32start
sub r2d, 16
movq mm0, [r1 + r2 + 0]
movq mm1, [r1 + r2 + 8]
movq [r0 + r2 + 0], mm0
movq [r0 + r2 + 8], mm1
movq mm0, [r1 + r2 - 16]
movq mm1, [r1 + r2 - 8]
movq [r0 + r2 - 16], mm0
movq [r0 + r2 - 8], mm1
sub r2d, 16
.copy32start
test r2d, r2d
jz .ret
.copy32:
sub r2d, 32
movq mm0, [r1 + r2 + 0]
movq mm1, [r1 + r2 + 8]
movq mm2, [r1 + r2 + 16]
movq mm3, [r1 + r2 + 24]
movq [r0 + r2 + 0], mm0
movq [r0 + r2 + 8], mm1
movq [r0 + r2 + 16], mm2
movq [r0 + r2 + 24], mm3
movq mm0, [r1 + r2 - 32]
movq mm1, [r1 + r2 - 24]
movq mm2, [r1 + r2 - 16]
movq mm3, [r1 + r2 - 8]
movq [r0 + r2 - 32], mm0
movq [r0 + r2 - 24], mm1
movq [r0 + r2 - 16], mm2
movq [r0 + r2 - 8], mm3
sub r2d, 32
jg .copy32
.ret
REP_RET
......@@ -1167,30 +1167,30 @@ cglobal memcpy_aligned_mmx, 3,3
cglobal memcpy_aligned_sse2, 3,3
test r2d, 16
jz .copy32
sub r2d, 16
movdqa xmm0, [r1 + r2]
movdqa [r0 + r2], xmm0
movdqa xmm0, [r1 + r2 - 16]
movdqa [r0 + r2 - 16], xmm0
sub r2d, 16
.copy32:
test r2d, 32
jz .copy64start
sub r2d, 32
movdqa xmm0, [r1 + r2 + 0]
movdqa [r0 + r2 + 0], xmm0
movdqa xmm1, [r1 + r2 + 16]
movdqa [r0 + r2 + 16], xmm1
movdqa xmm0, [r1 + r2 - 32]
movdqa [r0 + r2 - 32], xmm0
movdqa xmm1, [r1 + r2 - 16]
movdqa [r0 + r2 - 16], xmm1
sub r2d, 32
.copy64start
test r2d, r2d
jz .ret
.copy64:
sub r2d, 64
movdqa xmm0, [r1 + r2 + 0]
movdqa [r0 + r2 + 0], xmm0
movdqa xmm1, [r1 + r2 + 16]
movdqa [r0 + r2 + 16], xmm1
movdqa xmm2, [r1 + r2 + 32]
movdqa [r0 + r2 + 32], xmm2
movdqa xmm3, [r1 + r2 + 48]
movdqa [r0 + r2 + 48], xmm3
movdqa xmm0, [r1 + r2 - 64]
movdqa [r0 + r2 - 64], xmm0
movdqa xmm1, [r1 + r2 - 48]
movdqa [r0 + r2 - 48], xmm1
movdqa xmm2, [r1 + r2 - 32]
movdqa [r0 + r2 - 32], xmm2
movdqa xmm3, [r1 + r2 - 16]
movdqa [r0 + r2 - 16], xmm3
sub r2d, 64
jg .copy64
.ret:
REP_RET
......@@ -1313,17 +1313,17 @@ cglobal integral_init4v_mmx, 3,5
mova m0, [r0+r2]
mova m4, [r4+r2]
.loop:
sub r2, 8
mova m1, m4
psubw m1, m0
mova m4, [r4+r2]
mova m0, [r0+r2]
mova m4, [r4+r2-8]
mova m0, [r0+r2-8]
paddw m1, m4
mova m3, [r3+r2]
mova m3, [r3+r2-8]
psubw m1, m0
psubw m3, m0
mova [r0+r2], m1
mova [r1+r2], m3
mova [r0+r2-8], m1
mova [r1+r2-8], m3
sub r2, 8
jge .loop
REP_RET
......
......@@ -123,15 +123,15 @@ cglobal pixel_ssd_%1x%2, 4,5,6
%define offset mmsize
%define num_rows 1
%endif
lea r0, [r0+r1*2*num_rows]
psubw m1, [r2]
psubw m3, [r2+offset]
lea r2, [r2+r3*2*num_rows]
pmaddwd m1, m1
pmaddwd m3, m3
dec r4
lea r0, [r0+r1*2*num_rows]
lea r2, [r2+r3*2*num_rows]
paddd m0, m1
paddd m0, m3
dec r4
jg .loop
HADDD m0, m5
movd eax, m0
......@@ -158,7 +158,6 @@ cglobal pixel_ssd_%1x%2, 4,5
psubw m7, m2
pmaddwd m3, m3
pmaddwd m5, m5
dec r4
lea r0, [r0+r1*2]
lea r2, [r2+r3*2]
pmaddwd m7, m7
......@@ -166,6 +165,7 @@ cglobal pixel_ssd_%1x%2, 4,5
paddd m5, m7
paddd m0, m1
paddd m0, m5
dec r4
jg .loop
HADDD m0, m7
movd eax, m0
......@@ -540,12 +540,12 @@ cglobal pixel_ssd_nv12_core, 6,7
psubusb m1, [r0+r6]
por m0, m1
psrlw m2, m0, 8
add r6, mmsize
pand m0, m5
pmaddwd m2, m2
pmaddwd m0, m0
paddd m3, m0
paddd m4, m2
add r6, mmsize
jl .loopx
add r0, r1
add r2, r3
......@@ -646,8 +646,8 @@ SSD_NV12
punpcklbw m3, m7
punpckhbw m4, m7
%endif ; !HIGH_BIT_DEPTH
dec r2d
VAR_CORE
dec r2d
jg .loop
%endmacro
......@@ -3382,7 +3382,7 @@ ALIGN 16
test r2, r2
%else
mov r3, r2
or r3d, [r6+r1+4]
add r3d, [r6+r1+4]
%endif
jz .loopi0
xor r3d, r3d
......
......@@ -792,27 +792,27 @@ OPTIMIZE_CHROMA_2x2_DC
cglobal denoise_dct, 4,4,8
pxor m6, m6
.loop:
sub r3, mmsize/2
mova m2, [r0+r3*4+0*mmsize]
mova m3, [r0+r3*4+1*mmsize]
mova m2, [r0+r3*4-2*mmsize]
mova m3, [r0+r3*4-1*mmsize]
ABSD m0, m2
ABSD m1, m3
mova m4, m0
mova m5, m1
psubd m0, [r2+r3*4+0*mmsize]
psubd m1, [r2+r3*4+1*mmsize]
psubd m0, [r2+r3*4-2*mmsize]
psubd m1, [r2+r3*4-1*mmsize]
pcmpgtd m7, m0, m6
pand m0, m7
pcmpgtd m7, m1, m6
pand m1, m7
PSIGND m0, m2
PSIGND m1, m3
mova [r0+r3*4+0*mmsize], m0
mova [r0+r3*4+1*mmsize], m1
paddd m4, [r1+r3*4+0*mmsize]
paddd m5, [r1+r3*4+1*mmsize]
mova [r1+r3*4+0*mmsize], m4
mova [r1+r3*4+1*mmsize], m5
mova [r0+r3*4-2*mmsize], m0
mova [r0+r3*4-1*mmsize], m1
paddd m4, [r1+r3*4-2*mmsize]
paddd m5, [r1+r3*4-1*mmsize]
mova [r1+r3*4-2*mmsize], m4
mova [r1+r3*4-1*mmsize], m5
sub r3, mmsize/2
jg .loop
REP_RET
%endmacro
......@@ -837,29 +837,29 @@ DENOISE_DCT
cglobal denoise_dct, 4,4,7
pxor m6, m6
.loop:
sub r3, mmsize
mova m2, [r0+r3*2+0*mmsize]
mova m3, [r0+r3*2+1*mmsize]
mova m2, [r0+r3*2-2*mmsize]
mova m3, [r0+r3*2-1*mmsize]
ABSW m0, m2, sign
ABSW m1, m3, sign
psubusw m4, m0, [r2+r3*2+0*mmsize]
psubusw m5, m1, [r2+r3*2+1*mmsize]
psubusw m4, m0, [r2+r3*2-2*mmsize]
psubusw m5, m1, [r2+r3*2-1*mmsize]
PSIGNW m4, m2
PSIGNW m5, m3
mova [r0+r3*2+0*mmsize], m4
mova [r0+r3*2+1*mmsize], m5
mova [r0+r3*2-2*mmsize], m4
mova [r0+r3*2-1*mmsize], m5
punpcklwd m2, m0, m6
punpcklwd m3, m1, m6
punpckhwd m0, m6
punpckhwd m1, m6
paddd m2, [r1+r3*4+0*mmsize]
paddd m0, [r1+r3*4+1*mmsize]
paddd m3, [r1+r3*4+2*mmsize]
paddd m1, [r1+r3*4+3*mmsize]
mova [r1+r3*4+0*mmsize], m2
mova [r1+r3*4+1*mmsize], m0
mova [r1+r3*4+2*mmsize], m3
mova [r1+r3*4+3*mmsize], m1
paddd m2, [r1+r3*4-4*mmsize]
paddd m0, [r1+r3*4-3*mmsize]
paddd m3, [r1+r3*4-2*mmsize]
paddd m1, [r1+r3*4-1*mmsize]
mova [r1+r3*4-4*mmsize], m2
mova [r1+r3*4-3*mmsize], m0
mova [r1+r3*4-2*mmsize], m3
mova [r1+r3*4-1*mmsize], m1
sub r3, mmsize
jg .loop
REP_RET
%endmacro
......@@ -1038,7 +1038,7 @@ cglobal decimate_score64, 1,4
or r1, r2
xor r1, -1
je .ret
or eax, r3d
add eax, r3d
jne .ret9
.loop:
bsf rcx, r1
......@@ -1074,7 +1074,7 @@ cglobal decimate_score64, 1,5
je .tryret
xor r4, -1
.cont:
or r0, r2
add r0, r2
jne .ret9 ;r0 is zero at this point, so we don't need to zero it
.loop:
bsf ecx, r3
......
......@@ -364,8 +364,8 @@ cglobal pixel_sad_x%1_%2x%3, 6,7,XMM_REGS
SAD_X_2xNP %1, STRIDE, 1, %2/(mmsize/2)-1
.loop:
SAD_X%1_INC_P
dec r6
SAD_X_2xNP %1, STRIDE, 0, %2/(mmsize/2)
dec r6
jg .loop
%if %1 == 4
mov r6, r6m
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment