Commit 144001ed authored by Loren Merritt's avatar Loren Merritt

asm cosmetics

parent 95b2dd99
......@@ -386,17 +386,18 @@ AVG_CACHELINE_CHECK 20, 64, sse2
; pixel copy
;=============================================================================
%macro COPY4 3
%1 mm0, [r2]
%1 mm1, [r2+r3]
%1 mm2, [r2+r3*2]
%1 mm3, [r2+%3]
%1 [r0], mm0
%1 [r0+r1], mm1
%1 [r0+r1*2], mm2
%1 [r0+%2], mm3
%macro COPY4 4
%2 m0, [r2]
%2 m1, [r2+r3]
%2 m2, [r2+r3*2]
%2 m3, [r2+%4]
%1 [r0], m0
%1 [r0+r1], m1
%1 [r0+r1*2], m2
%1 [r0+%3], m3
%endmacro
INIT_MMX
;-----------------------------------------------------------------------------
; void x264_mc_copy_w4_mmx( uint8_t *dst, int i_dst_stride,
; uint8_t *src, int i_src_stride, int i_height )
......@@ -406,18 +407,18 @@ cglobal x264_mc_copy_w4_mmx, 4,6
lea r5, [r3*3]
lea r4, [r1*3]
je .end
COPY4 movd, r4, r5
COPY4 movd, movd, r4, r5
lea r2, [r2+r3*4]
lea r0, [r0+r1*4]
.end:
COPY4 movd, r4, r5
COPY4 movd, movd, r4, r5
RET
cglobal x264_mc_copy_w8_mmx, 5,7
lea r6, [r3*3]
lea r5, [r1*3]
.height_loop:
COPY4 movq, r5, r6
COPY4 movq, movq, r5, r6
lea r2, [r2+r3*4]
lea r0, [r0+r1*4]
sub r4d, 4
......@@ -450,19 +451,13 @@ cglobal x264_mc_copy_w16_mmx, 5,7
jg .height_loop
REP_RET
INIT_XMM
%macro COPY_W16_SSE2 2
cglobal %1, 5,7
lea r6, [r3*3]
lea r5, [r1*3]
.height_loop:
%2 xmm0, [r2]
%2 xmm1, [r2+r3]
%2 xmm2, [r2+r3*2]
%2 xmm3, [r2+r6]
movdqa [r0], xmm0
movdqa [r0+r1], xmm1
movdqa [r0+r1*2], xmm2
movdqa [r0+r5], xmm3
COPY4 movdqa, %2, r5, r6
lea r2, [r2+r3*4]
lea r0, [r0+r1*4]
sub r4d, 4
......@@ -485,7 +480,7 @@ COPY_W16_SSE2 x264_mc_copy_w16_aligned_sse2, movdqa
; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
%macro SPLATW 2
%if regsize==16
%if mmsize==16
pshuflw %1, %2, 0
movlhps %1, %1
%else
......@@ -531,6 +526,7 @@ COPY_W16_SSE2 x264_mc_copy_w16_aligned_sse2, movdqa
.height_loop:
%endmacro
INIT_MMX
;-----------------------------------------------------------------------------
; int x264_pixel_avg_weight_w16_mmxext( uint8_t *dst, int, uint8_t *src, int, int i_weight, int )
;-----------------------------------------------------------------------------
......@@ -548,9 +544,9 @@ cglobal x264_pixel_avg_weight_4x4_mmxext, 4,4,1
cglobal x264_pixel_avg_weight_w%2_%1, 4,5
BIWEIGHT_START 1
%assign x 0
%rep %2*2/regsize
%rep %2*2/mmsize
BIWEIGHT [r0+x], [r2+x]
%assign x x+regsize/2
%assign x x+mmsize/2
%endrep
add r0, r1
add r2, r3
......@@ -559,7 +555,6 @@ cglobal x264_pixel_avg_weight_w%2_%1, 4,5
REP_RET
%endmacro
INIT_MMX
AVG_WEIGHT mmxext, 8
AVG_WEIGHT mmxext, 16
INIT_XMM
......@@ -677,7 +672,7 @@ cglobal x264_prefetch_ref_mmxext, 3,3
;-----------------------------------------------------------------------------
%macro MC_CHROMA 1
cglobal x264_mc_chroma_%1, 0,6,1
%if regsize == 16
%if mmsize == 16
cmp dword r6m, 4
jle x264_mc_chroma_mmxext %+ .skip_prologue
%endif
......@@ -745,7 +740,7 @@ cglobal x264_mc_chroma_%1, 0,6,1
dec r4d
jnz .loop2d
%if regsize == 8
%if mmsize == 8
sub dword r6m, 8
jnz .finish ; width != 8 so assume 4
%ifdef ARCH_X86_64
......@@ -760,7 +755,7 @@ cglobal x264_mc_chroma_%1, 0,6,1
jmp .loop2d
%else
REP_RET
%endif ; regsize
%endif ; mmsize
.mc1dy:
and r5d, 7
......@@ -778,7 +773,7 @@ cglobal x264_mc_chroma_%1, 0,6,1
movifnidn r0d, r0m
movifnidn r1d, r1m
mov r4d, r7m
%if regsize == 8
%if mmsize == 8
cmp dword r6m, 8
je .loop1d_w8
%endif
......@@ -802,7 +797,7 @@ cglobal x264_mc_chroma_%1, 0,6,1
.finish:
REP_RET
%if regsize == 8
%if mmsize == 8
.loop1d_w8:
movu m0, [r2+r5]
mova m1, [r2]
......@@ -829,7 +824,7 @@ cglobal x264_mc_chroma_%1, 0,6,1
dec r4d
jnz .loop1d_w8
REP_RET
%endif ; regsize
%endif ; mmsize
%endmacro ; MC_CHROMA
INIT_MMX
......
......@@ -91,7 +91,7 @@ SECTION .text
%ifnidn %4, %2
mova %4, %2
%endif
%if regsize == 8
%if mmsize == 8
psllq %1, (8-%3)*8
psrlq %4, %3*8
%else
......@@ -124,22 +124,22 @@ cglobal x264_hpel_filter_v_%1, 5,6,1
LOAD_ADD m1, [r1 ], [r5+r3*2] ; a0
LOAD_ADD m2, [r1+r3 ], [r5+r3 ] ; b0
LOAD_ADD m3, [r1+r3*2], [r5 ] ; c0
LOAD_ADD m4, [r1 +regsize/2], [r5+r3*2+regsize/2] ; a1
LOAD_ADD m5, [r1+r3 +regsize/2], [r5+r3 +regsize/2] ; b1
LOAD_ADD m6, [r1+r3*2+regsize/2], [r5 +regsize/2] ; c1
LOAD_ADD m4, [r1 +mmsize/2], [r5+r3*2+mmsize/2] ; a1
LOAD_ADD m5, [r1+r3 +mmsize/2], [r5+r3 +mmsize/2] ; b1
LOAD_ADD m6, [r1+r3*2+mmsize/2], [r5 +mmsize/2] ; c1
FILT_V2
mova m7, [pw_16 GLOBAL]
mova [r2+r4*2], m1
mova [r2+r4*2+regsize], m4
mova [r2+r4*2+mmsize], m4
paddw m1, m7
paddw m4, m7
psraw m1, 5
psraw m4, 5
packuswb m1, m4
movnt [r0+r4], m1
add r1, regsize
add r5, regsize
add r4, regsize
add r1, mmsize
add r5, mmsize
add r4, mmsize
jl .loop
REP_RET
%endmacro
......@@ -460,11 +460,11 @@ cglobal x264_memcpy_aligned_sse2, 3,3
cglobal x264_memzero_aligned_%1, 2,2
pxor m0, m0
.loop:
sub r1d, regsize*8
sub r1d, mmsize*8
%assign i 0
%rep 8
mova [r0 + r1 + i], m0
%assign i i+regsize
%assign i i+mmsize
%endrep
jg .loop
REP_RET
......@@ -495,9 +495,9 @@ MEMZERO sse2
%endmacro
%macro FILT16x2 4
mova m3, [r0+%4+regsize]
mova m3, [r0+%4+mmsize]
mova m2, [r0+%4]
pavgb m3, [r0+%4+r5+regsize]
pavgb m3, [r0+%4+r5+mmsize]
pavgb m2, [r0+%4+r5]
PALIGNR %1, m3, 1, m6
pavgb %1, m3
......@@ -564,13 +564,13 @@ cglobal x264_frame_init_lowres_core_%1, 6,7
mov r6d, r6m
sub r6d, r7m
PUSH r6
%define dst_gap [rsp+push_size]
%define dst_gap [rsp+gprsize]
mov r6d, r5d
sub r6d, r7m
shl r6d, 1
PUSH r6
%define src_gap [rsp]
%if regsize == 16
%if mmsize == 16
; adjust for the odd end case
mov r6d, r7m
and r6d, 8
......@@ -579,7 +579,7 @@ cglobal x264_frame_init_lowres_core_%1, 6,7
sub r3, r6
sub r4, r6
add dst_gap, r6d
%endif ; regsize
%endif ; mmsize
pcmpeqb m7, m7
psrlw m7, 8
.vloop:
......@@ -590,7 +590,7 @@ cglobal x264_frame_init_lowres_core_%1, 6,7
pavgb m0, m1
pavgb m1, [r0+r5*2]
%endif
%if regsize == 16
%if mmsize == 16
test r6d, 8
jz .hloop
sub r0, 16
......@@ -604,15 +604,15 @@ cglobal x264_frame_init_lowres_core_%1, 6,7
mova m0, m2
mova m1, m3
sub r6d, 8
%endif ; regsize
%endif ; mmsize
.hloop:
sub r0, regsize*2
sub r1, regsize
sub r2, regsize
sub r3, regsize
sub r4, regsize
sub r0, mmsize*2
sub r1, mmsize
sub r2, mmsize
sub r3, mmsize
sub r4, mmsize
%ifdef m8
FILT8x4 m0, m1, m2, m3, m10, m11, regsize
FILT8x4 m0, m1, m2, m3, m10, m11, mmsize
mova m8, m0
mova m9, m1
FILT8x4 m2, m3, m0, m1, m4, m5, 0
......@@ -631,7 +631,7 @@ cglobal x264_frame_init_lowres_core_%1, 6,7
FILT16x2 m0, r1, r2, 0
FILT16x2 m1, r3, r4, r5
%endif
sub r6d, regsize
sub r6d, mmsize
jg .hloop
.skip:
mov r6, dst_gap
......@@ -642,7 +642,7 @@ cglobal x264_frame_init_lowres_core_%1, 6,7
sub r4, r6
dec dword r8m
jg .vloop
ADD rsp, 2*push_size
ADD rsp, 2*gprsize
emms
RET
%endmacro ; FRAME_INIT_LOWRES
......
......@@ -35,7 +35,7 @@ mask_ff: times 16 db 0xff
SECTION .text
%macro HADDD 2 ; sum junk
%if regsize == 16
%if mmsize == 16
movhlps %2, %1
paddd %1, %2
pshuflw %2, %1, 0xE
......@@ -131,10 +131,10 @@ cglobal x264_pixel_ssd_%1x%2_%3, 4,4
pxor m7, m7
%assign i 0
%rep %2/2
%if %1 > regsize
SSD_FULL 0, 0, regsize, regsize, i, 0
SSD_FULL r1, r3, r1+regsize, r3+regsize, 1, i<%2/2-1
%elif %1 == regsize
%if %1 > mmsize
SSD_FULL 0, 0, mmsize, mmsize, i, 0
SSD_FULL r1, r3, r1+mmsize, r3+mmsize, 1, i<%2/2-1
%elif %1 == mmsize
SSD_FULL 0, 0, r1, r3, i, i<%2/2-1
%else
SSD_HALF 0, 0, r1, r3, i, i<%2/2-1
......
......@@ -112,7 +112,7 @@ cglobal %1, 1,1
%assign x 0
%rep %2
QUANT_ONE [r0+x], m6, m7
%assign x x+regsize
%assign x x+mmsize
%endrep
RET
%endmacro
......@@ -125,7 +125,7 @@ cglobal %1, 3,3
%assign x 0
%rep %2
QUANT_ONE [r0+x], [r1+x], [r2+x]
%assign x x+regsize
%assign x x+mmsize
%endrep
RET
%endmacro
......@@ -338,33 +338,33 @@ cglobal x264_denoise_dct_core_%1, 4,5
movzx r4d, word [r0] ; backup DC coefficient
pxor m7, m7
.loop:
sub r3, regsize
mova m2, [r0+r3*2+0*regsize]
mova m3, [r0+r3*2+1*regsize]
sub r3, mmsize
mova m2, [r0+r3*2+0*mmsize]
mova m3, [r0+r3*2+1*mmsize]
PABSW m0, m2
PABSW m1, m3
mova m4, m0
mova m5, m1
psubusw m0, [r2+r3*2+0*regsize]
psubusw m1, [r2+r3*2+1*regsize]
psubusw m0, [r2+r3*2+0*mmsize]
psubusw m1, [r2+r3*2+1*mmsize]
PSIGNW m0, m2
PSIGNW m1, m3
mova [r0+r3*2+0*regsize], m0
mova [r0+r3*2+1*regsize], m1
mova [r0+r3*2+0*mmsize], m0
mova [r0+r3*2+1*mmsize], m1
mova m2, m4
mova m3, m5
punpcklwd m4, m7
punpckhwd m2, m7
punpcklwd m5, m7
punpckhwd m3, m7
paddd m4, [r1+r3*4+0*regsize]
paddd m2, [r1+r3*4+1*regsize]
paddd m5, [r1+r3*4+2*regsize]
paddd m3, [r1+r3*4+3*regsize]
mova [r1+r3*4+0*regsize], m4
mova [r1+r3*4+1*regsize], m2
mova [r1+r3*4+2*regsize], m5
mova [r1+r3*4+3*regsize], m3
paddd m4, [r1+r3*4+0*mmsize]
paddd m2, [r1+r3*4+1*mmsize]
paddd m5, [r1+r3*4+2*mmsize]
paddd m3, [r1+r3*4+3*mmsize]
mova [r1+r3*4+0*mmsize], m4
mova [r1+r3*4+1*mmsize], m2
mova [r1+r3*4+2*mmsize], m5
mova [r1+r3*4+3*mmsize], m3
jg .loop
mov [r0], r4w ; restore DC coefficient
RET
......
......@@ -37,7 +37,7 @@
%endif
%endmacro
; PIC support macros. All these macros are totally harmless when __PIC__ is
; PIC support macros. All these macros are totally harmless when PIC is
; not defined but can ruin everything if misused in PIC mode. On x86_32, shared
; objects cannot directly access global variables by address, they need to
; go through the GOT (global offset table). Most OSes do not care about it
......@@ -55,7 +55,7 @@
; Before in both execution order and compiled code order (so GLOBAL knows
; which register the GOT is in).
%ifndef __PIC__
%ifndef PIC
%define GLOBAL
%macro picgetgot 1
%endmacro
......@@ -99,8 +99,13 @@
; %1 = number of arguments. loads them from stack if needed.
; %2 = number of registers used, not including PIC. pushes callee-saved regs if needed.
; %3 = whether global constants are used in this function. inits x86_32 PIC if needed.
; %4 = list of names to define to registers
; PROLOGUE can also be invoked by adding the same options to cglobal
; e.g.
; cglobal foo, 2,3,0, dst, src, tmp
; declares a function (foo), taking two args (dst and src), one local variable (tmp), and not using globals
; TODO Some functions can use some args directly from the stack. If they're the
; last args then you can just not declare them, but if they're in the middle
; we need more flexible macro.
......@@ -144,19 +149,19 @@ DECLARE_REG_SIZE di, dil
DECLARE_REG_SIZE bp, bpl
%ifdef ARCH_X86_64
%define push_size 8
%define gprsize 8
%else
%define push_size 4
%define gprsize 4
%endif
%macro PUSH 1
push %1
%assign stack_offset stack_offset+push_size
%assign stack_offset stack_offset+gprsize
%endmacro
%macro POP 1
pop %1
%assign stack_offset stack_offset-push_size
%assign stack_offset stack_offset-gprsize
%endmacro
%macro SUB 2
......@@ -191,6 +196,32 @@ DECLARE_REG_SIZE bp, bpl
%endif
%endmacro
%macro DEFINE_ARGS 0-*
%ifdef n_arg_names
%assign %%i 0
%rep n_arg_names
CAT_UNDEF arg_name %+ %%i, q
CAT_UNDEF arg_name %+ %%i, d
CAT_UNDEF arg_name %+ %%i, w
CAT_UNDEF arg_name %+ %%i, b
CAT_UNDEF arg_name, %%i
%assign %%i %%i+1
%endrep
%endif
%assign %%i 0
%rep %0
%xdefine %1q r %+ %%i %+ q
%xdefine %1d r %+ %%i %+ d
%xdefine %1w r %+ %%i %+ w
%xdefine %1b r %+ %%i %+ b
CAT_XDEFINE arg_name, %%i, %1
%assign %%i %%i+1
%rotate 1
%endrep
%assign n_arg_names %%i
%endmacro
%ifdef ARCH_X86_64 ;========================================================
DECLARE_REG 0, rdi, edi, di, dil, edi
......@@ -209,11 +240,12 @@ DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 8]
%endif
%endmacro
%macro PROLOGUE 3
%macro PROLOGUE 2-4+ 0 ; #args, #regs, pic, arg_names...
ASSERT %2 >= %1
ASSERT %2 <= 7
%assign stack_offset 0
LOAD_IF_USED 6, %1
DEFINE_ARGS %4
%endmacro
%macro RET 0
......@@ -256,11 +288,11 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
%endif
%endmacro
%macro PROLOGUE 3
%macro PROLOGUE 2-4+ 0 ; #args, #regs, pic, arg_names...
ASSERT %2 >= %1
%assign stack_offset 0
%assign regs_used %2
%ifdef __PIC__
%ifdef PIC
%if %3
%assign regs_used regs_used+1
%endif
......@@ -280,6 +312,7 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
%if %3
picgetgot r%2
%endif
DEFINE_ARGS %4
%endmacro
%macro RET 0
......@@ -309,7 +342,7 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
%assign function_align 16
; Symbol prefix for C linkage
%macro cglobal 1
%macro cglobal 1-2+
%ifidn __OUTPUT_FORMAT__,elf
%ifdef PREFIX
global _%1:function hidden
......@@ -328,16 +361,9 @@ DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
align function_align
%1:
RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
%endmacro
%macro cglobal 3
cglobal %1
PROLOGUE %2, %3, 0
%endmacro
%macro cglobal 4
cglobal %1
PROLOGUE %2, %3, %4
%if %0 > 1
PROLOGUE %2
%endif
%endmacro
%macro cextern 1
......@@ -360,10 +386,6 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
; merge mmx and sse*
%macro CAT_DEFINE 3
%define %1%2 %3
%endmacro
%macro CAT_XDEFINE 3
%xdefine %1%2 %3
%endmacro
......@@ -374,7 +396,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
%macro INIT_MMX 0
%define RESET_MM_PERMUTATION INIT_MMX
%define regsize 8
%define mmsize 8
%define num_mmregs 8
%define mova movq
%define movu movq
......@@ -382,8 +404,8 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
%define movnt movntq
%assign %%i 0
%rep 8
CAT_DEFINE m, %%i, mm %+ %%i
CAT_DEFINE nmm, %%i, %%i
CAT_XDEFINE m, %%i, mm %+ %%i
CAT_XDEFINE nmm, %%i, %%i
%assign %%i %%i+1
%endrep
%rep 8
......@@ -395,7 +417,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
%macro INIT_XMM 0
%define RESET_MM_PERMUTATION INIT_XMM
%define regsize 16
%define mmsize 16
%define num_mmregs 8
%ifdef ARCH_X86_64
%define num_mmregs 16
......@@ -406,8 +428,8 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
%define movnt movntdq
%assign %%i 0
%rep num_mmregs
CAT_DEFINE m, %%i, xmm %+ %%i
CAT_DEFINE nxmm, %%i, %%i
CAT_XDEFINE m, %%i, xmm %+ %%i
CAT_XDEFINE nxmm, %%i, %%i
%assign %%i %%i+1
%endrep
%endmacro
......
......@@ -258,7 +258,7 @@ case $host_cpu in
ARCH="X86_64"
AS="yasm"
if [ "$SYS" = MACOSX ];then
ASFLAGS="-f macho64 -m amd64 -D__PIC__ -DPREFIX"
ASFLAGS="-f macho64 -m amd64 -DPIC -DPREFIX"
CFLAGS="$CFLAGS -arch x86_64"
LDFLAGS="$LDFLAGS -arch x86_64"
else
......@@ -390,7 +390,7 @@ fi
if [ "$pic" = "yes" ] ; then
CFLAGS="$CFLAGS -fPIC"
ASFLAGS="$ASFLAGS -D__PIC__"
ASFLAGS="$ASFLAGS -DPIC"
# resolve textrels in the x86 asm
cc_check stdio.h -Wl,-Bsymbolic && LDFLAGS="$LDFLAGS -Wl,-Bsymbolic"
fi
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment