x86inc.asm 55.7 KB
Newer Older
1
;*****************************************************************************
Fiona Glaser's avatar
Fiona Glaser committed
2
;* x86inc.asm: x264asm abstraction layer
3
;*****************************************************************************
Anton Mitrofanov's avatar
Anton Mitrofanov committed
4
;* Copyright (C) 2005-2022 x264 project
5
6
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
7
;*          Henrik Gramner <henrik@gramner.com>
8
;*          Anton Mitrofanov <BugMaster@narod.ru>
Fiona Glaser's avatar
Fiona Glaser committed
9
;*          Fiona Glaser <fiona@x264.com>
10
;*
Fiona Glaser's avatar
Fiona Glaser committed
11
12
13
;* Permission to use, copy, modify, and/or distribute this software for any
;* purpose with or without fee is hereby granted, provided that the above
;* copyright notice and this permission notice appear in all copies.
14
;*
Fiona Glaser's avatar
Fiona Glaser committed
15
16
17
18
19
20
21
;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
22
23
;*****************************************************************************

Fiona Glaser's avatar
Fiona Glaser committed
24
25
26
27
28
29
30
31
32
33
34
35
36
; This is a header file for the x264ASM assembly language, which uses
; NASM/YASM syntax combined with a large number of macros to provide easy
; abstraction between different calling conventions (x86_32, win64, linux64).
; It also has various other useful features to simplify writing the kind of
; DSP functions that are most often used in x264.

; Unlike the rest of x264, this file is available under an ISC license, as it
; has significant usefulness outside of x264 and we want it to be available
; to the largest audience possible.  Of course, if you modify it for your own
; purposes to add a new feature, we strongly encourage contributing a patch
; as this feature might be useful for others as well.  Send patches or ideas
; to x264-devel@videolan.org .

37
38
%ifndef private_prefix
    %define private_prefix x264
39
%endif
40

41
42
43
44
%ifndef public_prefix
    %define public_prefix private_prefix
%endif

45
46
47
48
49
50
51
52
%ifndef STACK_ALIGNMENT
    %if ARCH_X86_64
        %define STACK_ALIGNMENT 16
    %else
        %define STACK_ALIGNMENT 4
    %endif
%endif

53
54
55
%define WIN64  0
%define UNIX64 0
%if ARCH_X86_64
Anton Mitrofanov's avatar
Anton Mitrofanov committed
56
    %ifidn __OUTPUT_FORMAT__,win32
57
        %define WIN64  1
58
59
    %elifidn __OUTPUT_FORMAT__,win64
        %define WIN64  1
60
61
    %elifidn __OUTPUT_FORMAT__,x64
        %define WIN64  1
Anton Mitrofanov's avatar
Anton Mitrofanov committed
62
    %else
63
        %define UNIX64 1
Anton Mitrofanov's avatar
Anton Mitrofanov committed
64
65
66
    %endif
%endif

67
%define FORMAT_ELF 0
68
%define FORMAT_MACHO 0
69
70
71
72
73
74
%ifidn __OUTPUT_FORMAT__,elf
    %define FORMAT_ELF 1
%elifidn __OUTPUT_FORMAT__,elf32
    %define FORMAT_ELF 1
%elifidn __OUTPUT_FORMAT__,elf64
    %define FORMAT_ELF 1
75
76
77
78
79
80
%elifidn __OUTPUT_FORMAT__,macho
    %define FORMAT_MACHO 1
%elifidn __OUTPUT_FORMAT__,macho32
    %define FORMAT_MACHO 1
%elifidn __OUTPUT_FORMAT__,macho64
    %define FORMAT_MACHO 1
81
82
%endif

83
84
85
86
87
88
%ifdef PREFIX
    %define mangle(x) _ %+ x
%else
    %define mangle(x) x
%endif

89
90
91
92
93
; Use VEX-encoding even in non-AVX functions
%ifndef FORCE_VEX_ENCODING
    %define FORCE_VEX_ENCODING 0
%endif

Fiona Glaser's avatar
Fiona Glaser committed
94
%macro SECTION_RODATA 0-1 16
95
96
97
98
99
100
101
    %ifidn __OUTPUT_FORMAT__,win32
        SECTION .rdata align=%1
    %elif WIN64
        SECTION .rdata align=%1
    %else
        SECTION .rodata align=%1
    %endif
Loren Merritt's avatar
Loren Merritt committed
102
103
%endmacro

104
105
%if ARCH_X86_64
    %define PIC 1 ; always use PIC on x86-64
106
    default rel
107
108
109
110
%elifidn __OUTPUT_FORMAT__,win32
    %define PIC 0 ; PIC isn't used on 32-bit Windows
%elifndef PIC
    %define PIC 0
111
112
%endif

113
%define HAVE_PRIVATE_EXTERN 1
114
115
%ifdef __NASM_VER__
    %use smartalign
116
117
118
    %if __NASM_VERSION_ID__ < 0x020e0000 ; 2.14
        %define HAVE_PRIVATE_EXTERN 0
    %endif
119
120
%endif

121
122
123
124
125
126
127
; Macros to eliminate most code duplication between x86_32 and x86_64:
; Currently this works only for leaf functions which load all their arguments
; into registers at the start, and make no other use of the stack. Luckily that
; covers most of x264's asm.

; PROLOGUE:
; %1 = number of arguments. loads them from stack if needed.
Loren Merritt's avatar
Loren Merritt committed
128
; %2 = number of registers used. pushes callee-saved regs if needed.
Anton Mitrofanov's avatar
Anton Mitrofanov committed
129
; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
130
131
132
; %4 = (optional) stack size to be allocated. The stack will be aligned before
;      allocating the specified stack size. If the required stack alignment is
;      larger than the known stack alignment the stack will be manually aligned
133
134
135
136
;      and an extra register will be allocated to hold the original stack
;      pointer (to not invalidate r0m etc.). To prevent the use of an extra
;      register as stack pointer, request a negative stack size.
; %4+/%5+ = list of names to define to registers
137
138
; PROLOGUE can also be invoked by adding the same options to cglobal

Loren Merritt's avatar
Loren Merritt committed
139
; e.g.
140
141
142
143
; cglobal foo, 2,3,7,0x40, dst, src, tmp
; declares a function (foo) that automatically loads two arguments (dst and
; src) into registers, uses one additional register (tmp) plus 7 vector
; registers (m0-m6) and allocates 0x40 bytes of stack space.
Loren Merritt's avatar
Loren Merritt committed
144

145
146
147
148
149
; TODO Some functions can use some args directly from the stack. If they're the
; last args then you can just not declare them, but if they're in the middle
; we need more flexible macro.

; RET:
150
; Pops anything that was pushed by PROLOGUE, and returns.
151
152

; REP_RET:
153
; Use this instead of RET if it's a branch target.
154

Anton Mitrofanov's avatar
Anton Mitrofanov committed
155
156
157
; registers:
; rN and rNq are the native-size register holding function argument N
; rNd, rNw, rNb are dword, word, and byte size
158
; rNh is the high 8 bits of the word size
Anton Mitrofanov's avatar
Anton Mitrofanov committed
159
160
161
; rNm is the original location of arg N (a register or on the stack), dword
; rNmp is native size

162
%macro DECLARE_REG 2-3
163
    %define r%1q %2
164
165
166
167
    %define r%1d %2d
    %define r%1w %2w
    %define r%1b %2b
    %define r%1h %2h
168
    %define %2q %2
169
170
    %if %0 == 2
        %define r%1m  %2d
Anton Mitrofanov's avatar
Anton Mitrofanov committed
171
        %define r%1mp %2
172
    %elif ARCH_X86_64 ; memory
173
174
        %define r%1m [rstk + stack_offset + %3]
        %define r%1mp qword r %+ %1 %+ m
Anton Mitrofanov's avatar
Anton Mitrofanov committed
175
    %else
176
177
        %define r%1m [rstk + stack_offset + %3]
        %define r%1mp dword r %+ %1 %+ m
Anton Mitrofanov's avatar
Anton Mitrofanov committed
178
    %endif
Loren Merritt's avatar
Loren Merritt committed
179
    %define r%1  %2
180
181
%endmacro

182
%macro DECLARE_REG_SIZE 3
183
184
185
186
187
188
    %define r%1q r%1
    %define e%1q r%1
    %define r%1d e%1
    %define e%1d e%1
    %define r%1w %1
    %define e%1w %1
189
190
    %define r%1h %3
    %define e%1h %3
Loren Merritt's avatar
Loren Merritt committed
191
192
    %define r%1b %2
    %define e%1b %2
193
194
195
    %if ARCH_X86_64 == 0
        %define r%1 e%1
    %endif
196
197
%endmacro

198
199
200
201
202
203
204
DECLARE_REG_SIZE ax, al, ah
DECLARE_REG_SIZE bx, bl, bh
DECLARE_REG_SIZE cx, cl, ch
DECLARE_REG_SIZE dx, dl, dh
DECLARE_REG_SIZE si, sil, null
DECLARE_REG_SIZE di, dil, null
DECLARE_REG_SIZE bp, bpl, null
205

206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
; t# defines for when per-arch register allocation is more complex than just function arguments

%macro DECLARE_REG_TMP 1-*
    %assign %%i 0
    %rep %0
        CAT_XDEFINE t, %%i, r%1
        %assign %%i %%i+1
        %rotate 1
    %endrep
%endmacro

%macro DECLARE_REG_TMP_SIZE 0-*
    %rep %0
        %define t%1q t%1 %+ q
        %define t%1d t%1 %+ d
        %define t%1w t%1 %+ w
222
        %define t%1h t%1 %+ h
223
224
225
226
227
        %define t%1b t%1 %+ b
        %rotate 1
    %endrep
%endmacro

228
DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
229

230
%if ARCH_X86_64
Loren Merritt's avatar
Loren Merritt committed
231
    %define gprsize 8
232
%else
Loren Merritt's avatar
Loren Merritt committed
233
    %define gprsize 4
234
235
%endif

236
237
238
239
240
241
242
243
244
245
246
247
%macro LEA 2
%if ARCH_X86_64
    lea %1, [%2]
%elif PIC
    call $+5 ; special-cased to not affect the RSB on most CPU:s
    pop %1
    add %1, (%2)-$+1
%else
    mov %1, %2
%endif
%endmacro

248
249
250
251
252
253
254
255
256
257
; Repeats an instruction/operation for multiple arguments.
; Example usage: "REPX {psrlw x, 8}, m0, m1, m2, m3"
%macro REPX 2-* ; operation, args
    %xdefine %%f(x) %1
    %rep %0 - 1
        %rotate 1
        %%f(%1)
    %endrep
%endmacro

258
259
%macro PUSH 1
    push %1
260
261
262
    %ifidn rstk, rsp
        %assign stack_offset stack_offset+gprsize
    %endif
263
264
265
266
%endmacro

%macro POP 1
    pop %1
267
268
269
    %ifidn rstk, rsp
        %assign stack_offset stack_offset-gprsize
    %endif
270
271
%endmacro

272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
%macro PUSH_IF_USED 1-*
    %rep %0
        %if %1 < regs_used
            PUSH r%1
        %endif
        %rotate 1
    %endrep
%endmacro

%macro POP_IF_USED 1-*
    %rep %0
        %if %1 < regs_used
            pop r%1
        %endif
        %rotate 1
    %endrep
%endmacro

%macro LOAD_IF_USED 1-*
    %rep %0
        %if %1 < num_args
            mov r%1, r %+ %1 %+ mp
        %endif
        %rotate 1
    %endrep
%endmacro

299
300
%macro SUB 2
    sub %1, %2
301
    %ifidn %1, rstk
302
303
304
305
306
307
        %assign stack_offset stack_offset+(%2)
    %endif
%endmacro

%macro ADD 2
    add %1, %2
308
    %ifidn %1, rstk
309
310
311
312
313
314
315
316
317
318
        %assign stack_offset stack_offset-(%2)
    %endif
%endmacro

%macro movifnidn 2
    %ifnidn %1, %2
        mov %1, %2
    %endif
%endmacro

319
320
321
322
%if ARCH_X86_64 == 0
    %define movsxd movifnidn
%endif

323
324
325
326
327
328
329
330
%macro movsxdifnidn 2
    %ifnidn %1, %2
        movsxd %1, %2
    %endif
%endmacro

%macro ASSERT 1
    %if (%1) == 0
331
        %error assertion ``%1'' failed
332
333
334
    %endif
%endmacro

Loren Merritt's avatar
Loren Merritt committed
335
336
337
338
339
340
341
%macro DEFINE_ARGS 0-*
    %ifdef n_arg_names
        %assign %%i 0
        %rep n_arg_names
            CAT_UNDEF arg_name %+ %%i, q
            CAT_UNDEF arg_name %+ %%i, d
            CAT_UNDEF arg_name %+ %%i, w
342
            CAT_UNDEF arg_name %+ %%i, h
Loren Merritt's avatar
Loren Merritt committed
343
            CAT_UNDEF arg_name %+ %%i, b
Loren Merritt's avatar
Loren Merritt committed
344
            CAT_UNDEF arg_name %+ %%i, m
345
            CAT_UNDEF arg_name %+ %%i, mp
Loren Merritt's avatar
Loren Merritt committed
346
347
348
349
350
            CAT_UNDEF arg_name, %%i
            %assign %%i %%i+1
        %endrep
    %endif

351
    %xdefine %%stack_offset stack_offset
352
    %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
Loren Merritt's avatar
Loren Merritt committed
353
354
355
356
357
    %assign %%i 0
    %rep %0
        %xdefine %1q r %+ %%i %+ q
        %xdefine %1d r %+ %%i %+ d
        %xdefine %1w r %+ %%i %+ w
358
        %xdefine %1h r %+ %%i %+ h
Loren Merritt's avatar
Loren Merritt committed
359
        %xdefine %1b r %+ %%i %+ b
Loren Merritt's avatar
Loren Merritt committed
360
        %xdefine %1m r %+ %%i %+ m
361
        %xdefine %1mp r %+ %%i %+ mp
Loren Merritt's avatar
Loren Merritt committed
362
363
364
365
        CAT_XDEFINE arg_name, %%i, %1
        %assign %%i %%i+1
        %rotate 1
    %endrep
366
    %xdefine stack_offset %%stack_offset
367
    %assign n_arg_names %0
Loren Merritt's avatar
Loren Merritt committed
368
369
%endmacro

370
%define required_stack_alignment ((mmsize + 15) & ~15)
Henrik Gramner's avatar
Henrik Gramner committed
371
372
%define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512)))
%define high_mm_regs (16*cpuflag(avx512))
373

374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
; Large stack allocations on Windows need to use stack probing in order
; to guarantee that all stack memory is committed before accessing it.
; This is done by ensuring that the guard page(s) at the end of the
; currently committed pages are touched prior to any pages beyond that.
%if WIN64
    %assign STACK_PROBE_SIZE 8192
%elifidn __OUTPUT_FORMAT__, win32
    %assign STACK_PROBE_SIZE 4096
%else
    %assign STACK_PROBE_SIZE 0
%endif

%macro PROBE_STACK 1 ; stack_size
    %if STACK_PROBE_SIZE
        %assign %%i STACK_PROBE_SIZE
        %rep %1 / STACK_PROBE_SIZE
            mov eax, [rsp-%%i]
            %assign %%i %%i+STACK_PROBE_SIZE
        %endrep
    %endif
%endmacro

396
%macro ALLOC_STACK 0-2 0, 0 ; stack_size, n_xmm_regs (for win64 only)
397
398
    %ifnum %1
        %if %1 != 0
399
            %assign %%pad 0
400
401
402
403
            %assign stack_size %1
            %if stack_size < 0
                %assign stack_size -stack_size
            %endif
404
            %if WIN64
405
                %assign %%pad %%pad + 32 ; shadow space
406
407
408
                %if mmsize != 8
                    %assign xmm_regs_used %2
                    %if xmm_regs_used > 8
409
                        %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers
410
411
                    %endif
                %endif
412
            %endif
413
414
415
            %if required_stack_alignment <= STACK_ALIGNMENT
                ; maintain the current stack alignment
                %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
416
                PROBE_STACK stack_size_padded
417
418
419
420
421
422
423
424
425
                SUB rsp, stack_size_padded
            %else
                %assign %%reg_num (regs_used - 1)
                %xdefine rstk r %+ %%reg_num
                ; align stack, and save original stack location directly above
                ; it, i.e. in [rsp+stack_size_padded], so we can restore the
                ; stack in a single instruction (i.e. mov rsp, rstk or mov
                ; rsp, [rsp+stack_size_padded])
                %if %1 < 0 ; need to store rsp on stack
426
427
                    %xdefine rstkm [rsp + stack_size + %%pad]
                    %assign %%pad %%pad + gprsize
428
429
430
                %else ; can keep rsp in rstk during whole function
                    %xdefine rstkm rstk
                %endif
431
                %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1))
432
                PROBE_STACK stack_size_padded
433
434
435
436
                mov rstk, rsp
                and rsp, ~(required_stack_alignment-1)
                sub rsp, stack_size_padded
                movifnidn rstkm, rstk
437
            %endif
438
            WIN64_PUSH_XMM
439
440
441
442
        %endif
    %endif
%endmacro

443
%macro SETUP_STACK_POINTER 0-1 0
444
    %ifnum %1
445
        %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT
446
            %if %1 > 0
447
448
                ; Reserve an additional register for storing the original stack pointer, but avoid using
                ; eax/rax for this purpose since it can potentially get overwritten as a return value.
449
                %assign regs_used (regs_used + 1)
450
451
452
453
454
                %if ARCH_X86_64 && regs_used == 7
                    %assign regs_used 8
                %elif ARCH_X86_64 == 0 && regs_used == 1
                    %assign regs_used 2
                %endif
455
456
            %endif
            %if ARCH_X86_64 && regs_used < 5 + UNIX64 * 3
457
458
                ; Ensure that we don't clobber any registers containing arguments. For UNIX64 we also preserve r6 (rax)
                ; since it's used as a hidden argument in vararg functions to specify the number of vector registers used.
459
                %assign regs_used 5 + UNIX64 * 3
460
461
462
463
464
            %endif
        %endif
    %endif
%endmacro

465
%if WIN64 ; Windows x64 ;=================================================
Anton Mitrofanov's avatar
Anton Mitrofanov committed
466

467
468
469
470
471
472
473
474
475
476
477
DECLARE_REG 0,  rcx
DECLARE_REG 1,  rdx
DECLARE_REG 2,  R8
DECLARE_REG 3,  R9
DECLARE_REG 4,  R10, 40
DECLARE_REG 5,  R11, 48
DECLARE_REG 6,  rax, 56
DECLARE_REG 7,  rdi, 64
DECLARE_REG 8,  rsi, 72
DECLARE_REG 9,  rbx, 80
DECLARE_REG 10, rbp, 88
478
479
480
481
DECLARE_REG 11, R14, 96
DECLARE_REG 12, R15, 104
DECLARE_REG 13, R12, 112
DECLARE_REG 14, R13, 120
Anton Mitrofanov's avatar
Anton Mitrofanov committed
482

483
%macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
484
    %assign num_args %1
Anton Mitrofanov's avatar
Anton Mitrofanov committed
485
    %assign regs_used %2
486
    ASSERT regs_used >= num_args
487
    SETUP_STACK_POINTER %4
488
489
    ASSERT regs_used <= 15
    PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
490
491
    ALLOC_STACK %4, %3
    %if mmsize != 8 && stack_size == 0
492
493
        WIN64_SPILL_XMM %3
    %endif
494
    LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
495
496
497
498
499
500
501
502
503
    %if %0 > 4
        %ifnum %4
            DEFINE_ARGS %5
        %else
            DEFINE_ARGS %4, %5
        %endif
    %elifnnum %4
        DEFINE_ARGS %4
    %endif
504
505
506
%endmacro

%macro WIN64_PUSH_XMM 0
507
    ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
Henrik Gramner's avatar
Henrik Gramner committed
508
    %if xmm_regs_used > 6 + high_mm_regs
509
510
        movaps [rstk + stack_offset +  8], xmm6
    %endif
Henrik Gramner's avatar
Henrik Gramner committed
511
    %if xmm_regs_used > 7 + high_mm_regs
512
513
        movaps [rstk + stack_offset + 24], xmm7
    %endif
Henrik Gramner's avatar
Henrik Gramner committed
514
515
    %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
    %if %%xmm_regs_on_stack > 0
516
        %assign %%i 8
Henrik Gramner's avatar
Henrik Gramner committed
517
        %rep %%xmm_regs_on_stack
518
519
520
521
            movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
            %assign %%i %%i+1
        %endrep
    %endif
522
523
524
525
%endmacro

%macro WIN64_SPILL_XMM 1
    %assign xmm_regs_used %1
Henrik Gramner's avatar
Henrik Gramner committed
526
527
528
    ASSERT xmm_regs_used <= 16 + high_mm_regs
    %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
    %if %%xmm_regs_on_stack > 0
529
        ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack.
Henrik Gramner's avatar
Henrik Gramner committed
530
        %assign %%pad %%xmm_regs_on_stack*16 + 32
531
        %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
532
        SUB rsp, stack_size_padded
Anton Mitrofanov's avatar
Anton Mitrofanov committed
533
    %endif
534
    WIN64_PUSH_XMM
Anton Mitrofanov's avatar
Anton Mitrofanov committed
535
536
%endmacro

537
%macro WIN64_RESTORE_XMM_INTERNAL 0
538
    %assign %%pad_size 0
Henrik Gramner's avatar
Henrik Gramner committed
539
540
541
542
    %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
    %if %%xmm_regs_on_stack > 0
        %assign %%i xmm_regs_used - high_mm_regs
        %rep %%xmm_regs_on_stack
Anton Mitrofanov's avatar
Anton Mitrofanov committed
543
            %assign %%i %%i-1
544
            movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32]
Anton Mitrofanov's avatar
Anton Mitrofanov committed
545
        %endrep
546
547
    %endif
    %if stack_size_padded > 0
548
        %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT
549
550
            mov rsp, rstkm
        %else
551
            add rsp, stack_size_padded
552
            %assign %%pad_size stack_size_padded
553
        %endif
Anton Mitrofanov's avatar
Anton Mitrofanov committed
554
    %endif
Henrik Gramner's avatar
Henrik Gramner committed
555
    %if xmm_regs_used > 7 + high_mm_regs
556
        movaps xmm7, [rsp + stack_offset - %%pad_size + 24]
557
    %endif
Henrik Gramner's avatar
Henrik Gramner committed
558
    %if xmm_regs_used > 6 + high_mm_regs
559
        movaps xmm6, [rsp + stack_offset - %%pad_size +  8]
560
    %endif
Anton Mitrofanov's avatar
Anton Mitrofanov committed
561
562
%endmacro

563
564
%macro WIN64_RESTORE_XMM 0
    WIN64_RESTORE_XMM_INTERNAL
565
    %assign stack_offset (stack_offset-stack_size_padded)
566
    %assign stack_size_padded 0
Anton Mitrofanov's avatar
Anton Mitrofanov committed
567
568
569
    %assign xmm_regs_used 0
%endmacro

Henrik Gramner's avatar
Henrik Gramner committed
570
%define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6+high_mm_regs
571

Anton Mitrofanov's avatar
Anton Mitrofanov committed
572
%macro RET 0
573
    WIN64_RESTORE_XMM_INTERNAL
574
    POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
Henrik Gramner's avatar
Henrik Gramner committed
575
    %if vzeroupper_required
576
577
        vzeroupper
    %endif
578
    AUTO_REP_RET
Anton Mitrofanov's avatar
Anton Mitrofanov committed
579
580
%endmacro

581
%elif ARCH_X86_64 ; *nix x64 ;=============================================
582

583
584
585
586
587
588
589
590
591
592
593
DECLARE_REG 0,  rdi
DECLARE_REG 1,  rsi
DECLARE_REG 2,  rdx
DECLARE_REG 3,  rcx
DECLARE_REG 4,  R8
DECLARE_REG 5,  R9
DECLARE_REG 6,  rax, 8
DECLARE_REG 7,  R10, 16
DECLARE_REG 8,  R11, 24
DECLARE_REG 9,  rbx, 32
DECLARE_REG 10, rbp, 40
594
595
596
597
DECLARE_REG 11, R14, 48
DECLARE_REG 12, R15, 56
DECLARE_REG 13, R12, 64
DECLARE_REG 14, R13, 72
598

599
%macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
600
601
    %assign num_args %1
    %assign regs_used %2
Henrik Gramner's avatar
Henrik Gramner committed
602
    %assign xmm_regs_used %3
603
    ASSERT regs_used >= num_args
604
    SETUP_STACK_POINTER %4
605
606
    ASSERT regs_used <= 15
    PUSH_IF_USED 9, 10, 11, 12, 13, 14
607
    ALLOC_STACK %4
608
    LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
609
610
611
612
613
614
615
616
617
    %if %0 > 4
        %ifnum %4
            DEFINE_ARGS %5
        %else
            DEFINE_ARGS %4, %5
        %endif
    %elifnnum %4
        DEFINE_ARGS %4
    %endif
618
619
%endmacro

Henrik Gramner's avatar
Henrik Gramner committed
620
%define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required
621

622
%macro RET 0
623
624
625
626
627
628
629
    %if stack_size_padded > 0
        %if required_stack_alignment > STACK_ALIGNMENT
            mov rsp, rstkm
        %else
            add rsp, stack_size_padded
        %endif
    %endif
630
    POP_IF_USED 14, 13, 12, 11, 10, 9
Henrik Gramner's avatar
Henrik Gramner committed
631
    %if vzeroupper_required
632
633
        vzeroupper
    %endif
634
    AUTO_REP_RET
635
636
637
638
%endmacro

%else ; X86_32 ;==============================================================

639
640
641
642
643
644
645
DECLARE_REG 0, eax, 4
DECLARE_REG 1, ecx, 8
DECLARE_REG 2, edx, 12
DECLARE_REG 3, ebx, 16
DECLARE_REG 4, esi, 20
DECLARE_REG 5, edi, 24
DECLARE_REG 6, ebp, 28
646
647
%define rsp esp

648
649
%macro DECLARE_ARG 1-*
    %rep %0
650
        %define r%1m [rstk + stack_offset + 4*%1 + 4]
651
652
653
        %define r%1mp dword r%1m
        %rotate 1
    %endrep
654
655
%endmacro

656
DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
657

658
%macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
659
    %assign num_args %1
660
    %assign regs_used %2
661
662
663
664
    ASSERT regs_used >= num_args
    %if num_args > 7
        %assign num_args 7
    %endif
665
666
667
    %if regs_used > 7
        %assign regs_used 7
    %endif
668
669
    SETUP_STACK_POINTER %4
    ASSERT regs_used <= 7
670
    PUSH_IF_USED 3, 4, 5, 6
671
    ALLOC_STACK %4
672
    LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
673
674
675
676
677
678
679
680
681
    %if %0 > 4
        %ifnum %4
            DEFINE_ARGS %5
        %else
            DEFINE_ARGS %4, %5
        %endif
    %elifnnum %4
        DEFINE_ARGS %4
    %endif
682
683
%endmacro

Henrik Gramner's avatar
Henrik Gramner committed
684
%define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required
685

686
%macro RET 0
687
688
689
690
691
692
693
    %if stack_size_padded > 0
        %if required_stack_alignment > STACK_ALIGNMENT
            mov rsp, rstkm
        %else
            add rsp, stack_size_padded
        %endif
    %endif
694
    POP_IF_USED 6, 5, 4, 3
Henrik Gramner's avatar
Henrik Gramner committed
695
    %if vzeroupper_required
696
697
        vzeroupper
    %endif
698
    AUTO_REP_RET
699
700
701
702
%endmacro

%endif ;======================================================================

703
%if WIN64 == 0
704
705
    %macro WIN64_SPILL_XMM 1
    %endmacro
706
707
    %macro WIN64_RESTORE_XMM_INTERNAL 0
    %endmacro
708
    %macro WIN64_RESTORE_XMM 0
709
710
711
    %endmacro
    %macro WIN64_PUSH_XMM 0
    %endmacro
712
713
%endif

714
715
716
717
; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either
; a branch or a branch target. So switch to a 2-byte form of ret in that case.
; We can automatically detect "follows a branch", but not a branch target.
; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.)
718
%macro REP_RET 0
719
    %if has_epilogue || cpuflag(ssse3)
720
721
722
723
        RET
    %else
        rep ret
    %endif
724
    annotate_function_size
725
726
%endmacro

727
728
%define last_branch_adr $$
%macro AUTO_REP_RET 0
Henrik Gramner's avatar
Henrik Gramner committed
729
730
    %if notcpuflag(ssse3)
        times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ == last_branch_adr.
731
732
    %endif
    ret
733
    annotate_function_size
734
735
736
737
738
739
%endmacro

%macro BRANCH_INSTR 0-*
    %rep %0
        %macro %1 1-2 %1
            %2 %1
740
741
742
743
            %if notcpuflag(ssse3)
                %%branch_instr equ $
                %xdefine last_branch_adr %%branch_instr
            %endif
744
745
746
747
748
749
750
        %endmacro
        %rotate 1
    %endrep
%endmacro

BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp

751
%macro TAIL_CALL 1-2 1 ; callee, is_nonadjacent
752
753
754
755
756
757
    %if has_epilogue
        call %1
        RET
    %elif %2
        jmp %1
    %endif
758
    annotate_function_size
759
760
%endmacro

761
762
763
764
765
766
;=============================================================================
; arch-independent part
;=============================================================================

%assign function_align 16

767
768
769
770
; Begin a function.
; Applies any symbol mangling needed for C linkage, and sets up a define such that
; subsequent uses of the function name automatically refer to the mangled version.
; Appends cpuflags to the function name if cpuflags has been specified.
771
772
; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX
; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2).
773
%macro cglobal 1-2+ "" ; name, [PROLOGUE args]
774
775
776
777
778
779
    cglobal_internal 1, %1 %+ SUFFIX, %2
%endmacro
%macro cvisible 1-2+ "" ; name, [PROLOGUE args]
    cglobal_internal 0, %1 %+ SUFFIX, %2
%endmacro
%macro cglobal_internal 2-3+
780
    annotate_function_size
781
    %ifndef cglobaled_%2
782
783
784
785
786
        %if %1
            %xdefine %2 mangle(private_prefix %+ _ %+ %2)
        %else
            %xdefine %2 mangle(public_prefix %+ _ %+ %2)
        %endif
787
788
        %xdefine %2.skip_prologue %2 %+ .skip_prologue
        CAT_XDEFINE cglobaled_, %2, 1
789
    %endif
790
    %xdefine current_function %2
791
    %xdefine current_function_section __SECT__
792
    %if FORMAT_ELF
793
794
795
796
797
798
799
        %if %1
            global %2:function hidden
        %else
            global %2:function
        %endif
    %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN && %1
        global %2:private_extern
800
    %else
801
        global %2
802
803
    %endif
    align function_align
804
    %2:
805
806
807
808
809
    RESET_MM_PERMUTATION        ; needed for x86-64, also makes disassembly somewhat nicer
    %xdefine rstk rsp           ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required
    %assign stack_offset 0      ; stack pointer offset relative to the return address
    %assign stack_size 0        ; amount of stack space that can be freely used inside a function
    %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
Henrik Gramner's avatar
Henrik Gramner committed
810
    %assign xmm_regs_used 0     ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 and vzeroupper
811
812
    %ifnidn %3, ""
        PROLOGUE %3
Loren Merritt's avatar
Loren Merritt committed
813
    %endif
814
815
%endmacro

816
817
818
819
; Create a global symbol from a local label with the correct name mangling and type
%macro cglobal_label 1
    %if FORMAT_ELF
        global current_function %+ %1:function hidden
820
821
    %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN
        global current_function %+ %1:private_extern
822
823
824
825
826
827
    %else
        global current_function %+ %1
    %endif
    %1:
%endmacro

828
%macro cextern 1
829
    %xdefine %1 mangle(private_prefix %+ _ %+ %1)
830
    CAT_XDEFINE cglobaled_, %1, 1
831
832
833
    extern %1
%endmacro

834
; like cextern, but without the prefix
835
%macro cextern_naked 1
836
837
838
    %ifdef PREFIX
        %xdefine %1 mangle(%1)
    %endif
839
    CAT_XDEFINE cglobaled_, %1, 1
Anton Mitrofanov's avatar
Anton Mitrofanov committed
840
    extern %1
841
842
%endmacro

843
%macro const 1-2+
844
    %xdefine %1 mangle(private_prefix %+ _ %+ %1)
845
    %if FORMAT_ELF
846
        global %1:data hidden
847
848
    %elif FORMAT_MACHO && HAVE_PRIVATE_EXTERN
        global %1:private_extern
849
850
851
    %else
        global %1
    %endif
852
853
854
    %1: %2
%endmacro

855
856
; This is needed for ELF, otherwise the GNU linker assumes the stack is executable by default.
%if FORMAT_ELF
857
    [SECTION .note.GNU-stack noalloc noexec nowrite progbits]
858
859
%endif

860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
; Tell debuggers how large the function was.
; This may be invoked multiple times per function; we rely on later instances overriding earlier ones.
; This is invoked by RET and similar macros, and also cglobal does it for the previous function,
; but if the last function in a source file doesn't use any of the standard macros for its epilogue,
; then its size might be unspecified.
%macro annotate_function_size 0
    %ifdef __YASM_VER__
        %ifdef current_function
            %if FORMAT_ELF
                current_function_section
                %%ecf equ $
                size current_function %%ecf - current_function
                __SECT__
            %endif
        %endif
    %endif
%endmacro

878
879
880
881
; cpuflags

%assign cpuflags_mmx      (1<<0)
%assign cpuflags_mmx2     (1<<1) | cpuflags_mmx
882
%assign cpuflags_3dnow    (1<<2) | cpuflags_mmx
883
%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow
884
885
886
%assign cpuflags_sse      (1<<4) | cpuflags_mmx2
%assign cpuflags_sse2     (1<<5) | cpuflags_sse
%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
887
888
889
890
891
%assign cpuflags_lzcnt    (1<<7) | cpuflags_sse2
%assign cpuflags_sse3     (1<<8) | cpuflags_sse2
%assign cpuflags_ssse3    (1<<9) | cpuflags_sse3
%assign cpuflags_sse4     (1<<10)| cpuflags_ssse3
%assign cpuflags_sse42    (1<<11)| cpuflags_sse4
892
%assign cpuflags_aesni    (1<<12)| cpuflags_sse42
893
894
895
896
897
898
899
900
901
902
903
904
905
906
%assign cpuflags_gfni     (1<<13)| cpuflags_sse42
%assign cpuflags_avx      (1<<14)| cpuflags_sse42
%assign cpuflags_xop      (1<<15)| cpuflags_avx
%assign cpuflags_fma4     (1<<16)| cpuflags_avx
%assign cpuflags_fma3     (1<<17)| cpuflags_avx
%assign cpuflags_bmi1     (1<<18)| cpuflags_avx|cpuflags_lzcnt
%assign cpuflags_bmi2     (1<<19)| cpuflags_bmi1
%assign cpuflags_avx2     (1<<20)| cpuflags_fma3|cpuflags_bmi2
%assign cpuflags_avx512   (1<<21)| cpuflags_avx2 ; F, CD, BW, DQ, VL

%assign cpuflags_cache32  (1<<22)
%assign cpuflags_cache64  (1<<23)
%assign cpuflags_aligned  (1<<24) ; not a cpu feature, but a function variant
%assign cpuflags_atom     (1<<25)
907

908
909
910
; Returns a boolean value expressing whether or not the specified cpuflag is enabled.
%define    cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1)
%define notcpuflag(x) (cpuflag(x) ^ 1)
911

912
; Takes an arbitrary number of cpuflags from the above list.
913
914
; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
915
916
917
918
919
%macro INIT_CPUFLAGS 0-*
    %xdefine SUFFIX
    %undef cpuname
    %assign cpuflags 0

920
    %if %0 >= 1
921
922
923
924
925
926
927
928
929
        %rep %0
            %ifdef cpuname
                %xdefine cpuname cpuname %+ _%1
            %else
                %xdefine cpuname %1
            %endif
            %assign cpuflags cpuflags | cpuflags_%1
            %rotate 1
        %endrep
930
        %xdefine SUFFIX _ %+ cpuname
931

932
933
934
        %if cpuflag(avx)
            %assign avx_enabled 1
        %endif
935
        %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2))
936
937
938
939
            %define mova movaps
            %define movu movups
            %define movnta movntps
        %endif
940
941
        %if cpuflag(aligned)
            %define movu mova
942
        %elif cpuflag(sse3) && notcpuflag(ssse3)
943
944
            %define movu lddqu
        %endif
945
946
947
    %endif

    %if ARCH_X86_64 || cpuflag(sse2)
948
        %ifdef __NASM_VER__
949
            ALIGNMODE p6
950
951
952
        %else
            CPU amdnop
        %endif
953
    %else
954
955
956
957
958
        %ifdef __NASM_VER__
            ALIGNMODE nop
        %else
            CPU basicnop
        %endif
959
960
961
    %endif
%endmacro

Henrik Gramner's avatar
Henrik Gramner committed
962
; Merge mmx, sse*, and avx*
963
964
965
; m# is a simd register of the currently selected size
; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m#
; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m#
Henrik Gramner's avatar
Henrik Gramner committed
966
967
; zm# is the corresponding zmm register if mmsize >= 64, otherwise the same as m#
; (All 4 remain in sync through SWAP.)
968

969
970
971
972
973
974
975
976
%macro CAT_XDEFINE 3
    %xdefine %1%2 %3
%endmacro

%macro CAT_UNDEF 2
    %undef %1%2
%endmacro

977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
%macro DEFINE_MMREGS 1 ; mmtype
    %assign %%prev_mmregs 0
    %ifdef num_mmregs
        %assign %%prev_mmregs num_mmregs
    %endif

    %assign num_mmregs 8
    %if ARCH_X86_64 && mmsize >= 16
        %assign num_mmregs 16
        %if cpuflag(avx512) || mmsize == 64
            %assign num_mmregs 32
        %endif
    %endif

    %assign %%i 0
    %rep num_mmregs
        CAT_XDEFINE m, %%i, %1 %+ %%i
        CAT_XDEFINE nn%1, %%i, %%i
        %assign %%i %%i+1
    %endrep
    %if %%prev_mmregs > num_mmregs
        %rep %%prev_mmregs - num_mmregs
            CAT_UNDEF m, %%i
            CAT_UNDEF nn %+ mmtype, %%i