x86inc.asm 48.6 KB
Newer Older
1
;*****************************************************************************
Fiona Glaser's avatar
Fiona Glaser committed
2
;* x86inc.asm: x264asm abstraction layer
3
;*****************************************************************************
Henrik Gramner's avatar
Henrik Gramner committed
4
;* Copyright (C) 2005-2018 x264 project
5
6
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
7
;*          Henrik Gramner <henrik@gramner.com>
8
;*          Anton Mitrofanov <BugMaster@narod.ru>
Fiona Glaser's avatar
Fiona Glaser committed
9
;*          Fiona Glaser <fiona@x264.com>
10
;*
Fiona Glaser's avatar
Fiona Glaser committed
11
12
13
;* Permission to use, copy, modify, and/or distribute this software for any
;* purpose with or without fee is hereby granted, provided that the above
;* copyright notice and this permission notice appear in all copies.
14
;*
Fiona Glaser's avatar
Fiona Glaser committed
15
16
17
18
19
20
21
;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
22
23
;*****************************************************************************

Fiona Glaser's avatar
Fiona Glaser committed
24
25
26
27
28
29
30
31
32
33
34
35
36
; This is a header file for the x264ASM assembly language, which uses
; NASM/YASM syntax combined with a large number of macros to provide easy
; abstraction between different calling conventions (x86_32, win64, linux64).
; It also has various other useful features to simplify writing the kind of
; DSP functions that are most often used in x264.

; Unlike the rest of x264, this file is available under an ISC license, as it
; has significant usefulness outside of x264 and we want it to be available
; to the largest audience possible.  Of course, if you modify it for your own
; purposes to add a new feature, we strongly encourage contributing a patch
; as this feature might be useful for others as well.  Send patches or ideas
; to x264-devel@videolan.org .

37
38
%ifndef private_prefix
    %define private_prefix x264
39
%endif
40

41
42
43
44
%ifndef public_prefix
    %define public_prefix private_prefix
%endif

45
46
47
48
49
50
51
52
%ifndef STACK_ALIGNMENT
    %if ARCH_X86_64
        %define STACK_ALIGNMENT 16
    %else
        %define STACK_ALIGNMENT 4
    %endif
%endif

53
54
55
%define WIN64  0
%define UNIX64 0
%if ARCH_X86_64
Anton Mitrofanov's avatar
Anton Mitrofanov committed
56
    %ifidn __OUTPUT_FORMAT__,win32
57
        %define WIN64  1
58
59
    %elifidn __OUTPUT_FORMAT__,win64
        %define WIN64  1
60
61
    %elifidn __OUTPUT_FORMAT__,x64
        %define WIN64  1
Anton Mitrofanov's avatar
Anton Mitrofanov committed
62
    %else
63
        %define UNIX64 1
Anton Mitrofanov's avatar
Anton Mitrofanov committed
64
65
66
    %endif
%endif

67
68
69
70
71
72
73
74
75
%define FORMAT_ELF 0
%ifidn __OUTPUT_FORMAT__,elf
    %define FORMAT_ELF 1
%elifidn __OUTPUT_FORMAT__,elf32
    %define FORMAT_ELF 1
%elifidn __OUTPUT_FORMAT__,elf64
    %define FORMAT_ELF 1
%endif

76
77
78
79
80
81
%ifdef PREFIX
    %define mangle(x) _ %+ x
%else
    %define mangle(x) x
%endif

Fiona Glaser's avatar
Fiona Glaser committed
82
%macro SECTION_RODATA 0-1 16
83
84
85
86
87
88
89
    %ifidn __OUTPUT_FORMAT__,win32
        SECTION .rdata align=%1
    %elif WIN64
        SECTION .rdata align=%1
    %else
        SECTION .rodata align=%1
    %endif
Loren Merritt's avatar
Loren Merritt committed
90
91
%endmacro

92
%if WIN64
Anton Mitrofanov's avatar
Anton Mitrofanov committed
93
    %define PIC
94
%elif ARCH_X86_64 == 0
95
96
97
; x86_32 doesn't require PIC.
; Some distros prefer shared objects to be PIC, but nothing breaks if
; the code contains a few textrels, so we'll skip that complexity.
Loren Merritt's avatar
Loren Merritt committed
98
99
100
    %undef PIC
%endif
%ifdef PIC
101
    default rel
102
103
%endif

104
105
106
107
%ifdef __NASM_VER__
    %use smartalign
%endif

108
109
110
111
112
113
114
; Macros to eliminate most code duplication between x86_32 and x86_64:
; Currently this works only for leaf functions which load all their arguments
; into registers at the start, and make no other use of the stack. Luckily that
; covers most of x264's asm.

; PROLOGUE:
; %1 = number of arguments. loads them from stack if needed.
Loren Merritt's avatar
Loren Merritt committed
115
; %2 = number of registers used. pushes callee-saved regs if needed.
Anton Mitrofanov's avatar
Anton Mitrofanov committed
116
; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
117
118
119
; %4 = (optional) stack size to be allocated. The stack will be aligned before
;      allocating the specified stack size. If the required stack alignment is
;      larger than the known stack alignment the stack will be manually aligned
120
121
122
123
;      and an extra register will be allocated to hold the original stack
;      pointer (to not invalidate r0m etc.). To prevent the use of an extra
;      register as stack pointer, request a negative stack size.
; %4+/%5+ = list of names to define to registers
124
125
; PROLOGUE can also be invoked by adding the same options to cglobal

Loren Merritt's avatar
Loren Merritt committed
126
; e.g.
127
128
129
130
; cglobal foo, 2,3,7,0x40, dst, src, tmp
; declares a function (foo) that automatically loads two arguments (dst and
; src) into registers, uses one additional register (tmp) plus 7 vector
; registers (m0-m6) and allocates 0x40 bytes of stack space.
Loren Merritt's avatar
Loren Merritt committed
131

132
133
134
135
136
; TODO Some functions can use some args directly from the stack. If they're the
; last args then you can just not declare them, but if they're in the middle
; we need more flexible macro.

; RET:
137
; Pops anything that was pushed by PROLOGUE, and returns.
138
139

; REP_RET:
140
; Use this instead of RET if it's a branch target.
141

Anton Mitrofanov's avatar
Anton Mitrofanov committed
142
143
144
; registers:
; rN and rNq are the native-size register holding function argument N
; rNd, rNw, rNb are dword, word, and byte size
145
; rNh is the high 8 bits of the word size
Anton Mitrofanov's avatar
Anton Mitrofanov committed
146
147
148
; rNm is the original location of arg N (a register or on the stack), dword
; rNmp is native size

149
%macro DECLARE_REG 2-3
150
    %define r%1q %2
151
152
153
154
    %define r%1d %2d
    %define r%1w %2w
    %define r%1b %2b
    %define r%1h %2h
155
    %define %2q %2
156
157
    %if %0 == 2
        %define r%1m  %2d
Anton Mitrofanov's avatar
Anton Mitrofanov committed
158
        %define r%1mp %2
159
    %elif ARCH_X86_64 ; memory
160
161
        %define r%1m [rstk + stack_offset + %3]
        %define r%1mp qword r %+ %1 %+ m
Anton Mitrofanov's avatar
Anton Mitrofanov committed
162
    %else
163
164
        %define r%1m [rstk + stack_offset + %3]
        %define r%1mp dword r %+ %1 %+ m
Anton Mitrofanov's avatar
Anton Mitrofanov committed
165
    %endif
Loren Merritt's avatar
Loren Merritt committed
166
    %define r%1  %2
167
168
%endmacro

169
%macro DECLARE_REG_SIZE 3
170
171
172
173
174
175
    %define r%1q r%1
    %define e%1q r%1
    %define r%1d e%1
    %define e%1d e%1
    %define r%1w %1
    %define e%1w %1
176
177
    %define r%1h %3
    %define e%1h %3
Loren Merritt's avatar
Loren Merritt committed
178
179
    %define r%1b %2
    %define e%1b %2
180
181
182
    %if ARCH_X86_64 == 0
        %define r%1 e%1
    %endif
183
184
%endmacro

185
186
187
188
189
190
191
DECLARE_REG_SIZE ax, al, ah
DECLARE_REG_SIZE bx, bl, bh
DECLARE_REG_SIZE cx, cl, ch
DECLARE_REG_SIZE dx, dl, dh
DECLARE_REG_SIZE si, sil, null
DECLARE_REG_SIZE di, dil, null
DECLARE_REG_SIZE bp, bpl, null
192

193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
; t# defines for when per-arch register allocation is more complex than just function arguments

%macro DECLARE_REG_TMP 1-*
    %assign %%i 0
    %rep %0
        CAT_XDEFINE t, %%i, r%1
        %assign %%i %%i+1
        %rotate 1
    %endrep
%endmacro

%macro DECLARE_REG_TMP_SIZE 0-*
    %rep %0
        %define t%1q t%1 %+ q
        %define t%1d t%1 %+ d
        %define t%1w t%1 %+ w
209
        %define t%1h t%1 %+ h
210
211
212
213
214
        %define t%1b t%1 %+ b
        %rotate 1
    %endrep
%endmacro

215
DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
216

217
%if ARCH_X86_64
Loren Merritt's avatar
Loren Merritt committed
218
    %define gprsize 8
219
%else
Loren Merritt's avatar
Loren Merritt committed
220
    %define gprsize 4
221
222
223
224
%endif

%macro PUSH 1
    push %1
225
226
227
    %ifidn rstk, rsp
        %assign stack_offset stack_offset+gprsize
    %endif
228
229
230
231
%endmacro

%macro POP 1
    pop %1
232
233
234
    %ifidn rstk, rsp
        %assign stack_offset stack_offset-gprsize
    %endif
235
236
%endmacro

237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
%macro PUSH_IF_USED 1-*
    %rep %0
        %if %1 < regs_used
            PUSH r%1
        %endif
        %rotate 1
    %endrep
%endmacro

%macro POP_IF_USED 1-*
    %rep %0
        %if %1 < regs_used
            pop r%1
        %endif
        %rotate 1
    %endrep
%endmacro

%macro LOAD_IF_USED 1-*
    %rep %0
        %if %1 < num_args
            mov r%1, r %+ %1 %+ mp
        %endif
        %rotate 1
    %endrep
%endmacro

264
265
%macro SUB 2
    sub %1, %2
266
    %ifidn %1, rstk
267
268
269
270
271
272
        %assign stack_offset stack_offset+(%2)
    %endif
%endmacro

%macro ADD 2
    add %1, %2
273
    %ifidn %1, rstk
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
        %assign stack_offset stack_offset-(%2)
    %endif
%endmacro

%macro movifnidn 2
    %ifnidn %1, %2
        mov %1, %2
    %endif
%endmacro

%macro movsxdifnidn 2
    %ifnidn %1, %2
        movsxd %1, %2
    %endif
%endmacro

%macro ASSERT 1
    %if (%1) == 0
292
        %error assertion ``%1'' failed
293
294
295
    %endif
%endmacro

Loren Merritt's avatar
Loren Merritt committed
296
297
298
299
300
301
302
%macro DEFINE_ARGS 0-*
    %ifdef n_arg_names
        %assign %%i 0
        %rep n_arg_names
            CAT_UNDEF arg_name %+ %%i, q
            CAT_UNDEF arg_name %+ %%i, d
            CAT_UNDEF arg_name %+ %%i, w
303
            CAT_UNDEF arg_name %+ %%i, h
Loren Merritt's avatar
Loren Merritt committed
304
            CAT_UNDEF arg_name %+ %%i, b
Loren Merritt's avatar
Loren Merritt committed
305
            CAT_UNDEF arg_name %+ %%i, m
306
            CAT_UNDEF arg_name %+ %%i, mp
Loren Merritt's avatar
Loren Merritt committed
307
308
309
310
311
            CAT_UNDEF arg_name, %%i
            %assign %%i %%i+1
        %endrep
    %endif

312
    %xdefine %%stack_offset stack_offset
313
    %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
Loren Merritt's avatar
Loren Merritt committed
314
315
316
317
318
    %assign %%i 0
    %rep %0
        %xdefine %1q r %+ %%i %+ q
        %xdefine %1d r %+ %%i %+ d
        %xdefine %1w r %+ %%i %+ w
319
        %xdefine %1h r %+ %%i %+ h
Loren Merritt's avatar
Loren Merritt committed
320
        %xdefine %1b r %+ %%i %+ b
Loren Merritt's avatar
Loren Merritt committed
321
        %xdefine %1m r %+ %%i %+ m
322
        %xdefine %1mp r %+ %%i %+ mp
Loren Merritt's avatar
Loren Merritt committed
323
324
325
326
        CAT_XDEFINE arg_name, %%i, %1
        %assign %%i %%i+1
        %rotate 1
    %endrep
327
    %xdefine stack_offset %%stack_offset
328
    %assign n_arg_names %0
Loren Merritt's avatar
Loren Merritt committed
329
330
%endmacro

331
%define required_stack_alignment ((mmsize + 15) & ~15)
Henrik Gramner's avatar
Henrik Gramner committed
332
333
%define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512)))
%define high_mm_regs (16*cpuflag(avx512))
334

335
336
337
%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
    %ifnum %1
        %if %1 != 0
338
            %assign %%pad 0
339
340
341
342
            %assign stack_size %1
            %if stack_size < 0
                %assign stack_size -stack_size
            %endif
343
            %if WIN64
344
                %assign %%pad %%pad + 32 ; shadow space
345
346
347
                %if mmsize != 8
                    %assign xmm_regs_used %2
                    %if xmm_regs_used > 8
348
                        %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers
349
350
                    %endif
                %endif
351
            %endif
352
353
354
            %if required_stack_alignment <= STACK_ALIGNMENT
                ; maintain the current stack alignment
                %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
355
356
357
358
359
360
361
362
363
                SUB rsp, stack_size_padded
            %else
                %assign %%reg_num (regs_used - 1)
                %xdefine rstk r %+ %%reg_num
                ; align stack, and save original stack location directly above
                ; it, i.e. in [rsp+stack_size_padded], so we can restore the
                ; stack in a single instruction (i.e. mov rsp, rstk or mov
                ; rsp, [rsp+stack_size_padded])
                %if %1 < 0 ; need to store rsp on stack
364
365
                    %xdefine rstkm [rsp + stack_size + %%pad]
                    %assign %%pad %%pad + gprsize
366
367
368
                %else ; can keep rsp in rstk during whole function
                    %xdefine rstkm rstk
                %endif
369
370
371
372
373
                %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1))
                mov rstk, rsp
                and rsp, ~(required_stack_alignment-1)
                sub rsp, stack_size_padded
                movifnidn rstkm, rstk
374
            %endif
375
            WIN64_PUSH_XMM
376
377
378
379
380
381
        %endif
    %endif
%endmacro

%macro SETUP_STACK_POINTER 1
    %ifnum %1
382
        %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT
383
            %if %1 > 0
384
385
                ; Reserve an additional register for storing the original stack pointer, but avoid using
                ; eax/rax for this purpose since it can potentially get overwritten as a return value.
386
                %assign regs_used (regs_used + 1)
387
388
389
390
391
                %if ARCH_X86_64 && regs_used == 7
                    %assign regs_used 8
                %elif ARCH_X86_64 == 0 && regs_used == 1
                    %assign regs_used 2
                %endif
392
393
            %endif
            %if ARCH_X86_64 && regs_used < 5 + UNIX64 * 3
394
395
                ; Ensure that we don't clobber any registers containing arguments. For UNIX64 we also preserve r6 (rax)
                ; since it's used as a hidden argument in vararg functions to specify the number of vector registers used.
396
                %assign regs_used 5 + UNIX64 * 3
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
            %endif
        %endif
    %endif
%endmacro

%macro DEFINE_ARGS_INTERNAL 3+
    %ifnum %2
        DEFINE_ARGS %3
    %elif %1 == 4
        DEFINE_ARGS %2
    %elif %1 > 4
        DEFINE_ARGS %2, %3
    %endif
%endmacro

412
%if WIN64 ; Windows x64 ;=================================================
Anton Mitrofanov's avatar
Anton Mitrofanov committed
413

414
415
416
417
418
419
420
421
422
423
424
DECLARE_REG 0,  rcx
DECLARE_REG 1,  rdx
DECLARE_REG 2,  R8
DECLARE_REG 3,  R9
DECLARE_REG 4,  R10, 40
DECLARE_REG 5,  R11, 48
DECLARE_REG 6,  rax, 56
DECLARE_REG 7,  rdi, 64
DECLARE_REG 8,  rsi, 72
DECLARE_REG 9,  rbx, 80
DECLARE_REG 10, rbp, 88
425
426
427
428
DECLARE_REG 11, R14, 96
DECLARE_REG 12, R15, 104
DECLARE_REG 13, R12, 112
DECLARE_REG 14, R13, 120
Anton Mitrofanov's avatar
Anton Mitrofanov committed
429

430
%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
431
    %assign num_args %1
Anton Mitrofanov's avatar
Anton Mitrofanov committed
432
    %assign regs_used %2
433
    ASSERT regs_used >= num_args
434
    SETUP_STACK_POINTER %4
435
436
    ASSERT regs_used <= 15
    PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
437
438
    ALLOC_STACK %4, %3
    %if mmsize != 8 && stack_size == 0
439
440
        WIN64_SPILL_XMM %3
    %endif
441
    LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
442
443
444
445
    DEFINE_ARGS_INTERNAL %0, %4, %5
%endmacro

%macro WIN64_PUSH_XMM 0
446
    ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
Henrik Gramner's avatar
Henrik Gramner committed
447
    %if xmm_regs_used > 6 + high_mm_regs
448
449
        movaps [rstk + stack_offset +  8], xmm6
    %endif
Henrik Gramner's avatar
Henrik Gramner committed
450
    %if xmm_regs_used > 7 + high_mm_regs
451
452
        movaps [rstk + stack_offset + 24], xmm7
    %endif
Henrik Gramner's avatar
Henrik Gramner committed
453
454
    %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
    %if %%xmm_regs_on_stack > 0
455
        %assign %%i 8
Henrik Gramner's avatar
Henrik Gramner committed
456
        %rep %%xmm_regs_on_stack
457
458
459
460
            movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
            %assign %%i %%i+1
        %endrep
    %endif
461
462
463
464
%endmacro

%macro WIN64_SPILL_XMM 1
    %assign xmm_regs_used %1
Henrik Gramner's avatar
Henrik Gramner committed
465
466
467
    ASSERT xmm_regs_used <= 16 + high_mm_regs
    %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
    %if %%xmm_regs_on_stack > 0
468
        ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack.
Henrik Gramner's avatar
Henrik Gramner committed
469
        %assign %%pad %%xmm_regs_on_stack*16 + 32
470
        %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
471
        SUB rsp, stack_size_padded
Anton Mitrofanov's avatar
Anton Mitrofanov committed
472
    %endif
473
    WIN64_PUSH_XMM
Anton Mitrofanov's avatar
Anton Mitrofanov committed
474
475
%endmacro

476
%macro WIN64_RESTORE_XMM_INTERNAL 0
477
    %assign %%pad_size 0
Henrik Gramner's avatar
Henrik Gramner committed
478
479
480
481
    %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
    %if %%xmm_regs_on_stack > 0
        %assign %%i xmm_regs_used - high_mm_regs
        %rep %%xmm_regs_on_stack
Anton Mitrofanov's avatar
Anton Mitrofanov committed
482
            %assign %%i %%i-1
483
            movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32]
Anton Mitrofanov's avatar
Anton Mitrofanov committed
484
        %endrep
485
486
    %endif
    %if stack_size_padded > 0
487
        %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT
488
489
            mov rsp, rstkm
        %else
490
            add rsp, stack_size_padded
491
            %assign %%pad_size stack_size_padded
492
        %endif
Anton Mitrofanov's avatar
Anton Mitrofanov committed
493
    %endif
Henrik Gramner's avatar
Henrik Gramner committed
494
    %if xmm_regs_used > 7 + high_mm_regs
495
        movaps xmm7, [rsp + stack_offset - %%pad_size + 24]
496
    %endif
Henrik Gramner's avatar
Henrik Gramner committed
497
    %if xmm_regs_used > 6 + high_mm_regs
498
        movaps xmm6, [rsp + stack_offset - %%pad_size +  8]
499
    %endif
Anton Mitrofanov's avatar
Anton Mitrofanov committed
500
501
%endmacro

502
503
%macro WIN64_RESTORE_XMM 0
    WIN64_RESTORE_XMM_INTERNAL
504
    %assign stack_offset (stack_offset-stack_size_padded)
505
    %assign stack_size_padded 0
Anton Mitrofanov's avatar
Anton Mitrofanov committed
506
507
508
    %assign xmm_regs_used 0
%endmacro

Henrik Gramner's avatar
Henrik Gramner committed
509
%define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6+high_mm_regs
510

Anton Mitrofanov's avatar
Anton Mitrofanov committed
511
%macro RET 0
512
    WIN64_RESTORE_XMM_INTERNAL
513
    POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
Henrik Gramner's avatar
Henrik Gramner committed
514
    %if vzeroupper_required
515
516
        vzeroupper
    %endif
517
    AUTO_REP_RET
Anton Mitrofanov's avatar
Anton Mitrofanov committed
518
519
%endmacro

520
%elif ARCH_X86_64 ; *nix x64 ;=============================================
521

522
523
524
525
526
527
528
529
530
531
532
DECLARE_REG 0,  rdi
DECLARE_REG 1,  rsi
DECLARE_REG 2,  rdx
DECLARE_REG 3,  rcx
DECLARE_REG 4,  R8
DECLARE_REG 5,  R9
DECLARE_REG 6,  rax, 8
DECLARE_REG 7,  R10, 16
DECLARE_REG 8,  R11, 24
DECLARE_REG 9,  rbx, 32
DECLARE_REG 10, rbp, 40
533
534
535
536
DECLARE_REG 11, R14, 48
DECLARE_REG 12, R15, 56
DECLARE_REG 13, R12, 64
DECLARE_REG 14, R13, 72
537

Henrik Gramner's avatar
Henrik Gramner committed
538
%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
539
540
    %assign num_args %1
    %assign regs_used %2
Henrik Gramner's avatar
Henrik Gramner committed
541
    %assign xmm_regs_used %3
542
    ASSERT regs_used >= num_args
543
    SETUP_STACK_POINTER %4
544
545
    ASSERT regs_used <= 15
    PUSH_IF_USED 9, 10, 11, 12, 13, 14
546
    ALLOC_STACK %4
547
    LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
548
    DEFINE_ARGS_INTERNAL %0, %4, %5
549
550
%endmacro

Henrik Gramner's avatar
Henrik Gramner committed
551
%define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required
552

553
%macro RET 0
554
555
556
557
558
559
560
    %if stack_size_padded > 0
        %if required_stack_alignment > STACK_ALIGNMENT
            mov rsp, rstkm
        %else
            add rsp, stack_size_padded
        %endif
    %endif
561
    POP_IF_USED 14, 13, 12, 11, 10, 9
Henrik Gramner's avatar
Henrik Gramner committed
562
    %if vzeroupper_required
563
564
        vzeroupper
    %endif
565
    AUTO_REP_RET
566
567
568
569
%endmacro

%else ; X86_32 ;==============================================================

570
571
572
573
574
575
576
DECLARE_REG 0, eax, 4
DECLARE_REG 1, ecx, 8
DECLARE_REG 2, edx, 12
DECLARE_REG 3, ebx, 16
DECLARE_REG 4, esi, 20
DECLARE_REG 5, edi, 24
DECLARE_REG 6, ebp, 28
577
578
%define rsp esp

579
580
%macro DECLARE_ARG 1-*
    %rep %0
581
        %define r%1m [rstk + stack_offset + 4*%1 + 4]
582
583
584
        %define r%1mp dword r%1m
        %rotate 1
    %endrep
585
586
%endmacro

587
DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
588

589
%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
590
    %assign num_args %1
591
    %assign regs_used %2
592
593
594
595
    ASSERT regs_used >= num_args
    %if num_args > 7
        %assign num_args 7
    %endif
596
597
598
    %if regs_used > 7
        %assign regs_used 7
    %endif
599
600
    SETUP_STACK_POINTER %4
    ASSERT regs_used <= 7
601
    PUSH_IF_USED 3, 4, 5, 6
602
    ALLOC_STACK %4
603
    LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
604
    DEFINE_ARGS_INTERNAL %0, %4, %5
605
606
%endmacro

Henrik Gramner's avatar
Henrik Gramner committed
607
%define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required
608

609
%macro RET 0
610
611
612
613
614
615
616
    %if stack_size_padded > 0
        %if required_stack_alignment > STACK_ALIGNMENT
            mov rsp, rstkm
        %else
            add rsp, stack_size_padded
        %endif
    %endif
617
    POP_IF_USED 6, 5, 4, 3
Henrik Gramner's avatar
Henrik Gramner committed
618
    %if vzeroupper_required
619
620
        vzeroupper
    %endif
621
    AUTO_REP_RET
622
623
624
625
%endmacro

%endif ;======================================================================

626
%if WIN64 == 0
627
628
    %macro WIN64_SPILL_XMM 1
    %endmacro
629
    %macro WIN64_RESTORE_XMM 0
630
631
632
    %endmacro
    %macro WIN64_PUSH_XMM 0
    %endmacro
633
634
%endif

635
636
637
638
; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either
; a branch or a branch target. So switch to a 2-byte form of ret in that case.
; We can automatically detect "follows a branch", but not a branch target.
; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.)
639
%macro REP_RET 0
640
    %if has_epilogue || cpuflag(ssse3)
641
642
643
644
        RET
    %else
        rep ret
    %endif
645
    annotate_function_size
646
647
%endmacro

648
649
%define last_branch_adr $$
%macro AUTO_REP_RET 0
Henrik Gramner's avatar
Henrik Gramner committed
650
651
    %if notcpuflag(ssse3)
        times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ == last_branch_adr.
652
653
    %endif
    ret
654
    annotate_function_size
655
656
657
658
659
660
%endmacro

%macro BRANCH_INSTR 0-*
    %rep %0
        %macro %1 1-2 %1
            %2 %1
661
662
663
664
            %if notcpuflag(ssse3)
                %%branch_instr equ $
                %xdefine last_branch_adr %%branch_instr
            %endif
665
666
667
668
669
670
671
        %endmacro
        %rotate 1
    %endrep
%endmacro

BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp

672
673
674
675
676
677
678
%macro TAIL_CALL 2 ; callee, is_nonadjacent
    %if has_epilogue
        call %1
        RET
    %elif %2
        jmp %1
    %endif
679
    annotate_function_size
680
681
%endmacro

682
683
684
685
686
687
;=============================================================================
; arch-independent part
;=============================================================================

%assign function_align 16

688
689
690
691
; Begin a function.
; Applies any symbol mangling needed for C linkage, and sets up a define such that
; subsequent uses of the function name automatically refer to the mangled version.
; Appends cpuflags to the function name if cpuflags has been specified.
692
693
; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX
; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2).
694
%macro cglobal 1-2+ "" ; name, [PROLOGUE args]
695
696
697
698
699
700
    cglobal_internal 1, %1 %+ SUFFIX, %2
%endmacro
%macro cvisible 1-2+ "" ; name, [PROLOGUE args]
    cglobal_internal 0, %1 %+ SUFFIX, %2
%endmacro
%macro cglobal_internal 2-3+
701
    annotate_function_size
702
703
704
705
706
707
708
709
710
711
712
    %if %1
        %xdefine %%FUNCTION_PREFIX private_prefix
        %xdefine %%VISIBILITY hidden
    %else
        %xdefine %%FUNCTION_PREFIX public_prefix
        %xdefine %%VISIBILITY
    %endif
    %ifndef cglobaled_%2
        %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2)
        %xdefine %2.skip_prologue %2 %+ .skip_prologue
        CAT_XDEFINE cglobaled_, %2, 1
713
    %endif
714
    %xdefine current_function %2
715
    %xdefine current_function_section __SECT__
716
    %if FORMAT_ELF
717
        global %2:function %%VISIBILITY
718
    %else
719
        global %2
720
721
    %endif
    align function_align
722
    %2:
723
724
725
726
727
    RESET_MM_PERMUTATION        ; needed for x86-64, also makes disassembly somewhat nicer
    %xdefine rstk rsp           ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required
    %assign stack_offset 0      ; stack pointer offset relative to the return address
    %assign stack_size 0        ; amount of stack space that can be freely used inside a function
    %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
Henrik Gramner's avatar
Henrik Gramner committed
728
    %assign xmm_regs_used 0     ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 and vzeroupper
729
730
    %ifnidn %3, ""
        PROLOGUE %3
Loren Merritt's avatar
Loren Merritt committed
731
    %endif
732
733
%endmacro

734
735
736
737
738
739
740
741
742
743
; Create a global symbol from a local label with the correct name mangling and type
%macro cglobal_label 1
    %if FORMAT_ELF
        global current_function %+ %1:function hidden
    %else
        global current_function %+ %1
    %endif
    %1:
%endmacro

744
%macro cextern 1
745
    %xdefine %1 mangle(private_prefix %+ _ %+ %1)
746
    CAT_XDEFINE cglobaled_, %1, 1
747
748
749
    extern %1
%endmacro

750
; like cextern, but without the prefix
751
%macro cextern_naked 1
752
753
754
    %ifdef PREFIX
        %xdefine %1 mangle(%1)
    %endif
755
    CAT_XDEFINE cglobaled_, %1, 1
Anton Mitrofanov's avatar
Anton Mitrofanov committed
756
    extern %1
757
758
%endmacro

759
%macro const 1-2+
760
    %xdefine %1 mangle(private_prefix %+ _ %+ %1)
761
    %if FORMAT_ELF
762
763
764
765
        global %1:data hidden
    %else
        global %1
    %endif
766
767
768
    %1: %2
%endmacro

769
770
; This is needed for ELF, otherwise the GNU linker assumes the stack is executable by default.
%if FORMAT_ELF
771
    [SECTION .note.GNU-stack noalloc noexec nowrite progbits]
772
773
%endif

774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
; Tell debuggers how large the function was.
; This may be invoked multiple times per function; we rely on later instances overriding earlier ones.
; This is invoked by RET and similar macros, and also cglobal does it for the previous function,
; but if the last function in a source file doesn't use any of the standard macros for its epilogue,
; then its size might be unspecified.
%macro annotate_function_size 0
    %ifdef __YASM_VER__
        %ifdef current_function
            %if FORMAT_ELF
                current_function_section
                %%ecf equ $
                size current_function %%ecf - current_function
                __SECT__
            %endif
        %endif
    %endif
%endmacro

792
793
794
795
; cpuflags

%assign cpuflags_mmx      (1<<0)
%assign cpuflags_mmx2     (1<<1) | cpuflags_mmx
796
%assign cpuflags_3dnow    (1<<2) | cpuflags_mmx
797
%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow
798
799
800
%assign cpuflags_sse      (1<<4) | cpuflags_mmx2
%assign cpuflags_sse2     (1<<5) | cpuflags_sse
%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
801
802
803
804
805
%assign cpuflags_lzcnt    (1<<7) | cpuflags_sse2
%assign cpuflags_sse3     (1<<8) | cpuflags_sse2
%assign cpuflags_ssse3    (1<<9) | cpuflags_sse3
%assign cpuflags_sse4     (1<<10)| cpuflags_ssse3
%assign cpuflags_sse42    (1<<11)| cpuflags_sse4
806
807
808
809
810
811
812
813
814
815
816
817
818
819
%assign cpuflags_aesni    (1<<12)| cpuflags_sse42
%assign cpuflags_avx      (1<<13)| cpuflags_sse42
%assign cpuflags_xop      (1<<14)| cpuflags_avx
%assign cpuflags_fma4     (1<<15)| cpuflags_avx
%assign cpuflags_fma3     (1<<16)| cpuflags_avx
%assign cpuflags_bmi1     (1<<17)| cpuflags_avx|cpuflags_lzcnt
%assign cpuflags_bmi2     (1<<18)| cpuflags_bmi1
%assign cpuflags_avx2     (1<<19)| cpuflags_fma3|cpuflags_bmi2
%assign cpuflags_avx512   (1<<20)| cpuflags_avx2 ; F, CD, BW, DQ, VL

%assign cpuflags_cache32  (1<<21)
%assign cpuflags_cache64  (1<<22)
%assign cpuflags_aligned  (1<<23) ; not a cpu feature, but a function variant
%assign cpuflags_atom     (1<<24)
820

821
822
823
; Returns a boolean value expressing whether or not the specified cpuflag is enabled.
%define    cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1)
%define notcpuflag(x) (cpuflag(x) ^ 1)
824

825
; Takes an arbitrary number of cpuflags from the above list.
826
827
; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
828
829
830
831
832
%macro INIT_CPUFLAGS 0-*
    %xdefine SUFFIX
    %undef cpuname
    %assign cpuflags 0

833
    %if %0 >= 1
834
835
836
837
838
839
840
841
842
        %rep %0
            %ifdef cpuname
                %xdefine cpuname cpuname %+ _%1
            %else
                %xdefine cpuname %1
            %endif
            %assign cpuflags cpuflags | cpuflags_%1
            %rotate 1
        %endrep
843
        %xdefine SUFFIX _ %+ cpuname
844

845
846
847
        %if cpuflag(avx)
            %assign avx_enabled 1
        %endif
848
        %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2))
849
850
851
852
            %define mova movaps
            %define movu movups
            %define movnta movntps
        %endif
853
854
        %if cpuflag(aligned)
            %define movu mova
855
        %elif cpuflag(sse3) && notcpuflag(ssse3)
856
857
            %define movu lddqu
        %endif
858
859
860
    %endif

    %if ARCH_X86_64 || cpuflag(sse2)
861
        %ifdef __NASM_VER__
862
            ALIGNMODE p6
863
864
865
        %else
            CPU amdnop
        %endif
866
    %else
867
868
869
870
871
        %ifdef __NASM_VER__
            ALIGNMODE nop
        %else
            CPU basicnop
        %endif
872
873
874
    %endif
%endmacro

Henrik Gramner's avatar
Henrik Gramner committed
875
; Merge mmx, sse*, and avx*
876
877
878
; m# is a simd register of the currently selected size
; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m#
; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m#
Henrik Gramner's avatar
Henrik Gramner committed
879
880
; zm# is the corresponding zmm register if mmsize >= 64, otherwise the same as m#
; (All 4 remain in sync through SWAP.)
881

882
883
884
885
886
887
888
889
%macro CAT_XDEFINE 3
    %xdefine %1%2 %3
%endmacro

%macro CAT_UNDEF 2
    %undef %1%2
%endmacro

890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
%macro DEFINE_MMREGS 1 ; mmtype
    %assign %%prev_mmregs 0
    %ifdef num_mmregs
        %assign %%prev_mmregs num_mmregs
    %endif

    %assign num_mmregs 8
    %if ARCH_X86_64 && mmsize >= 16
        %assign num_mmregs 16
        %if cpuflag(avx512) || mmsize == 64
            %assign num_mmregs 32
        %endif
    %endif

    %assign %%i 0
    %rep num_mmregs
        CAT_XDEFINE m, %%i, %1 %+ %%i
        CAT_XDEFINE nn%1, %%i, %%i
        %assign %%i %%i+1
    %endrep
    %if %%prev_mmregs > num_mmregs
        %rep %%prev_mmregs - num_mmregs
            CAT_UNDEF m, %%i
            CAT_UNDEF nn %+ mmtype, %%i
            %assign %%i %%i+1
        %endrep
    %endif
    %xdefine mmtype %1
%endmacro

Henrik Gramner's avatar
Henrik Gramner committed
920
921
922
923
924
925
926
927
928
929
930
931
; Prefer registers 16-31 over 0-15 to avoid having to use vzeroupper
%macro AVX512_MM_PERMUTATION 0-1 0 ; start_reg
    %if ARCH_X86_64 && cpuflag(avx512)
        %assign %%i %1
        %rep 16-%1
            %assign %%i_high %%i+16
            SWAP %%i, %%i_high
            %assign %%i %%i+1
        %endrep
    %endif
%endmacro

932
%macro INIT_MMX 0-1+
Fiona Glaser's avatar
Fiona Glaser committed
933
    %assign avx_enabled 0
934
    %define RESET_MM_PERMUTATION INIT_MMX %1
Loren Merritt's avatar
Loren Merritt committed
935
    %define mmsize 8
936
937
938
    %define mova movq
    %define movu movq
    %define movh movd
939
    %define movnta movntq
940
    INIT_CPUFLAGS %1
941
    DEFINE_MMREGS mm
Loren Merritt's avatar
Loren Merritt committed
942
943
%endmacro

944
%macro INIT_XMM 0-1+
Fiona Glaser's avatar
Fiona Glaser committed
945
    %assign avx_enabled 0
946
    %define RESET_MM_PERMUTATION INIT_XMM %1
Loren Merritt's avatar
Loren Merritt committed
947
    %define mmsize 16
948
949
950
    %define mova movdqa
    %define movu movdqu
    %define movh movq
951
    %define movnta movntdq
952
    INIT_CPUFLAGS %1
953
    DEFINE_MMREGS xmm
Henrik Gramner's avatar
Henrik Gramner committed
954
    %if WIN64
955
        AVX512_MM_PERMUTATION 6 ; Swap callee-saved registers with volatile registers
Henrik Gramner's avatar
Henrik Gramner committed
956
    %endif
957
958
%endmacro

959
%macro INIT_YMM 0-1+
Fiona Glaser's avatar
Fiona Glaser committed
960
    %assign avx_enabled 1
961
    %define RESET_MM_PERMUTATION INIT_YMM %1
962
    %define mmsize 32
963
964
    %define mova movdqa
    %define movu movdqu
965
    %undef movh
966
    %define movnta movntdq
967
    INIT_CPUFLAGS %1
968
    DEFINE_MMREGS ymm
Henrik Gramner's avatar
Henrik Gramner committed
969
970
971
972
973
974
975
976
977
978
979
980
    AVX512_MM_PERMUTATION
%endmacro

%macro INIT_ZMM 0-1+
    %assign avx_enabled 1
    %define RESET_MM_PERMUTATION INIT_ZMM %1
    %define mmsize 64
    %define mova movdqa
    %define movu movdqu
    %undef movh
    %define movnta movntdq
    INIT_CPUFLAGS %1
981
    DEFINE_MMREGS zmm
Henrik Gramner's avatar
Henrik Gramner committed
982
    AVX512_MM_PERMUTATION
983
984
%endmacro

Loren Merritt's avatar
Loren Merritt committed
985
INIT_XMM
986

987
988
989
990
%macro DECLARE_MMCAST 1
    %define  mmmm%1   mm%1
    %define  mmxmm%1  mm%1
    %define  mmymm%1  mm%1
Henrik Gramner's avatar
Henrik Gramner committed
991
    %define  mmzmm%1  mm%1
992
993
994
    %define xmmmm%1   mm%1
    %define xmmxmm%1 xmm%1
    %define xmmymm%1 xmm%1
Henrik Gramner's avatar
Henrik Gramner committed
995
    %define xmmzmm%1 xmm%1
996
    %define ymmmm%1   mm%1
997
    %define ymmxmm%1 xmm%1
998
    %define ymmymm%1 ymm%1
Henrik Gramner's avatar
Henrik Gramner committed
999
1000
1001
1002
1003
    %define ymmzmm%1 ymm%1
    %define zmmmm%1   mm%1
    %define zmmxmm%1 xmm%1
    %define zmmymm%1 ymm%1
    %define zmmzmm%1 zmm%1
1004
1005
    %define xm%1 xmm %+ m%1
    %define ym%1 ymm %+ m%1
Henrik Gramner's avatar
Henrik Gramner committed
1006
    %define zm%1 zmm %+ m%1
1007
1008
1009
%endmacro

%assign i 0
Henrik Gramner's avatar
Henrik Gramner committed
1010
%rep 32
1011
    DECLARE_MMCAST i
1012
    %assign i i+1
1013
1014
%endrep

1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
; I often want to use macros that permute their arguments. e.g. there's no
; efficient way to implement butterfly or transpose or dct without swapping some
; arguments.
;
; I would like to not have to manually keep track of the permutations:
; If I insert a permutation in the middle of a function, it should automatically
; change everything that follows. For more complex macros I may also have multiple
; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
;
; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
; permutes its arguments. It's equivalent to exchanging the contents of the
; registers, except that this way you exchange the register names instead, so it
; doesn't cost any cycles.

%macro PERMUTE 2-* ; takes a list of pairs to swap
1030
1031
1032
1033
1034
1035
1036
1037
1038
    %rep %0/2
        %xdefine %%tmp%2 m%2
        %rotate 2
    %endrep
    %rep %0/2
        %xdefine m%1 %%tmp%2
        CAT_XDEFINE nn, m%1, %1
        %rotate 2
    %endrep
1039
1040
%endmacro

1041
%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs)
1042
1043
1044
1045
1046
    %ifnum %1 ; SWAP 0, 1, ...
        SWAP_INTERNAL_NUM %1, %2
    %else ; SWAP m0, m1, ...
        SWAP_INTERNAL_NAME %1, %2
    %endif
1047
1048
1049
1050
1051
1052
1053
%endmacro

%macro SWAP_INTERNAL_NUM 2-*
    %rep %0-1
        %xdefine %%tmp m%1
        %xdefine m%1 m%2
        %xdefine m%2 %%tmp
1054
1055
        CAT_XDEFINE nn, m%1, %1
        CAT_XDEFINE nn, m%2, %2
1056
        %rotate 1
1057
1058
1059
1060
    %endrep
%endmacro

%macro SWAP_INTERNAL_NAME 2-*
1061
    %xdefine %%args nn %+ %1
1062
    %rep %0-1
1063
        %xdefine %%args %%args, nn %+ %2
1064
        %rotate 1
1065
1066
    %endrep
    SWAP_INTERNAL_NUM %%args
1067
1068
%endmacro

1069
1070
1071
1072
1073
1074
1075
1076
1077
; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
; calls to that function will automatically load the permutation, so values can
; be returned in mmregs.
%macro SAVE_MM_PERMUTATION 0-1
    %if %0
        %xdefine %%f %1_m
    %else
        %xdefine %%f current_function %+ _m
    %endif
1078
1079
    %assign %%i 0
    %rep num_mmregs
1080
        CAT_XDEFINE %%f, %%i, m %+ %%i
1081
        %assign %%i %%i+1
1082
    %endrep
1083
1084
%endmacro

Loren Merritt's avatar
Loren Merritt committed
1085
%macro LOAD_MM_PERMUTATION 1 ; name to load from
1086
1087
1088
1089
    %ifdef %1_m0
        %assign %%i 0
        %rep num_mmregs
            CAT_XDEFINE m, %%i, %1_m %+ %%i
1090
            CAT_XDEFINE nn, m %+ %%i, %%i
1091
            %assign %%i %%i+1
1092
1093
        %endrep
    %endif
1094
1095
%endmacro

1096
; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
1097
%macro call 1
1098
1099
1100
1101
1102
    %ifid %1
        call_internal %1 %+ SUFFIX, %1
    %else
        call %1
    %endif
1103
1104
%endmacro
%macro call_internal 2
1105
1106
1107
1108
    %xdefine %%i %2
    %ifndef cglobaled_%2
        %ifdef cglobaled_%1
            %xdefine %%i %1
1109
        %endif
1110
    %endif
Loren Merritt's avatar