dct-a.asm 67.3 KB
Newer Older
1
;*****************************************************************************
Fiona Glaser's avatar
Fiona Glaser committed
2
;* dct-a.asm: x86 transform and zigzag
3
;*****************************************************************************
Henrik Gramner's avatar
Henrik Gramner committed
4
;* Copyright (C) 2003-2018 x264 project
5
;*
6
;* Authors: Holger Lubitz <holger@lubitz.org>
7
;*          Loren Merritt <lorenm@u.washington.edu>
8
;*          Laurent Aimar <fenrir@via.ecp.fr>
9
;*          Min Chen <chenm001.163.com>
Fiona Glaser's avatar
Fiona Glaser committed
10
;*          Fiona Glaser <fiona@x264.com>
11
12
13
14
15
16
17
18
19
20
21
22
23
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
24
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
Fiona Glaser's avatar
Fiona Glaser committed
25
26
27
;*
;* This program is also available under a commercial proprietary license.
;* For more information, contact us at licensing@x264.com.
28
29
30
;*****************************************************************************

%include "x86inc.asm"
31
%include "x86util.asm"
32

33
SECTION_RODATA 64
Henrik Gramner's avatar
Henrik Gramner committed
34
; AVX-512 permutation indices are bit-packed to save cache
35
%if HIGH_BIT_DEPTH
36
37
38
39
40
scan_frame_avx512: dd 0x00bf0200, 0x00fd7484, 0x0033a611, 0x0069d822 ; bits 0-3:   4x4_frame
                   dd 0x00a3ca95, 0x00dd8d08, 0x00e75b8c, 0x00a92919 ; bits 4-8:   8x8_frame1
                   dd 0x0072f6a6, 0x003c8433, 0x007e5247, 0x00b6a0ba ; bits 9-13:  8x8_frame2
                   dd 0x00ecf12d, 0x00f3239e, 0x00b9540b, 0x00ff868f ; bits 14-18: 8x8_frame3
                                                                     ; bits 19-23: 8x8_frame4
41
42
43
44
scan_field_avx512: dd 0x0006b240, 0x000735a1, 0x0007b9c2, 0x0009bde8 ; bits 0-4:   8x8_field1
                   dd 0x000c4e69, 0x000ce723, 0x000a0004, 0x000aeb4a ; bits 5-9:   8x8_field2
                   dd 0x000b5290, 0x000bd6ab, 0x000d5ac5, 0x000ddee6 ; bits 10-14: 8x8_field3
                   dd 0x000e6f67, 0x000e842c, 0x000f0911, 0x000ff058 ; bits 15-19: 8x8_field4
45
46
47
48
cavlc_shuf_avx512: dd 0x00018820, 0x000398a4, 0x0005a928, 0x0007b9ac ; bits 0-4:   interleave1
                   dd 0x0009ca30, 0x000bdab4, 0x000deb38, 0x000ffbbc ; bits 5-9:   interleave2
                   dd 0x00010c01, 0x00031c85, 0x00052d09, 0x00073d8d ; bits 10-14: interleave3
                   dd 0x00094e11, 0x000b5e95, 0x000d6f19, 0x000f7f9d ; bits 15-19: interleave4
49
%else
Henrik Gramner's avatar
Henrik Gramner committed
50
51
52
53
dct_avx512:        dd 0x10000000, 0x00021104, 0x3206314c, 0x60042048 ; bits    0-4:   dct8x8_fenc    bits    5-9:   dct8x8_fdec
                   dd 0x98008a10, 0x20029b14, 0xba06bb5c, 0x4004aa58 ; bits    10-13: dct16x16_fenc  bits    14-18: dct16x16_fdec
                   dd 0x54004421, 0x80025525, 0x7606756d, 0xe0046469 ; bits(e) 24-27: idct8x8_idct1  bits(e) 28-31: idct8x8_idct2
                   dd 0xdc00ce31, 0xa002df35, 0xfe06ff7d, 0xc004ee79 ; bits(o) 24-31: idct8x8_gather
54
55
56
57
scan_frame_avx512: dw 0x7000, 0x5484, 0x3811, 0x1c22, 0x3c95, 0x5908, 0x758c, 0x9119 ; bits 0-3:   4x4_frame
                   dw 0xaca6, 0xc833, 0xe447, 0xe8ba, 0xcd2d, 0xb19e, 0x960b, 0x7a8f ; bits 4-9:   8x8_frame1
                   dw 0x5e10, 0x7da0, 0x9930, 0xb4c0, 0xd050, 0xec60, 0xf0d0, 0xd540 ; bits 10-15: 8x8_frame2
                   dw 0xb9b0, 0x9e20, 0xbe90, 0xdb00, 0xf780, 0xfb10, 0xdea0, 0xfe30
58
59
60
61
scan_field_avx512: dw 0x0700, 0x0741, 0x0782, 0x07c8, 0x08c9, 0x0a43, 0x0c04, 0x0a8a ; bits 0-5:   8x8_field1
                   dw 0x0910, 0x094b, 0x0985, 0x09c6, 0x0ac7, 0x0c4c, 0x0c91, 0x0b18 ; bits 6-11:  8x8_field2
                   dw 0x0b52, 0x0b8d, 0x0bce, 0x0ccf, 0x0e13, 0x0e59, 0x0d20, 0x0d5a
                   dw 0x0d94, 0x0dd5, 0x0e96, 0x0ed7, 0x0f1b, 0x0f61, 0x0fa8, 0x0fe2
62
63
64
65
cavlc_shuf_avx512: dw 0x0080, 0x0184, 0x0288, 0x038c, 0x0490, 0x0594, 0x0698, 0x079c ; bits 0-5:   interleave1
                   dw 0x08a0, 0x09a4, 0x0aa8, 0x0bac, 0x0cb0, 0x0db4, 0x0eb8, 0x0fbc ; bits 6-11:  interleave2
                   dw 0x00c1, 0x01c5, 0x02c9, 0x03cd, 0x04d1, 0x05d5, 0x06d9, 0x07dd
                   dw 0x08e1, 0x09e5, 0x0ae9, 0x0bed, 0x0cf1, 0x0df5, 0x0ef9, 0x0ffd
66
67
%endif

Ilia's avatar
Ilia committed
68
pw_ppmmmmpp:    dw 1,1,-1,-1,-1,-1,1,1
Holger Lubitz's avatar
Holger Lubitz committed
69
pb_sub4frame:   db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
70
71
pb_sub4field:   db 0,4,1,8,12,5,9,13,2,6,10,14,3,7,11,15
pb_subacmask:   dw 0,-1,-1,-1,-1,-1,-1,-1
72
73
74
75
pb_scan4framea: SHUFFLE_MASK_W 6,3,7,0,4,1,2,5
pb_scan4frameb: SHUFFLE_MASK_W 0,4,1,2,5,6,3,7
pb_scan4frame2a: SHUFFLE_MASK_W 0,4,1,2,5,8,12,9
pb_scan4frame2b: SHUFFLE_MASK_W 6,3,7,10,13,14,11,15
76

Fiona Glaser's avatar
Fiona Glaser committed
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
pb_scan8framet1: SHUFFLE_MASK_W 0,  1,  6,  7,  8,  9, 13, 14
pb_scan8framet2: SHUFFLE_MASK_W 2 , 3,  4,  7,  9, 15, 10, 14
pb_scan8framet3: SHUFFLE_MASK_W 0,  1,  5,  6,  8, 11, 12, 13
pb_scan8framet4: SHUFFLE_MASK_W 0,  3,  4,  5,  8, 11, 12, 15
pb_scan8framet5: SHUFFLE_MASK_W 1,  2,  6,  7,  9, 10, 13, 14
pb_scan8framet6: SHUFFLE_MASK_W 0,  3,  4,  5, 10, 11, 12, 15
pb_scan8framet7: SHUFFLE_MASK_W 1,  2,  6,  7,  8,  9, 14, 15
pb_scan8framet8: SHUFFLE_MASK_W 0,  1,  2,  7,  8, 10, 11, 14
pb_scan8framet9: SHUFFLE_MASK_W 1,  4,  5,  7,  8, 13, 14, 15

pb_scan8frame1: SHUFFLE_MASK_W  0,  8,  1,  2,  9, 12,  4, 13
pb_scan8frame2: SHUFFLE_MASK_W  4,  0,  1,  5,  8, 10, 12, 14
pb_scan8frame3: SHUFFLE_MASK_W 12, 10,  8,  6,  2,  3,  7,  9
pb_scan8frame4: SHUFFLE_MASK_W  0,  1,  8, 12,  4, 13,  9,  2
pb_scan8frame5: SHUFFLE_MASK_W  5, 14, 10,  3, 11, 15,  6,  7
pb_scan8frame6: SHUFFLE_MASK_W  6,  8, 12, 13,  9,  7,  5,  3
pb_scan8frame7: SHUFFLE_MASK_W  1,  3,  5,  7, 10, 14, 15, 11
pb_scan8frame8: SHUFFLE_MASK_W  10, 3, 11, 14,  5,  6, 15,  7

pb_scan8field1 : SHUFFLE_MASK_W    0,   1,   2,   8,   9,   3,   4,  10
pb_scan8field2a: SHUFFLE_MASK_W 0x80,  11,   5,   6,   7,  12,0x80,0x80
pb_scan8field2b: SHUFFLE_MASK_W    0,0x80,0x80,0x80,0x80,0x80,   1,   8
pb_scan8field3a: SHUFFLE_MASK_W   10,   5,   6,   7,  11,0x80,0x80,0x80
pb_scan8field3b: SHUFFLE_MASK_W 0x80,0x80,0x80,0x80,0x80,   1,   8,   2
pb_scan8field4a: SHUFFLE_MASK_W    4,   5,   6,   7,  11,0x80,0x80,0x80
pb_scan8field6 : SHUFFLE_MASK_W    4,   5,   6,   7,  11,0x80,0x80,  12
pb_scan8field7 : SHUFFLE_MASK_W    5,   6,   7,  11,0x80,0x80,  12,  13

105
106
SECTION .text

107
108
cextern pw_32_0
cextern pw_32
Fiona Glaser's avatar
Fiona Glaser committed
109
cextern pw_512
110
cextern pw_8000
111
cextern pw_pixel_max
112
113
114
cextern hsub_mul
cextern pb_1
cextern pw_1
115
116
cextern pd_1
cextern pd_32
117
118
cextern pw_ppppmmmm
cextern pw_pmpmpmpm
119
cextern deinterleave_shufd
120
121
cextern pb_unpackbd1
cextern pb_unpackbd2
122

123
%macro WALSH4_1D 6
124
125
    SUMSUB_BADC %1, %5, %4, %3, %2, %6
    SUMSUB_BADC %1, %5, %3, %4, %2, %6
126
    SWAP %2, %5, %4
127
128
%endmacro

129
130
%macro SUMSUB_17BIT 4 ; a, b, tmp, 0x8000
    movq  m%3, m%4
131
    pxor  m%1, m%4
132
    psubw m%3, m%2
133
    pxor  m%2, m%4
134
135
    pavgw m%3, m%1
    pavgw m%2, m%1
136
137
    pxor  m%3, m%4
    pxor  m%2, m%4
138
139
140
    SWAP %1, %2, %3
%endmacro

141
142
143
144
145
146
147
148
%macro DCT_UNPACK 3
    punpcklwd %3, %1
    punpckhwd %2, %1
    psrad     %3, 16
    psrad     %2, 16
    SWAP      %1, %3
%endmacro

149
%if HIGH_BIT_DEPTH
150
;-----------------------------------------------------------------------------
151
; void dct4x4dc( dctcoef d[4][4] )
152
;-----------------------------------------------------------------------------
153
154
%macro DCT4x4_DC 0
cglobal dct4x4dc, 1,1,5
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
    mova   m0, [r0+ 0]
    mova   m1, [r0+16]
    mova   m2, [r0+32]
    mova   m3, [r0+48]
    WALSH4_1D  d, 0,1,2,3,4
    TRANSPOSE4x4D 0,1,2,3,4
    paddd  m0, [pd_1]
    WALSH4_1D  d, 0,1,2,3,4
    psrad  m0, 1
    psrad  m1, 1
    psrad  m2, 1
    psrad  m3, 1
    mova [r0+ 0], m0
    mova [r0+16], m1
    mova [r0+32], m2
    mova [r0+48], m3
    RET
172
173
%endmacro ; DCT4x4_DC

174
175
176
177
INIT_XMM sse2
DCT4x4_DC
INIT_XMM avx
DCT4x4_DC
178
179
%else

180
INIT_MMX mmx2
181
cglobal dct4x4dc, 1,1
182
    movq   m3, [r0+24]
183
184
185
    movq   m2, [r0+16]
    movq   m1, [r0+ 8]
    movq   m0, [r0+ 0]
186
    movq   m7, [pw_8000] ; convert to unsigned and back, so that pavgw works
187
    WALSH4_1D  w, 0,1,2,3,4
188
    TRANSPOSE4x4W 0,1,2,3,4
189
    SUMSUB_BADC w, 1, 0, 3, 2, 4
190
191
    SWAP 0, 1
    SWAP 2, 3
192
193
    SUMSUB_17BIT 0,2,4,7
    SUMSUB_17BIT 1,3,5,7
194
    movq  [r0+0], m0
195
196
197
    movq  [r0+8], m2
    movq [r0+16], m3
    movq [r0+24], m1
198
    RET
199
200
%endif ; HIGH_BIT_DEPTH

201
%if HIGH_BIT_DEPTH
202
203
204
;-----------------------------------------------------------------------------
; void idct4x4dc( int32_t d[4][4] )
;-----------------------------------------------------------------------------
205
206
%macro IDCT4x4DC 0
cglobal idct4x4dc, 1,1
207
208
209
210
211
212
213
214
215
216
217
218
    mova   m3, [r0+48]
    mova   m2, [r0+32]
    mova   m1, [r0+16]
    mova   m0, [r0+ 0]
    WALSH4_1D  d,0,1,2,3,4
    TRANSPOSE4x4D 0,1,2,3,4
    WALSH4_1D  d,0,1,2,3,4
    mova  [r0+ 0], m0
    mova  [r0+16], m1
    mova  [r0+32], m2
    mova  [r0+48], m3
    RET
219
220
%endmacro ; IDCT4x4DC

221
222
223
224
INIT_XMM sse2
IDCT4x4DC
INIT_XMM avx
IDCT4x4DC
225
%else
226
227

;-----------------------------------------------------------------------------
228
; void idct4x4dc( int16_t d[4][4] )
229
;-----------------------------------------------------------------------------
230
231
INIT_MMX mmx
cglobal idct4x4dc, 1,1
232
233
234
235
    movq   m3, [r0+24]
    movq   m2, [r0+16]
    movq   m1, [r0+ 8]
    movq   m0, [r0+ 0]
236
    WALSH4_1D  w,0,1,2,3,4
237
    TRANSPOSE4x4W 0,1,2,3,4
238
    WALSH4_1D  w,0,1,2,3,4
239
240
241
242
243
    movq  [r0+ 0], m0
    movq  [r0+ 8], m1
    movq  [r0+16], m2
    movq  [r0+24], m3
    RET
244
%endif ; HIGH_BIT_DEPTH
245

Henrik Gramner's avatar
Henrik Gramner committed
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
;-----------------------------------------------------------------------------
; void dct2x4dc( dctcoef dct[8], dctcoef dct4x4[8][16] )
;-----------------------------------------------------------------------------
%if WIN64
    DECLARE_REG_TMP 6 ; Avoid some REX prefixes to reduce code size
%else
    DECLARE_REG_TMP 2
%endif

%macro INSERT_COEFF 3 ; dst, src, imm
    %if %3
        %if HIGH_BIT_DEPTH
            %if cpuflag(sse4)
                pinsrd %1, %2, %3
            %elif %3 == 2
                movd       m2, %2
            %elif %3 == 1
                punpckldq  %1, %2
            %else
                punpckldq  m2, %2
                punpcklqdq %1, m2
            %endif
        %else
            %if %3 == 2
                punpckldq  %1, %2
            %else
                pinsrw %1, %2, %3
            %endif
        %endif
    %else
        movd %1, %2
    %endif
    %if HIGH_BIT_DEPTH
        mov %2, t0d
    %else
        mov %2, t0w
    %endif
%endmacro

%macro DCT2x4DC 2
cglobal dct2x4dc, 2,3
    xor          t0d, t0d
    INSERT_COEFF  m0, [r1+0*16*SIZEOF_DCTCOEF], 0
    INSERT_COEFF  m0, [r1+1*16*SIZEOF_DCTCOEF], 2
    add           r1, 4*16*SIZEOF_DCTCOEF
    INSERT_COEFF  m0, [r1-2*16*SIZEOF_DCTCOEF], 1
    INSERT_COEFF  m0, [r1-1*16*SIZEOF_DCTCOEF], 3
    INSERT_COEFF  m1, [r1+0*16*SIZEOF_DCTCOEF], 0
    INSERT_COEFF  m1, [r1+1*16*SIZEOF_DCTCOEF], 2
    INSERT_COEFF  m1, [r1+2*16*SIZEOF_DCTCOEF], 1
    INSERT_COEFF  m1, [r1+3*16*SIZEOF_DCTCOEF], 3
    SUMSUB_BA     %1, 1, 0, 2
    SBUTTERFLY    %2, 1, 0, 2
    SUMSUB_BA     %1, 0, 1, 2
    SBUTTERFLY    %2, 0, 1, 2
    SUMSUB_BA     %1, 1, 0, 2
    pshuf%1       m0, m0, q1032
    mova        [r0], m1
    mova [r0+mmsize], m0
    RET
%endmacro

%if HIGH_BIT_DEPTH
INIT_XMM sse2
DCT2x4DC d, dq
INIT_XMM avx
DCT2x4DC d, dq
%else
INIT_MMX mmx2
DCT2x4DC w, wd
%endif

318
%if HIGH_BIT_DEPTH
319
;-----------------------------------------------------------------------------
320
; void sub4x4_dct( dctcoef dct[4][4], pixel *pix1, pixel *pix2 )
321
;-----------------------------------------------------------------------------
322
323
INIT_MMX mmx
cglobal sub4x4_dct, 3,3
324
325
326
327
328
329
330
.skip_prologue:
    LOAD_DIFF  m0, m4, none, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
    LOAD_DIFF  m3, m4, none, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE]
    LOAD_DIFF  m1, m4, none, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
    LOAD_DIFF  m2, m4, none, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE]
    DCT4_1D 0,1,2,3,4
    TRANSPOSE4x4W 0,1,2,3,4
331

332
333
    SUMSUB_BADC w, 3, 0, 2, 1
    SUMSUB_BA   w, 2, 3, 4
334
335
336
337
338
339
340
341
342
    DCT_UNPACK m2, m4, m5
    DCT_UNPACK m3, m6, m7
    mova  [r0+ 0], m2 ; s03 + s12
    mova  [r0+ 8], m4
    mova  [r0+32], m3 ; s03 - s12
    mova  [r0+40], m6

    DCT_UNPACK m0, m2, m4
    DCT_UNPACK m1, m3, m5
343
344
    SUMSUB2_AB  d, 0, 1, 4
    SUMSUB2_AB  d, 2, 3, 5
345
346
347
348
    mova  [r0+16], m0 ; d03*2 + d12
    mova  [r0+24], m2
    mova  [r0+48], m4 ; d03 - 2*d12
    mova  [r0+56], m5
349
    RET
350
%else
351

352
353
%macro SUB_DCT4 0
cglobal sub4x4_dct, 3,3
354
.skip_prologue:
355
%if cpuflag(ssse3)
356
    mova m5, [hsub_mul]
357
%endif
358
    LOAD_DIFF8x4 0, 3, 1, 2, 4, 5, r1, r2
359
    DCT4_1D 0,1,2,3,4
360
    TRANSPOSE4x4W 0,1,2,3,4
361
362
363
364
365
    DCT4_1D 0,1,2,3,4
    movq  [r0+ 0], m0
    movq  [r0+ 8], m1
    movq  [r0+16], m2
    movq  [r0+24], m3
366
    RET
367
368
%endmacro

369
370
371
372
INIT_MMX mmx
SUB_DCT4
INIT_MMX ssse3
SUB_DCT4
373
%endif ; HIGH_BIT_DEPTH
374

375
%if HIGH_BIT_DEPTH
376
;-----------------------------------------------------------------------------
377
; void add4x4_idct( pixel *p_dst, dctcoef dct[4][4] )
378
;-----------------------------------------------------------------------------
379
380
381
382
383
384
385
386
387
388
389
390
%macro STORE_DIFFx2 6
    psrad     %1, 6
    psrad     %2, 6
    packssdw  %1, %2
    movq      %3, %5
    movhps    %3, %6
    paddsw    %1, %3
    CLIPW     %1, %4, [pw_pixel_max]
    movq      %5, %1
    movhps    %6, %1
%endmacro

391
392
%macro ADD4x4_IDCT 0
cglobal add4x4_idct, 2,2,6
Daniel Kang's avatar
Daniel Kang committed
393
    add   r0, 2*FDEC_STRIDEB
394
395
396
397
398
399
400
401
402
.skip_prologue:
    mova  m1, [r1+16]
    mova  m3, [r1+48]
    mova  m2, [r1+32]
    mova  m0, [r1+ 0]
    IDCT4_1D d,0,1,2,3,4,5
    TRANSPOSE4x4D 0,1,2,3,4
    paddd m0, [pd_32]
    IDCT4_1D d,0,1,2,3,4,5
403
    pxor  m5, m5
Daniel Kang's avatar
Daniel Kang committed
404
405
    STORE_DIFFx2 m0, m1, m4, m5, [r0-2*FDEC_STRIDEB], [r0-1*FDEC_STRIDEB]
    STORE_DIFFx2 m2, m3, m4, m5, [r0+0*FDEC_STRIDEB], [r0+1*FDEC_STRIDEB]
406
    RET
407
408
%endmacro

409
410
411
412
INIT_XMM sse2
ADD4x4_IDCT
INIT_XMM avx
ADD4x4_IDCT
413
414

%else ; !HIGH_BIT_DEPTH
415

416
417
INIT_MMX mmx
cglobal add4x4_idct, 2,2
418
    pxor m7, m7
419
.skip_prologue:
420
421
    movq  m1, [r1+ 8]
    movq  m3, [r1+24]
422
423
    movq  m2, [r1+16]
    movq  m0, [r1+ 0]
424
    IDCT4_1D w,0,1,2,3,4,5
425
    TRANSPOSE4x4W 0,1,2,3,4
426
    paddw m0, [pw_32]
427
    IDCT4_1D w,0,1,2,3,4,5
428
429
430
431
    STORE_DIFF  m0, m4, m7, [r0+0*FDEC_STRIDE]
    STORE_DIFF  m1, m4, m7, [r0+1*FDEC_STRIDE]
    STORE_DIFF  m2, m4, m7, [r0+2*FDEC_STRIDE]
    STORE_DIFF  m3, m4, m7, [r0+3*FDEC_STRIDE]
432
433
    RET

434
435
%macro ADD4x4 0
cglobal add4x4_idct, 2,2,6
436
437
438
439
    mova      m1, [r1+0x00]     ; row1/row0
    mova      m3, [r1+0x10]     ; row3/row2
    psraw     m0, m1, 1         ; row1>>1/...
    psraw     m2, m3, 1         ; row3>>1/...
Holger Lubitz's avatar
Holger Lubitz committed
440
441
442
443
444
    movsd     m0, m1            ; row1>>1/row0
    movsd     m2, m3            ; row3>>1/row2
    psubw     m0, m3            ; row1>>1-row3/row0-2
    paddw     m2, m1            ; row3>>1+row1/row0+2
    SBUTTERFLY2 wd, 0, 2, 1
445
    SUMSUB_BA w, 2, 0, 1
446
447
    pshuflw   m1, m2, q2301
    pshufhw   m2, m2, q2301
Holger Lubitz's avatar
Holger Lubitz committed
448
449
    punpckldq m1, m0
    punpckhdq m2, m0
450
    SWAP       0, 1
Holger Lubitz's avatar
Holger Lubitz committed
451

452
    mova      m1, [pw_32_0]
Holger Lubitz's avatar
Holger Lubitz committed
453
454
    paddw     m1, m0            ; row1/row0 corrected
    psraw     m0, 1             ; row1>>1/...
455
    psraw     m3, m2, 1         ; row3>>1/...
Holger Lubitz's avatar
Holger Lubitz committed
456
    movsd     m0, m1            ; row1>>1/row0
457
458
459
460
461
    movsd     m3, m2            ; row3>>1/row2
    psubw     m0, m2            ; row1>>1-row3/row0-2
    paddw     m3, m1            ; row3>>1+row1/row0+2
    SBUTTERFLY2 qdq, 0, 3, 1
    SUMSUB_BA w, 3, 0, 1
Holger Lubitz's avatar
Holger Lubitz committed
462
463
464

    movd      m4, [r0+FDEC_STRIDE*0]
    movd      m1, [r0+FDEC_STRIDE*1]
465
    movd      m2, [r0+FDEC_STRIDE*2]
Holger Lubitz's avatar
Holger Lubitz committed
466
467
468
    movd      m5, [r0+FDEC_STRIDE*3]
    punpckldq m1, m4            ; row0/row1
    pxor      m4, m4
469
    punpckldq m2, m5            ; row3/row2
Holger Lubitz's avatar
Holger Lubitz committed
470
    punpcklbw m1, m4
471
472
    psraw     m3, 6
    punpcklbw m2, m4
Holger Lubitz's avatar
Holger Lubitz committed
473
    psraw     m0, 6
474
475
476
    paddsw    m3, m1
    paddsw    m0, m2
    packuswb  m0, m3            ; row0/row1/row3/row2
Holger Lubitz's avatar
Holger Lubitz committed
477
478
479
480
481
    pextrd   [r0+FDEC_STRIDE*0], m0, 3
    pextrd   [r0+FDEC_STRIDE*1], m0, 2
    movd     [r0+FDEC_STRIDE*2], m0
    pextrd   [r0+FDEC_STRIDE*3], m0, 1
    RET
482
483
%endmacro ; ADD4x4

484
485
486
487
INIT_XMM sse4
ADD4x4
INIT_XMM avx
ADD4x4
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512

%macro STOREx2_AVX2 9
    movq      xm%3, [r0+%5*FDEC_STRIDE]
    vinserti128 m%3, m%3, [r0+%6*FDEC_STRIDE], 1
    movq      xm%4, [r0+%7*FDEC_STRIDE]
    vinserti128 m%4, m%4, [r0+%8*FDEC_STRIDE], 1
    punpcklbw  m%3, m%9
    punpcklbw  m%4, m%9
    psraw      m%1, 6
    psraw      m%2, 6
    paddsw     m%1, m%3
    paddsw     m%2, m%4
    packuswb   m%1, m%2
    vextracti128 xm%2, m%1, 1
    movq   [r0+%5*FDEC_STRIDE], xm%1
    movq   [r0+%6*FDEC_STRIDE], xm%2
    movhps [r0+%7*FDEC_STRIDE], xm%1
    movhps [r0+%8*FDEC_STRIDE], xm%2
%endmacro

INIT_YMM avx2
cglobal add8x8_idct, 2,3,8
    add    r0, 4*FDEC_STRIDE
    pxor   m7, m7
    TAIL_CALL .skip_prologue, 0
513
cglobal_label .skip_prologue
514
    ; TRANSPOSE4x4Q
515
516
517
518
519
520
521
522
    mova       xm0, [r1+ 0]
    mova       xm1, [r1+32]
    mova       xm2, [r1+16]
    mova       xm3, [r1+48]
    vinserti128 m0, m0, [r1+ 64], 1
    vinserti128 m1, m1, [r1+ 96], 1
    vinserti128 m2, m2, [r1+ 80], 1
    vinserti128 m3, m3, [r1+112], 1
523
    SBUTTERFLY qdq, 0, 1, 4
524
    SBUTTERFLY qdq, 2, 3, 4
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
    IDCT4_1D w,0,1,2,3,4,5
    TRANSPOSE2x4x4W 0,1,2,3,4
    paddw m0, [pw_32]
    IDCT4_1D w,0,1,2,3,4,5
    STOREx2_AVX2 0, 1, 4, 5, -4, 0, -3, 1, 7
    STOREx2_AVX2 2, 3, 4, 5, -2, 2, -1, 3, 7
    ret

; 2xdst, 2xtmp, 4xsrcrow, 1xzero
%macro LOAD_DIFF8x2_AVX2 9
    movq    xm%1, [r1+%5*FENC_STRIDE]
    movq    xm%2, [r1+%6*FENC_STRIDE]
    vinserti128 m%1, m%1, [r1+%7*FENC_STRIDE], 1
    vinserti128 m%2, m%2, [r1+%8*FENC_STRIDE], 1
    punpcklbw m%1, m%9
    punpcklbw m%2, m%9
    movq    xm%3, [r2+(%5-4)*FDEC_STRIDE]
    movq    xm%4, [r2+(%6-4)*FDEC_STRIDE]
    vinserti128 m%3, m%3, [r2+(%7-4)*FDEC_STRIDE], 1
    vinserti128 m%4, m%4, [r2+(%8-4)*FDEC_STRIDE], 1
    punpcklbw m%3, m%9
    punpcklbw m%4, m%9
    psubw    m%1, m%3
    psubw    m%2, m%4
%endmacro

; 4x src, 1x tmp
%macro STORE8_DCT_AVX2 5
    SBUTTERFLY qdq, %1, %2, %5
    SBUTTERFLY qdq, %3, %4, %5
    mova [r0+  0], xm%1
    mova [r0+ 16], xm%3
    mova [r0+ 32], xm%2
    mova [r0+ 48], xm%4
    vextracti128 [r0+ 64], m%1, 1
    vextracti128 [r0+ 80], m%3, 1
    vextracti128 [r0+ 96], m%2, 1
    vextracti128 [r0+112], m%4, 1
%endmacro

%macro STORE16_DCT_AVX2 5
    SBUTTERFLY qdq, %1, %2, %5
    SBUTTERFLY qdq, %3, %4, %5
    mova [r0+ 0-128], xm%1
    mova [r0+16-128], xm%3
    mova [r0+32-128], xm%2
    mova [r0+48-128], xm%4
    vextracti128 [r0+ 0], m%1, 1
    vextracti128 [r0+16], m%3, 1
    vextracti128 [r0+32], m%2, 1
    vextracti128 [r0+48], m%4, 1
%endmacro

INIT_YMM avx2
cglobal sub8x8_dct, 3,3,7
    pxor m6, m6
    add r2, 4*FDEC_STRIDE
    LOAD_DIFF8x2_AVX2 0, 1, 4, 5, 0, 1, 4, 5, 6
    LOAD_DIFF8x2_AVX2 2, 3, 4, 5, 2, 3, 6, 7, 6
    DCT4_1D 0, 1, 2, 3, 4
    TRANSPOSE2x4x4W 0, 1, 2, 3, 4
    DCT4_1D 0, 1, 2, 3, 4
    STORE8_DCT_AVX2 0, 1, 2, 3, 4
    RET

INIT_YMM avx2
cglobal sub16x16_dct, 3,3,6
    add r0, 128
    add r2, 4*FDEC_STRIDE
    call .sub16x4_dct
    add r0, 64
    add r1, 4*FENC_STRIDE
    add r2, 4*FDEC_STRIDE
    call .sub16x4_dct
    add r0, 256-64
    add r1, 4*FENC_STRIDE
    add r2, 4*FDEC_STRIDE
    call .sub16x4_dct
    add r0, 64
    add r1, 4*FENC_STRIDE
    add r2, 4*FDEC_STRIDE
    call .sub16x4_dct
    RET
.sub16x4_dct:
    LOAD_DIFF16x2_AVX2 0, 1, 4, 5, 0, 1
    LOAD_DIFF16x2_AVX2 2, 3, 4, 5, 2, 3
    DCT4_1D 0, 1, 2, 3, 4
    TRANSPOSE2x4x4W 0, 1, 2, 3, 4
    DCT4_1D 0, 1, 2, 3, 4
    STORE16_DCT_AVX2 0, 1, 2, 3, 4
    ret
Henrik Gramner's avatar
Henrik Gramner committed
616
617
618
619
620
621
622
623

%macro DCT4x4_AVX512 0
    psubw      m0, m2            ; 0 1
    psubw      m1, m3            ; 3 2
    SUMSUB_BA   w, 1, 0, 2
    SBUTTERFLY wd, 1, 0, 2
    paddw      m2, m1, m0
    psubw      m3, m1, m0
624
625
    vpaddw     m2 {k1}, m1       ; 0+1+2+3 0<<1+1-2-3<<1
    vpsubw     m3 {k1}, m0       ; 0-1-2+3 0-1<<1+2<<1-3
Henrik Gramner's avatar
Henrik Gramner committed
626
627
628
629
630
631
632
    shufps     m1, m2, m3, q2323 ; a3 b3 a2 b2 c3 d3 c2 d2
    punpcklqdq m2, m3            ; a0 b0 a1 b1 c0 d0 c1 d1
    SUMSUB_BA   w, 1, 2, 3
    shufps     m3, m1, m2, q3131 ; a1+a2 b1+b2 c1+c2 d1+d2 a1-a2 b1-b2 b1-b2 d1-d2
    shufps     m1, m2, q2020     ; a0+a3 b0+b3 c0+c3 d0+d3 a0-a3 b0-b3 c0-c3 d0-d3
    paddw      m2, m1, m3
    psubw      m0, m1, m3
633
634
    vpaddw     m2 {k2}, m1       ; 0'+1'+2'+3' 0'<<1+1'-2'-3'<<1
    vpsubw     m0 {k2}, m3       ; 0'-1'-2'+3' 0'-1'<<1+2'<<1-3'
Henrik Gramner's avatar
Henrik Gramner committed
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
%endmacro

INIT_XMM avx512
cglobal sub4x4_dct
    mov         eax, 0xf0aa
    kmovw        k1, eax
    PROLOGUE 3,3
    movd         m0,      [r1+0*FENC_STRIDE]
    movd         m2,      [r2+0*FDEC_STRIDE]
    vpbroadcastd m0 {k1}, [r1+1*FENC_STRIDE]
    vpbroadcastd m2 {k1}, [r2+1*FDEC_STRIDE]
    movd         m1,      [r1+3*FENC_STRIDE]
    movd         m3,      [r2+3*FDEC_STRIDE]
    vpbroadcastd m1 {k1}, [r1+2*FENC_STRIDE]
    vpbroadcastd m3 {k1}, [r2+2*FDEC_STRIDE]
    kshiftrw     k2, k1, 8
    pxor         m4, m4
    punpcklbw    m0, m4
    punpcklbw    m2, m4
    punpcklbw    m1, m4
    punpcklbw    m3, m4
    DCT4x4_AVX512
    mova       [r0], m2
    mova    [r0+16], m0
    RET
Henrik Gramner's avatar
Henrik Gramner committed
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700

INIT_ZMM avx512
cglobal dct4x4x4_internal
    punpcklbw  m0, m1, m4
    punpcklbw  m2, m3, m4
    punpckhbw  m1, m4
    punpckhbw  m3, m4
    DCT4x4_AVX512
    mova       m1, m2
    vshufi32x4 m2 {k2}, m0, m0, q2200 ; m0
    vshufi32x4 m0 {k3}, m1, m1, q3311 ; m1
    ret

%macro DCT8x8_LOAD_FENC_AVX512 4 ; dst, perm, row1, row2
    movu     %1,     [r1+%3*FENC_STRIDE]
    vpermt2d %1, %2, [r1+%4*FENC_STRIDE]
%endmacro

%macro DCT8x8_LOAD_FDEC_AVX512 5 ; dst, perm, tmp, row1, row2
    movu     %1,      [r2+(%4  )*FDEC_STRIDE]
    vmovddup %1 {k1}, [r2+(%4+2)*FDEC_STRIDE]
    movu     %3,      [r2+(%5  )*FDEC_STRIDE]
    vmovddup %3 {k1}, [r2+(%5+2)*FDEC_STRIDE]
    vpermt2d %1, %2, %3
%endmacro

cglobal sub8x8_dct, 3,3
    mova       m0, [dct_avx512]
    DCT8x8_LOAD_FENC_AVX512 m1, m0, 0, 4 ; 0 2 1 3
    mov       r1d, 0xaaaaaaaa
    kmovd      k1, r1d
    psrld      m0, 5
    DCT8x8_LOAD_FDEC_AVX512 m3, m0, m2, 0, 4
    mov       r1d, 0xf0f0f0f0
    kmovd      k2, r1d
    pxor      xm4, xm4
    knotw      k3, k2
    call dct4x4x4_internal_avx512
    mova     [r0], m0
    mova  [r0+64], m1
    RET
Henrik Gramner's avatar
Henrik Gramner committed
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725

%macro SUB4x16_DCT_AVX512 2 ; dst, src
    vpermd   m1, m5, [r1+1*%2*64]
    mova     m3,     [r2+2*%2*64]
    vpermt2d m3, m6, [r2+2*%2*64+64]
    call dct4x4x4_internal_avx512
    mova [r0+%1*64    ], m0
    mova [r0+%1*64+128], m1
%endmacro

cglobal sub16x16_dct
    psrld    m5, [dct_avx512], 10
    mov     eax, 0xaaaaaaaa
    kmovd    k1, eax
    mov     eax, 0xf0f0f0f0
    kmovd    k2, eax
    PROLOGUE 3,3
    pxor    xm4, xm4
    knotw    k3, k2
    psrld    m6, m5, 4
    SUB4x16_DCT_AVX512 0, 0
    SUB4x16_DCT_AVX512 1, 1
    SUB4x16_DCT_AVX512 4, 2
    SUB4x16_DCT_AVX512 5, 3
    RET
Henrik Gramner's avatar
Henrik Gramner committed
726

Henrik Gramner's avatar
Henrik Gramner committed
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
cglobal sub8x8_dct_dc, 3,3
    mova         m3, [dct_avx512]
    DCT8x8_LOAD_FENC_AVX512 m0, m3, 0, 4 ; 0 2 1 3
    mov         r1d, 0xaa
    kmovb        k1, r1d
    psrld        m3, 5
    DCT8x8_LOAD_FDEC_AVX512 m1, m3, m2, 0, 4
    pxor        xm3, xm3
    psadbw       m0, m3
    psadbw       m1, m3
    psubw        m0, m1
    vpmovqw    xmm0, m0
    vprold     xmm1, xmm0, 16
    paddw      xmm0, xmm1       ; 0 0 2 2 1 1 3 3
    punpckhqdq xmm2, xmm0, xmm0
    psubw      xmm1, xmm0, xmm2 ; 0-1 0-1 2-3 2-3
    paddw      xmm0, xmm2       ; 0+1 0+1 2+3 2+3
    punpckldq  xmm0, xmm1       ; 0+1 0+1 0-1 0-1 2+3 2+3 2-3 2-3
    punpcklqdq xmm1, xmm0, xmm0
746
    vpsubw     xmm0 {k1}, xm3, xmm0
Henrik Gramner's avatar
Henrik Gramner committed
747
748
749
750
    paddw      xmm0, xmm1       ; 0+1+2+3 0+1-2-3 0-1+2-3 0-1-2+3
    movhps     [r0], xmm0
    RET

Henrik Gramner's avatar
Henrik Gramner committed
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
cglobal sub8x16_dct_dc, 3,3
    mova         m5, [dct_avx512]
    DCT8x8_LOAD_FENC_AVX512 m0, m5, 0, 8  ; 0 4 1 5
    DCT8x8_LOAD_FENC_AVX512 m1, m5, 4, 12 ; 2 6 3 7
    mov         r1d, 0xaa
    kmovb        k1, r1d
    psrld        m5, 5
    DCT8x8_LOAD_FDEC_AVX512 m2, m5, m4, 0, 8
    DCT8x8_LOAD_FDEC_AVX512 m3, m5, m4, 4, 12
    pxor        xm4, xm4
    psadbw       m0, m4
    psadbw       m1, m4
    psadbw       m2, m4
    psadbw       m3, m4
    psubw        m0, m2
    psubw        m1, m3
    SBUTTERFLY  qdq, 0, 1, 2
    paddw        m0, m1
    vpmovqw    xmm0, m0         ; 0 2 4 6 1 3 5 7
    psrlq      xmm2, xmm0, 32
    psubw      xmm1, xmm0, xmm2 ; 0-4 2-6 1-5 3-7
    paddw      xmm0, xmm2       ; 0+4 2+6 1+5 3+7
    punpckhdq  xmm2, xmm0, xmm1
    punpckldq  xmm0, xmm1
    psubw      xmm1, xmm0, xmm2 ; 0-1+4-5 2-3+6-7 0-1-4+5 2-3-6+7
    paddw      xmm0, xmm2       ; 0+1+4+5 2+3+6+7 0+1-4-5 2+3-6-7
    punpcklwd  xmm0, xmm1
    psrlq      xmm2, xmm0, 32
    psubw      xmm1, xmm0, xmm2 ; 0+1-2-3+4+5-6-7 0-1-2+3+4-5-6+7 0+1-2-3-4-5+6+7 0-1-2+3-4+5+6-7
    paddw      xmm0, xmm2       ; 0+1+2+3+4+5+6+7 0-1+2-3+4-5+6-7 0+1+2+3-4-5-6-7 0-1+2-3-4+5-6+7
    shufps     xmm0, xmm1, q0220
    mova       [r0], xmm0
    RET

Henrik Gramner's avatar
Henrik Gramner committed
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
%macro SARSUMSUB 3 ; a, b, tmp
    mova    m%3, m%1
    vpsraw  m%1 {k1}, 1
    psubw   m%1, m%2    ; 0-2 1>>1-3
    vpsraw  m%2 {k1}, 1
    paddw   m%2, m%3    ; 0+2 1+3>>1
%endmacro

cglobal add8x8_idct, 2,2
    mova            m1, [r1]
    mova            m2, [r1+64]
    mova            m3, [dct_avx512]
    vbroadcasti32x4 m4, [pw_32]
    mov            r1d, 0xf0f0f0f0
    kxnorb          k2, k2, k2
    kmovd           k1, r1d
    kmovb           k3, k2
    vshufi32x4      m0, m1, m2, q2020 ; 0 1   4 5   8 9   c d
    vshufi32x4      m1, m2, q3131     ; 2 3   6 7   a b   e f
    psrlq           m5, m3, 56        ; {0, 3, 1, 2, 4, 7, 5, 6} * FDEC_STRIDE
    vpgatherqq      m6 {k2}, [r0+m5]
    SARSUMSUB        0, 1, 2
    SBUTTERFLY      wd, 1, 0, 2
    psrlq           m7, m3, 28
    SUMSUB_BA        w, 0, 1, 2       ; 0+1+2+3>>1 0+1>>1-2-3
    vprold          m1, 16            ; 0-1>>1-2+3 0-1+2-3>>1
    SBUTTERFLY      dq, 0, 1, 2
    psrlq           m3, 24
    SARSUMSUB        0, 1, 2
    vpermi2q        m3, m1, m0
    vpermt2q        m1, m7, m0
    paddw           m3, m4            ; += 32
    SUMSUB_BA        w, 1, 3, 0
    psraw           m1, 6             ; 0'+1'+2'+3'>>1 0'+1'>>1-2'-3'
    psraw           m3, 6             ; 0'-1'+2'-3'>>1 0'-1'>>1-2'+3'
    pxor           xm0, xm0
    SBUTTERFLY      bw, 6, 0, 2
    paddsw          m1, m6
    paddsw          m3, m0
    packuswb        m1, m3
    vpscatterqq [r0+m5] {k3}, m1
    RET
827
%endif ; HIGH_BIT_DEPTH
Holger Lubitz's avatar
Holger Lubitz committed
828
829

INIT_MMX
830
;-----------------------------------------------------------------------------
831
; void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
832
;-----------------------------------------------------------------------------
833
834
%macro SUB_NxN_DCT 7
cglobal %1, 3,3,%7
835
%if HIGH_BIT_DEPTH == 0
836
837
838
839
%if mmsize == 8
    pxor m7, m7
%else
    add r2, 4*FDEC_STRIDE
840
    mova m7, [hsub_mul]
841
%endif
842
%endif ; !HIGH_BIT_DEPTH
843
.skip_prologue:
844
    call %2.skip_prologue
845
    add  r0, %3
Loren Merritt's avatar
Loren Merritt committed
846
847
    add  r1, %4-%5-%6*FENC_STRIDE
    add  r2, %4-%5-%6*FDEC_STRIDE
848
    call %2.skip_prologue
849
    add  r0, %3
Loren Merritt's avatar
Loren Merritt committed
850
851
    add  r1, (%4-%6)*FENC_STRIDE-%5-%4
    add  r2, (%4-%6)*FDEC_STRIDE-%5-%4
852
    call %2.skip_prologue
853
    add  r0, %3
Loren Merritt's avatar
Loren Merritt committed
854
855
    add  r1, %4-%5-%6*FENC_STRIDE
    add  r2, %4-%5-%6*FDEC_STRIDE
856
    TAIL_CALL %2.skip_prologue, 1
857
858
859
%endmacro

;-----------------------------------------------------------------------------
860
; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
861
;-----------------------------------------------------------------------------
862
%macro ADD_NxN_IDCT 6-7
863
%if HIGH_BIT_DEPTH
864
865
866
867
cglobal %1, 2,2,%7
%if %3==256
    add r1, 128
%endif
868
%else
Loren Merritt's avatar
Loren Merritt committed
869
cglobal %1, 2,2,11
870
    pxor m7, m7
871
%endif
872
%if mmsize>=16 && %3!=256
873
874
    add  r0, 4*FDEC_STRIDE
%endif
875
.skip_prologue:
876
    call %2.skip_prologue
Loren Merritt's avatar
Loren Merritt committed
877
    add  r0, %4-%5-%6*FDEC_STRIDE
878
    add  r1, %3
879
    call %2.skip_prologue
Loren Merritt's avatar
Loren Merritt committed
880
    add  r0, (%4-%6)*FDEC_STRIDE-%5-%4
881
    add  r1, %3
882
    call %2.skip_prologue
Loren Merritt's avatar
Loren Merritt committed
883
    add  r0, %4-%5-%6*FDEC_STRIDE
884
    add  r1, %3
885
    TAIL_CALL %2.skip_prologue, 1
886
887
%endmacro

888
%if HIGH_BIT_DEPTH
889
INIT_MMX
890
891
SUB_NxN_DCT  sub8x8_dct_mmx,     sub4x4_dct_mmx,   64,  8, 0, 0, 0
SUB_NxN_DCT  sub16x16_dct_mmx,   sub8x8_dct_mmx,   64, 16, 8, 8, 0
892
INIT_XMM
893
894
895
896
897
898
899
900
ADD_NxN_IDCT add8x8_idct_sse2,   add4x4_idct_sse2, 64,  8, 0, 0, 6
ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2, 64, 16, 8, 8, 6
ADD_NxN_IDCT add8x8_idct_avx,    add4x4_idct_avx,  64,  8, 0, 0, 6
ADD_NxN_IDCT add16x16_idct_avx,  add8x8_idct_avx,  64, 16, 8, 8, 6
cextern add8x8_idct8_sse2.skip_prologue
cextern add8x8_idct8_avx.skip_prologue
ADD_NxN_IDCT add16x16_idct8_sse2, add8x8_idct8_sse2, 256, 16, 0, 0, 16
ADD_NxN_IDCT add16x16_idct8_avx,  add8x8_idct8_avx,  256, 16, 0, 0, 16
901
902
903
904
905
906
cextern sub8x8_dct8_sse2.skip_prologue
cextern sub8x8_dct8_sse4.skip_prologue
cextern sub8x8_dct8_avx.skip_prologue
SUB_NxN_DCT  sub16x16_dct8_sse2, sub8x8_dct8_sse2, 256, 16, 0, 0, 14
SUB_NxN_DCT  sub16x16_dct8_sse4, sub8x8_dct8_sse4, 256, 16, 0, 0, 14
SUB_NxN_DCT  sub16x16_dct8_avx,  sub8x8_dct8_avx,  256, 16, 0, 0, 14
907
%else ; !HIGH_BIT_DEPTH
908
%if ARCH_X86_64 == 0
909
INIT_MMX
910
SUB_NxN_DCT  sub8x8_dct_mmx,     sub4x4_dct_mmx,   32, 4, 0, 0, 0
911
ADD_NxN_IDCT add8x8_idct_mmx,    add4x4_idct_mmx,  32, 4, 0, 0
912
SUB_NxN_DCT  sub16x16_dct_mmx,   sub8x8_dct_mmx,   32, 8, 4, 4, 0
913
ADD_NxN_IDCT add16x16_idct_mmx,  add8x8_idct_mmx,  32, 8, 4, 4
914
915
916

cextern sub8x8_dct8_mmx.skip_prologue
cextern add8x8_idct8_mmx.skip_prologue
917
SUB_NxN_DCT  sub16x16_dct8_mmx,  sub8x8_dct8_mmx,  128, 8, 0, 0, 0
918
ADD_NxN_IDCT add16x16_idct8_mmx, add8x8_idct8_mmx, 128, 8, 0, 0
Loren Merritt's avatar
Loren Merritt committed
919
920
%endif

921
INIT_XMM
922
923
cextern sub8x8_dct_sse2.skip_prologue
cextern sub8x8_dct_ssse3.skip_prologue
924
cextern sub8x8_dct_avx.skip_prologue
Fiona Glaser's avatar
Fiona Glaser committed
925
cextern sub8x8_dct_xop.skip_prologue
926
927
928
929
SUB_NxN_DCT  sub16x16_dct_sse2,  sub8x8_dct_sse2,  128, 8, 0, 0, 10
SUB_NxN_DCT  sub16x16_dct_ssse3, sub8x8_dct_ssse3, 128, 8, 0, 0, 10
SUB_NxN_DCT  sub16x16_dct_avx,   sub8x8_dct_avx,   128, 8, 0, 0, 10
SUB_NxN_DCT  sub16x16_dct_xop,   sub8x8_dct_xop,   128, 8, 0, 0, 10
930

931
cextern add8x8_idct_sse2.skip_prologue
932
cextern add8x8_idct_avx.skip_prologue
933
934
ADD_NxN_IDCT add16x16_idct_sse2, add8x8_idct_sse2, 128, 8, 0, 0
ADD_NxN_IDCT add16x16_idct_avx,  add8x8_idct_avx,  128, 8, 0, 0
935

936
cextern add8x8_idct8_sse2.skip_prologue
937
cextern add8x8_idct8_avx.skip_prologue
938
939
ADD_NxN_IDCT add16x16_idct8_sse2, add8x8_idct8_sse2, 128, 8, 0, 0
ADD_NxN_IDCT add16x16_idct8_avx,  add8x8_idct8_avx,  128, 8, 0, 0
940

941
cextern sub8x8_dct8_sse2.skip_prologue
942
cextern sub8x8_dct8_ssse3.skip_prologue
943
cextern sub8x8_dct8_avx.skip_prologue
944
945
946
SUB_NxN_DCT  sub16x16_dct8_sse2,  sub8x8_dct8_sse2,  128, 8, 0, 0, 11
SUB_NxN_DCT  sub16x16_dct8_ssse3, sub8x8_dct8_ssse3, 128, 8, 0, 0, 11
SUB_NxN_DCT  sub16x16_dct8_avx,   sub8x8_dct8_avx,   128, 8, 0, 0, 11
947
948
949

INIT_YMM
ADD_NxN_IDCT add16x16_idct_avx2, add8x8_idct_avx2, 128, 8, 0, 0
950
%endif ; HIGH_BIT_DEPTH
951

952
%if HIGH_BIT_DEPTH
953
;-----------------------------------------------------------------------------
954
; void add8x8_idct_dc( pixel *p_dst, dctcoef *dct2x2 )
955
;-----------------------------------------------------------------------------
956
%macro ADD_DC 2
957
958
959
    mova    m0, [%1+FDEC_STRIDEB*0] ; 8pixels
    mova    m1, [%1+FDEC_STRIDEB*1]
    mova    m2, [%1+FDEC_STRIDEB*2]
960
961
962
    paddsw  m0, %2
    paddsw  m1, %2
    paddsw  m2, %2
963
    paddsw  %2, [%1+FDEC_STRIDEB*3]
964
965
966
967
    CLIPW   m0, m5, m6
    CLIPW   m1, m5, m6
    CLIPW   m2, m5, m6
    CLIPW   %2, m5, m6
968
969
970
971
    mova    [%1+FDEC_STRIDEB*0], m0
    mova    [%1+FDEC_STRIDEB*1], m1
    mova    [%1+FDEC_STRIDEB*2], m2
    mova    [%1+FDEC_STRIDEB*3], %2
972
973
%endmacro

974
975
%macro ADD_IDCT_DC 0
cglobal add8x8_idct_dc, 2,2,7
976
977
978
979
    mova        m6, [pw_pixel_max]
    pxor        m5, m5
    mova        m3, [r1]
    paddd       m3, [pd_32]
980
981
982
983
984
    psrad       m3, 6         ; dc0   0 dc1   0 dc2   0 dc3   0
    pshuflw     m4, m3, q2200 ; dc0 dc0 dc1 dc1   _   _   _   _
    pshufhw     m3, m3, q2200 ;   _   _   _   _ dc2 dc2 dc3 dc3
    pshufd      m4, m4, q1100 ; dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
    pshufd      m3, m3, q3322 ; dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
985
986
    ADD_DC r0+FDEC_STRIDEB*0, m4
    ADD_DC r0+FDEC_STRIDEB*4, m3
987
    RET
988

989
cglobal add16x16_idct_dc, 2,3,8
990
991
992
993
    mov         r2, 4
    mova        m6, [pw_pixel_max]
    mova        m7, [pd_32]
    pxor        m5, m5
994
.loop:
995
996
    mova        m3, [r1]
    paddd       m3, m7
997
998
999
1000
1001
    psrad       m3, 6         ; dc0   0 dc1   0 dc2   0 dc3   0
    pshuflw     m4, m3, q2200 ; dc0 dc0 dc1 dc1   _   _   _   _
    pshufhw     m3, m3, q2200 ;   _   _   _   _ dc2 dc2 dc3 dc3
    pshufd      m4, m4, q1100 ; dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
    pshufd      m3, m3, q3322 ; dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
1002
    ADD_DC r0+FDEC_STRIDEB*0, m4
1003
1004
    ADD_DC r0+SIZEOF_PIXEL*8, m3
    add         r1, 16
1005
    add         r0, 4*FDEC_STRIDEB
1006
1007
    dec         r2
    jg .loop
1008
    RET
1009
1010
%endmacro ; ADD_IDCT_DC

1011
1012
1013
1014
INIT_XMM sse2
ADD_IDCT_DC
INIT_XMM avx
ADD_IDCT_DC
1015
1016

%else ;!HIGH_BIT_DEPTH
1017
%macro ADD_DC 3
Fiona Glaser's avatar
Fiona Glaser committed
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
    mova    m4, [%3+FDEC_STRIDE*0]
    mova    m5, [%3+FDEC_STRIDE*1]
    mova    m6, [%3+FDEC_STRIDE*2]
    paddusb m4, %1
    paddusb m5, %1
    paddusb m6, %1
    paddusb %1, [%3+FDEC_STRIDE*3]
    psubusb m4, %2
    psubusb m5, %2
    psubusb m6, %2
    psubusb %1, %2
    mova [%3+FDEC_STRIDE*0], m4
    mova [%3+FDEC_STRIDE*1], m5
    mova [%3+FDEC_STRIDE*2], m6
    mova [%3+FDEC_STRIDE*3], %1
1033
1034
%endmacro

Fiona Glaser's avatar
Fiona Glaser committed
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
INIT_MMX mmx2
cglobal add8x8_idct_dc, 2,2
    mova      m0, [r1]
    pxor      m1, m1
    add       r0, FDEC_STRIDE*4
    paddw     m0, [pw_32]
    psraw     m0, 6
    psubw     m1, m0
    packuswb  m0, m0
    packuswb  m1, m1
    punpcklbw m0, m0
    punpcklbw m1, m1
    pshufw    m2, m0, q3322
    pshufw    m3, m1, q3322
    punpcklbw m0, m0
    punpcklbw m1, m1
    ADD_DC    m0, m1, r0-FDEC_STRIDE*4
    ADD_DC    m2, m3, r0
Anton Mitrofanov's avatar
Anton Mitrofanov committed
1053
    RET
1054

Fiona Glaser's avatar
Fiona Glaser committed
1055
1056
1057
1058
1059
INIT_XMM ssse3
cglobal add8x8_idct_dc, 2,2
    movh     m0, [r1]
    pxor     m1, m1
    add      r0, FDEC_STRIDE*4
Fiona Glaser's avatar
Fiona Glaser committed
1060
    pmulhrsw m0, [pw_512]
Fiona Glaser's avatar
Fiona Glaser committed
1061
    psubw    m1, m0
1062
    mova     m5, [pb_unpackbd1]
Fiona Glaser's avatar
Fiona Glaser committed
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
    packuswb m0, m0
    packuswb m1, m1
    pshufb   m0, m5
    pshufb   m1, m5
    movh     m2, [r0+FDEC_STRIDE*-4]
    movh     m3, [r0+FDEC_STRIDE*-3]
    movh     m4, [r0+FDEC_STRIDE*-2]
    movh     m5, [r0+FDEC_STRIDE*-1]
    movhps   m2, [r0+FDEC_STRIDE* 0]
    movhps   m3, [r0+FDEC_STRIDE* 1]
    movhps   m4, [r0+FDEC_STRIDE* 2]
    movhps   m5, [r0+FDEC_STRIDE* 3]
    paddusb  m2, m0
    paddusb  m3, m0
    paddusb  m4, m0
    paddusb  m5, m0
    psubusb  m2, m1
    psubusb  m3, m1
    psubusb  m4, m1
    psubusb  m5, m1
    movh   [r0+FDEC_STRIDE*-4], m2
    movh   [r0+FDEC_STRIDE*-3], m3
    movh   [r0+FDEC_STRIDE*-2], m4
    movh   [r0+FDEC_STRIDE*-1], m5
    movhps [r0+FDEC_STRIDE* 0], m2
    movhps [r0+FDEC_STRIDE* 1], m3
    movhps [r0+FDEC_STRIDE* 2], m4
    movhps [r0+FDEC_STRIDE* 3], m5
Anton Mitrofanov's avatar
Anton Mitrofanov committed
1091
    RET
1092

Fiona Glaser's avatar
Fiona Glaser committed
1093
1094
INIT_MMX mmx2
cglobal add16x16_idct_dc, 2,3
1095
1096
    mov       r2, 4
.loop:
Fiona Glaser's avatar
Fiona Glaser committed
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
    mova      m0, [r1]
    pxor      m1, m1
    paddw     m0, [pw_32]
    psraw     m0, 6
    psubw     m1, m0
    packuswb  m0, m0
    packuswb  m1, m1
    punpcklbw m0, m0
    punpcklbw m1, m1
    pshufw    m2, m0, q3322
    pshufw    m3, m1, q3322
    punpcklbw m0, m0
    punpcklbw m1, m1
    ADD_DC    m0, m1, r0
    ADD_DC    m2, m3, r0+8
1112
1113
1114
1115
    add       r1, 8
    add       r0, FDEC_STRIDE*4
    dec       r2
    jg .loop
1116
    RET
1117

Fiona Glaser's avatar
Fiona Glaser committed
1118
1119
INIT_XMM sse2
cglobal add16x16_idct_dc, 2,2,8
1120
1121
    call .loop
    add       r0, FDEC_STRIDE*4
1122
    TAIL_CALL .loop, 0
1123
1124
.loop:
    add       r0, FDEC_STRIDE*4
Fiona Glaser's avatar
Fiona Glaser committed
1125
1126
    movq      m0, [r1+0]
    movq      m2, [r1+8]
1127
    add       r1, 16
Fiona Glaser's avatar
Fiona Glaser committed
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
    punpcklwd m0, m0
    punpcklwd m2, m2
    pxor      m3, m3
    paddw     m0, [pw_32]
    paddw     m2, [pw_32]
    psraw     m0, 6
    psraw     m2, 6
    psubw     m1, m3, m0
    packuswb  m0, m1
    psubw     m3, m2
    punpckhbw m1, m0, m0
    packuswb  m2, m3
    punpckhbw m3, m2, m2
    punpcklbw m0, m0
    punpcklbw m2, m2
    ADD_DC    m0, m1, r0+FDEC_STRIDE*-4
    ADD_DC    m2, m3, r0
1145
1146
    ret

1147
1148
%macro ADD16x16 0
cglobal add16x16_idct_dc, 2,2,8
1149
    call .loop
Fiona Glaser's avatar
Fiona Glaser committed
1150
    add      r0, FDEC_STRIDE*4
1151
    TAIL_CALL .loop, 0
1152
.loop:
Fiona Glaser's avatar
Fiona Glaser committed
1153
1154
1155
1156
    add      r0, FDEC_STRIDE*4
    mova     m0, [r1]
    add      r1, 16
    pxor     m1, m1
Fiona Glaser's avatar
Fiona Glaser committed
1157
    pmulhrsw m0, [pw_512]
Fiona Glaser's avatar
Fiona Glaser committed
1158
    psubw    m1, m0
1159
1160
    mova     m5, [pb_unpackbd1]
    mova     m6, [pb_unpackbd2]
Fiona Glaser's avatar
Fiona Glaser committed
1161
1162
1163
1164
1165
1166
1167
1168
    packuswb m0, m0
    packuswb m1, m1
    pshufb   m2, m0, m6
    pshufb   m0, m5
    pshufb   m3, m1, m6
    pshufb   m1, m5
    ADD_DC   m0, m1, r0+FDEC_STRIDE*-4
    ADD_DC   m2, m3, r0
1169
    ret
1170
1171
%endmacro ; ADD16x16

1172
1173
1174
1175
INIT_XMM ssse3
ADD16x16
INIT_XMM avx
ADD16x16
1176

Fiona Glaser's avatar
Fiona Glaser committed
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
%macro ADD_DC_AVX2 3
    mova   xm4, [r0+FDEC_STRIDE*0+%3]
    mova   xm5, [r0+FDEC_STRIDE*1+%3]
    vinserti128 m4, m4, [r2+FDEC_STRIDE*0+%3], 1
    vinserti128 m5, m5, [r2+FDEC_STRIDE*1+%3], 1
    paddusb m4, %1
    paddusb m5, %1
    psubusb m4, %2
    psubusb m5, %2
    mova [r0+FDEC_STRIDE*0+%3], xm4
    mova [r0+FDEC_STRIDE*1+%3], xm5
    vextracti128 [r2+FDEC_STRIDE*0+%3], m4, 1
    vextracti128 [r2+FDEC_STRIDE*1+%3], m5, 1
%endmacro

INIT_YMM avx2
cglobal add16x16_idct_dc, 2,3,6
    add      r0, FDEC_STRIDE*4
    mova     m0, [r1]
    pxor     m1, m1
    pmulhrsw m0, [pw_512]
    psubw    m1, m0
1199
1200
    mova     m4, [pb_unpackbd1]
    mova     m5, [pb_unpackbd2]
Fiona Glaser's avatar
Fiona Glaser committed
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211