mc-a.asm 42.3 KB
Newer Older
1
;*****************************************************************************
Fiona Glaser's avatar
Fiona Glaser committed
2
;* mc-a.asm: x86 motion compensation
3
;*****************************************************************************
Hii's avatar
Hii committed
4
;* Copyright (C) 2003-2012 x264 project
5 6
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
7
;*          Fiona Glaser <fiona@x264.com>
8
;*          Laurent Aimar <fenrir@via.ecp.fr>
Dylan Yudaken's avatar
Dylan Yudaken committed
9
;*          Dylan Yudaken <dyudaken@gmail.com>
10
;*          Holger Lubitz <holger@lubitz.org>
11
;*          Min Chen <chenm001.163.com>
12
;*          Oskar Arvidsson <oskar@irock.se>
13 14 15 16 17 18 19 20 21 22 23 24 25
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
26
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
Fiona Glaser's avatar
Fiona Glaser committed
27 28 29
;*
;* This program is also available under a commercial proprietary license.
;* For more information, contact us at licensing@x264.com.
30 31 32
;*****************************************************************************

%include "x86inc.asm"
33
%include "x86util.asm"
34

Fiona Glaser's avatar
Fiona Glaser committed
35
SECTION_RODATA 32
36

37 38 39 40 41
ch_shuf: db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9
ch_shuf_adj: times 8 db 0
             times 8 db 2
             times 8 db 4
             times 8 db 6
42
sq_1: times 1 dq 1
43 44 45

SECTION .text

46 47
cextern pb_0
cextern pw_1
48 49 50 51
cextern pw_4
cextern pw_8
cextern pw_32
cextern pw_64
52
cextern pw_00ff
53
cextern pw_pixel_max
54
cextern sw_64
55
cextern pd_32
56

57
;=============================================================================
Dylan Yudaken's avatar
Dylan Yudaken committed
58
; implicit weighted biprediction
59
;=============================================================================
Fiona Glaser's avatar
Fiona Glaser committed
60
; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
61
%if WIN64
62 63 64 65 66
    DECLARE_REG_TMP 0,1,2,3,4,5,4,5
    %macro AVG_START 0-1 0
        PROLOGUE 5,7,%1
        movsxd r5, dword r5m
    %endmacro
67
%elif UNIX64
68 69 70
    DECLARE_REG_TMP 0,1,2,3,4,5,7,8
    %macro AVG_START 0-1 0
        PROLOGUE 6,9,%1
71 72
    %endmacro
%else
73
    DECLARE_REG_TMP 1,2,3,4,5,6,1,2
Anton Mitrofanov's avatar
Anton Mitrofanov committed
74 75
    %macro AVG_START 0-1 0
        PROLOGUE 0,7,%1
76 77 78 79
        mov t0, r0m
        mov t1, r1m
        mov t2, r2m
        mov t3, r3m
80 81
        mov t4, r4m
        mov t5, r5m
82 83 84
    %endmacro
%endif

85 86 87 88
%macro AVG_END 0
    lea  t4, [t4+t5*2*SIZEOF_PIXEL]
    lea  t2, [t2+t3*2*SIZEOF_PIXEL]
    lea  t0, [t0+t1*2*SIZEOF_PIXEL]
89
    sub eax, 2
90 91 92 93
    jg .height_loop
    REP_RET
%endmacro

94
%if HIGH_BIT_DEPTH
95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117

%macro BIWEIGHT_MMX 2
    movh      m0, %1
    movh      m1, %2
    punpcklwd m0, m1
    pmaddwd   m0, m3
    paddd     m0, m4
    psrad     m0, 6
%endmacro

%macro BIWEIGHT_START_MMX 0
    movzx  t6d, word r6m
    mov    t7d, 64
    sub    t7d, t6d
    shl    t7d, 16
    add    t6d, t7d
    movd    m3, t6d
    SPLATD  m3, m3
    mova    m4, [pd_32]
    pxor    m5, m5
%endmacro

%else ;!HIGH_BIT_DEPTH
Loren Merritt's avatar
Loren Merritt committed
118 119 120
%macro BIWEIGHT_MMX 2
    movh      m0, %1
    movh      m1, %2
Anton Mitrofanov's avatar
Anton Mitrofanov committed
121 122 123 124
    punpcklbw m0, m5
    punpcklbw m1, m5
    pmullw    m0, m2
    pmullw    m1, m3
Fiona Glaser's avatar
Fiona Glaser committed
125
    paddw     m0, m1
Anton Mitrofanov's avatar
Anton Mitrofanov committed
126
    paddw     m0, m4
Fiona Glaser's avatar
Fiona Glaser committed
127 128 129
    psraw     m0, 6
%endmacro

Loren Merritt's avatar
Loren Merritt committed
130
%macro BIWEIGHT_START_MMX 0
Anton Mitrofanov's avatar
Anton Mitrofanov committed
131 132
    movd    m2, r6m
    SPLATW  m2, m2   ; weight_dst
133
    mova    m3, [pw_64]
Anton Mitrofanov's avatar
Anton Mitrofanov committed
134
    psubw   m3, m2   ; weight_src
135
    mova    m4, [pw_32] ; rounding
Anton Mitrofanov's avatar
Anton Mitrofanov committed
136
    pxor    m5, m5
Fiona Glaser's avatar
Fiona Glaser committed
137
%endmacro
138
%endif ;HIGH_BIT_DEPTH
Fiona Glaser's avatar
Fiona Glaser committed
139

Loren Merritt's avatar
Loren Merritt committed
140 141 142 143
%macro BIWEIGHT_SSSE3 2
    movh      m0, %1
    movh      m1, %2
    punpcklbw m0, m1
Anton Mitrofanov's avatar
Anton Mitrofanov committed
144 145
    pmaddubsw m0, m3
    paddw     m0, m4
Loren Merritt's avatar
Loren Merritt committed
146 147 148 149 150 151 152 153 154
    psraw     m0, 6
%endmacro

%macro BIWEIGHT_START_SSSE3 0
    movzx  t6d, byte r6m ; FIXME x86_64
    mov    t7d, 64
    sub    t7d, t6d
    shl    t7d, 8
    add    t6d, t7d
Anton Mitrofanov's avatar
Anton Mitrofanov committed
155
    movd    m3, t6d
156
    mova    m4, [pw_32]
Anton Mitrofanov's avatar
Anton Mitrofanov committed
157
    SPLATW  m3, m3   ; weight_dst,src
Loren Merritt's avatar
Loren Merritt committed
158 159
%endmacro

160
%if HIGH_BIT_DEPTH
161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176
%macro BIWEIGHT_ROW 4
    BIWEIGHT   [%2], [%3]
%if %4==mmsize/4
    packssdw     m0, m0
    CLIPW        m0, m5, m7
    movh       [%1], m0
%else
    SWAP 0, 6
    BIWEIGHT   [%2+mmsize/2], [%3+mmsize/2]
    packssdw     m6, m0
    CLIPW        m6, m5, m7
    mova       [%1], m6
%endif
%endmacro

%else ;!HIGH_BIT_DEPTH
Loren Merritt's avatar
Loren Merritt committed
177 178 179 180 181 182
%macro BIWEIGHT_ROW 4
    BIWEIGHT [%2], [%3]
%if %4==mmsize/2
    packuswb   m0, m0
    movh     [%1], m0
%else
Anton Mitrofanov's avatar
Anton Mitrofanov committed
183
    SWAP 0, 6
Loren Merritt's avatar
Loren Merritt committed
184
    BIWEIGHT [%2+mmsize/2], [%3+mmsize/2]
Anton Mitrofanov's avatar
Anton Mitrofanov committed
185 186
    packuswb   m6, m0
    mova     [%1], m6
Loren Merritt's avatar
Loren Merritt committed
187 188
%endif
%endmacro
Fiona Glaser's avatar
Fiona Glaser committed
189

190 191
%endif ;HIGH_BIT_DEPTH

Fiona Glaser's avatar
Fiona Glaser committed
192
;-----------------------------------------------------------------------------
193
; int pixel_avg_weight_w16( pixel *dst, int, pixel *src1, int, pixel *src2, int, int i_weight )
Fiona Glaser's avatar
Fiona Glaser committed
194
;-----------------------------------------------------------------------------
195 196
%macro AVG_WEIGHT 1-2 0
cglobal pixel_avg_weight_w%1
Fiona Glaser's avatar
Fiona Glaser committed
197
    BIWEIGHT_START
198
    AVG_START %2
199
%if HIGH_BIT_DEPTH
200 201 202
    mova    m7, [pw_pixel_max]
%endif
.height_loop:
203
%if mmsize==16 && %1==mmsize/(2*SIZEOF_PIXEL)
Loren Merritt's avatar
Loren Merritt committed
204
    BIWEIGHT [t2], [t4]
Anton Mitrofanov's avatar
Anton Mitrofanov committed
205
    SWAP 0, 6
206
    BIWEIGHT [t2+SIZEOF_PIXEL*t3], [t4+SIZEOF_PIXEL*t5]
207
%if HIGH_BIT_DEPTH
208 209 210
    packssdw m6, m0
    CLIPW    m6, m5, m7
%else ;!HIGH_BIT_DEPTH
Anton Mitrofanov's avatar
Anton Mitrofanov committed
211
    packuswb m6, m0
212
%endif ;HIGH_BIT_DEPTH
Anton Mitrofanov's avatar
Anton Mitrofanov committed
213
    movlps   [t0], m6
214
    movhps   [t0+SIZEOF_PIXEL*t1], m6
Loren Merritt's avatar
Loren Merritt committed
215
%else
Fiona Glaser's avatar
Fiona Glaser committed
216
%assign x 0
217 218 219
%rep (%1*SIZEOF_PIXEL+mmsize-1)/mmsize
    BIWEIGHT_ROW   t0+x,                   t2+x,                   t4+x,                 %1
    BIWEIGHT_ROW   t0+x+SIZEOF_PIXEL*t1,   t2+x+SIZEOF_PIXEL*t3,   t4+x+SIZEOF_PIXEL*t5, %1
Loren Merritt's avatar
Loren Merritt committed
220
%assign x x+mmsize
Fiona Glaser's avatar
Fiona Glaser committed
221
%endrep
Loren Merritt's avatar
Loren Merritt committed
222
%endif
223
    AVG_END
Fiona Glaser's avatar
Fiona Glaser committed
224 225
%endmacro

Loren Merritt's avatar
Loren Merritt committed
226 227
%define BIWEIGHT BIWEIGHT_MMX
%define BIWEIGHT_START BIWEIGHT_START_MMX
228 229 230 231
INIT_MMX mmx2
AVG_WEIGHT 4
AVG_WEIGHT 8
AVG_WEIGHT 16
232
%if HIGH_BIT_DEPTH
233 234 235 236
INIT_XMM sse2
AVG_WEIGHT 4,  8
AVG_WEIGHT 8,  8
AVG_WEIGHT 16, 8
237
%else ;!HIGH_BIT_DEPTH
238 239 240
INIT_XMM sse2
AVG_WEIGHT 8,  7
AVG_WEIGHT 16, 7
Loren Merritt's avatar
Loren Merritt committed
241 242
%define BIWEIGHT BIWEIGHT_SSSE3
%define BIWEIGHT_START BIWEIGHT_START_SSSE3
243 244 245 246 247
INIT_MMX ssse3
AVG_WEIGHT 4
INIT_XMM ssse3
AVG_WEIGHT 8,  7
AVG_WEIGHT 16, 7
248
%endif ;HIGH_BIT_DEPTH
Fiona Glaser's avatar
Fiona Glaser committed
249

Dylan Yudaken's avatar
Dylan Yudaken committed
250 251 252 253
;=============================================================================
; P frame explicit weighted prediction
;=============================================================================

254
%if HIGH_BIT_DEPTH
255 256
; width
%macro WEIGHT_START 1
257 258
    mova        m0, [r4+ 0]         ; 1<<denom
    mova        m3, [r4+16]
259 260 261 262 263
    movd        m2, [r4+32]         ; denom
    mova        m4, [pw_pixel_max]
    paddw       m2, [sq_1]          ; denom+1
%endmacro

264 265
; src1, src2
%macro WEIGHT 2
266 267 268 269 270 271 272 273 274 275 276
    movh        m5, [%1]
    movh        m6, [%2]
    punpcklwd   m5, m0
    punpcklwd   m6, m0
    pmaddwd     m5, m3
    pmaddwd     m6, m3
    psrad       m5, m2
    psrad       m6, m2
    packssdw    m5, m6
%endmacro

277 278
; src, dst, width
%macro WEIGHT_TWO_ROW 3
279 280 281 282 283 284 285 286 287
    %assign x 0
%rep (%3+mmsize/2-1)/(mmsize/2)
%if %3-x/2 <= 4 && mmsize == 16
    WEIGHT      %1+x, %1+r3+x
    CLIPW         m5, [pb_0], m4
    movh      [%2+x], m5
    movhps [%2+r1+x], m5
%else
    WEIGHT      %1+x, %1+x+mmsize/2
288
    SWAP           5,  7
289 290 291 292 293 294 295 296 297 298
    WEIGHT   %1+r3+x, %1+r3+x+mmsize/2
    CLIPW         m5, [pb_0], m4
    CLIPW         m7, [pb_0], m4
    mova      [%2+x], m7
    mova   [%2+r1+x], m5
%endif
    %assign x x+mmsize
%endrep
%endmacro

299
%else ; !HIGH_BIT_DEPTH
300

Dylan Yudaken's avatar
Dylan Yudaken committed
301 302
%macro WEIGHT_START 1
    mova     m3, [r4]
303 304
    mova     m4, [r4+16]
%if notcpuflag(ssse3) || cpuflag(xop)
Dylan Yudaken's avatar
Dylan Yudaken committed
305 306 307 308 309
    movd     m5, [r4+32]
%endif
    pxor     m2, m2
%endmacro

310 311 312 313 314 315
; src1, src2, dst1, dst2
%macro WEIGHT_ROWx2 4
    movh      m0, [%1         ]
    movh      m1, [%1+mmsize/2]
    movh      m6, [%2         ]
    movh      m7, [%2+mmsize/2]
Dylan Yudaken's avatar
Dylan Yudaken committed
316 317
    punpcklbw m0, m2
    punpcklbw m1, m2
318 319 320
    punpcklbw m6, m2
    punpcklbw m7, m2
%if cpuflag(ssse3)
Dylan Yudaken's avatar
Dylan Yudaken committed
321 322
    psllw     m0, 7
    psllw     m1, 7
323 324
    psllw     m6, 7
    psllw     m7, 7
Dylan Yudaken's avatar
Dylan Yudaken committed
325 326
    pmulhrsw  m0, m3
    pmulhrsw  m1, m3
327 328
    pmulhrsw  m6, m3
    pmulhrsw  m7, m3
Dylan Yudaken's avatar
Dylan Yudaken committed
329 330
    paddw     m0, m4
    paddw     m1, m4
331 332
    paddw     m6, m4
    paddw     m7, m4
Dylan Yudaken's avatar
Dylan Yudaken committed
333
%else
334 335 336 337 338 339 340 341 342 343 344 345
    pmullw    m0, m3
    pmullw    m1, m3
    pmullw    m6, m3
    pmullw    m7, m3
    paddsw    m0, m4        ;1<<(denom-1)+(offset<<denom)
    paddsw    m1, m4
    paddsw    m6, m4
    paddsw    m7, m4
    psraw     m0, m5
    psraw     m1, m5
    psraw     m6, m5
    psraw     m7, m5
Dylan Yudaken's avatar
Dylan Yudaken committed
346
%endif
347 348 349 350
    packuswb  m0, m1
    packuswb  m6, m7
    mova    [%3], m0
    mova    [%4], m6
Dylan Yudaken's avatar
Dylan Yudaken committed
351 352
%endmacro

353 354 355 356 357 358 359 360 361 362 363 364 365
; src1, src2, dst1, dst2, width
%macro WEIGHT_COL 5
    movh      m0, [%1]
    movh      m1, [%2]
    punpcklbw m0, m2
    punpcklbw m1, m2
%if cpuflag(ssse3)
    psllw     m0, 7
    psllw     m1, 7
    pmulhrsw  m0, m3
    pmulhrsw  m1, m3
    paddw     m0, m4
    paddw     m1, m4
Dylan Yudaken's avatar
Dylan Yudaken committed
366
%else
367 368 369 370 371 372
    pmullw    m0, m3
    pmullw    m1, m3
    paddsw    m0, m4        ;1<<(denom-1)+(offset<<denom)
    paddsw    m1, m4
    psraw     m0, m5
    psraw     m1, m5
Dylan Yudaken's avatar
Dylan Yudaken committed
373
%endif
374 375 376 377
%if %5 == 8
    packuswb  m0, m1
    movh    [%3], m0
    movhps  [%4], m0
Dylan Yudaken's avatar
Dylan Yudaken committed
378
%else
379 380 381 382
    packuswb  m0, m0
    packuswb  m1, m1
    movd    [%3], m0    ; width 2 can write garbage for the last 2 bytes
    movd    [%4], m1
Dylan Yudaken's avatar
Dylan Yudaken committed
383 384 385
%endif
%endmacro

386 387
; src, dst, width
%macro WEIGHT_TWO_ROW 3
Dylan Yudaken's avatar
Dylan Yudaken committed
388 389 390
%assign x 0
%rep %3
%if (%3-x) >= mmsize
391
    WEIGHT_ROWx2 %1+x, %1+r3+x, %2+x, %2+r1+x
Dylan Yudaken's avatar
Dylan Yudaken committed
392 393
    %assign x (x+mmsize)
%else
394
    WEIGHT_COL %1+x, %1+r3+x, %2+x, %2+r1+x, %3-x
Dylan Yudaken's avatar
Dylan Yudaken committed
395 396 397 398 399 400 401 402
    %exitrep
%endif
%if x >= %3
    %exitrep
%endif
%endrep
%endmacro

403
%endif ; HIGH_BIT_DEPTH
404

405
;-----------------------------------------------------------------------------
406
;void mc_weight_wX( pixel *dst, int i_dst_stride, pixel *src, int i_src_stride, weight_t *weight, int h )
407
;-----------------------------------------------------------------------------
Dylan Yudaken's avatar
Dylan Yudaken committed
408

409
%macro WEIGHTER 1
410
cglobal mc_weight_w%1, 6,6,8
411
    FIX_STRIDES r1, r3
Dylan Yudaken's avatar
Dylan Yudaken committed
412 413 414 415 416
    WEIGHT_START %1
.loop:
    WEIGHT_TWO_ROW r2, r0, %1
    lea  r0, [r0+r1*2]
    lea  r2, [r2+r3*2]
417
    sub r5d, 2
Dylan Yudaken's avatar
Dylan Yudaken committed
418 419 420 421
    jg .loop
    REP_RET
%endmacro

422 423 424 425 426 427 428 429 430 431
INIT_MMX mmx2
WEIGHTER  4
WEIGHTER  8
WEIGHTER 12
WEIGHTER 16
WEIGHTER 20
INIT_XMM sse2
WEIGHTER  8
WEIGHTER 16
WEIGHTER 20
432
%if HIGH_BIT_DEPTH
433
WEIGHTER 12
434
%else
435 436 437 438 439 440
INIT_MMX ssse3
WEIGHTER  4
INIT_XMM ssse3
WEIGHTER  8
WEIGHTER 16
WEIGHTER 20
441
%endif
Dylan Yudaken's avatar
Dylan Yudaken committed
442 443 444 445

%macro OFFSET_OP 7
    mov%6        m0, [%1]
    mov%6        m1, [%2]
446
%if HIGH_BIT_DEPTH
447 448 449 450 451 452 453
    p%5usw       m0, m2
    p%5usw       m1, m2
%ifidn %5,add
    pminsw       m0, m3
    pminsw       m1, m3
%endif
%else
Dylan Yudaken's avatar
Dylan Yudaken committed
454 455
    p%5usb       m0, m2
    p%5usb       m1, m2
456
%endif
Dylan Yudaken's avatar
Dylan Yudaken committed
457 458 459 460 461 462 463
    mov%7      [%3], m0
    mov%7      [%4], m1
%endmacro

%macro OFFSET_TWO_ROW 4
%assign x 0
%rep %3
464
%if (%3*SIZEOF_PIXEL-x) >= mmsize
Dylan Yudaken's avatar
Dylan Yudaken committed
465 466 467
    OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, u, a
    %assign x (x+mmsize)
%else
468
%if HIGH_BIT_DEPTH
469 470 471 472
    OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, h, h
%else
    OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, d, d
%endif
Dylan Yudaken's avatar
Dylan Yudaken committed
473 474
    %exitrep
%endif
475
%if x >= %3*SIZEOF_PIXEL
Dylan Yudaken's avatar
Dylan Yudaken committed
476 477 478 479 480
    %exitrep
%endif
%endrep
%endmacro

481
;-----------------------------------------------------------------------------
482
;void mc_offset_wX( pixel *src, int i_src_stride, pixel *dst, int i_dst_stride, weight_t *w, int h )
483
;-----------------------------------------------------------------------------
484
%macro OFFSET 2
485
cglobal mc_offset%2_w%1, 6,6
486
    FIX_STRIDES r1, r3
Dylan Yudaken's avatar
Dylan Yudaken committed
487
    mova m2, [r4]
488
%if HIGH_BIT_DEPTH
489
%ifidn %2,add
490 491 492
    mova m3, [pw_pixel_max]
%endif
%endif
Dylan Yudaken's avatar
Dylan Yudaken committed
493
.loop:
494
    OFFSET_TWO_ROW r2, r0, %1, %2
Dylan Yudaken's avatar
Dylan Yudaken committed
495 496
    lea  r0, [r0+r1*2]
    lea  r2, [r2+r3*2]
497
    sub r5d, 2
Dylan Yudaken's avatar
Dylan Yudaken committed
498 499 500 501
    jg .loop
    REP_RET
%endmacro

502 503 504
%macro OFFSETPN 1
       OFFSET %1, add
       OFFSET %1, sub
Dylan Yudaken's avatar
Dylan Yudaken committed
505
%endmacro
506 507 508 509 510 511 512 513 514 515
INIT_MMX mmx2
OFFSETPN  4
OFFSETPN  8
OFFSETPN 12
OFFSETPN 16
OFFSETPN 20
INIT_XMM sse2
OFFSETPN 12
OFFSETPN 16
OFFSETPN 20
516
%if HIGH_BIT_DEPTH
517 518
INIT_XMM sse2
OFFSETPN  8
519
%endif
Fiona Glaser's avatar
Fiona Glaser committed
520 521 522 523 524 525 526


;=============================================================================
; pixel avg
;=============================================================================

;-----------------------------------------------------------------------------
527 528
; void pixel_avg_4x4( pixel *dst, int dst_stride,
;                     pixel *src1, int src1_stride, pixel *src2, int src2_stride, int weight );
Fiona Glaser's avatar
Fiona Glaser committed
529
;-----------------------------------------------------------------------------
530 531
%macro AVGH 2
cglobal pixel_avg_%1x%2
Fiona Glaser's avatar
Fiona Glaser committed
532 533
    mov eax, %2
    cmp dword r6m, 32
534
    jne pixel_avg_weight_w%1 %+ SUFFIX
Fiona Glaser's avatar
Fiona Glaser committed
535 536
%if mmsize == 16 && %1 == 16
    test dword r4m, 15
537
    jz pixel_avg_w%1_sse2
Fiona Glaser's avatar
Fiona Glaser committed
538
%endif
Loren Merritt's avatar
Loren Merritt committed
539
    jmp pixel_avg_w%1_mmx2
Fiona Glaser's avatar
Fiona Glaser committed
540 541 542
%endmacro

;-----------------------------------------------------------------------------
543 544
; void pixel_avg_w4( pixel *dst, int dst_stride,
;                    pixel *src1, int src1_stride, pixel *src2, int src2_stride,
545
;                    int height, int weight );
Fiona Glaser's avatar
Fiona Glaser committed
546 547
;-----------------------------------------------------------------------------

548 549
%macro AVG_FUNC 3
cglobal pixel_avg_w%1
Loren Merritt's avatar
Loren Merritt committed
550
    AVG_START
551 552 553 554 555
.height_loop:
%assign x 0
%rep (%1*SIZEOF_PIXEL+mmsize-1)/mmsize
    %2     m0, [t2+x]
    %2     m1, [t2+x+SIZEOF_PIXEL*t3]
556
%if HIGH_BIT_DEPTH
557 558 559 560 561 562 563 564 565 566
    pavgw  m0, [t4+x]
    pavgw  m1, [t4+x+SIZEOF_PIXEL*t5]
%else ;!HIGH_BIT_DEPTH
    pavgb  m0, [t4+x]
    pavgb  m1, [t4+x+SIZEOF_PIXEL*t5]
%endif
    %3     [t0+x], m0
    %3     [t0+x+SIZEOF_PIXEL*t1], m1
%assign x x+mmsize
%endrep
Loren Merritt's avatar
Loren Merritt committed
567 568
    AVG_END
%endmacro
569

570
%if HIGH_BIT_DEPTH
571

572 573
INIT_MMX mmx2
AVG_FUNC 4, movq, movq
Fiona Glaser's avatar
Fiona Glaser committed
574
AVGH 4, 16
575 576 577
AVGH 4, 8
AVGH 4, 4
AVGH 4, 2
578

579 580 581 582
AVG_FUNC 8, movq, movq
AVGH 8, 16
AVGH 8,  8
AVGH 8,  4
583

584 585 586
AVG_FUNC 16, movq, movq
AVGH 16, 16
AVGH 16,  8
587

588 589
INIT_XMM sse2
AVG_FUNC 4, movq, movq
Fiona Glaser's avatar
Fiona Glaser committed
590
AVGH  4, 16
591 592 593
AVGH  4, 8
AVGH  4, 4
AVGH  4, 2
594

595 596 597 598
AVG_FUNC 8, movdqu, movdqa
AVGH  8, 16
AVGH  8,  8
AVGH  8,  4
599

600 601 602
AVG_FUNC 16, movdqu, movdqa
AVGH  16, 16
AVGH  16,  8
603 604 605

%else ;!HIGH_BIT_DEPTH

606 607
INIT_MMX mmx2
AVG_FUNC 4, movd, movd
Fiona Glaser's avatar
Fiona Glaser committed
608
AVGH 4, 16
609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635
AVGH 4, 8
AVGH 4, 4
AVGH 4, 2

AVG_FUNC 8, movq, movq
AVGH 8, 16
AVGH 8,  8
AVGH 8,  4

AVG_FUNC 16, movq, movq
AVGH 16, 16
AVGH 16, 8

INIT_XMM sse2
AVG_FUNC 16, movdqu, movdqa
AVGH 16, 16
AVGH 16,  8
AVGH  8, 16
AVGH  8,  8
AVGH  8,  4
INIT_XMM ssse3
AVGH 16, 16
AVGH 16,  8
AVGH  8, 16
AVGH  8,  8
AVGH  8,  4
INIT_MMX ssse3
Fiona Glaser's avatar
Fiona Glaser committed
636
AVGH  4, 16
637 638 639
AVGH  4,  8
AVGH  4,  4
AVGH  4,  2
Fiona Glaser's avatar
Fiona Glaser committed
640

641
%endif ;HIGH_BIT_DEPTH
642 643


644

645 646 647 648
;=============================================================================
; pixel avg2
;=============================================================================

649
%if HIGH_BIT_DEPTH
650 651 652 653 654
;-----------------------------------------------------------------------------
; void pixel_avg2_wN( uint16_t *dst,  int dst_stride,
;                     uint16_t *src1, int src_stride,
;                     uint16_t *src2, int height );
;-----------------------------------------------------------------------------
655
%macro AVG2_W_ONE 1
Loren Merritt's avatar
Loren Merritt committed
656
cglobal pixel_avg2_w%1, 6,7,4
657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674
    sub     r4, r2
    lea     r6, [r4+r3*2]
.height_loop:
    movu    m0, [r2]
    movu    m1, [r2+r3*2]
%if mmsize == 8
    pavgw   m0, [r2+r4]
    pavgw   m1, [r2+r6]
%else
    movu    m2, [r2+r4]
    movu    m3, [r2+r6]
    pavgw   m0, m2
    pavgw   m1, m3
%endif
    mova   [r0], m0
    mova   [r0+r1*2], m1
    lea     r2, [r2+r3*4]
    lea     r0, [r0+r1*4]
675
    sub    r5d, 2
676 677 678 679
    jg .height_loop
    REP_RET
%endmacro

680
%macro AVG2_W_TWO 3
Loren Merritt's avatar
Loren Merritt committed
681
cglobal pixel_avg2_w%1, 6,7,8
682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709
    sub     r4, r2
    lea     r6, [r4+r3*2]
.height_loop:
    movu    m0, [r2]
    %2      m1, [r2+mmsize]
    movu    m2, [r2+r3*2]
    %2      m3, [r2+r3*2+mmsize]
%if mmsize == 8
    pavgw   m0, [r2+r4]
    pavgw   m1, [r2+r4+mmsize]
    pavgw   m2, [r2+r6]
    pavgw   m3, [r2+r6+mmsize]
%else
    movu    m4, [r2+r4]
    %2      m5, [r2+r4+mmsize]
    movu    m6, [r2+r6]
    %2      m7, [r2+r6+mmsize]
    pavgw   m0, m4
    pavgw   m1, m5
    pavgw   m2, m6
    pavgw   m3, m7
%endif
    mova   [r0], m0
    %3     [r0+mmsize], m1
    mova   [r0+r1*2], m2
    %3     [r0+r1*2+mmsize], m3
    lea     r2, [r2+r3*4]
    lea     r0, [r0+r1*4]
710
    sub    r5d, 2
711 712 713 714
    jg .height_loop
    REP_RET
%endmacro

715 716 717 718 719 720 721
INIT_MMX mmx2
AVG2_W_ONE  4
AVG2_W_TWO  8, movu, mova
INIT_XMM sse2
AVG2_W_ONE  8
AVG2_W_TWO 10, movd, movd
AVG2_W_TWO 16, movu, mova
722 723

INIT_MMX
Loren Merritt's avatar
Loren Merritt committed
724
cglobal pixel_avg2_w10_mmx2, 6,7
725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747
    sub     r4, r2
    lea     r6, [r4+r3*2]
.height_loop:
    movu    m0, [r2+ 0]
    movu    m1, [r2+ 8]
    movh    m2, [r2+16]
    movu    m3, [r2+r3*2+ 0]
    movu    m4, [r2+r3*2+ 8]
    movh    m5, [r2+r3*2+16]
    pavgw   m0, [r2+r4+ 0]
    pavgw   m1, [r2+r4+ 8]
    pavgw   m2, [r2+r4+16]
    pavgw   m3, [r2+r6+ 0]
    pavgw   m4, [r2+r6+ 8]
    pavgw   m5, [r2+r6+16]
    mova   [r0+ 0], m0
    mova   [r0+ 8], m1
    movh   [r0+16], m2
    mova   [r0+r1*2+ 0], m3
    mova   [r0+r1*2+ 8], m4
    movh   [r0+r1*2+16], m5
    lea     r2, [r2+r3*2*2]
    lea     r0, [r0+r1*2*2]
748
    sub    r5d, 2
749 750 751
    jg .height_loop
    REP_RET

Loren Merritt's avatar
Loren Merritt committed
752
cglobal pixel_avg2_w16_mmx2, 6,7
753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781
    sub     r4, r2
    lea     r6, [r4+r3*2]
.height_loop:
    movu    m0, [r2+ 0]
    movu    m1, [r2+ 8]
    movu    m2, [r2+16]
    movu    m3, [r2+24]
    movu    m4, [r2+r3*2+ 0]
    movu    m5, [r2+r3*2+ 8]
    movu    m6, [r2+r3*2+16]
    movu    m7, [r2+r3*2+24]
    pavgw   m0, [r2+r4+ 0]
    pavgw   m1, [r2+r4+ 8]
    pavgw   m2, [r2+r4+16]
    pavgw   m3, [r2+r4+24]
    pavgw   m4, [r2+r6+ 0]
    pavgw   m5, [r2+r6+ 8]
    pavgw   m6, [r2+r6+16]
    pavgw   m7, [r2+r6+24]
    mova   [r0+ 0], m0
    mova   [r0+ 8], m1
    mova   [r0+16], m2
    mova   [r0+24], m3
    mova   [r0+r1*2+ 0], m4
    mova   [r0+r1*2+ 8], m5
    mova   [r0+r1*2+16], m6
    mova   [r0+r1*2+24], m7
    lea     r2, [r2+r3*2*2]
    lea     r0, [r0+r1*2*2]
782
    sub    r5d, 2
783 784 785
    jg .height_loop
    REP_RET

Loren Merritt's avatar
Loren Merritt committed
786
cglobal pixel_avg2_w18_mmx2, 6,7
787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805
    sub     r4, r2
.height_loop:
    movu    m0, [r2+ 0]
    movu    m1, [r2+ 8]
    movu    m2, [r2+16]
    movu    m3, [r2+24]
    movh    m4, [r2+32]
    pavgw   m0, [r2+r4+ 0]
    pavgw   m1, [r2+r4+ 8]
    pavgw   m2, [r2+r4+16]
    pavgw   m3, [r2+r4+24]
    pavgw   m4, [r2+r4+32]
    mova   [r0+ 0], m0
    mova   [r0+ 8], m1
    mova   [r0+16], m2
    mova   [r0+24], m3
    movh   [r0+32], m4
    lea     r2, [r2+r3*2]
    lea     r0, [r0+r1*2]
806
    dec    r5d
807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827
    jg .height_loop
    REP_RET

INIT_XMM
cglobal pixel_avg2_w18_sse2, 6,7,6
    sub     r4, r2
.height_loop:
    movu    m0, [r2+ 0]
    movu    m1, [r2+16]
    movh    m2, [r2+32]
    movu    m3, [r2+r4+ 0]
    movu    m4, [r2+r4+16]
    movh    m5, [r2+r4+32]
    pavgw   m0, m3
    pavgw   m1, m4
    pavgw   m2, m5
    mova   [r0+ 0], m0
    mova   [r0+16], m1
    movh   [r0+32], m2
    lea     r2, [r2+r3*2]
    lea     r0, [r0+r1*2]
828
    dec    r5d
829 830
    jg .height_loop
    REP_RET
831
%endif ; HIGH_BIT_DEPTH
832

833
%if HIGH_BIT_DEPTH == 0
834
;-----------------------------------------------------------------------------
835 836 837
; void pixel_avg2_w4( uint8_t *dst, int dst_stride,
;                     uint8_t *src1, int src_stride,
;                     uint8_t *src2, int height );
838 839
;-----------------------------------------------------------------------------
%macro AVG2_W8 2
Loren Merritt's avatar
Loren Merritt committed
840
cglobal pixel_avg2_w%1_mmx2, 6,7
841 842 843 844 845 846 847
    sub    r4, r2
    lea    r6, [r4+r3]
.height_loop:
    %2     mm0, [r2]
    %2     mm1, [r2+r3]
    pavgb  mm0, [r2+r4]
    pavgb  mm1, [r2+r6]
848
    lea    r2, [r2+r3*2]
849 850 851
    %2     [r0], mm0
    %2     [r0+r1], mm1
    lea    r0, [r0+r1*2]
852
    sub    r5d, 2
853 854 855 856
    jg     .height_loop
    REP_RET
%endmacro

857
INIT_MMX
858 859 860 861
AVG2_W8 4, movd
AVG2_W8 8, movq

%macro AVG2_W16 2
Loren Merritt's avatar
Loren Merritt committed
862
cglobal pixel_avg2_w%1_mmx2, 6,7
863 864
    sub    r2, r4
    lea    r6, [r2+r3]
865
.height_loop:
866 867 868 869 870 871 872 873 874
    movq   mm0, [r4]
    %2     mm1, [r4+8]
    movq   mm2, [r4+r3]
    %2     mm3, [r4+r3+8]
    pavgb  mm0, [r4+r2]
    pavgb  mm1, [r4+r2+8]
    pavgb  mm2, [r4+r6]
    pavgb  mm3, [r4+r6+8]
    lea    r4, [r4+r3*2]
875 876 877 878 879 880 881 882 883 884 885 886 887
    movq   [r0], mm0
    %2     [r0+8], mm1
    movq   [r0+r1], mm2
    %2     [r0+r1+8], mm3
    lea    r0, [r0+r1*2]
    sub    r5d, 2
    jg     .height_loop
    REP_RET
%endmacro

AVG2_W16 12, movd
AVG2_W16 16, movq

Loren Merritt's avatar
Loren Merritt committed
888
cglobal pixel_avg2_w20_mmx2, 6,7
889 890
    sub    r2, r4
    lea    r6, [r2+r3]
891
.height_loop:
892 893 894 895 896 897 898 899 900 901 902 903 904
    movq   mm0, [r4]
    movq   mm1, [r4+8]
    movd   mm2, [r4+16]
    movq   mm3, [r4+r3]
    movq   mm4, [r4+r3+8]
    movd   mm5, [r4+r3+16]
    pavgb  mm0, [r4+r2]
    pavgb  mm1, [r4+r2+8]
    pavgb  mm2, [r4+r2+16]
    pavgb  mm3, [r4+r6]
    pavgb  mm4, [r4+r6+8]
    pavgb  mm5, [r4+r6+16]
    lea    r4, [r4+r3*2]
905 906 907 908 909 910 911 912 913 914 915
    movq   [r0], mm0
    movq   [r0+8], mm1
    movd   [r0+16], mm2
    movq   [r0+r1], mm3
    movq   [r0+r1+8], mm4
    movd   [r0+r1+16], mm5
    lea    r0, [r0+r1*2]
    sub    r5d, 2
    jg     .height_loop
    REP_RET

916
cglobal pixel_avg2_w16_sse2, 6,7
Fiona Glaser's avatar
Fiona Glaser committed
917 918 919 920 921 922 923
    sub    r4, r2
    lea    r6, [r4+r3]
.height_loop:
    movdqu xmm0, [r2]
    movdqu xmm2, [r2+r3]
    movdqu xmm1, [r2+r4]
    movdqu xmm3, [r2+r6]
924
    lea    r2, [r2+r3*2]
Fiona Glaser's avatar
Fiona Glaser committed
925 926 927 928 929 930 931 932 933
    pavgb  xmm0, xmm1
    pavgb  xmm2, xmm3
    movdqa [r0], xmm0
    movdqa [r0+r1], xmm2
    lea    r0, [r0+r1*2]
    sub    r5d, 2
    jg     .height_loop
    REP_RET

Fiona Glaser's avatar
Fiona Glaser committed
934
%macro AVG2_W20 1
935
cglobal pixel_avg2_w20_%1, 6,7
936 937
    sub    r2, r4
    lea    r6, [r2+r3]
Fiona Glaser's avatar
Fiona Glaser committed
938
.height_loop:
939 940
    movdqu xmm0, [r4]
    movdqu xmm2, [r4+r3]
Fiona Glaser's avatar
Fiona Glaser committed
941
%ifidn %1, sse2_misalign
942 943 944 945
    movd   mm4,  [r4+16]
    movd   mm5,  [r4+r3+16]
    pavgb  xmm0, [r4+r2]
    pavgb  xmm2, [r4+r6]
Fiona Glaser's avatar
Fiona Glaser committed
946
%else
947 948 949 950
    movdqu xmm1, [r4+r2]
    movdqu xmm3, [r4+r6]
    movd   mm4,  [r4+16]
    movd   mm5,  [r4+r3+16]
Fiona Glaser's avatar
Fiona Glaser committed
951 952
    pavgb  xmm0, xmm1
    pavgb  xmm2, xmm3
Fiona Glaser's avatar
Fiona Glaser committed
953
%endif
954 955 956
    pavgb  mm4,  [r4+r2+16]
    pavgb  mm5,  [r4+r6+16]
    lea    r4, [r4+r3*2]
Fiona Glaser's avatar
Fiona Glaser committed
957 958 959 960 961 962 963 964
    movdqa [r0], xmm0
    movd   [r0+16], mm4
    movdqa [r0+r1], xmm2
    movd   [r0+r1+16], mm5
    lea    r0, [r0+r1*2]
    sub    r5d, 2
    jg     .height_loop
    REP_RET
Fiona Glaser's avatar
Fiona Glaser committed
965 966 967 968
%endmacro

AVG2_W20 sse2
AVG2_W20 sse2_misalign
969 970 971 972 973 974 975 976 977 978 979 980

; Cacheline split code for processors with high latencies for loads
; split over cache lines.  See sad-a.asm for a more detailed explanation.
; This particular instance is complicated by the fact that src1 and src2
; can have different alignments.  For simplicity and code size, only the
; MMX cacheline workaround is used.  As a result, in the case of SSE2
; pixel_avg, the cacheline check functions calls the SSE2 version if there
; is no cacheline split, and the MMX workaround if there is.

%macro INIT_SHIFT 2
    and    eax, 7
    shl    eax, 3
981
    movd   %1, [sw_64]
982 983 984 985 986 987 988 989 990
    movd   %2, eax
    psubw  %1, %2
%endmacro

%macro AVG_CACHELINE_START 0
    %assign stack_offset 0
    INIT_SHIFT mm6, mm7
    mov    eax, r4m
    INIT_SHIFT mm4, mm5
Loren Merritt's avatar
Loren Merritt committed
991
    PROLOGUE 6,6
992 993 994 995 996
    and    r2, ~7
    and    r4, ~7
    sub    r4, r2
.height_loop:
%endmacro
Fiona Glaser's avatar
Fiona Glaser committed
997

998 999
%macro AVG_CACHELINE_LOOP 2
    movq   mm1, [r2+%1]
1000
    movq   mm0, [r2+8+%1]
1001
    movq   mm3, [r2+r4+%1]
1002
    movq   mm2, [r2+r4+8+%1]
1003
    psrlq  mm1, mm7
1004
    psllq  mm0, mm6
1005
    psrlq  mm3, mm5
1006
    psllq  mm2, mm4
1007 1008
    por    mm0, mm1
    por    <