mc-a.asm 23.2 KB
Newer Older
1 2 3 4 5 6
;*****************************************************************************
;* mc-a.asm: h264 encoder library
;*****************************************************************************
;* Copyright (C) 2003-2008 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
7
;*          Fiona Glaser <fiona@x264.com>
8
;*          Laurent Aimar <fenrir@via.ecp.fr>
9 10 11 12 13 14 15 16 17 18 19 20 21 22
;*          Min Chen <chenm001.163.com>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
23
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
24 25 26 27 28 29
;*****************************************************************************

%include "x86inc.asm"

SECTION_RODATA

Loren Merritt's avatar
Loren Merritt committed
30 31
pw_4:  times 8 dw  4
pw_8:  times 8 dw  8
Fiona Glaser's avatar
Fiona Glaser committed
32 33
pw_32: times 8 dw 32
pw_64: times 8 dw 64
34
sw_64: dd 64
35 36 37 38

SECTION .text

;=============================================================================
Fiona Glaser's avatar
Fiona Glaser committed
39
; weighted prediction
40
;=============================================================================
Fiona Glaser's avatar
Fiona Glaser committed
41 42
; implicit bipred only:
; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
43
%ifdef ARCH_X86_64
44
    DECLARE_REG_TMP 0,1,2,3,4,5,10,11
Anton Mitrofanov's avatar
Anton Mitrofanov committed
45 46 47 48 49
    %macro AVG_START 0-1 0
        PROLOGUE 6,7,%1
%ifdef WIN64
        movsxd r5, r5d
%endif
Loren Merritt's avatar
Loren Merritt committed
50
        .height_loop:
51 52
    %endmacro
%else
53
    DECLARE_REG_TMP 1,2,3,4,5,6,1,2
Anton Mitrofanov's avatar
Anton Mitrofanov committed
54 55
    %macro AVG_START 0-1 0
        PROLOGUE 0,7,%1
56 57 58 59
        mov t0, r0m
        mov t1, r1m
        mov t2, r2m
        mov t3, r3m
60 61
        mov t4, r4m
        mov t5, r5m
Loren Merritt's avatar
Loren Merritt committed
62
        .height_loop:
63 64 65
    %endmacro
%endif

Fiona Glaser's avatar
Fiona Glaser committed
66 67 68
%macro SPLATW 2
%if mmsize==16
    pshuflw  %1, %2, 0
69
    punpcklqdq %1, %1
Fiona Glaser's avatar
Fiona Glaser committed
70 71 72 73 74
%else
    pshufw   %1, %2, 0
%endif
%endmacro

Loren Merritt's avatar
Loren Merritt committed
75 76 77
%macro BIWEIGHT_MMX 2
    movh      m0, %1
    movh      m1, %2
Anton Mitrofanov's avatar
Anton Mitrofanov committed
78 79 80 81
    punpcklbw m0, m5
    punpcklbw m1, m5
    pmullw    m0, m2
    pmullw    m1, m3
Fiona Glaser's avatar
Fiona Glaser committed
82
    paddw     m0, m1
Anton Mitrofanov's avatar
Anton Mitrofanov committed
83
    paddw     m0, m4
Fiona Glaser's avatar
Fiona Glaser committed
84 85 86
    psraw     m0, 6
%endmacro

Loren Merritt's avatar
Loren Merritt committed
87
%macro BIWEIGHT_START_MMX 0
Anton Mitrofanov's avatar
Anton Mitrofanov committed
88 89 90 91 92 93
    movd    m2, r6m
    SPLATW  m2, m2   ; weight_dst
    mova    m3, [pw_64 GLOBAL]
    psubw   m3, m2   ; weight_src
    mova    m4, [pw_32 GLOBAL] ; rounding
    pxor    m5, m5
Fiona Glaser's avatar
Fiona Glaser committed
94 95
%endmacro

Loren Merritt's avatar
Loren Merritt committed
96 97 98 99
%macro BIWEIGHT_SSSE3 2
    movh      m0, %1
    movh      m1, %2
    punpcklbw m0, m1
Anton Mitrofanov's avatar
Anton Mitrofanov committed
100 101
    pmaddubsw m0, m3
    paddw     m0, m4
Loren Merritt's avatar
Loren Merritt committed
102 103 104 105 106 107 108 109 110
    psraw     m0, 6
%endmacro

%macro BIWEIGHT_START_SSSE3 0
    movzx  t6d, byte r6m ; FIXME x86_64
    mov    t7d, 64
    sub    t7d, t6d
    shl    t7d, 8
    add    t6d, t7d
Anton Mitrofanov's avatar
Anton Mitrofanov committed
111 112 113
    movd    m3, t6d
    mova    m4, [pw_32 GLOBAL]
    SPLATW  m3, m3   ; weight_dst,src
Loren Merritt's avatar
Loren Merritt committed
114 115 116 117 118 119 120 121
%endmacro

%macro BIWEIGHT_ROW 4
    BIWEIGHT [%2], [%3]
%if %4==mmsize/2
    packuswb   m0, m0
    movh     [%1], m0
%else
Anton Mitrofanov's avatar
Anton Mitrofanov committed
122
    SWAP 0, 6
Loren Merritt's avatar
Loren Merritt committed
123
    BIWEIGHT [%2+mmsize/2], [%3+mmsize/2]
Anton Mitrofanov's avatar
Anton Mitrofanov committed
124 125
    packuswb   m6, m0
    mova     [%1], m6
Loren Merritt's avatar
Loren Merritt committed
126 127
%endif
%endmacro
Fiona Glaser's avatar
Fiona Glaser committed
128 129 130 131

;-----------------------------------------------------------------------------
; int x264_pixel_avg_weight_w16_mmxext( uint8_t *dst, int, uint8_t *src1, int, uint8_t *src2, int, int i_weight )
;-----------------------------------------------------------------------------
Anton Mitrofanov's avatar
Anton Mitrofanov committed
132 133
%macro AVG_WEIGHT 2-3 0
cglobal x264_pixel_avg_weight_w%2_%1
Fiona Glaser's avatar
Fiona Glaser committed
134
    BIWEIGHT_START
Anton Mitrofanov's avatar
Anton Mitrofanov committed
135
    AVG_START %3
Loren Merritt's avatar
Loren Merritt committed
136 137
%if %2==8 && mmsize==16
    BIWEIGHT [t2], [t4]
Anton Mitrofanov's avatar
Anton Mitrofanov committed
138
    SWAP 0, 6
Loren Merritt's avatar
Loren Merritt committed
139
    BIWEIGHT [t2+t3], [t4+t5]
Anton Mitrofanov's avatar
Anton Mitrofanov committed
140 141 142
    packuswb m6, m0
    movlps   [t0], m6
    movhps   [t0+t1], m6
Loren Merritt's avatar
Loren Merritt committed
143
%else
Fiona Glaser's avatar
Fiona Glaser committed
144
%assign x 0
Loren Merritt's avatar
Loren Merritt committed
145 146 147 148
%rep 1+%2/(mmsize*2)
    BIWEIGHT_ROW t0+x,    t2+x,    t4+x,    %2
    BIWEIGHT_ROW t0+x+t1, t2+x+t3, t4+x+t5, %2
%assign x x+mmsize
Fiona Glaser's avatar
Fiona Glaser committed
149
%endrep
Loren Merritt's avatar
Loren Merritt committed
150
%endif
Fiona Glaser's avatar
Fiona Glaser committed
151 152 153 154 155 156 157 158
    lea  t0, [t0+t1*2]
    lea  t2, [t2+t3*2]
    lea  t4, [t4+t5*2]
    sub  eax, 2
    jg   .height_loop
    REP_RET
%endmacro

Loren Merritt's avatar
Loren Merritt committed
159 160 161
%define BIWEIGHT BIWEIGHT_MMX
%define BIWEIGHT_START BIWEIGHT_START_MMX
INIT_MMX
Fiona Glaser's avatar
Fiona Glaser committed
162 163 164 165
AVG_WEIGHT mmxext, 4
AVG_WEIGHT mmxext, 8
AVG_WEIGHT mmxext, 16
INIT_XMM
Loren Merritt's avatar
Loren Merritt committed
166
%define x264_pixel_avg_weight_w4_sse2 x264_pixel_avg_weight_w4_mmxext
Anton Mitrofanov's avatar
Anton Mitrofanov committed
167 168
AVG_WEIGHT sse2, 8,  7
AVG_WEIGHT sse2, 16, 7
Loren Merritt's avatar
Loren Merritt committed
169 170 171 172 173
%define BIWEIGHT BIWEIGHT_SSSE3
%define BIWEIGHT_START BIWEIGHT_START_SSSE3
INIT_MMX
AVG_WEIGHT ssse3, 4
INIT_XMM
Anton Mitrofanov's avatar
Anton Mitrofanov committed
174 175
AVG_WEIGHT ssse3, 8,  7
AVG_WEIGHT ssse3, 16, 7
Fiona Glaser's avatar
Fiona Glaser committed
176 177 178 179 180 181 182 183 184 185 186 187



;=============================================================================
; pixel avg
;=============================================================================

;-----------------------------------------------------------------------------
; void x264_pixel_avg_4x4_mmxext( uint8_t *dst, int dst_stride,
;                                 uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride, int weight );
;-----------------------------------------------------------------------------
%macro AVGH 3
Anton Mitrofanov's avatar
Anton Mitrofanov committed
188
cglobal x264_pixel_avg_%1x%2_%3
Fiona Glaser's avatar
Fiona Glaser committed
189 190
    mov eax, %2
    cmp dword r6m, 32
Loren Merritt's avatar
Loren Merritt committed
191
    jne x264_pixel_avg_weight_w%1_%3
Fiona Glaser's avatar
Fiona Glaser committed
192 193 194 195 196 197 198 199 200 201 202 203 204
%if mmsize == 16 && %1 == 16
    test dword r4m, 15
    jz x264_pixel_avg_w%1_sse2
%endif
    jmp x264_pixel_avg_w%1_mmxext
%endmacro

;-----------------------------------------------------------------------------
; void x264_pixel_avg_w4_mmxext( uint8_t *dst, int dst_stride,
;                                uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride,
;                                int height, int weight );
;-----------------------------------------------------------------------------

205 206
%macro AVG_END 0
    sub    eax, 2
207
    lea    t4, [t4+t5*2]
208 209 210 211 212 213
    lea    t2, [t2+t3*2]
    lea    t0, [t0+t1*2]
    jg     .height_loop
    REP_RET
%endmacro

Loren Merritt's avatar
Loren Merritt committed
214 215 216 217 218 219 220 221 222 223 224
%macro AVG_FUNC 3
cglobal %1
    AVG_START
    %2     m0, [t2]
    %2     m1, [t2+t3]
    pavgb  m0, [t4]
    pavgb  m1, [t4+t5]
    %3     [t0], m0
    %3     [t0+t1], m1
    AVG_END
%endmacro
225

Loren Merritt's avatar
Loren Merritt committed
226 227
INIT_MMX
AVG_FUNC x264_pixel_avg_w4_mmxext, movd, movd
Fiona Glaser's avatar
Fiona Glaser committed
228 229 230
AVGH 4, 8, mmxext
AVGH 4, 4, mmxext
AVGH 4, 2, mmxext
231

Loren Merritt's avatar
Loren Merritt committed
232
AVG_FUNC x264_pixel_avg_w8_mmxext, movq, movq
Fiona Glaser's avatar
Fiona Glaser committed
233 234 235
AVGH 8, 16, mmxext
AVGH 8, 8,  mmxext
AVGH 8, 4,  mmxext
236

Loren Merritt's avatar
Loren Merritt committed
237 238
cglobal x264_pixel_avg_w16_mmxext
    AVG_START
239 240 241 242
    movq   mm0, [t2  ]
    movq   mm1, [t2+8]
    movq   mm2, [t2+t3  ]
    movq   mm3, [t2+t3+8]
243 244 245 246
    pavgb  mm0, [t4  ]
    pavgb  mm1, [t4+8]
    pavgb  mm2, [t4+t5  ]
    pavgb  mm3, [t4+t5+8]
247 248 249 250
    movq   [t0  ], mm0
    movq   [t0+8], mm1
    movq   [t0+t1  ], mm2
    movq   [t0+t1+8], mm3
Loren Merritt's avatar
Loren Merritt committed
251
    AVG_END
252

Fiona Glaser's avatar
Fiona Glaser committed
253 254
AVGH 16, 16, mmxext
AVGH 16, 8,  mmxext
255

Fiona Glaser's avatar
Fiona Glaser committed
256
INIT_XMM
Loren Merritt's avatar
Loren Merritt committed
257
AVG_FUNC x264_pixel_avg_w16_sse2, movdqu, movdqa
Fiona Glaser's avatar
Fiona Glaser committed
258
AVGH 16, 16, sse2
Loren Merritt's avatar
Loren Merritt committed
259 260 261 262 263 264 265 266 267 268 269 270 271
AVGH 16,  8, sse2
AVGH  8, 16, sse2
AVGH  8,  8, sse2
AVGH  8,  4, sse2
AVGH 16, 16, ssse3
AVGH 16,  8, ssse3
AVGH  8, 16, ssse3
AVGH  8,  8, ssse3
AVGH  8,  4, ssse3
INIT_MMX
AVGH  4,  8, ssse3
AVGH  4,  4, ssse3
AVGH  4,  2, ssse3
Fiona Glaser's avatar
Fiona Glaser committed
272

273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359


;=============================================================================
; pixel avg2
;=============================================================================

;-----------------------------------------------------------------------------
; void x264_pixel_avg2_w4_mmxext( uint8_t *dst, int dst_stride,
;                                 uint8_t *src1, int src_stride,
;                                 uint8_t *src2, int height );
;-----------------------------------------------------------------------------
%macro AVG2_W8 2
cglobal x264_pixel_avg2_w%1_mmxext, 6,7
    sub    r4, r2
    lea    r6, [r4+r3]
.height_loop:
    %2     mm0, [r2]
    %2     mm1, [r2+r3]
    pavgb  mm0, [r2+r4]
    pavgb  mm1, [r2+r6]
    %2     [r0], mm0
    %2     [r0+r1], mm1
    sub    r5d, 2
    lea    r2, [r2+r3*2]
    lea    r0, [r0+r1*2]
    jg     .height_loop
    REP_RET
%endmacro

AVG2_W8 4, movd
AVG2_W8 8, movq

%macro AVG2_W16 2
cglobal x264_pixel_avg2_w%1_mmxext, 6,7
    sub    r4, r2
    lea    r6, [r4+r3]
.height_loop:
    movq   mm0, [r2]
    %2     mm1, [r2+8]
    movq   mm2, [r2+r3]
    %2     mm3, [r2+r3+8]
    pavgb  mm0, [r2+r4]
    pavgb  mm1, [r2+r4+8]
    pavgb  mm2, [r2+r6]
    pavgb  mm3, [r2+r6+8]
    movq   [r0], mm0
    %2     [r0+8], mm1
    movq   [r0+r1], mm2
    %2     [r0+r1+8], mm3
    lea    r2, [r2+r3*2]
    lea    r0, [r0+r1*2]
    sub    r5d, 2
    jg     .height_loop
    REP_RET
%endmacro

AVG2_W16 12, movd
AVG2_W16 16, movq

cglobal x264_pixel_avg2_w20_mmxext, 6,7
    sub    r4, r2
    lea    r6, [r4+r3]
.height_loop:
    movq   mm0, [r2]
    movq   mm1, [r2+8]
    movd   mm2, [r2+16]
    movq   mm3, [r2+r3]
    movq   mm4, [r2+r3+8]
    movd   mm5, [r2+r3+16]
    pavgb  mm0, [r2+r4]
    pavgb  mm1, [r2+r4+8]
    pavgb  mm2, [r2+r4+16]
    pavgb  mm3, [r2+r6]
    pavgb  mm4, [r2+r6+8]
    pavgb  mm5, [r2+r6+16]
    movq   [r0], mm0
    movq   [r0+8], mm1
    movd   [r0+16], mm2
    movq   [r0+r1], mm3
    movq   [r0+r1+8], mm4
    movd   [r0+r1+16], mm5
    lea    r2, [r2+r3*2]
    lea    r0, [r0+r1*2]
    sub    r5d, 2
    jg     .height_loop
    REP_RET

360
cglobal x264_pixel_avg2_w16_sse2, 6,7
Fiona Glaser's avatar
Fiona Glaser committed
361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377
    sub    r4, r2
    lea    r6, [r4+r3]
.height_loop:
    movdqu xmm0, [r2]
    movdqu xmm2, [r2+r3]
    movdqu xmm1, [r2+r4]
    movdqu xmm3, [r2+r6]
    pavgb  xmm0, xmm1
    pavgb  xmm2, xmm3
    movdqa [r0], xmm0
    movdqa [r0+r1], xmm2
    lea    r2, [r2+r3*2]
    lea    r0, [r0+r1*2]
    sub    r5d, 2
    jg     .height_loop
    REP_RET

Fiona Glaser's avatar
Fiona Glaser committed
378 379
%macro AVG2_W20 1
cglobal x264_pixel_avg2_w20_%1, 6,7
Fiona Glaser's avatar
Fiona Glaser committed
380 381 382 383 384 385 386
    sub    r4, r2
    lea    r6, [r4+r3]
.height_loop:
    movdqu xmm0, [r2]
    movdqu xmm2, [r2+r3]
    movd   mm4,  [r2+16]
    movd   mm5,  [r2+r3+16]
Fiona Glaser's avatar
Fiona Glaser committed
387 388 389 390 391 392
%ifidn %1, sse2_misalign
    pavgb  xmm0, [r2+r4]
    pavgb  xmm2, [r2+r6]
%else
    movdqu xmm1, [r2+r4]
    movdqu xmm3, [r2+r6]
Fiona Glaser's avatar
Fiona Glaser committed
393 394
    pavgb  xmm0, xmm1
    pavgb  xmm2, xmm3
Fiona Glaser's avatar
Fiona Glaser committed
395
%endif
Fiona Glaser's avatar
Fiona Glaser committed
396 397 398 399 400 401 402 403 404 405 406
    pavgb  mm4,  [r2+r4+16]
    pavgb  mm5,  [r2+r6+16]
    movdqa [r0], xmm0
    movd   [r0+16], mm4
    movdqa [r0+r1], xmm2
    movd   [r0+r1+16], mm5
    lea    r2, [r2+r3*2]
    lea    r0, [r0+r1*2]
    sub    r5d, 2
    jg     .height_loop
    REP_RET
Fiona Glaser's avatar
Fiona Glaser committed
407 408 409 410
%endmacro

AVG2_W20 sse2
AVG2_W20 sse2_misalign
411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428

; Cacheline split code for processors with high latencies for loads
; split over cache lines.  See sad-a.asm for a more detailed explanation.
; This particular instance is complicated by the fact that src1 and src2
; can have different alignments.  For simplicity and code size, only the
; MMX cacheline workaround is used.  As a result, in the case of SSE2
; pixel_avg, the cacheline check functions calls the SSE2 version if there
; is no cacheline split, and the MMX workaround if there is.

%macro INIT_SHIFT 2
    and    eax, 7
    shl    eax, 3
    movd   %1, [sw_64 GLOBAL]
    movd   %2, eax
    psubw  %1, %2
%endmacro

%macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set
Anton Mitrofanov's avatar
Anton Mitrofanov committed
429
cglobal x264_pixel_avg2_w%1_cache%2_%3
430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446
    mov    eax, r2m
    and    eax, 0x1f|(%2>>1)
    cmp    eax, (32-%1)|(%2>>1)
    jle x264_pixel_avg2_w%1_%3
;w12 isn't needed because w16 is just as fast if there's no cacheline split
%if %1 == 12
    jmp x264_pixel_avg2_w16_cache_mmxext
%else
    jmp x264_pixel_avg2_w%1_cache_mmxext
%endif
%endmacro

%macro AVG_CACHELINE_START 0
    %assign stack_offset 0
    INIT_SHIFT mm6, mm7
    mov    eax, r4m
    INIT_SHIFT mm4, mm5
Loren Merritt's avatar
Loren Merritt committed
447
    PROLOGUE 6,6
448 449 450 451 452
    and    r2, ~7
    and    r4, ~7
    sub    r4, r2
.height_loop:
%endmacro
Fiona Glaser's avatar
Fiona Glaser committed
453

454 455 456 457 458 459 460 461 462 463 464 465 466 467
%macro AVG_CACHELINE_LOOP 2
    movq   mm0, [r2+8+%1]
    movq   mm1, [r2+%1]
    movq   mm2, [r2+r4+8+%1]
    movq   mm3, [r2+r4+%1]
    psllq  mm0, mm6
    psrlq  mm1, mm7
    psllq  mm2, mm4
    psrlq  mm3, mm5
    por    mm0, mm1
    por    mm2, mm3
    pavgb  mm0, mm2
    %2 [r0+%1], mm0
%endmacro
468

469 470 471 472 473 474 475
x264_pixel_avg2_w8_cache_mmxext:
    AVG_CACHELINE_START
    AVG_CACHELINE_LOOP 0, movq
    add    r2, r3
    add    r0, r1
    dec    r5d
    jg     .height_loop
Anton Mitrofanov's avatar
Anton Mitrofanov committed
476
    REP_RET
477 478 479 480 481 482 483 484 485

x264_pixel_avg2_w16_cache_mmxext:
    AVG_CACHELINE_START
    AVG_CACHELINE_LOOP 0, movq
    AVG_CACHELINE_LOOP 8, movq
    add    r2, r3
    add    r0, r1
    dec    r5d
    jg .height_loop
Anton Mitrofanov's avatar
Anton Mitrofanov committed
486
    REP_RET
487 488 489 490 491 492 493 494 495 496

x264_pixel_avg2_w20_cache_mmxext:
    AVG_CACHELINE_START
    AVG_CACHELINE_LOOP 0, movq
    AVG_CACHELINE_LOOP 8, movq
    AVG_CACHELINE_LOOP 16, movd
    add    r2, r3
    add    r0, r1
    dec    r5d
    jg .height_loop
Anton Mitrofanov's avatar
Anton Mitrofanov committed
497
    REP_RET
498 499 500 501 502 503 504 505 506 507 508 509 510 511

%ifndef ARCH_X86_64
AVG_CACHELINE_CHECK  8, 32, mmxext
AVG_CACHELINE_CHECK 12, 32, mmxext
AVG_CACHELINE_CHECK 16, 32, mmxext
AVG_CACHELINE_CHECK 20, 32, mmxext
AVG_CACHELINE_CHECK 16, 64, mmxext
AVG_CACHELINE_CHECK 20, 64, mmxext
%endif

AVG_CACHELINE_CHECK  8, 64, mmxext
AVG_CACHELINE_CHECK 12, 64, mmxext
AVG_CACHELINE_CHECK 16, 64, sse2
AVG_CACHELINE_CHECK 20, 64, sse2
512 513 514 515 516

;=============================================================================
; pixel copy
;=============================================================================

Loren Merritt's avatar
Loren Merritt committed
517 518 519 520 521 522 523 524 525
%macro COPY4 4
    %2  m0, [r2]
    %2  m1, [r2+r3]
    %2  m2, [r2+r3*2]
    %2  m3, [r2+%4]
    %1  [r0],      m0
    %1  [r0+r1],   m1
    %1  [r0+r1*2], m2
    %1  [r0+%3],   m3
526 527
%endmacro

Loren Merritt's avatar
Loren Merritt committed
528
INIT_MMX
529 530 531 532 533
;-----------------------------------------------------------------------------
; void x264_mc_copy_w4_mmx( uint8_t *dst, int i_dst_stride,
;                           uint8_t *src, int i_src_stride, int i_height )
;-----------------------------------------------------------------------------
cglobal x264_mc_copy_w4_mmx, 4,6
Loren Merritt's avatar
Loren Merritt committed
534
    cmp     dword r4m, 4
535 536 537
    lea     r5, [r3*3]
    lea     r4, [r1*3]
    je .end
Loren Merritt's avatar
Loren Merritt committed
538
    COPY4 movd, movd, r4, r5
539 540 541
    lea     r2, [r2+r3*4]
    lea     r0, [r0+r1*4]
.end:
Loren Merritt's avatar
Loren Merritt committed
542
    COPY4 movd, movd, r4, r5
543 544 545 546 547 548
    RET

cglobal x264_mc_copy_w8_mmx, 5,7
    lea     r6, [r3*3]
    lea     r5, [r1*3]
.height_loop:
Loren Merritt's avatar
Loren Merritt committed
549
    COPY4 movq, movq, r5, r6
550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581
    lea     r2, [r2+r3*4]
    lea     r0, [r0+r1*4]
    sub     r4d, 4
    jg      .height_loop
    REP_RET

cglobal x264_mc_copy_w16_mmx, 5,7
    lea     r6, [r3*3]
    lea     r5, [r1*3]
.height_loop:
    movq    mm0, [r2]
    movq    mm1, [r2+8]
    movq    mm2, [r2+r3]
    movq    mm3, [r2+r3+8]
    movq    mm4, [r2+r3*2]
    movq    mm5, [r2+r3*2+8]
    movq    mm6, [r2+r6]
    movq    mm7, [r2+r6+8]
    movq    [r0], mm0
    movq    [r0+8], mm1
    movq    [r0+r1], mm2
    movq    [r0+r1+8], mm3
    movq    [r0+r1*2], mm4
    movq    [r0+r1*2+8], mm5
    movq    [r0+r5], mm6
    movq    [r0+r5+8], mm7
    lea     r2, [r2+r3*4]
    lea     r0, [r0+r1*4]
    sub     r4d, 4
    jg      .height_loop
    REP_RET

Loren Merritt's avatar
Loren Merritt committed
582
INIT_XMM
583 584
%macro COPY_W16_SSE2 2
cglobal %1, 5,7
Fiona Glaser's avatar
Fiona Glaser committed
585 586
    lea     r6, [r3*3]
    lea     r5, [r1*3]
Loren Merritt's avatar
Loren Merritt committed
587
.height_loop:
Loren Merritt's avatar
Loren Merritt committed
588
    COPY4 movdqa, %2, r5, r6
Fiona Glaser's avatar
Fiona Glaser committed
589 590 591 592 593
    lea     r2, [r2+r3*4]
    lea     r0, [r0+r1*4]
    sub     r4d, 4
    jg      .height_loop
    REP_RET
594 595 596
%endmacro

COPY_W16_SSE2 x264_mc_copy_w16_sse2, movdqu
597 598 599
; cacheline split with mmx has too much overhead; the speed benefit is near-zero.
; but with SSE3 the overhead is zero, so there's no reason not to include it.
COPY_W16_SSE2 x264_mc_copy_w16_sse3, lddqu
600
COPY_W16_SSE2 x264_mc_copy_w16_aligned_sse2, movdqa
Fiona Glaser's avatar
Fiona Glaser committed
601

602 603 604 605 606 607 608 609


;=============================================================================
; prefetch
;=============================================================================
; FIXME assumes 64 byte cachelines

;-----------------------------------------------------------------------------
Loren Merritt's avatar
Loren Merritt committed
610
; void x264_prefetch_fenc_mmxext( uint8_t *pix_y, int stride_y,
611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629
;                                 uint8_t *pix_uv, int stride_uv, int mb_x )
;-----------------------------------------------------------------------------
%ifdef ARCH_X86_64
cglobal x264_prefetch_fenc_mmxext, 5,5
    mov    eax, r4d
    and    eax, 3
    imul   eax, r1d
    lea    r0,  [r0+rax*4+64]
    prefetcht0  [r0]
    prefetcht0  [r0+r1]
    lea    r0,  [r0+r1*2]
    prefetcht0  [r0]
    prefetcht0  [r0+r1]

    and    r4d, 6
    imul   r4d, r3d
    lea    r2,  [r2+r4+64]
    prefetcht0  [r2]
    prefetcht0  [r2+r3]
Anton Mitrofanov's avatar
Anton Mitrofanov committed
630
    RET
631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673

%else
cglobal x264_prefetch_fenc_mmxext
    mov    r2, [esp+20]
    mov    r1, [esp+8]
    mov    r0, [esp+4]
    and    r2, 3
    imul   r2, r1
    lea    r0, [r0+r2*4+64]
    prefetcht0 [r0]
    prefetcht0 [r0+r1]
    lea    r0, [r0+r1*2]
    prefetcht0 [r0]
    prefetcht0 [r0+r1]

    mov    r2, [esp+20]
    mov    r1, [esp+16]
    mov    r0, [esp+12]
    and    r2, 6
    imul   r2, r1
    lea    r0, [r0+r2+64]
    prefetcht0 [r0]
    prefetcht0 [r0+r1]
    ret
%endif ; ARCH_X86_64

;-----------------------------------------------------------------------------
; void x264_prefetch_ref_mmxext( uint8_t *pix, int stride, int parity )
;-----------------------------------------------------------------------------
cglobal x264_prefetch_ref_mmxext, 3,3
    dec    r2d
    and    r2d, r1d
    lea    r0,  [r0+r2*8+64]
    lea    r2,  [r1*3]
    prefetcht0  [r0]
    prefetcht0  [r0+r1]
    prefetcht0  [r0+r1*2]
    prefetcht0  [r0+r2]
    lea    r0,  [r0+r1*4]
    prefetcht0  [r0]
    prefetcht0  [r0+r1]
    prefetcht0  [r0+r1*2]
    prefetcht0  [r0+r2]
Anton Mitrofanov's avatar
Anton Mitrofanov committed
674
    RET
675 676 677 678 679 680 681



;=============================================================================
; chroma MC
;=============================================================================

682
    %define t0 rax
683
%ifdef ARCH_X86_64
684
    %define t1 r10
685
%else
686
    %define t1 r1
687
%endif
Loren Merritt's avatar
Loren Merritt committed
688 689

%macro MC_CHROMA_START 0
Anton Mitrofanov's avatar
Anton Mitrofanov committed
690
    movifnidn r2,  r2mp
691 692 693
    movifnidn r3d, r3m
    movifnidn r4d, r4m
    movifnidn r5d, r5m
Loren Merritt's avatar
Loren Merritt committed
694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709
    mov       t0d, r5d
    mov       t1d, r4d
    sar       t0d, 3
    sar       t1d, 3
    imul      t0d, r3d
    add       t0d, t1d
    movsxdifnidn t0, t0d
    add       r2,  t0            ; src += (dx>>3) + (dy>>3) * src_stride
%endmacro

;-----------------------------------------------------------------------------
; void x264_mc_chroma_mmxext( uint8_t *dst, int dst_stride,
;                             uint8_t *src, int src_stride,
;                             int dx, int dy,
;                             int width, int height )
;-----------------------------------------------------------------------------
Anton Mitrofanov's avatar
Anton Mitrofanov committed
710 711
%macro MC_CHROMA 1-2 0
cglobal x264_mc_chroma_%1
Loren Merritt's avatar
Loren Merritt committed
712
%if mmsize == 16
Loren Merritt's avatar
Loren Merritt committed
713
    cmp dword r6m, 4
Anton Mitrofanov's avatar
Anton Mitrofanov committed
714
    jle x264_mc_chroma_mmxext
Loren Merritt's avatar
Loren Merritt committed
715
%endif
Anton Mitrofanov's avatar
Anton Mitrofanov committed
716
    PROLOGUE 0,6,%2
Loren Merritt's avatar
Loren Merritt committed
717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740
    MC_CHROMA_START
    pxor       m3, m3
    and       r4d, 7         ; dx &= 7
    jz .mc1dy
    and       r5d, 7         ; dy &= 7
    jz .mc1dx

    movd       m5, r4d
    movd       m6, r5d
    SPLATW     m5, m5        ; m5 = dx
    SPLATW     m6, m6        ; m6 = dy

    mova       m4, [pw_8 GLOBAL]
    mova       m0, m4
    psubw      m4, m5        ; m4 = 8-dx
    psubw      m0, m6        ; m0 = 8-dy

    mova       m7, m5
    pmullw     m5, m0        ; m5 = dx*(8-dy) =     cB
    pmullw     m7, m6        ; m7 = dx*dy =         cD
    pmullw     m6, m4        ; m6 = (8-dx)*dy =     cC
    pmullw     m4, m0        ; m4 = (8-dx)*(8-dy) = cA

    mov       r4d, r7m
741
%ifdef ARCH_X86_64
Loren Merritt's avatar
Loren Merritt committed
742 743
    mov       r10, r0
    mov       r11, r2
744
%else
Anton Mitrofanov's avatar
Anton Mitrofanov committed
745
    mov        r0, r0mp
Loren Merritt's avatar
Loren Merritt committed
746 747
    mov        r1, r1m
    mov        r5, r2
748 749
%endif

Loren Merritt's avatar
Loren Merritt committed
750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779
.loop2d:
    movh       m1, [r2+r3]
    movh       m0, [r2]
    punpcklbw  m1, m3        ; 00 px1 | 00 px2 | 00 px3 | 00 px4
    punpcklbw  m0, m3
    pmullw     m1, m6        ; 2nd line * cC
    pmullw     m0, m4        ; 1st line * cA
    paddw      m0, m1        ; m0 <- result

    movh       m2, [r2+1]
    movh       m1, [r2+r3+1]
    punpcklbw  m2, m3
    punpcklbw  m1, m3

    paddw      m0, [pw_32 GLOBAL]

    pmullw     m2, m5        ; line * cB
    pmullw     m1, m7        ; line * cD
    paddw      m0, m2
    paddw      m0, m1
    psrlw      m0, 6

    packuswb m0, m3          ; 00 00 00 00 px1 px2 px3 px4
    movh       [r0], m0

    add        r2,  r3
    add        r0,  r1       ; dst_stride
    dec        r4d
    jnz .loop2d

Loren Merritt's avatar
Loren Merritt committed
780
%if mmsize == 8
781
    sub dword r6m, 8
Loren Merritt's avatar
Loren Merritt committed
782
    jnz .finish              ; width != 8 so assume 4
783
%ifdef ARCH_X86_64
Loren Merritt's avatar
Loren Merritt committed
784 785
    lea        r0, [r10+4]   ; dst
    lea        r2, [r11+4]   ; src
786
%else
Anton Mitrofanov's avatar
Anton Mitrofanov committed
787
    mov        r0, r0mp
Loren Merritt's avatar
Loren Merritt committed
788
    lea        r2, [r5+4]
Anton Mitrofanov's avatar
Anton Mitrofanov committed
789
    add        r0, 4
790
%endif
Loren Merritt's avatar
Loren Merritt committed
791 792 793 794
    mov       r4d, r7m       ; height
    jmp .loop2d
%else
    REP_RET
Loren Merritt's avatar
Loren Merritt committed
795
%endif ; mmsize
Loren Merritt's avatar
Loren Merritt committed
796 797 798 799 800 801 802 803

.mc1dy:
    and       r5d, 7
    movd       m6, r5d
    mov        r5, r3        ; pel_offset = dx ? 1 : src_stride
    jmp .mc1d
.mc1dx:
    movd       m6, r4d
804
    mov       r5d, 1
Loren Merritt's avatar
Loren Merritt committed
805 806 807 808 809
.mc1d:
    mova       m5, [pw_8 GLOBAL]
    SPLATW     m6, m6
    mova       m7, [pw_4 GLOBAL]
    psubw      m5, m6
Anton Mitrofanov's avatar
Anton Mitrofanov committed
810
    movifnidn r0,  r0mp
811 812
    movifnidn r1d, r1m
    mov       r4d, r7m
Loren Merritt's avatar
Loren Merritt committed
813
%if mmsize == 8
Loren Merritt's avatar
Loren Merritt committed
814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833
    cmp dword r6m, 8
    je .loop1d_w8
%endif

.loop1d_w4:
    movh       m0, [r2+r5]
    movh       m1, [r2]
    punpcklbw  m0, m3
    punpcklbw  m1, m3
    pmullw     m0, m6
    pmullw     m1, m5
    paddw      m0, m7
    paddw      m0, m1
    psrlw      m0, 3
    packuswb   m0, m3
    movh     [r0], m0
    add        r2, r3
    add        r0, r1
    dec        r4d
    jnz .loop1d_w4
Loren Merritt's avatar
Loren Merritt committed
834
.finish:
835 836
    REP_RET

Loren Merritt's avatar
Loren Merritt committed
837
%if mmsize == 8
Loren Merritt's avatar
Loren Merritt committed
838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863
.loop1d_w8:
    movu       m0, [r2+r5]
    mova       m1, [r2]
    mova       m2, m0
    mova       m4, m1
    punpcklbw  m0, m3
    punpcklbw  m1, m3
    punpckhbw  m2, m3
    punpckhbw  m4, m3
    pmullw     m0, m6
    pmullw     m1, m5
    pmullw     m2, m6
    pmullw     m4, m5
    paddw      m0, m7
    paddw      m2, m7
    paddw      m0, m1
    paddw      m2, m4
    psrlw      m0, 3
    psrlw      m2, 3
    packuswb   m0, m2
    mova     [r0], m0
    add        r2, r3
    add        r0, r1
    dec        r4d
    jnz .loop1d_w8
    REP_RET
Loren Merritt's avatar
Loren Merritt committed
864
%endif ; mmsize
Loren Merritt's avatar
Loren Merritt committed
865 866 867 868 869
%endmacro ; MC_CHROMA

INIT_MMX
MC_CHROMA mmxext
INIT_XMM
Anton Mitrofanov's avatar
Anton Mitrofanov committed
870
MC_CHROMA sse2, 8
Loren Merritt's avatar
Loren Merritt committed
871 872

INIT_MMX
Anton Mitrofanov's avatar
Anton Mitrofanov committed
873
cglobal x264_mc_chroma_ssse3, 0,6,8
Loren Merritt's avatar
Loren Merritt committed
874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889
    MC_CHROMA_START
    and       r4d, 7
    and       r5d, 7
    mov       t0d, r4d
    shl       t0d, 8
    sub       t0d, r4d
    mov       r4d, 8
    add       t0d, 8
    sub       r4d, r5d
    imul      r5d, t0d ; (x*255+8)*y
    imul      r4d, t0d ; (x*255+8)*(8-y)
    cmp dword r6m, 4
    jg .width8
    mova       m5, [pw_32 GLOBAL]
    movd       m6, r5d
    movd       m7, r4d
Anton Mitrofanov's avatar
Anton Mitrofanov committed
890
    movifnidn r0,  r0mp
Loren Merritt's avatar
Loren Merritt committed
891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930
    movifnidn r1d, r1m
    movifnidn r4d, r7m
    SPLATW     m6, m6
    SPLATW     m7, m7
    movh       m0, [r2]
    punpcklbw  m0, [r2+1]
    add r2, r3
.loop4:
    movh       m1, [r2]
    movh       m3, [r2+r3]
    punpcklbw  m1, [r2+1]
    punpcklbw  m3, [r2+r3+1]
    lea        r2, [r2+2*r3]
    mova       m2, m1
    mova       m4, m3
    pmaddubsw  m0, m7
    pmaddubsw  m1, m6
    pmaddubsw  m2, m7
    pmaddubsw  m3, m6
    paddw      m0, m5
    paddw      m2, m5
    paddw      m1, m0
    paddw      m3, m2
    mova       m0, m4
    psrlw      m1, 6
    psrlw      m3, 6
    packuswb   m1, m1
    packuswb   m3, m3
    movh     [r0], m1
    movh  [r0+r1], m3
    sub       r4d, 2
    lea        r0, [r0+2*r1]
    jg .loop4
    REP_RET

INIT_XMM
.width8:
    mova       m5, [pw_32 GLOBAL]
    movd       m6, r5d
    movd       m7, r4d
Anton Mitrofanov's avatar
Anton Mitrofanov committed
931
    movifnidn r0,  r0mp
Loren Merritt's avatar
Loren Merritt committed
932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966
    movifnidn r1d, r1m
    movifnidn r4d, r7m
    SPLATW     m6, m6
    SPLATW     m7, m7
    movh       m0, [r2]
    movh       m1, [r2+1]
    punpcklbw  m0, m1
    add r2, r3
.loop8:
    movh       m1, [r2]
    movh       m2, [r2+1]
    movh       m3, [r2+r3]
    movh       m4, [r2+r3+1]
    punpcklbw  m1, m2
    punpcklbw  m3, m4
    lea        r2, [r2+2*r3]
    mova       m2, m1
    mova       m4, m3
    pmaddubsw  m0, m7
    pmaddubsw  m1, m6
    pmaddubsw  m2, m7
    pmaddubsw  m3, m6
    paddw      m0, m5
    paddw      m2, m5
    paddw      m1, m0
    paddw      m3, m2
    mova       m0, m4
    psrlw      m1, 6
    psrlw      m3, 6
    packuswb   m1, m3
    movh     [r0], m1
    movhps [r0+r1], m1
    sub       r4d, 2
    lea        r0, [r0+2*r1]
    jg .loop8
967 968
    REP_RET

Loren Merritt's avatar
Loren Merritt committed
969 970
; mc_chroma 1d ssse3 is negligibly faster, and definitely not worth the extra code size