quant-a.asm 14.1 KB
Newer Older
1 2 3
;*****************************************************************************
;* quant-a.asm: h264 encoder library
;*****************************************************************************
4
;* Copyright (C) 2005-2008 x264 project
5
;*
Loren Merritt's avatar
Loren Merritt committed
6
;* Authors: Loren Merritt <lorenm@u.washington.edu>
7
;*          Christian Heine <sennindemokrit@gmx.net>
8 9 10 11 12 13 14 15 16 17 18 19 20
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
21
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
22 23
;*****************************************************************************

24
%include "x86inc.asm"
25
%include "x86util.asm"
26

27
SECTION_RODATA
28
pb_1:     times 16 db 1
Loren Merritt's avatar
Loren Merritt committed
29
pw_1:     times 8 dw 1
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57
pd_1:     times 4 dd 1

%macro DQM4 3
    dw %1, %2, %1, %2, %2, %3, %2, %3
%endmacro
%macro DQM8 6
    dw %1, %4, %5, %4, %1, %4, %5, %4
    dw %4, %2, %6, %2, %4, %2, %6, %2
    dw %5, %6, %3, %6, %5, %6, %3, %6
    ; last line not used, just padding for power-of-2 stride
    times 8 dw 0
%endmacro

dequant4_scale:
    DQM4 10, 13, 16
    DQM4 11, 14, 18
    DQM4 13, 16, 20
    DQM4 14, 18, 23
    DQM4 16, 20, 25
    DQM4 18, 23, 29

dequant8_scale:
    DQM8 20, 18, 32, 19, 25, 24
    DQM8 22, 19, 35, 21, 28, 26
    DQM8 26, 23, 42, 24, 33, 31
    DQM8 28, 25, 45, 26, 35, 33
    DQM8 32, 28, 51, 30, 40, 38
    DQM8 36, 32, 58, 34, 46, 43
58

59 60 61 62 63 64 65 66 67 68 69
decimate_mask_table4:
    db  0,3,2,6,2,5,5,9,1,5,4,8,5,8,8,12,1,4,4,8,4,7,7,11,4,8,7,11,8,11,11,15,1,4
    db  3,7,4,7,7,11,3,7,6,10,7,10,10,14,4,7,7,11,7,10,10,14,7,11,10,14,11,14,14
    db 18,0,4,3,7,3,6,6,10,3,7,6,10,7,10,10,14,3,6,6,10,6,9,9,13,6,10,9,13,10,13
    db 13,17,4,7,6,10,7,10,10,14,6,10,9,13,10,13,13,17,7,10,10,14,10,13,13,17,10
    db 14,13,17,14,17,17,21,0,3,3,7,3,6,6,10,2,6,5,9,6,9,9,13,3,6,6,10,6,9,9,13
    db  6,10,9,13,10,13,13,17,3,6,5,9,6,9,9,13,5,9,8,12,9,12,12,16,6,9,9,13,9,12
    db 12,16,9,13,12,16,13,16,16,20,3,7,6,10,6,9,9,13,6,10,9,13,10,13,13,17,6,9
    db  9,13,9,12,12,16,9,13,12,16,13,16,16,20,7,10,9,13,10,13,13,17,9,13,12,16
    db 13,16,16,20,10,13,13,17,13,16,16,20,13,17,16,20,17,20,20,24

70 71
SECTION .text

Loren Merritt's avatar
Loren Merritt committed
72 73 74 75 76 77 78 79 80 81 82 83
%macro QUANT_DC_START 0
    movd       m6, r1m     ; mf
    movd       m7, r2m     ; bias
%ifidn m0, mm0
    pshufw     m6, m6, 0
    pshufw     m7, m7, 0
%else
    pshuflw    m6, m6, 0
    pshuflw    m7, m7, 0
    punpcklqdq m6, m6
    punpcklqdq m7, m7
%endif
84 85
%endmacro

86 87 88 89 90 91 92
%macro PABSW_MMX 2
    pxor       %1, %1
    pcmpgtw    %1, %2
    pxor       %2, %1
    psubw      %2, %1
    SWAP       %1, %2
%endmacro
Loren Merritt's avatar
Loren Merritt committed
93

94 95 96
%macro PSIGNW_MMX 2
    pxor       %1, %2
    psubw      %1, %2
97 98
%endmacro

99 100 101 102 103 104 105 106 107 108 109 110
%macro PABSW_SSSE3 2
    pabsw      %1, %2
%endmacro

%macro PSIGNW_SSSE3 2
    psignw     %1, %2
%endmacro

%macro QUANT_ONE 3
;;; %1      (m64)       dct[y][x]
;;; %2      (m64/mmx)   mf[y][x] or mf[0][0] (as uint16_t)
;;; %3      (m64/mmx)   bias[y][x] or bias[0][0] (as uint16_t)
111
    mova       m1, %1   ; load dct coeffs
112
    PABSW      m0, m1
Loren Merritt's avatar
Loren Merritt committed
113 114
    paddusw    m0, %3   ; round
    pmulhuw    m0, %2   ; divide
115
    PSIGNW     m0, m1   ; restore sign
116
    mova       %1, m0   ; store
117
%endmacro
118

119
;-----------------------------------------------------------------------------
120
; void x264_quant_4x4_dc_mmxext( int16_t dct[16], int mf, int bias )
121
;-----------------------------------------------------------------------------
122
%macro QUANT_DC 2
123
cglobal %1, 1,1
Loren Merritt's avatar
Loren Merritt committed
124
    QUANT_DC_START
125
%assign x 0
126 127
%rep %2
    QUANT_ONE [r0+x], m6, m7
Loren Merritt's avatar
Loren Merritt committed
128
%assign x x+mmsize
129
%endrep
130 131
    RET
%endmacro
132 133

;-----------------------------------------------------------------------------
134
; void x264_quant_4x4_mmx( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
135
;-----------------------------------------------------------------------------
136
%macro QUANT_AC 2
137
cglobal %1, 3,3
138
%assign x 0
139 140
%rep %2
    QUANT_ONE [r0+x], [r1+x], [r2+x]
Loren Merritt's avatar
Loren Merritt committed
141
%assign x x+mmsize
142
%endrep
143
    RET
144 145
%endmacro

146 147 148 149
INIT_MMX
%define PABSW PABSW_MMX
%define PSIGNW PSIGNW_MMX
QUANT_DC x264_quant_2x2_dc_mmxext, 1
150
%ifndef ARCH_X86_64 ; not needed because sse2 is faster
151 152 153
QUANT_DC x264_quant_4x4_dc_mmxext, 4
QUANT_AC x264_quant_4x4_mmx, 4
QUANT_AC x264_quant_8x8_mmx, 16
154 155
%endif

Loren Merritt's avatar
Loren Merritt committed
156
INIT_XMM
157 158 159
QUANT_DC x264_quant_4x4_dc_sse2, 2
QUANT_AC x264_quant_4x4_sse2, 2
QUANT_AC x264_quant_8x8_sse2, 8
Loren Merritt's avatar
Loren Merritt committed
160

161 162 163 164 165
%define PABSW PABSW_SSSE3
%define PSIGNW PSIGNW_SSSE3
QUANT_DC x264_quant_4x4_dc_ssse3, 2
QUANT_AC x264_quant_4x4_ssse3, 2
QUANT_AC x264_quant_8x8_ssse3, 8
166

167 168
INIT_MMX
QUANT_DC x264_quant_2x2_dc_ssse3, 1
169 170


171 172 173 174 175

;=============================================================================
; dequant
;=============================================================================

Loren Merritt's avatar
Loren Merritt committed
176
%macro DEQUANT16_L 3
177 178
;;; %1      dct[y][x]
;;; %2,%3   dequant_mf[i_mf][y][x]
Loren Merritt's avatar
Loren Merritt committed
179 180
;;; m5      i_qbits

181
    mova     m0, %2
182 183
    packssdw m0, %3
    pmullw   m0, %1
Loren Merritt's avatar
Loren Merritt committed
184
    psllw    m0, m5
185
    mova     %1, m0
186 187
%endmacro

Loren Merritt's avatar
Loren Merritt committed
188
%macro DEQUANT32_R 3
189 190
;;; %1      dct[y][x]
;;; %2,%3   dequant_mf[i_mf][y][x]
Loren Merritt's avatar
Loren Merritt committed
191
;;; m5      -i_qbits
192
;;; m6      f
Loren Merritt's avatar
Loren Merritt committed
193 194
;;; m7      0

195 196
    mova      m0, %1
    mova      m1, m0
197 198 199 200 201 202
    punpcklwd m0, m7
    punpckhwd m1, m7
    pmaddwd   m0, %2
    pmaddwd   m1, %3
    paddd     m0, m6
    paddd     m1, m6
Loren Merritt's avatar
Loren Merritt committed
203 204 205
    psrad     m0, m5
    psrad     m1, m5
    packssdw  m0, m1
206
    mova      %1, m0
207 208
%endmacro

Loren Merritt's avatar
Loren Merritt committed
209 210 211
%macro DEQUANT_LOOP 3
%if 8*(%2-2*%3)
    mov t0d, 8*(%2-2*%3)
212
%%loop:
Loren Merritt's avatar
Loren Merritt committed
213 214 215 216
    %1 [r0+t0+8*%3], [r1+t0*2+16*%3], [r1+t0*2+24*%3]
    %1 [r0+t0     ], [r1+t0*2      ], [r1+t0*2+ 8*%3]
    sub t0d, 16*%3
    jge %%loop
217
    rep ret
Loren Merritt's avatar
Loren Merritt committed
218 219 220 221 222
%else
    %1 [r0+8*%3], [r1+16*%3], [r1+24*%3]
    %1 [r0     ], [r1      ], [r1+ 8*%3]
    ret
%endif
223 224
%endmacro

225
%macro DEQUANT16_FLAT 2-8
226
    mova   m0, %1
227 228 229
%assign i %0-2
%rep %0-1
%if i
230
    mova   m %+ i, [r0+%2]
231 232 233 234 235
    pmullw m %+ i, m0
%else
    pmullw m0, [r0+%2]
%endif
    psllw  m %+ i, m7
236
    mova   [r0+%2], m %+ i
237 238 239 240 241
    %assign i i-1
    %rotate 1
%endrep
%endmacro

242 243 244
%ifdef ARCH_X86_64
    %define t0  r4
    %define t0d r4d
245 246 247 248
    %define t1  r3
    %define t1d r3d
    %define t2  r2
    %define t2d r2d
249 250 251
%else
    %define t0  r2
    %define t0d r2d
252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272
    %define t1  r0
    %define t1d r0d
    %define t2  r1
    %define t2d r1d
%endif

;-----------------------------------------------------------------------------
; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
;-----------------------------------------------------------------------------
%macro DEQUANT 4
cglobal x264_dequant_%2x%2_%1, 0,3
    movifnidn t2d, r2m
    imul t0d, t2d, 0x2b
    shr  t0d, 8     ; i_qbits = i_qp / 6
    lea  t1, [t0*3]
    sub  t2d, t1d
    sub  t2d, t1d   ; i_mf = i_qp % 6
    shl  t2d, %3+2
%ifdef ARCH_X86_64
    add  r1, t2     ; dequant_mf[i_mf]
%else
273 274 275 276
    add  r1, r1m    ; dequant_mf[i_mf]
    mov  r0, r0m    ; dct
%endif
    sub  t0d, %3
277
    jl   .rshift32  ; negative qbits => rightshift
278 279

.lshift:
Loren Merritt's avatar
Loren Merritt committed
280
    movd m5, t0d
281
    DEQUANT_LOOP DEQUANT16_L, %2*%2/4, %4
282 283

.rshift32:
284
    neg   t0d
Loren Merritt's avatar
Loren Merritt committed
285
    movd  m5, t0d
286
    mova  m6, [pd_1 GLOBAL]
Loren Merritt's avatar
Loren Merritt committed
287
    pxor  m7, m7
288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304
    pslld m6, m5
    psrld m6, 1
    DEQUANT_LOOP DEQUANT32_R, %2*%2/4, %4

cglobal x264_dequant_%2x%2_flat16_%1, 0,3
    movifnidn t2d, r2m
%if %2 == 8
    cmp  t2d, 12
    jl x264_dequant_%2x%2_%1
    sub  t2d, 12
%endif
    imul t0d, t2d, 0x2b
    shr  t0d, 8     ; i_qbits = i_qp / 6
    lea  t1, [t0*3]
    sub  t2d, t1d
    sub  t2d, t1d   ; i_mf = i_qp % 6
    shl  t2d, %3
Loren Merritt's avatar
Loren Merritt committed
305
%ifdef PIC
306 307 308
    lea  r1, [dequant%2_scale GLOBAL]
    add  r1, t2
%else
Loren Merritt's avatar
Loren Merritt committed
309
    lea  r1, [dequant%2_scale + t2 GLOBAL]
310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331
%endif
    movifnidn r0d, r0m
    movd m7, t0d
%if %2 == 4
%ifidn %1, mmx
    DEQUANT16_FLAT [r1], 0, 16
    DEQUANT16_FLAT [r1+8], 8, 24
%else
    DEQUANT16_FLAT [r1], 0, 16
%endif
%elifidn %1, mmx
    DEQUANT16_FLAT [r1], 0, 8, 64, 72
    DEQUANT16_FLAT [r1+16], 16, 24, 48, 56
    DEQUANT16_FLAT [r1+16], 80, 88, 112, 120
    DEQUANT16_FLAT [r1+32], 32, 40, 96, 104
%else
    DEQUANT16_FLAT [r1], 0, 64
    DEQUANT16_FLAT [r1+16], 16, 48, 80, 112
    DEQUANT16_FLAT [r1+32], 32, 96
%endif
    ret
%endmacro ; DEQUANT
332

Loren Merritt's avatar
Loren Merritt committed
333 334
%ifndef ARCH_X86_64
INIT_MMX
335 336
DEQUANT mmx, 4, 4, 1
DEQUANT mmx, 8, 6, 1
Loren Merritt's avatar
Loren Merritt committed
337 338
%endif
INIT_XMM
339 340
DEQUANT sse2, 4, 4, 2
DEQUANT sse2, 8, 6, 2
Loren Merritt's avatar
Loren Merritt committed
341

Fiona Glaser's avatar
Fiona Glaser committed
342 343 344


;-----------------------------------------------------------------------------
Loren Merritt's avatar
Loren Merritt committed
345
; void x264_denoise_dct_mmx( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
Fiona Glaser's avatar
Fiona Glaser committed
346 347
;-----------------------------------------------------------------------------
%macro DENOISE_DCT 1
Loren Merritt's avatar
Loren Merritt committed
348
cglobal x264_denoise_dct_%1, 4,5
Fiona Glaser's avatar
Fiona Glaser committed
349 350 351
    movzx     r4d, word [r0] ; backup DC coefficient
    pxor      m7, m7
.loop:
Loren Merritt's avatar
Loren Merritt committed
352 353 354
    sub       r3, mmsize
    mova      m2, [r0+r3*2+0*mmsize]
    mova      m3, [r0+r3*2+1*mmsize]
Fiona Glaser's avatar
Fiona Glaser committed
355 356 357 358
    PABSW     m0, m2
    PABSW     m1, m3
    mova      m4, m0
    mova      m5, m1
Loren Merritt's avatar
Loren Merritt committed
359 360
    psubusw   m0, [r2+r3*2+0*mmsize]
    psubusw   m1, [r2+r3*2+1*mmsize]
Fiona Glaser's avatar
Fiona Glaser committed
361 362
    PSIGNW    m0, m2
    PSIGNW    m1, m3
Loren Merritt's avatar
Loren Merritt committed
363 364
    mova      [r0+r3*2+0*mmsize], m0
    mova      [r0+r3*2+1*mmsize], m1
Fiona Glaser's avatar
Fiona Glaser committed
365 366 367 368 369 370
    mova      m2, m4
    mova      m3, m5
    punpcklwd m4, m7
    punpckhwd m2, m7
    punpcklwd m5, m7
    punpckhwd m3, m7
Loren Merritt's avatar
Loren Merritt committed
371 372 373 374 375 376 377 378
    paddd     m4, [r1+r3*4+0*mmsize]
    paddd     m2, [r1+r3*4+1*mmsize]
    paddd     m5, [r1+r3*4+2*mmsize]
    paddd     m3, [r1+r3*4+3*mmsize]
    mova      [r1+r3*4+0*mmsize], m4
    mova      [r1+r3*4+1*mmsize], m2
    mova      [r1+r3*4+2*mmsize], m5
    mova      [r1+r3*4+3*mmsize], m3
Fiona Glaser's avatar
Fiona Glaser committed
379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394
    jg .loop
    mov       [r0], r4w ; restore DC coefficient
    RET
%endmacro

%define PABSW PABSW_MMX
%define PSIGNW PSIGNW_MMX
%ifndef ARCH_X86_64
INIT_MMX
DENOISE_DCT mmx
%endif
INIT_XMM
DENOISE_DCT sse2
%define PABSW PABSW_SSSE3
%define PSIGNW PSIGNW_SSSE3
DENOISE_DCT ssse3
395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606



;-----------------------------------------------------------------------------
; int x264_decimate_score( int16_t *dct )
;-----------------------------------------------------------------------------

%macro DECIMATE_MASK_SSE2 6
%ifidn %5, ssse3
    pabsw    xmm0, [%3+ 0]
    pabsw    xmm1, [%3+16]
%else
    movdqa   xmm0, [%3+ 0]
    movdqa   xmm1, [%3+16]
    ABS2_MMX xmm0, xmm1, xmm3, xmm4
%endif
    packsswb xmm0, xmm1
    pxor     xmm2, xmm2
    pcmpeqb  xmm2, xmm0
    pcmpgtb  xmm0, %4
    pmovmskb %1, xmm2
    pmovmskb %2, xmm0
%endmacro

%macro DECIMATE_MASK_MMX 6
    movq      mm0, [%3+ 0]
    movq      mm1, [%3+ 8]
    movq      mm2, [%3+16]
    movq      mm3, [%3+24]
    ABS2_MMX  mm0, mm1, mm4, mm5
    ABS2_MMX  mm2, mm3, mm4, mm5
    packsswb  mm0, mm1
    packsswb  mm2, mm3
    pxor      mm4, mm4
    pxor      mm5, mm5
    pcmpeqb   mm4, mm0
    pcmpeqb   mm5, mm2
    pcmpgtb   mm0, %4
    pcmpgtb   mm2, %4
    pmovmskb   %6, mm4
    pmovmskb   %1, mm5
    shl        %1, 8
    or         %1, %6
    pmovmskb   %6, mm0
    pmovmskb   %2, mm2
    shl        %2, 8
    or         %2, %6
%endmacro

cextern x264_decimate_table4
cextern x264_decimate_table8

%macro DECIMATE4x4 2

;A LUT is faster than bsf on AMD processors, and no slower on Intel
;This is not true for score64.
cglobal x264_decimate_score%1_%2, 1,3
%ifdef PIC
    lea r10, [x264_decimate_table4 GLOBAL]
    lea r11, [decimate_mask_table4 GLOBAL]
    %define table r10
    %define mask_table r11
%else
    %define table x264_decimate_table4
    %define mask_table decimate_mask_table4
%endif
    DECIMATE_MASK edx, eax, r0, [pb_1 GLOBAL], %2, ecx
    xor   edx, 0xffff
    je   .ret
    test  eax, eax
    jne  .ret9
%if %1==15
    shr   edx, 1
%endif
    movzx ecx, dl
    movzx eax, byte [mask_table + rcx]
    cmp   edx, ecx
    je   .ret
    bsr   ecx, ecx
    shr   edx, 1
    shr   edx, cl
    bsf   ecx, edx
    shr   edx, 1
    shr   edx, cl
    add    al, byte [table + rcx]
    add    al, byte [mask_table + rdx]
.ret:
    REP_RET
.ret9:
    mov   eax, 9
    RET

%endmacro

%ifndef ARCH_X86_64
%define DECIMATE_MASK DECIMATE_MASK_MMX
DECIMATE4x4 15, mmxext
DECIMATE4x4 16, mmxext
%endif
%define DECIMATE_MASK DECIMATE_MASK_SSE2
DECIMATE4x4 15, sse2
DECIMATE4x4 15, ssse3
DECIMATE4x4 16, sse2
DECIMATE4x4 16, ssse3

%macro DECIMATE8x8 1

%ifdef ARCH_X86_64
cglobal x264_decimate_score64_%1, 1,4
%ifdef PIC
    lea r10, [x264_decimate_table8 GLOBAL]
    %define table r10
%else
    %define table x264_decimate_table8
%endif
    mova  m7, [pb_1 GLOBAL]
    DECIMATE_MASK r1d, eax, r0, m7, %1, null
    test  eax, eax
    jne  .ret9
    DECIMATE_MASK r2d, eax, r0+32, m7, %1, null
    shl   r2d, 16
    or    r1d, r2d
    DECIMATE_MASK r2d, r3d, r0+64, m7, %1, null
    shl   r2, 32
    or    eax, r3d
    or    r1, r2
    DECIMATE_MASK r2d, r3d, r0+96, m7, %1, null
    shl   r2, 48
    or    r1, r2
    not   r1
    test  r1, r1
    je   .ret
    or    eax, r3d
    jne  .ret9
.loop:
    bsf   rcx, r1
    shr   r1, cl
    movzx ecx, byte [table + rcx]
    add   eax, ecx
    shr   r1, 1
    jne  .loop
.ret:
    REP_RET
.ret9:
    mov   eax, 9
    RET

%else ; ARCH
%ifidn %1, mmxext
cglobal x264_decimate_score64_%1, 1,6
%else
cglobal x264_decimate_score64_%1, 1,5
%endif
    mova  m7, [pb_1 GLOBAL]
    DECIMATE_MASK r3, r2, r0, m7, %1, r5
    test  r2, r2
    jne  .ret9
    DECIMATE_MASK r4, r2, r0+32, m7, %1, r5
    shl   r4, 16
    or    r3, r4
    DECIMATE_MASK r4, r1, r0+64, m7, %1, r5
    or    r2, r1
    DECIMATE_MASK r1, r0, r0+96, m7, %1, r5
    shl   r1, 16
    or    r4, r1
    not   r3
    not   r4
    mov   r1, r3
    or    r1, r4
    je   .ret
    or    r0, r2
    jne  .ret9    ;r2 is zero at this point, so we don't need to zero it
.loop:
    bsf   ecx, r3
    test  r3, r3
    je   .largerun
    shrd  r3, r4, cl
    shr   r4, cl
    movzx ecx, byte [x264_decimate_table8 + ecx]
    add   r0, ecx
    shrd  r3, r4, 1
    shr   r4, 1
    mov   r2, r3
    or    r2, r4
    jne  .loop
.ret:
    REP_RET
.ret9:
    mov   eax, 9
    RET
.largerun:
    mov   r3, r4
    xor   r4, r4
    bsf   ecx, r3
    shr   r3, cl
    shr   r3, 1
    jne  .loop
    REP_RET
%endif ; ARCH

%endmacro

%ifndef ARCH_X86_64
INIT_MMX
%define DECIMATE_MASK DECIMATE_MASK_MMX
DECIMATE8x8 mmxext
%endif
INIT_XMM
%define DECIMATE_MASK DECIMATE_MASK_SSE2
DECIMATE8x8 sse2
DECIMATE8x8 ssse3