dct-a.asm 16.1 KB
Newer Older
1 2 3 4 5 6 7
;*****************************************************************************
;* dct-a.asm: h264 encoder library
;*****************************************************************************
;* Copyright (C) 2003-2008 x264 project
;*
;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
;*          Loren Merritt <lorenm@u.washington.edu>
8
;*          Min Chen <chenm001.163.com>
9 10 11 12 13 14 15 16 17 18 19 20 21
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
22
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
23 24 25
;*****************************************************************************

%include "x86inc.asm"
26
%include "x86util.asm"
27 28 29 30

SECTION_RODATA
pw_1:  times 8 dw 1
pw_32: times 8 dw 32
Holger Lubitz's avatar
Holger Lubitz committed
31 32 33
pb_sub4frame:   db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
pb_scan4framea: db 12,13,6,7,14,15,0,1,8,9,2,3,4,5,10,11
pb_scan4frameb: db 0,1,8,9,2,3,4,5,10,11,12,13,6,7,14,15
34 35 36

SECTION .text

37 38 39 40 41 42
%macro HADAMARD4_1D 4
    SUMSUB_BADC m%2, m%1, m%4, m%3
    SUMSUB_BADC m%4, m%2, m%3, m%1
    SWAP %1, %4, %3
%endmacro

43 44 45
;-----------------------------------------------------------------------------
; void x264_dct4x4dc_mmx( int16_t d[4][4] )
;-----------------------------------------------------------------------------
Loren Merritt's avatar
Loren Merritt committed
46
cglobal x264_dct4x4dc_mmx, 1,1
47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66
    movq   m0, [r0+ 0]
    movq   m1, [r0+ 8]
    movq   m2, [r0+16]
    movq   m3, [r0+24]
    HADAMARD4_1D  0,1,2,3
    TRANSPOSE4x4W 0,1,2,3,4
    HADAMARD4_1D  0,1,2,3
    movq   m6, [pw_1 GLOBAL]
    paddw  m0, m6
    paddw  m1, m6
    paddw  m2, m6
    paddw  m3, m6
    psraw  m0, 1
    psraw  m1, 1
    psraw  m2, 1
    psraw  m3, 1
    movq  [r0+0], m0
    movq  [r0+8], m1
    movq [r0+16], m2
    movq [r0+24], m3
67 68 69 70 71 72
    RET

;-----------------------------------------------------------------------------
; void x264_idct4x4dc_mmx( int16_t d[4][4] )
;-----------------------------------------------------------------------------
cglobal x264_idct4x4dc_mmx, 1,1
73 74 75 76 77 78 79 80 81 82 83 84
    movq  m0, [r0+ 0]
    movq  m1, [r0+ 8]
    movq  m2, [r0+16]
    movq  m3, [r0+24]
    HADAMARD4_1D  0,1,2,3
    TRANSPOSE4x4W 0,1,2,3,4
    HADAMARD4_1D  0,1,2,3
    movq  [r0+ 0], m0
    movq  [r0+ 8], m1
    movq  [r0+16], m2
    movq  [r0+24], m3
    RET
85

86 87 88 89 90 91
%macro DCT4_1D 5
    SUMSUB_BADC m%4, m%1, m%3, m%2
    SUMSUB_BA   m%3, m%4
    SUMSUB2_AB  m%1, m%2, m%5
    SWAP %1, %3, %4, %5, %2
%endmacro
92

93 94 95 96 97 98
%macro IDCT4_1D 6
    SUMSUB_BA   m%3, m%1
    SUMSUBD2_AB m%2, m%4, m%6, m%5
    SUMSUB_BADC m%2, m%3, m%5, m%1
    SWAP %1, %2, %5, %4, %3
%endmacro
99 100 101 102 103 104

;-----------------------------------------------------------------------------
; void x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
cglobal x264_sub4x4_dct_mmx, 3,3
.skip_prologue:
Loren Merritt's avatar
Loren Merritt committed
105
%macro SUB_DCT4 1
106 107 108 109
    LOAD_DIFF  m0, m6, m7, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
    LOAD_DIFF  m1, m6, m7, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
    LOAD_DIFF  m2, m6, m7, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
    LOAD_DIFF  m3, m6, m7, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
110
    DCT4_1D 0,1,2,3,4
Loren Merritt's avatar
Loren Merritt committed
111
    TRANSPOSE%1 0,1,2,3,4
112 113 114 115 116
    DCT4_1D 0,1,2,3,4
    movq  [r0+ 0], m0
    movq  [r0+ 8], m1
    movq  [r0+16], m2
    movq  [r0+24], m3
Loren Merritt's avatar
Loren Merritt committed
117 118
%endmacro
    SUB_DCT4 4x4W
119 120 121 122 123
    RET

;-----------------------------------------------------------------------------
; void x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] )
;-----------------------------------------------------------------------------
Loren Merritt's avatar
Loren Merritt committed
124
cglobal x264_add4x4_idct_mmx, 2,2
125
.skip_prologue:
126 127 128 129
    movq  m0, [r1+ 0]
    movq  m1, [r1+ 8]
    movq  m2, [r1+16]
    movq  m3, [r1+24]
Loren Merritt's avatar
Loren Merritt committed
130
%macro ADD_IDCT4 1
131
    IDCT4_1D 0,1,2,3,4,5
Loren Merritt's avatar
Loren Merritt committed
132 133
    TRANSPOSE%1 0,1,2,3,4
    paddw m0, [pw_32 GLOBAL]
134 135
    IDCT4_1D 0,1,2,3,4,5
    pxor  m7, m7
136 137 138 139
    STORE_DIFF  m0, m4, m7, [r0+0*FDEC_STRIDE]
    STORE_DIFF  m1, m4, m7, [r0+1*FDEC_STRIDE]
    STORE_DIFF  m2, m4, m7, [r0+2*FDEC_STRIDE]
    STORE_DIFF  m3, m4, m7, [r0+3*FDEC_STRIDE]
Loren Merritt's avatar
Loren Merritt committed
140 141
%endmacro
    ADD_IDCT4 4x4W
142 143
    RET

Loren Merritt's avatar
Loren Merritt committed
144
INIT_XMM
145

Loren Merritt's avatar
Loren Merritt committed
146 147 148 149 150 151 152 153 154 155 156 157 158 159
cglobal x264_sub8x8_dct_sse2, 3,3
.skip_prologue:
    call .8x4
    add  r0, 64
    add  r1, 4*FENC_STRIDE
    add  r2, 4*FDEC_STRIDE
.8x4:
    SUB_DCT4 2x4x4W
    movhps [r0+32], m0
    movhps [r0+40], m1
    movhps [r0+48], m2
    movhps [r0+56], m3
    ret

Loren Merritt's avatar
Loren Merritt committed
160
cglobal x264_add8x8_idct_sse2, 2,2
Loren Merritt's avatar
Loren Merritt committed
161 162 163 164 165 166 167 168 169 170 171 172 173 174 175
.skip_prologue:
    call .8x4
    add  r1, 64
    add  r0, 4*FDEC_STRIDE
.8x4:
    movq   m0, [r1+ 0]
    movq   m1, [r1+ 8]
    movq   m2, [r1+16]
    movq   m3, [r1+24]
    movhps m0, [r1+32]
    movhps m1, [r1+40]
    movhps m2, [r1+48]
    movhps m3, [r1+56]
    ADD_IDCT4 2x4x4W
    ret
176 177 178 179 180 181 182 183 184

;-----------------------------------------------------------------------------
; void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
%macro SUB_NxN_DCT 6
cglobal %1, 3,3
.skip_prologue:
    call %2
    add  r0, %3
Loren Merritt's avatar
Loren Merritt committed
185 186
    add  r1, %4-%5-%6*FENC_STRIDE
    add  r2, %4-%5-%6*FDEC_STRIDE
187 188
    call %2
    add  r0, %3
Loren Merritt's avatar
Loren Merritt committed
189 190
    add  r1, (%4-%6)*FENC_STRIDE-%5-%4
    add  r2, (%4-%6)*FDEC_STRIDE-%5-%4
191 192
    call %2
    add  r0, %3
Loren Merritt's avatar
Loren Merritt committed
193 194
    add  r1, %4-%5-%6*FENC_STRIDE
    add  r2, %4-%5-%6*FDEC_STRIDE
195 196 197 198 199 200 201
    jmp  %2
%endmacro

;-----------------------------------------------------------------------------
; void x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] )
;-----------------------------------------------------------------------------
%macro ADD_NxN_IDCT 6
Loren Merritt's avatar
Loren Merritt committed
202
cglobal %1, 2,2
203 204
.skip_prologue:
    call %2
Loren Merritt's avatar
Loren Merritt committed
205
    add  r0, %4-%5-%6*FDEC_STRIDE
206 207
    add  r1, %3
    call %2
Loren Merritt's avatar
Loren Merritt committed
208
    add  r0, (%4-%6)*FDEC_STRIDE-%5-%4
209 210
    add  r1, %3
    call %2
Loren Merritt's avatar
Loren Merritt committed
211
    add  r0, %4-%5-%6*FDEC_STRIDE
212 213 214 215
    add  r1, %3
    jmp  %2
%endmacro

216
%ifndef ARCH_X86_64
Loren Merritt's avatar
Loren Merritt committed
217 218 219 220
SUB_NxN_DCT  x264_sub8x8_dct_mmx,    x264_sub4x4_dct_mmx  %+ .skip_prologue, 32, 4, 0, 0
ADD_NxN_IDCT x264_add8x8_idct_mmx,   x264_add4x4_idct_mmx %+ .skip_prologue, 32, 4, 0, 0
SUB_NxN_DCT  x264_sub16x16_dct_mmx,  x264_sub8x8_dct_mmx  %+ .skip_prologue, 32, 8, 4, 4
ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx %+ .skip_prologue, 32, 8, 4, 4
221

Loren Merritt's avatar
Loren Merritt committed
222 223
cextern x264_sub8x8_dct8_mmx.skip_prologue
cextern x264_add8x8_idct8_mmx.skip_prologue
Loren Merritt's avatar
Loren Merritt committed
224 225
SUB_NxN_DCT  x264_sub16x16_dct8_mmx,  x264_sub8x8_dct8_mmx  %+ .skip_prologue, 128, 8, 0, 0
ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx %+ .skip_prologue, 128, 8, 0, 0
Loren Merritt's avatar
Loren Merritt committed
226 227 228 229
%define x264_sub8x8_dct8_sse2 x264_sub8x8_dct8_sse2.skip_prologue
%define x264_add8x8_idct8_sse2 x264_add8x8_idct8_sse2.skip_prologue
%endif

230 231 232
SUB_NxN_DCT  x264_sub16x16_dct_sse2,  x264_sub8x8_dct_sse2  %+ .skip_prologue, 64, 8, 0, 4
ADD_NxN_IDCT x264_add16x16_idct_sse2, x264_add8x8_idct_sse2 %+ .skip_prologue, 64, 8, 0, 4

233 234
cextern x264_sub8x8_dct8_sse2
cextern x264_add8x8_idct8_sse2
Loren Merritt's avatar
Loren Merritt committed
235 236
SUB_NxN_DCT  x264_sub16x16_dct8_sse2,  x264_sub8x8_dct8_sse2,  128, 8, 0, 0
ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2, 128, 8, 0, 0
237

Holger Lubitz's avatar
Holger Lubitz committed
238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478
;-----------------------------------------------------------------------------
; void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[8][8] )
;-----------------------------------------------------------------------------
%macro SCAN_8x8 1
cglobal x264_zigzag_scan_8x8_frame_%1, 2,2
    movdqa    xmm0, [r1]
    movdqa    xmm1, [r1+16]
    movdq2q    mm0, xmm0
    PALIGNR   xmm1, xmm1, 14, xmm2
    movdq2q    mm1, xmm1

    movdqa    xmm2, [r1+32]
    movdqa    xmm3, [r1+48]
    PALIGNR   xmm2, xmm2, 12, xmm4
    movdq2q    mm2, xmm2
    PALIGNR   xmm3, xmm3, 10, xmm4
    movdq2q    mm3, xmm3

    punpckhwd xmm0, xmm1
    punpckhwd xmm2, xmm3

    movq       mm4, mm1
    movq       mm5, mm1
    movq       mm6, mm2
    movq       mm7, mm3
    punpckhwd  mm1, mm0
    psllq      mm0, 16
    psrlq      mm3, 16
    punpckhdq  mm1, mm1
    punpckhdq  mm2, mm0
    punpcklwd  mm0, mm4
    punpckhwd  mm4, mm3
    punpcklwd  mm4, mm2
    punpckhdq  mm0, mm2
    punpcklwd  mm6, mm3
    punpcklwd  mm5, mm7
    punpcklwd  mm5, mm6

    movdqa    xmm4, [r1+64]
    movdqa    xmm5, [r1+80]
    movdqa    xmm6, [r1+96]
    movdqa    xmm7, [r1+112]

    movq [r0+2*00], mm0
    movq [r0+2*04], mm4
    movd [r0+2*08], mm1
    movq [r0+2*36], mm5
    movq [r0+2*46], mm6

    PALIGNR   xmm4, xmm4, 14, xmm3
    movdq2q    mm4, xmm4
    PALIGNR   xmm5, xmm5, 12, xmm3
    movdq2q    mm5, xmm5
    PALIGNR   xmm6, xmm6, 10, xmm3
    movdq2q    mm6, xmm6
%ifidn %1, ssse3
    PALIGNR   xmm7, xmm7, 8, xmm3
    movdq2q    mm7, xmm7
%else
    movhlps   xmm3, xmm7
    movlhps   xmm7, xmm7
    movdq2q    mm7, xmm3
%endif

    punpckhwd xmm4, xmm5
    punpckhwd xmm6, xmm7

    movq       mm0, mm4
    movq       mm1, mm5
    movq       mm3, mm7
    punpcklwd  mm7, mm6
    psrlq      mm6, 16
    punpcklwd  mm4, mm6
    punpcklwd  mm5, mm4
    punpckhdq  mm4, mm3
    punpcklwd  mm3, mm6
    punpckhwd  mm3, mm4
    punpckhwd  mm0, mm1
    punpckldq  mm4, mm0
    punpckhdq  mm0, mm6
    pshufw     mm4, mm4, 0x6c

    movq [r0+2*14], mm4
    movq [r0+2*25], mm0
    movd [r0+2*54], mm7
    movq [r0+2*56], mm5
    movq [r0+2*60], mm3

    movdqa    xmm3, xmm0
    movdqa    xmm7, xmm4
    punpckldq xmm0, xmm2
    punpckldq xmm4, xmm6
    punpckhdq xmm3, xmm2
    punpckhdq xmm7, xmm6
    pshufhw   xmm0, xmm0, 0x1b
    pshuflw   xmm4, xmm4, 0x1b
    pshufhw   xmm3, xmm3, 0x1b
    pshuflw   xmm7, xmm7, 0x1b

    movlps [r0+2*10], xmm0
    movhps [r0+2*17], xmm0
    movlps [r0+2*21], xmm3
    movlps [r0+2*28], xmm4
    movhps [r0+2*32], xmm3
    movhps [r0+2*39], xmm4
    movlps [r0+2*43], xmm7
    movhps [r0+2*50], xmm7

    RET
%endmacro

INIT_XMM
%define PALIGNR PALIGNR_MMX
SCAN_8x8 sse2
%define PALIGNR PALIGNR_SSSE3
SCAN_8x8 ssse3

;-----------------------------------------------------------------------------
; void x264_zigzag_scan_8x8_frame_mmxext( int16_t level[64], int16_t dct[8][8] )
;-----------------------------------------------------------------------------
cglobal x264_zigzag_scan_8x8_frame_mmxext, 2,2
    movq       mm0, [r1]
    movq       mm1, [r1+2*8]
    movq       mm2, [r1+2*14]
    movq       mm3, [r1+2*21]
    movq       mm4, [r1+2*28]
    movq       mm5, mm0
    movq       mm6, mm1
    psrlq      mm0, 16
    punpckldq  mm1, mm1
    punpcklwd  mm5, mm6
    punpckhwd  mm1, mm3
    punpckhwd  mm6, mm0
    punpckldq  mm5, mm0
    movq       mm7, [r1+2*52]
    movq       mm0, [r1+2*60]
    punpckhwd  mm1, mm2
    punpcklwd  mm2, mm4
    punpckhwd  mm4, mm3
    punpckldq  mm3, mm3
    punpckhwd  mm3, mm2
    movq      [r0], mm5
    movq  [r0+2*4], mm1
    movq  [r0+2*8], mm6
    punpcklwd  mm6, mm0
    punpcklwd  mm6, mm7
    movq       mm1, [r1+2*32]
    movq       mm5, [r1+2*39]
    movq       mm2, [r1+2*46]
    movq [r0+2*35], mm3
    movq [r0+2*47], mm4
    punpckhwd  mm7, mm0
    psllq      mm0, 16
    movq       mm3, mm5
    punpcklwd  mm5, mm1
    punpckhwd  mm1, mm2
    punpckhdq  mm3, mm3
    movq [r0+2*52], mm6
    movq [r0+2*13], mm5
    movq       mm4, [r1+2*11]
    movq       mm6, [r1+2*25]
    punpcklwd  mm5, mm7
    punpcklwd  mm1, mm3
    punpckhdq  mm0, mm7
    movq       mm3, [r1+2*4]
    movq       mm7, [r1+2*18]
    punpcklwd  mm2, mm5
    movq [r0+2*25], mm1
    movq       mm1, mm4
    movq       mm5, mm6
    punpcklwd  mm4, mm3
    punpcklwd  mm6, mm7
    punpckhwd  mm1, mm3
    punpckhwd  mm5, mm7
    movq       mm3, mm6
    movq       mm7, mm5
    punpckldq  mm6, mm4
    punpckldq  mm5, mm1
    punpckhdq  mm3, mm4
    punpckhdq  mm7, mm1
    movq       mm4, [r1+2*35]
    movq       mm1, [r1+2*49]
    pshufw     mm6, mm6, 0x1b
    pshufw     mm5, mm5, 0x1b
    movq [r0+2*60], mm0
    movq [r0+2*56], mm2
    movq       mm0, [r1+2*42]
    movq       mm2, [r1+2*56]
    movq [r0+2*17], mm3
    movq [r0+2*32], mm7
    movq [r0+2*10], mm6
    movq [r0+2*21], mm5
    movq       mm3, mm0
    movq       mm7, mm2
    punpcklwd  mm0, mm4
    punpcklwd  mm2, mm1
    punpckhwd  mm3, mm4
    punpckhwd  mm7, mm1
    movq       mm4, mm2
    movq       mm1, mm7
    punpckhdq  mm2, mm0
    punpckhdq  mm7, mm3
    punpckldq  mm4, mm0
    punpckldq  mm1, mm3
    pshufw     mm2, mm2, 0x1b
    pshufw     mm7, mm7, 0x1b
    movq [r0+2*28], mm4
    movq [r0+2*43], mm1
    movq [r0+2*39], mm2
    movq [r0+2*50], mm7
    RET

;-----------------------------------------------------------------------------
; void x264_zigzag_scan_4x4_frame_mmx( int16_t level[16], int16_t dct[4][4] )
;-----------------------------------------------------------------------------
cglobal x264_zigzag_scan_4x4_frame_mmx, 2,2
    movq       mm0, [r1]
    movq       mm1, [r1+8]
    movq       mm2, [r1+16]
    movq       mm3, [r1+24]
    movq       mm4, mm0
    movq       mm5, mm1
    movq       mm6, mm2
    movq       mm7, mm3
    psllq      mm3, 16
    psrlq      mm0, 16
    punpckldq  mm2, mm2
    punpckhdq  mm1, mm1
    punpcklwd  mm4, mm5
    punpcklwd  mm5, mm3
    punpckldq  mm4, mm0
    punpckhwd  mm5, mm2
    punpckhwd  mm0, mm6
    punpckhwd  mm6, mm7
    punpcklwd  mm1, mm0
    punpckhdq  mm3, mm6
    movq      [r0], mm4
    movq    [r0+8], mm5
    movq   [r0+16], mm1
    movq   [r0+24], mm3
    RET
479

Holger Lubitz's avatar
Holger Lubitz committed
480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495
;-----------------------------------------------------------------------------
; void x264_zigzag_scan_4x4_frame_ssse3( int16_t level[16], int16_t dct[4][4] )
;-----------------------------------------------------------------------------
cglobal x264_zigzag_scan_4x4_frame_ssse3, 2,2
    movdqa    xmm1, [r1+16]
    movdqa    xmm0, [r1]
    pshufb    xmm1, [pb_scan4frameb GLOBAL]
    pshufb    xmm0, [pb_scan4framea GLOBAL]
    movdqa    xmm2, xmm1
    psrldq    xmm1, 6
    palignr   xmm2, xmm0, 6
    pslldq    xmm0, 10
    palignr   xmm1, xmm0, 10
    movdqa    [r0], xmm2
    movdqa [r0+16], xmm1
    RET
496

497
;-----------------------------------------------------------------------------
498
; void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] )
499
;-----------------------------------------------------------------------------
500 501 502 503 504 505 506 507 508 509 510 511
; sse2 is only 1 cycle faster, and ssse3/pshufb is slower on core2
cglobal x264_zigzag_scan_4x4_field_mmxext, 2,3
    pshufw     mm0, [r1+4], 0xd2
    movq       mm1, [r1+16]
    movq       mm2, [r1+24]
    movq    [r0+4], mm0
    movq   [r0+16], mm1
    movq   [r0+24], mm2
    mov        r2d, [r1]
    mov       [r0], r2d
    mov        r2d, [r1+12]
    mov    [r0+12], r2d
512 513
    RET

Loren Merritt's avatar
Loren Merritt committed
514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535
;-----------------------------------------------------------------------------
; void x264_zigzag_sub_4x4_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst )
;-----------------------------------------------------------------------------
cglobal x264_zigzag_sub_4x4_frame_ssse3, 3,3
    movd      xmm0, [r1+0*FENC_STRIDE]
    movd      xmm1, [r1+1*FENC_STRIDE]
    movd      xmm2, [r1+2*FENC_STRIDE]
    movd      xmm3, [r1+3*FENC_STRIDE]
    movd      xmm4, [r2+0*FDEC_STRIDE]
    movd      xmm5, [r2+1*FDEC_STRIDE]
    movd      xmm6, [r2+2*FDEC_STRIDE]
    movd      xmm7, [r2+3*FDEC_STRIDE]
    movd      [r2+0*FDEC_STRIDE], xmm0
    movd      [r2+1*FDEC_STRIDE], xmm1
    movd      [r2+2*FDEC_STRIDE], xmm2
    movd      [r2+3*FDEC_STRIDE], xmm3
    punpckldq xmm0, xmm1
    punpckldq xmm2, xmm3
    punpckldq xmm4, xmm5
    punpckldq xmm6, xmm7
    movlhps   xmm0, xmm2
    movlhps   xmm4, xmm6
Holger Lubitz's avatar
Holger Lubitz committed
536
    movdqa    xmm7, [pb_sub4frame GLOBAL]
Loren Merritt's avatar
Loren Merritt committed
537 538 539 540 541 542 543 544 545 546 547 548 549 550
    pshufb    xmm0, xmm7
    pshufb    xmm4, xmm7
    pxor      xmm6, xmm6
    movdqa    xmm1, xmm0
    movdqa    xmm5, xmm4
    punpcklbw xmm0, xmm6
    punpckhbw xmm1, xmm6
    punpcklbw xmm4, xmm6
    punpckhbw xmm5, xmm6
    psubw     xmm0, xmm4
    psubw     xmm1, xmm5
    movdqa    [r0], xmm0
    movdqa [r0+16], xmm1
    RET
551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567

INIT_MMX
cglobal x264_zigzag_interleave_8x8_cavlc_mmx, 2,3
    mov    r2d, 24
.loop:
    movq   m0, [r1+r2*4+ 0]
    movq   m1, [r1+r2*4+ 8]
    movq   m2, [r1+r2*4+16]
    movq   m3, [r1+r2*4+24]
    TRANSPOSE4x4W 0,1,2,3,4
    movq   [r0+r2+ 0], m0
    movq   [r0+r2+32], m1
    movq   [r0+r2+64], m2
    movq   [r0+r2+96], m3
    sub    r2d, 8
    jge .loop
    REP_RET