mc-a2.asm 10.1 KB
Newer Older
Loren Merritt's avatar
Loren Merritt committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
;*****************************************************************************
;* mc-a2.asm: h264 encoder library
;*****************************************************************************
;* Copyright (C) 2005 x264 project
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
;*****************************************************************************

Loren Merritt's avatar
Loren Merritt committed
21
BITS 64
Loren Merritt's avatar
Loren Merritt committed
22 23 24 25 26

;=============================================================================
; Macros and other preprocessor constants
;=============================================================================

Loren Merritt's avatar
Loren Merritt committed
27
%include "amd64inc.asm"
Loren Merritt's avatar
Loren Merritt committed
28 29 30 31 32

;=============================================================================
; Read only data
;=============================================================================

Loren Merritt's avatar
Loren Merritt committed
33
SECTION .rodata
Loren Merritt's avatar
Loren Merritt committed
34 35 36 37 38 39 40 41 42 43 44

ALIGN 16
mmx_dw_one:
    times 4 dw 16
mmx_dd_one:
    times 2 dd 512
mmx_dw_20:
    times 4 dw 20
mmx_dw_5:
    times 4 dw -5

45
%assign tbuffer 0
Loren Merritt's avatar
Loren Merritt committed
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85

;=============================================================================
; Macros
;=============================================================================

%macro LOAD_4 9
    movd %1, %5
    movd %2, %6
    movd %3, %7
    movd %4, %8
    punpcklbw %1, %9
    punpcklbw %2, %9
    punpcklbw %3, %9
    punpcklbw %4, %9
%endmacro

%macro FILT_2 2
    psubw %1, %2
    psllw %2, 2
    psubw %1, %2
%endmacro

%macro FILT_4 3
    paddw %2, %3
    psllw %2, 2
    paddw %1, %2
    psllw %2, 2
    paddw %1, %2
%endmacro

%macro FILT_6 4
    psubw %1, %2
    psllw %2, 2
    psubw %1, %2
    paddw %1, %3
    paddw %1, %4
    psraw %1, 5
%endmacro

%macro FILT_ALL 1
Loren Merritt's avatar
Loren Merritt committed
86
    LOAD_4      mm1, mm2, mm3, mm4, [%1], [%1 + rcx], [%1 + 2 * rcx], [%1 + rbx], mm0
Loren Merritt's avatar
Loren Merritt committed
87
    FILT_2      mm1, mm2
Loren Merritt's avatar
Loren Merritt committed
88 89
    movd        mm5, [%1 + 4 * rcx]
    movd        mm6, [%1 + rdx]
Loren Merritt's avatar
Loren Merritt committed
90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122
    FILT_4      mm1, mm3, mm4
    punpcklbw   mm5, mm0
    punpcklbw   mm6, mm0
    psubw       mm1, mm5
    psllw       mm5, 2
    psubw       mm1, mm5
    paddw       mm1, mm6
%endmacro




;=============================================================================
; Code
;=============================================================================

SECTION .text

cglobal x264_horizontal_filter_mmxext
cglobal x264_center_filter_mmxext

;-----------------------------------------------------------------------------
;
; void x264_center_filter_mmxext( uint8_t *dst1, int i_dst1_stride,
;                                 uint8_t *dst2, int i_dst2_stride,
;                                  uint8_t *src, int i_src_stride,
;                                  int i_width, int i_height );
;
;-----------------------------------------------------------------------------

ALIGN 16
x264_center_filter_mmxext :

Loren Merritt's avatar
Loren Merritt committed
123 124 125 126 127 128 129 130 131
    push        r15
    pushreg     r15
%ifdef WIN64
    push        rdi
    pushreg     rdi
    push        rsi
    pushreg     rsi
%endif

Loren Merritt's avatar
Loren Merritt committed
132
    push        rbp
Loren Merritt's avatar
Loren Merritt committed
133
    pushreg     rbp
Loren Merritt's avatar
Loren Merritt committed
134
    push        rbx
Loren Merritt's avatar
Loren Merritt committed
135
    pushreg     rbx
Loren Merritt's avatar
Loren Merritt committed
136
    push        r12
Loren Merritt's avatar
Loren Merritt committed
137
    pushreg     r12
Loren Merritt's avatar
Loren Merritt committed
138
    push        r13
Loren Merritt's avatar
Loren Merritt committed
139
    pushreg     r13
Loren Merritt's avatar
Loren Merritt committed
140
    push        r14
Loren Merritt's avatar
Loren Merritt committed
141 142 143 144 145 146 147 148 149
    pushreg     r14
    lea         rbp,    [rsp]
    setframe    rbp, 0
    endprolog

%ifdef WIN64
    movsxd      r13,    dword [rsp+64+48]   ; src_stride
    mov         r12,    [rsp+64+40]         ; src
%else
Loren Merritt's avatar
Loren Merritt committed
150
    movsxd      r13,    r9d                 ; src_stride
151
    mov         r12,    r8                  ; src
Loren Merritt's avatar
Loren Merritt committed
152
%endif
153 154 155 156 157 158 159
    sub         r12,    r13
    sub         r12,    r13                 ; tsrc = src - 2 * src_stride

    ; use 24 instead of 18 (used in i386/mc-a2.asm) to keep rsp aligned
    lea         rax,    [r13 + r13 + 24 + tbuffer]
    sub         rsp,    rax

Loren Merritt's avatar
Loren Merritt committed
160 161 162 163 164 165 166 167
    mov         r10,    parm3q                 ; dst2
    movsxd      r11,    parm4d                 ; dst2_stride
    mov         r8,     parm1q                 ; dst1
    movsxd      r9,     parm2d                 ; dst1_stride
%ifdef WIN64
    movsxd      r14,    dword [rbp + 64 + 56]  ; width
    movsxd      r15,    dword [rbp + 64 + 64]  ; height
%else
168 169
    movsxd      r14,    dword [rbp + 56]    ; width
    movsxd      r15,    dword [rbp + 64]    ; height
Loren Merritt's avatar
Loren Merritt committed
170
%endif
Loren Merritt's avatar
Loren Merritt committed
171

172 173 174
    mov         rcx,    r13                 ; src_stride
    lea         rbx,    [r13 + r13 * 2]     ; 3 * src_stride
    lea         rdx,    [r13 + r13 * 4]     ; 5 * src_stride
Loren Merritt's avatar
Loren Merritt committed
175

176 177
    pxor        mm0,    mm0                 ; 0 ---> mm0
    movq        mm7,    [mmx_dd_one GLOBAL] ; for rounding
Loren Merritt's avatar
Loren Merritt committed
178

Loren Merritt's avatar
Loren Merritt committed
179
.loopcy:
Loren Merritt's avatar
Loren Merritt committed
180

181 182
    xor         rax,    rax
    mov         rsi,    r12             ; tsrc
Loren Merritt's avatar
Loren Merritt committed
183

Loren Merritt's avatar
Loren Merritt committed
184
    FILT_ALL    rsi
Loren Merritt's avatar
Loren Merritt committed
185 186

    pshufw      mm2,    mm1, 0
187 188
    movq        [rsp + tbuffer],  mm2
    movq        [rsp + tbuffer + 8],  mm1
189
    paddw       mm1,    [mmx_dw_one GLOBAL]
Loren Merritt's avatar
Loren Merritt committed
190 191 192
    psraw       mm1,    5

    packuswb    mm1,    mm1
193
    movd        [r8],   mm1             ; dst1[0] = mm1
Loren Merritt's avatar
Loren Merritt committed
194

195
    add         rax,    8
Loren Merritt's avatar
Loren Merritt committed
196
    add         rsi,    4
197
    lea         rdi,    [r8 - 4]        ; rdi = dst1 - 4
Loren Merritt's avatar
Loren Merritt committed
198

Loren Merritt's avatar
Loren Merritt committed
199
.loopcx1:
Loren Merritt's avatar
Loren Merritt committed
200

Loren Merritt's avatar
Loren Merritt committed
201
    FILT_ALL    rsi
Loren Merritt's avatar
Loren Merritt committed
202

203
    movq        [rsp + tbuffer + 2 * rax],  mm1
204
    paddw       mm1,    [mmx_dw_one GLOBAL]
Loren Merritt's avatar
Loren Merritt committed
205 206
    psraw       mm1,    5
    packuswb    mm1,    mm1
207
    movd        [rdi + rax],  mm1   ; dst1[rax - 4] = mm1
Loren Merritt's avatar
Loren Merritt committed
208

Loren Merritt's avatar
Loren Merritt committed
209
    add         rsi,    4
210 211
    add         rax,    4
    cmp         rax,    r14         ; cmp rax, width
Loren Merritt's avatar
Loren Merritt committed
212
    jnz         .loopcx1
Loren Merritt's avatar
Loren Merritt committed
213

Loren Merritt's avatar
Loren Merritt committed
214
    FILT_ALL    rsi
Loren Merritt's avatar
Loren Merritt committed
215 216

    pshufw      mm2,    mm1,  7
217 218
    movq        [rsp + tbuffer + 2 * rax],  mm1
    movq        [rsp + tbuffer + 2 * rax + 8],  mm2
219
    paddw       mm1,    [mmx_dw_one GLOBAL]
Loren Merritt's avatar
Loren Merritt committed
220 221
    psraw       mm1,    5
    packuswb    mm1,    mm1
222
    movd        [rdi + rax],  mm1   ; dst1[rax - 4] = mm1
Loren Merritt's avatar
Loren Merritt committed
223

224
    add         r12,    r13         ; tsrc = tsrc + src_stride
Loren Merritt's avatar
Loren Merritt committed
225

226
    add         r8,     r9          ; dst1 = dst1 + dst1_stride
Loren Merritt's avatar
Loren Merritt committed
227

228
    xor         rax,    rax
Loren Merritt's avatar
Loren Merritt committed
229

Loren Merritt's avatar
Loren Merritt committed
230
.loopcx2:
Loren Merritt's avatar
Loren Merritt committed
231

232 233 234 235 236 237
    movq        mm2,    [rsp + 2 * rax + 2  + 4 + tbuffer]
    movq        mm3,    [rsp + 2 * rax + 4  + 4 + tbuffer]
    movq        mm4,    [rsp + 2 * rax + 6  + 4 + tbuffer]
    movq        mm5,    [rsp + 2 * rax + 8  + 4 + tbuffer]
    movq        mm1,    [rsp + 2 * rax      + 4 + tbuffer]
    movq        mm6,    [rsp + 2 * rax + 10 + 4 + tbuffer]
Loren Merritt's avatar
Loren Merritt committed
238 239 240 241
    paddw       mm2,    mm5
    paddw       mm3,    mm4
    paddw       mm1,    mm6

242 243
    movq        mm5,    [mmx_dw_20 GLOBAL]
    movq        mm4,    [mmx_dw_5 GLOBAL]
Loren Merritt's avatar
Loren Merritt committed
244 245 246 247 248
    movq        mm6,    mm1
    pxor        mm7,    mm7

    punpckhwd   mm5,    mm2
    punpcklwd   mm4,    mm3
249 250
    punpcklwd   mm2,    [mmx_dw_20 GLOBAL]
    punpckhwd   mm3,    [mmx_dw_5 GLOBAL]
Loren Merritt's avatar
Loren Merritt committed
251 252 253 254 255 256 257 258 259 260 261 262

    pcmpgtw     mm7,    mm1

    pmaddwd     mm2,    mm4
    pmaddwd     mm3,    mm5

    punpcklwd   mm1,    mm7
    punpckhwd   mm6,    mm7

    paddd       mm2,    mm1
    paddd       mm3,    mm6

263 264
    paddd       mm2,    [mmx_dd_one GLOBAL]
    paddd       mm3,    [mmx_dd_one GLOBAL]
Loren Merritt's avatar
Loren Merritt committed
265 266 267 268 269 270 271

    psrad       mm2,    10
    psrad       mm3,    10

    packssdw    mm2,    mm3
    packuswb    mm2,    mm0

272
    movd        [r10 + rax], mm2    ; dst2[rax] = mm2
Loren Merritt's avatar
Loren Merritt committed
273

274 275
    add         rax,    4
    cmp         rax,    r14         ; cmp rax, width
Loren Merritt's avatar
Loren Merritt committed
276
    jnz         .loopcx2
Loren Merritt's avatar
Loren Merritt committed
277

278 279
    add         r10,    r11         ; dst2 += dst2_stride
    dec         r15                 ; height
Loren Merritt's avatar
Loren Merritt committed
280
    jnz         .loopcy
Loren Merritt's avatar
Loren Merritt committed
281

Loren Merritt's avatar
Loren Merritt committed
282
    lea         rsp,    [rbp]
Loren Merritt's avatar
Loren Merritt committed
283

Loren Merritt's avatar
Loren Merritt committed
284 285 286 287 288
    pop         r14
    pop         r13
    pop         r12
    pop         rbx
    pop         rbp
Loren Merritt's avatar
Loren Merritt committed
289 290 291 292 293
%ifdef WIN64
    pop         rsi
    pop         rdi
%endif
    pop         r15
Loren Merritt's avatar
Loren Merritt committed
294 295 296 297 298 299 300 301 302 303 304 305 306

    ret

;-----------------------------------------------------------------------------
;
; void x264_horizontal_filter_mmxext( uint8_t *dst, int i_dst_stride,
;                                     uint8_t *src, int i_src_stride,
;                                     int i_width, int i_height );
;
;-----------------------------------------------------------------------------

ALIGN 16
x264_horizontal_filter_mmxext :
Loren Merritt's avatar
Loren Merritt committed
307 308 309 310 311 312 313 314 315 316 317 318
    movsxd      r10,    parm2d               ; dst_stride
    movsxd      r11,    parm4d               ; src_stride
%ifdef WIN64
    mov         rdx,    r8                   ; src
    mov         r9,     rcx                  ; dst
    movsxd      rcx,    parm6d               ; height
%else
    movsxd      rcx,    parm6d               ; height
    mov         r9,     rdi                  ; dst
%endif
    
    movsxd      r8,     parm5d               ; width
Loren Merritt's avatar
Loren Merritt committed
319 320

    pxor        mm0,    mm0
321
    movq        mm7,    [mmx_dw_one GLOBAL]
Loren Merritt's avatar
Loren Merritt committed
322

Loren Merritt's avatar
Loren Merritt committed
323
    sub         rdx,    2
Loren Merritt's avatar
Loren Merritt committed
324 325 326

loophy:

327
    xor         rax,    rax
Loren Merritt's avatar
Loren Merritt committed
328 329 330

loophx:

Loren Merritt's avatar
Loren Merritt committed
331
    prefetchnta [rdx + rax + 48]       
Loren Merritt's avatar
Loren Merritt committed
332

Loren Merritt's avatar
Loren Merritt committed
333
    LOAD_4      mm1,    mm2, mm3, mm4, [rdx + rax], [rdx + rax + 1], [rdx + rax + 2], [rdx + rax + 3], mm0
Loren Merritt's avatar
Loren Merritt committed
334
    FILT_2      mm1,    mm2
Loren Merritt's avatar
Loren Merritt committed
335 336
    movd        mm5,    [rdx + rax + 4]
    movd        mm6,    [rdx + rax + 5]
Loren Merritt's avatar
Loren Merritt committed
337
    FILT_4      mm1,    mm3, mm4
Loren Merritt's avatar
Loren Merritt committed
338 339
    movd        mm2,    [rdx + rax + 4]
    movd        mm3,    [rdx + rax + 6]
Loren Merritt's avatar
Loren Merritt committed
340 341 342
    punpcklbw   mm5,    mm0
    punpcklbw   mm6,    mm0
    FILT_6      mm1,    mm5, mm6, mm7
Loren Merritt's avatar
Loren Merritt committed
343 344
    movd        mm4,    [rdx + rax + 7]
    movd        mm5,    [rdx + rax + 8]
Loren Merritt's avatar
Loren Merritt committed
345 346 347
    punpcklbw   mm2,    mm0
    punpcklbw   mm3,    mm0                  ; mm2(1), mm3(20), mm6(-5) ready
    FILT_2      mm2,    mm6
Loren Merritt's avatar
Loren Merritt committed
348
    movd        mm6,    [rdx + rax + 9]
Loren Merritt's avatar
Loren Merritt committed
349 350 351 352 353 354 355
    punpcklbw   mm4,    mm0
    punpcklbw   mm5,    mm0                  ; mm2(1-5), mm3(20), mm4(20), mm5(-5) ready
    FILT_4      mm2,    mm3, mm4
    punpcklbw   mm6,    mm0
    FILT_6      mm2,    mm5, mm6, mm7

    packuswb    mm1,    mm2
Loren Merritt's avatar
Loren Merritt committed
356
    movq        [r9 + rax],  mm1
Loren Merritt's avatar
Loren Merritt committed
357

358 359
    add         rax,    8
    cmp         rax,    r8                   ; cmp rax, width
Loren Merritt's avatar
Loren Merritt committed
360 361
    jnz         loophx

Loren Merritt's avatar
Loren Merritt committed
362 363
    add         rdx,    r11                  ; src_pitch
    add         r9,     r10                  ; dst_pitch
Loren Merritt's avatar
Loren Merritt committed
364

365
    dec         rcx
Loren Merritt's avatar
Loren Merritt committed
366 367 368
    jnz         loophy

    ret