Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
Menu
Open sidebar
VideoLAN
x264
Commits
c17218e8
Commit
c17218e8
authored
Mar 16, 2008
by
Loren Merritt
Browse files
merge x86_32 and x86_64 asm, with macros to abstract calling convention and register names
parent
3445cca4
Changes
44
Expand all
Hide whitespace changes
Inline
Side-by-side
.gitignore
View file @
c17218e8
...
...
@@ -13,6 +13,7 @@
config.h
config.mak
x264
checkasm
gtk/test
gtk/x264_gtk_encode
gtk/x264_icon.h
...
...
Makefile
View file @
c17218e8
...
...
@@ -2,6 +2,8 @@
include
config.mak
all
:
default
SRCS
=
common/mc.c common/predict.c common/pixel.c common/macroblock.c
\
common/frame.c common/dct.c common/cpu.c common/cabac.c
\
common/common.c common/mdate.c common/set.c
\
...
...
@@ -18,30 +20,26 @@ SRCS += common/visualize.c common/display-x11.c
endif
# MMX/SSE optims
ifeq
($(ARCH),X86)
ifneq
($(AS),)
SRCS
+=
common/i386/mc-c.c common/i386/predict-c.c
ASMSRC
=
common/i386/dct-a.asm common/i386/cpu-a.asm
\
common/i386/pixel-a.asm common/i386/mc-a.asm
\
common/i386/mc-a2.asm common/i386/predict-a.asm
\
common/i386/pixel-sse2.asm common/i386/quant-a.asm
\
common/i386/deblock-a.asm
X86SRC0
=
dct-a.asm deblock-a.asm mc-a.asm mc-a2.asm
\
pixel-a.asm predict-a.asm quant-a.asm sad-a.asm
\
cpu-32.asm dct-32.asm
X86SRC
=
$
(
X86SRC0:%
=
common/x86/%
)
ifeq
($(ARCH),X86)
SRCS
+=
common/x86/mc-c.c common/x86/predict-c.c
ASMSRC
=
$(X86SRC)
common/x86/pixel-32.asm
OBJASM
=
$(ASMSRC:%.asm=%.o)
ASFLAGS
+=
-Icommon
/
i3
86/
endif
ASFLAGS
+=
-Icommon
/
x
86/
$(OBJASM)
:
common/x86/x86inc.asm common/x86/x86inc-32.asm
endif
# MMX/SSE optims
ifeq
($(ARCH),X86_64)
ifneq
($(AS),)
SRCS
+=
common/i386/mc-c.c common/i386/predict-c.c
ASMSRC
=
common/amd64/dct-a.asm common/amd64/cpu-a.asm
\
common/amd64/pixel-a.asm common/amd64/mc-a.asm
\
common/amd64/mc-a2.asm common/amd64/predict-a.asm
\
common/amd64/pixel-sse2.asm common/amd64/quant-a.asm
\
common/amd64/deblock-a.asm
SRCS
+=
common/x86/mc-c.c common/x86/predict-c.c
ASMSRC
=
$
(
X86SRC:-32.asm
=
-64
.asm
)
OBJASM
=
$(ASMSRC:%.asm=%.o)
ASFLAGS
+=
-Icommon
/amd64
ASFLAGS
+=
-Icommon
/x86/
-DARCH_X86_64
$(OBJASM)
:
common/x86/x86inc.asm common/x86/x86inc-64.asm
endif
endif
...
...
@@ -69,7 +67,6 @@ OBJCLI = $(SRCCLI:%.c=%.o)
DEP
=
depend
.PHONY
:
all default fprofiled clean distclean install install-gtk uninstall dox test testclean
all
:
default
default
:
$(DEP) x264$(EXE)
...
...
@@ -89,8 +86,6 @@ libx264gtk.a: muxers.o libx264.a
checkasm
:
tools/checkasm.o libx264.a
$(CC)
-o
$@
$+
$(LDFLAGS)
common/amd64/*.o
:
common/amd64/amd64inc.asm
common/i386/*.o
:
common/i386/i386inc.asm
%.o
:
%.asm
$(AS)
$(ASFLAGS)
-o
$@
$<
# delete local/anonymous symbols, so they don't show up in oprofile
...
...
common/amd64/dct-a.asm
deleted
100644 → 0
View file @
3445cca4
;*****************************************************************************
;* dct.asm: h264 encoder library
;*****************************************************************************
;* Copyright (C) 2003 x264 project
;* $Id: dct.asm,v 1.1 2004/06/03 19:27:07 fenrir Exp $
;*
;* Authors: Laurent Aimar <fenrir@via.ecp.fr> (initial version)
;* Min Chen <chenm001.163.com> (converted to nasm)
;* Loren Merritt <lorenm@u.washington.edu> (dct8)
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
;*****************************************************************************
;*****************************************************************************
;* *
;* Revision history: *
;* *
;* 2004.04.28 portab all 4x4 function to nasm (CM) *
;* *
;*****************************************************************************
BITS
64
;=============================================================================
; Macros and other preprocessor constants
;=============================================================================
%include "amd64inc.asm"
%macro MMX_ZERO 1
pxor
%
1
,
%
1
%endmacro
%macro MMX_LOAD_DIFF_4P 5
movd
%
1
,
%
4
punpcklbw
%
1
,
%
3
movd
%
2
,
%
5
punpcklbw
%
2
,
%
3
psubw
%
1
,
%
2
%endmacro
%macro MMX_LOAD_DIFF_8P 5
movq
%
1
,
%
4
punpcklbw
%
1
,
%
3
movq
%
2
,
%
5
punpcklbw
%
2
,
%
3
psubw
%
1
,
%
2
%endmacro
%macro MMX_SUMSUB_BA 2
paddw
%
1
,
%
2
paddw
%
2
,
%
2
psubw
%
2
,
%
1
%endmacro
%macro MMX_SUMSUB_BADC 4
paddw
%
1
,
%
2
paddw
%
3
,
%
4
paddw
%
2
,
%
2
paddw
%
4
,
%
4
psubw
%
2
,
%
1
psubw
%
4
,
%
3
%endmacro
%macro MMX_SUMSUB2_AB 3
movq
%
3
,
%
1
paddw
%
1
,
%
1
paddw
%
1
,
%
2
psubw
%
3
,
%
2
psubw
%
3
,
%
2
%endmacro
%macro MMX_SUMSUBD2_AB 4
movq
%
4
,
%
1
movq
%
3
,
%
2
psraw
%
2
,
1
psraw
%
4
,
1
paddw
%
1
,
%
2
psubw
%
4
,
%
3
%endmacro
%macro SBUTTERFLY 5
mov
%
1
%
5
,
%
3
punpckl
%
2
%
3
,
%
4
punpckh
%
2
%
5
,
%
4
%endmacro
;-----------------------------------------------------------------------------
; input ABCD output ADTC
;-----------------------------------------------------------------------------
%macro MMX_TRANSPOSE 5
SBUTTERFLY
q
,
wd
,
%
1
,
%
2
,
%
5
SBUTTERFLY
q
,
wd
,
%
3
,
%
4
,
%
2
SBUTTERFLY
q
,
dq
,
%
1
,
%
3
,
%
4
SBUTTERFLY
q
,
dq
,
%
5
,
%
2
,
%
3
%endmacro
;-----------------------------------------------------------------------------
; input ABCDEFGH output AFHDTECB
;-----------------------------------------------------------------------------
%macro SSE2_TRANSPOSE8x8 9
SBUTTERFLY
dqa
,
wd
,
%
1
,
%
2
,
%
9
SBUTTERFLY
dqa
,
wd
,
%
3
,
%
4
,
%
2
SBUTTERFLY
dqa
,
wd
,
%
5
,
%
6
,
%
4
SBUTTERFLY
dqa
,
wd
,
%
7
,
%
8
,
%
6
SBUTTERFLY
dqa
,
dq
,
%
1
,
%
3
,
%
8
SBUTTERFLY
dqa
,
dq
,
%
9
,
%
2
,
%
3
SBUTTERFLY
dqa
,
dq
,
%
5
,
%
7
,
%
2
SBUTTERFLY
dqa
,
dq
,
%
4
,
%
6
,
%
7
SBUTTERFLY
dqa
,
qdq
,
%
1
,
%
5
,
%
6
SBUTTERFLY
dqa
,
qdq
,
%
9
,
%
4
,
%
5
SBUTTERFLY
dqa
,
qdq
,
%
8
,
%
2
,
%
4
SBUTTERFLY
dqa
,
qdq
,
%
3
,
%
7
,
%
2
%endmacro
%macro MMX_STORE_DIFF_4P 5
paddw
%
1
,
%
3
psraw
%
1
,
6
movd
%
2
,
%
5
punpcklbw
%
2
,
%
4
paddsw
%
1
,
%
2
packuswb
%
1
,
%
1
movd
%
5
,
%
1
%endmacro
%macro MMX_STORE_DIFF_8P 4
psraw
%
1
,
6
movq
%
2
,
%
4
punpcklbw
%
2
,
%
3
paddsw
%
1
,
%
2
packuswb
%
1
,
%
1
movq
%
4
,
%
1
%endmacro
;=============================================================================
; Constants
;=============================================================================
SECTION
_RODATA
pw_1:
times
8
dw
1
pw_32:
times
8
dw
32
;=============================================================================
; Code
;=============================================================================
SECTION
.text
;-----------------------------------------------------------------------------
; void x264_dct4x4dc_mmx( int16_t d[4][4] )
;-----------------------------------------------------------------------------
cglobal
x264_dct4x4dc_mmx
movq
mm0
,
[
parm1q
+
0
]
movq
mm1
,
[
parm1q
+
8
]
movq
mm2
,
[
parm1q
+
16
]
movq
mm3
,
[
parm1q
+
24
]
MMX_SUMSUB_BADC
mm1
,
mm0
,
mm3
,
mm2
; mm1=s01 mm0=d01 mm3=s23 mm2=d23
MMX_SUMSUB_BADC
mm3
,
mm1
,
mm2
,
mm0
; mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23
MMX_TRANSPOSE
mm3
,
mm1
,
mm0
,
mm2
,
mm4
; in: mm3, mm1, mm0, mm2 out: mm3, mm2, mm4, mm0
MMX_SUMSUB_BADC
mm2
,
mm3
,
mm0
,
mm4
; mm2=s01 mm3=d01 mm0=s23 mm4=d23
MMX_SUMSUB_BADC
mm0
,
mm2
,
mm4
,
mm3
; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23
movq
mm6
,
[
pw_1
GLOBAL
]
paddw
mm0
,
mm6
paddw
mm2
,
mm6
psraw
mm0
,
1
movq
[
parm1q
+
0
],
mm0
psraw
mm2
,
1
movq
[
parm1q
+
8
],
mm2
paddw
mm3
,
mm6
paddw
mm4
,
mm6
psraw
mm3
,
1
movq
[
parm1q
+
16
],
mm3
psraw
mm4
,
1
movq
[
parm1q
+
24
],
mm4
ret
;-----------------------------------------------------------------------------
; void x264_idct4x4dc_mmx( int16_t d[4][4] )
;-----------------------------------------------------------------------------
cglobal
x264_idct4x4dc_mmx
movq
mm0
,
[
parm1q
+
0
]
movq
mm1
,
[
parm1q
+
8
]
movq
mm2
,
[
parm1q
+
16
]
movq
mm3
,
[
parm1q
+
24
]
MMX_SUMSUB_BADC
mm1
,
mm0
,
mm3
,
mm2
; mm1=s01 mm0=d01 mm3=s23 mm2=d23
MMX_SUMSUB_BADC
mm3
,
mm1
,
mm2
,
mm0
; mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23
MMX_TRANSPOSE
mm3
,
mm1
,
mm0
,
mm2
,
mm4
; in: mm3, mm1, mm0, mm2 out: mm3, mm2, mm4, mm0
MMX_SUMSUB_BADC
mm2
,
mm3
,
mm0
,
mm4
; mm2=s01 mm3=d01 mm0=s23 mm4=d23
MMX_SUMSUB_BADC
mm0
,
mm2
,
mm4
,
mm3
; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23
movq
[
parm1q
+
0
],
mm0
movq
[
parm1q
+
8
],
mm2
movq
[
parm1q
+
16
],
mm3
movq
[
parm1q
+
24
],
mm4
ret
;-----------------------------------------------------------------------------
; void x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
cglobal
x264_sub4x4_dct_mmx
MMX_ZERO
mm7
; Load 4 lines
MMX_LOAD_DIFF_4P
mm0
,
mm6
,
mm7
,
[
parm2q
+
0
*
FENC_STRIDE
],
[
parm3q
+
0
*
FDEC_STRIDE
]
MMX_LOAD_DIFF_4P
mm1
,
mm6
,
mm7
,
[
parm2q
+
1
*
FENC_STRIDE
],
[
parm3q
+
1
*
FDEC_STRIDE
]
MMX_LOAD_DIFF_4P
mm2
,
mm6
,
mm7
,
[
parm2q
+
2
*
FENC_STRIDE
],
[
parm3q
+
2
*
FDEC_STRIDE
]
MMX_LOAD_DIFF_4P
mm3
,
mm6
,
mm7
,
[
parm2q
+
3
*
FENC_STRIDE
],
[
parm3q
+
3
*
FDEC_STRIDE
]
MMX_SUMSUB_BADC
mm3
,
mm0
,
mm2
,
mm1
; mm3=s03 mm0=d03 mm2=s12 mm1=d12
MMX_SUMSUB_BA
mm2
,
mm3
; mm2=s03+s12 mm3=s03-s12
MMX_SUMSUB2_AB
mm0
,
mm1
,
mm4
; mm0=2.d03+d12 mm4=d03-2.d12
; transpose in: mm2, mm0, mm3, mm4, out: mm2, mm4, mm1, mm3
MMX_TRANSPOSE
mm2
,
mm0
,
mm3
,
mm4
,
mm1
MMX_SUMSUB_BADC
mm3
,
mm2
,
mm1
,
mm4
; mm3=s03 mm2=d03 mm1=s12 mm4=d12
MMX_SUMSUB_BA
mm1
,
mm3
; mm1=s03+s12 mm3=s03-s12
MMX_SUMSUB2_AB
mm2
,
mm4
,
mm0
; mm2=2.d03+d12 mm0=d03-2.d12
movq
[
parm1q
+
0
],
mm1
movq
[
parm1q
+
8
],
mm2
movq
[
parm1q
+
16
],
mm3
movq
[
parm1q
+
24
],
mm0
ret
;-----------------------------------------------------------------------------
; void x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] )
;-----------------------------------------------------------------------------
cglobal
x264_add4x4_idct_mmx
; Load dct coeffs
movq
mm0
,
[
parm2q
+
0
]
; dct
movq
mm1
,
[
parm2q
+
8
]
movq
mm2
,
[
parm2q
+
16
]
movq
mm3
,
[
parm2q
+
24
]
MMX_SUMSUB_BA
mm2
,
mm0
; mm2=s02 mm0=d02
MMX_SUMSUBD2_AB
mm1
,
mm3
,
mm5
,
mm4
; mm1=s13 mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3)
MMX_SUMSUB_BADC
mm1
,
mm2
,
mm4
,
mm0
; mm1=s02+s13 mm2=s02-s13 mm4=d02+d13 mm0=d02-d13
; in: mm1, mm4, mm0, mm2 out: mm1, mm2, mm3, mm0
MMX_TRANSPOSE
mm1
,
mm4
,
mm0
,
mm2
,
mm3
MMX_SUMSUB_BA
mm3
,
mm1
; mm3=s02 mm1=d02
MMX_SUMSUBD2_AB
mm2
,
mm0
,
mm5
,
mm4
; mm2=s13 mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3)
MMX_SUMSUB_BADC
mm2
,
mm3
,
mm4
,
mm1
; mm2=s02+s13 mm3=s02-s13 mm4=d02+d13 mm1=d02-d13
MMX_ZERO
mm7
movq
mm6
,
[
pw_32
GLOBAL
]
MMX_STORE_DIFF_4P
mm2
,
mm0
,
mm6
,
mm7
,
[
parm1q
+
0
*
FDEC_STRIDE
]
MMX_STORE_DIFF_4P
mm4
,
mm0
,
mm6
,
mm7
,
[
parm1q
+
1
*
FDEC_STRIDE
]
MMX_STORE_DIFF_4P
mm1
,
mm0
,
mm6
,
mm7
,
[
parm1q
+
2
*
FDEC_STRIDE
]
MMX_STORE_DIFF_4P
mm3
,
mm0
,
mm6
,
mm7
,
[
parm1q
+
3
*
FDEC_STRIDE
]
ret
; =============================================================================
; 8x8 Transform
; =============================================================================
; in: ABCDEFGH
; out: FBCGEDHI
%macro DCT8_1D 10
MMX_SUMSUB_BA
%
8
,
%
1
; %8=s07, %1=d07
MMX_SUMSUB_BA
%
7
,
%
2
; %7=s16, %2=d16
MMX_SUMSUB_BA
%
6
,
%
3
; %6=s25, %3=d25
MMX_SUMSUB_BA
%
5
,
%
4
; %5=s34, %4=d34
MMX_SUMSUB_BA
%
5
,
%
8
; %5=a0, %8=a2
MMX_SUMSUB_BA
%
6
,
%
7
; %6=a1, %7=a3
movdqa
%
9
,
%
1
psraw
%
9
,
1
paddw
%
9
,
%
1
paddw
%
9
,
%
2
paddw
%
9
,
%
3
; %9=a4
movdqa
%
10
,
%
4
psraw
%
10
,
1
paddw
%
10
,
%
4
paddw
%
10
,
%
2
psubw
%
10
,
%
3
; %10=a7
MMX_SUMSUB_BA
%
4
,
%
1
psubw
%
1
,
%
3
psubw
%
4
,
%
2
psraw
%
3
,
1
psraw
%
2
,
1
psubw
%
1
,
%
3
; %1=a5
psubw
%
4
,
%
2
; %4=a6
MMX_SUMSUB_BA
%
6
,
%
5
; %6=b0, %5=b4
movdqa
%
2
,
%
10
psraw
%
2
,
2
paddw
%
2
,
%
9
; %2=b1
psraw
%
9
,
2
psubw
%
9
,
%
10
; %9=b7
movdqa
%
3
,
%
7
psraw
%
3
,
1
paddw
%
3
,
%
8
; %3=b2
psraw
%
8
,
1
psubw
%
8
,
%
7
; %8=b6
movdqa
%
7
,
%
4
psraw
%
7
,
2
paddw
%
7
,
%
1
; %7=b3
psraw
%
1
,
2
psubw
%
4
,
%
1
; %4=b5
%endmacro
;-----------------------------------------------------------------------------
; void __cdecl x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
cglobal
x264_sub8x8_dct8_sse2
MMX_ZERO
xmm9
MMX_LOAD_DIFF_8P
xmm0
,
xmm8
,
xmm9
,
[
parm2q
+
0
*
FENC_STRIDE
],
[
parm3q
+
0
*
FDEC_STRIDE
]
MMX_LOAD_DIFF_8P
xmm1
,
xmm8
,
xmm9
,
[
parm2q
+
1
*
FENC_STRIDE
],
[
parm3q
+
1
*
FDEC_STRIDE
]
MMX_LOAD_DIFF_8P
xmm2
,
xmm8
,
xmm9
,
[
parm2q
+
2
*
FENC_STRIDE
],
[
parm3q
+
2
*
FDEC_STRIDE
]
MMX_LOAD_DIFF_8P
xmm3
,
xmm8
,
xmm9
,
[
parm2q
+
3
*
FENC_STRIDE
],
[
parm3q
+
3
*
FDEC_STRIDE
]
MMX_LOAD_DIFF_8P
xmm4
,
xmm8
,
xmm9
,
[
parm2q
+
4
*
FENC_STRIDE
],
[
parm3q
+
4
*
FDEC_STRIDE
]
MMX_LOAD_DIFF_8P
xmm5
,
xmm8
,
xmm9
,
[
parm2q
+
5
*
FENC_STRIDE
],
[
parm3q
+
5
*
FDEC_STRIDE
]
MMX_LOAD_DIFF_8P
xmm6
,
xmm8
,
xmm9
,
[
parm2q
+
6
*
FENC_STRIDE
],
[
parm3q
+
6
*
FDEC_STRIDE
]
MMX_LOAD_DIFF_8P
xmm7
,
xmm8
,
xmm9
,
[
parm2q
+
7
*
FENC_STRIDE
],
[
parm3q
+
7
*
FDEC_STRIDE
]
DCT8_1D
xmm0
,
xmm1
,
xmm2
,
xmm3
,
xmm4
,
xmm5
,
xmm6
,
xmm7
,
xmm8
,
xmm9
SSE2_TRANSPOSE8x8
xmm5
,
xmm1
,
xmm2
,
xmm6
,
xmm4
,
xmm3
,
xmm7
,
xmm8
,
xmm0
DCT8_1D
xmm5
,
xmm3
,
xmm8
,
xmm6
,
xmm0
,
xmm4
,
xmm2
,
xmm1
,
xmm7
,
xmm9
movdqa
[
parm1q
+
0x00
],
xmm4
movdqa
[
parm1q
+
0x10
],
xmm3
movdqa
[
parm1q
+
0x20
],
xmm8
movdqa
[
parm1q
+
0x30
],
xmm2
movdqa
[
parm1q
+
0x40
],
xmm0
movdqa
[
parm1q
+
0x50
],
xmm6
movdqa
[
parm1q
+
0x60
],
xmm1
movdqa
[
parm1q
+
0x70
],
xmm7
ret
; in: ABCDEFGH
; out: IBHDEACG
%macro IDCT8_1D 10
MMX_SUMSUB_BA
%
5
,
%
1
; %5=a0, %1=a2
movdqa
%
10
,
%
3
psraw
%
3
,
1
psubw
%
3
,
%
7
; %3=a4
psraw
%
7
,
1
paddw
%
7
,
%
10
; %7=a6
movdqa
%
9
,
%
2
psraw
%
9
,
1
paddw
%
9
,
%
2
paddw
%
9
,
%
4
paddw
%
9
,
%
6
; %9=a7
movdqa
%
10
,
%
6
psraw
%
10
,
1
paddw
%
10
,
%
6
paddw
%
10
,
%
8
psubw
%
10
,
%
2
; %10=a5
psubw
%
2
,
%
4
psubw
%
6
,
%
4
paddw
%
2
,
%
8
psubw
%
6
,
%
8
psraw
%
4
,
1
psraw
%
8
,
1
psubw
%
2
,
%
4
; %2=a3
psubw
%
6
,
%
8
; %6=a1
MMX_SUMSUB_BA
%
7
,
%
5
; %7=b0, %5=b6
MMX_SUMSUB_BA
%
3
,
%
1
; %3=b2, %1=b4
movdqa
%
4
,
%
9
psraw
%
4
,
2
paddw
%
4
,
%
6
; %4=b1
psraw
%
6
,
2
psubw
%
9
,
%
6
; %9=b7
movdqa
%
8
,
%
10
psraw
%
8
,
2
paddw
%
8
,
%
2
; %8=b3
psraw
%
2
,
2
psubw
%
2
,
%
10
; %2=b5
MMX_SUMSUB_BA
%
9
,
%
7
; %9=c0, %7=c7
MMX_SUMSUB_BA
%
2
,
%
3
; %2=c1, %3=c6
MMX_SUMSUB_BA
%
8
,
%
1
; %8=c2, %1=c5
MMX_SUMSUB_BA
%
4
,
%
5
; %4=c3, %5=c4
%endmacro
;-----------------------------------------------------------------------------
; void __cdecl x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
cglobal
x264_add8x8_idct8_sse2
movdqa
xmm0
,
[
parm2q
+
0x00
]
movdqa
xmm1
,
[
parm2q
+
0x10
]
movdqa
xmm2
,
[
parm2q
+
0x20
]
movdqa
xmm3
,
[
parm2q
+
0x30
]
movdqa
xmm4
,
[
parm2q
+
0x40
]
movdqa
xmm5
,
[
parm2q
+
0x50
]
movdqa
xmm6
,
[
parm2q
+
0x60
]
movdqa
xmm7
,
[
parm2q
+
0x70
]
IDCT8_1D
xmm0
,
xmm1
,
xmm2
,
xmm3
,
xmm4
,
xmm5
,
xmm6
,
xmm7
,
xmm9
,
xmm8
SSE2_TRANSPOSE8x8
xmm9
,
xmm1
,
xmm7
,
xmm3
,
xmm4
,
xmm0
,
xmm2
,
xmm6
,
xmm5
paddw
xmm9
,
[
pw_32
GLOBAL
]
; rounding for the >>6 at the end
IDCT8_1D
xmm9
,
xmm0
,
xmm6
,
xmm3
,
xmm5
,
xmm4
,
xmm7
,
xmm1
,
xmm8
,
xmm2
MMX_ZERO
xmm15
MMX_STORE_DIFF_8P
xmm8
,
xmm14
,
xmm15
,
[
parm1q
+
0
*
FDEC_STRIDE
]
MMX_STORE_DIFF_8P
xmm0
,
xmm14
,
xmm15
,
[
parm1q
+
1
*
FDEC_STRIDE
]
MMX_STORE_DIFF_8P
xmm1
,
xmm14
,
xmm15
,
[
parm1q
+
2
*
FDEC_STRIDE
]
MMX_STORE_DIFF_8P
xmm3
,
xmm14
,
xmm15
,
[
parm1q
+
3
*
FDEC_STRIDE
]
MMX_STORE_DIFF_8P
xmm5
,
xmm14
,
xmm15
,
[
parm1q
+
4
*
FDEC_STRIDE
]
MMX_STORE_DIFF_8P
xmm9
,
xmm14
,
xmm15
,
[
parm1q
+
5
*
FDEC_STRIDE
]
MMX_STORE_DIFF_8P
xmm6
,
xmm14
,
xmm15
,
[
parm1q
+
6
*
FDEC_STRIDE
]
MMX_STORE_DIFF_8P
xmm7
,
xmm14
,
xmm15
,
[
parm1q
+
7
*
FDEC_STRIDE
]
ret
;-----------------------------------------------------------------------------
; void __cdecl x264_sub8x8_dct_mmx( int16_t dct[4][4][4],
; uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
%macro SUB_NxN_DCT 6
cglobal
%
1
call
%
2
add
parm1q
,
%
3
add
parm2q
,
%
4
-%
5
*
FENC_STRIDE
add
parm3q
,
%
4
-%
5
*
FDEC_STRIDE
call
%
2
add
parm1q
,
%
3
add
parm2q
,
%
4
*
FENC_STRIDE
-%
6
add
parm3q
,
%
4
*
FDEC_STRIDE
-%
6
call
%
2
add
parm1q
,
%
3
add
parm2q
,
%
4
-%
5
*
FENC_STRIDE
add
parm3q
,
%
4
-%
5
*
FDEC_STRIDE
jmp
%
2
%endmacro
;-----------------------------------------------------------------------------
; void __cdecl x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] )
;-----------------------------------------------------------------------------
%macro ADD_NxN_IDCT 6
cglobal
%
1
call
%
2
add
parm1q
,
%
4
-%
5
*
FDEC_STRIDE
add
parm2q
,
%
3
call
%
2
add
parm1q
,
%
4
*
FDEC_STRIDE
-%
6
add
parm2q
,
%
3
call
%
2
add
parm1q
,
%
4
-%
5
*
FDEC_STRIDE
add
parm2q
,
%
3
jmp
%
2
%endmacro
SUB_NxN_DCT
x264_sub8x8_dct_mmx
,
x264_sub4x4_dct_mmx
,
32
,
4
,
0
,
4
ADD_NxN_IDCT
x264_add8x8_idct_mmx
,
x264_add4x4_idct_mmx
,
32
,
4
,
0
,
4
SUB_NxN_DCT
x264_sub16x16_dct_mmx
,
x264_sub8x8_dct_mmx
,
32
,
4
,
4
,
12
ADD_NxN_IDCT
x264_add16x16_idct_mmx
,
x264_add8x8_idct_mmx
,
32
,
4
,
4
,
12
SUB_NxN_DCT
x264_sub16x16_dct8_sse2
,
x264_sub8x8_dct8_sse2
,
128
,
8
,
0
,
8
ADD_NxN_IDCT
x264_add16x16_idct8_sse2
,
x264_add8x8_idct8_sse2
,
128
,
8
,
0
,
8
;-----------------------------------------------------------------------------
; void __cdecl x264_zigzag_scan_4x4_field_sse2( int level[16], int16_t dct[4][4] )
;-----------------------------------------------------------------------------
cglobal
x264_zigzag_scan_4x4_field_sse2
punpcklwd
xmm0
,
[
parm2q
]
punpckhwd
xmm1
,
[
parm2q
]
punpcklwd
xmm2
,
[
parm2q
+
16
]
punpckhwd
xmm3
,
[
parm2q
+
16
]
psrad
xmm0
,
16
psrad
xmm1
,
16
psrad
xmm2
,
16
psrad
xmm3
,
16
movq
[
parm1q
],
xmm0
movdqa
[
parm1q
+
16
],
xmm1
movdqa
[
parm1q
+
32
],
xmm2
movhlps
xmm0
,
xmm0
movdqa
[
parm1q
+
48
],
xmm3
movq
[
parm1q
+
12
],
xmm0
movd
[
parm1q
+
8
],
xmm1
ret
common/amd64/deblock-a.asm
deleted
100644 → 0
View file @
3445cca4
;*****************************************************************************
;* deblock-a.asm: h264 encoder library
;*****************************************************************************
;* Copyright (C) 2005 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;* GNU General Public License for more details.