Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
X
x264
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
8
Issues
8
List
Boards
Labels
Service Desk
Milestones
Merge Requests
9
Merge Requests
9
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Environments
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
VideoLAN
x264
Commits
c17218e8
Commit
c17218e8
authored
Mar 16, 2008
by
Loren Merritt
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
merge x86_32 and x86_64 asm, with macros to abstract calling convention and register names
parent
3445cca4
Changes
44
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
44 changed files
with
6474 additions
and
9311 deletions
+6474
-9311
.gitignore
.gitignore
+1
-0
Makefile
Makefile
+16
-21
common/amd64/dct-a.asm
common/amd64/dct-a.asm
+0
-520
common/amd64/deblock-a.asm
common/amd64/deblock-a.asm
+0
-475
common/amd64/mc-a2.asm
common/amd64/mc-a2.asm
+0
-320
common/amd64/pixel-a.asm
common/amd64/pixel-a.asm
+0
-1301
common/amd64/pixel-sse2.asm
common/amd64/pixel-sse2.asm
+0
-1074
common/cpu.c
common/cpu.c
+2
-0
common/dct.c
common/dct.c
+1
-3
common/frame.c
common/frame.c
+5
-8
common/i386/deblock-a.asm
common/i386/deblock-a.asm
+0
-503
common/i386/mc-a.asm
common/i386/mc-a.asm
+0
-633
common/i386/pixel-a.asm
common/i386/pixel-a.asm
+0
-1835
common/i386/pixel-sse2.asm
common/i386/pixel-sse2.asm
+0
-1052
common/i386/predict-a.asm
common/i386/predict-a.asm
+0
-629
common/i386/quant-a.asm
common/i386/quant-a.asm
+0
-298
common/mc.c
common/mc.c
+3
-5
common/pixel.c
common/pixel.c
+7
-4
common/predict.c
common/predict.c
+1
-1
common/quant.c
common/quant.c
+1
-1
common/x86/cpu-32.asm
common/x86/cpu-32.asm
+15
-44
common/x86/cpu-64.asm
common/x86/cpu-64.asm
+8
-48
common/x86/dct-32.asm
common/x86/dct-32.asm
+560
-0
common/x86/dct-64.asm
common/x86/dct-64.asm
+243
-0
common/x86/dct-a.asm
common/x86/dct-a.asm
+295
-0
common/x86/dct.h
common/x86/dct.h
+0
-1
common/x86/deblock-a.asm
common/x86/deblock-a.asm
+620
-0
common/x86/mc-a.asm
common/x86/mc-a.asm
+637
-0
common/x86/mc-a2.asm
common/x86/mc-a2.asm
+146
-135
common/x86/mc-c.c
common/x86/mc-c.c
+13
-8
common/x86/mc.h
common/x86/mc.h
+0
-1
common/x86/pixel-32.asm
common/x86/pixel-32.asm
+460
-0
common/x86/pixel-a.asm
common/x86/pixel-a.asm
+1711
-0
common/x86/pixel.h
common/x86/pixel.h
+9
-4
common/x86/predict-a.asm
common/x86/predict-a.asm
+224
-170
common/x86/predict-c.c
common/x86/predict-c.c
+35
-35
common/x86/predict.h
common/x86/predict.h
+0
-1
common/x86/quant-a.asm
common/x86/quant-a.asm
+81
-74
common/x86/quant.h
common/x86/quant.h
+0
-0
common/x86/sad-a.asm
common/x86/sad-a.asm
+974
-0
common/x86/x86inc-32.asm
common/x86/x86inc-32.asm
+3
-36
common/x86/x86inc-64.asm
common/x86/x86inc-64.asm
+7
-35
common/x86/x86inc.asm
common/x86/x86inc.asm
+328
-0
tools/checkasm.c
tools/checkasm.c
+68
-36
No files found.
.gitignore
View file @
c17218e8
...
@@ -13,6 +13,7 @@
...
@@ -13,6 +13,7 @@
config.h
config.h
config.mak
config.mak
x264
x264
checkasm
gtk/test
gtk/test
gtk/x264_gtk_encode
gtk/x264_gtk_encode
gtk/x264_icon.h
gtk/x264_icon.h
...
...
Makefile
View file @
c17218e8
...
@@ -2,6 +2,8 @@
...
@@ -2,6 +2,8 @@
include
config.mak
include
config.mak
all
:
default
SRCS
=
common/mc.c common/predict.c common/pixel.c common/macroblock.c
\
SRCS
=
common/mc.c common/predict.c common/pixel.c common/macroblock.c
\
common/frame.c common/dct.c common/cpu.c common/cabac.c
\
common/frame.c common/dct.c common/cpu.c common/cabac.c
\
common/common.c common/mdate.c common/set.c
\
common/common.c common/mdate.c common/set.c
\
...
@@ -18,30 +20,26 @@ SRCS += common/visualize.c common/display-x11.c
...
@@ -18,30 +20,26 @@ SRCS += common/visualize.c common/display-x11.c
endif
endif
# MMX/SSE optims
# MMX/SSE optims
ifeq
($(ARCH),X86)
ifneq
($(AS),)
ifneq
($(AS),)
SRCS
+=
common/i386/mc-c.c common/i386/predict-c.c
X86SRC0
=
dct-a.asm deblock-a.asm mc-a.asm mc-a2.asm
\
ASMSRC
=
common/i386/dct-a.asm common/i386/cpu-a.asm
\
pixel-a.asm predict-a.asm quant-a.asm sad-a.asm
\
common/i386/pixel-a.asm common/i386/mc-a.asm
\
cpu-32.asm dct-32.asm
common/i386/mc-a2.asm common/i386/predict-a.asm
\
X86SRC
=
$
(
X86SRC0:%
=
common/x86/%
)
common/i386/pixel-sse2.asm common/i386/quant-a.asm
\
common/i386/deblock-a.asm
ifeq
($(ARCH),X86)
SRCS
+=
common/x86/mc-c.c common/x86/predict-c.c
ASMSRC
=
$(X86SRC)
common/x86/pixel-32.asm
OBJASM
=
$(ASMSRC:%.asm=%.o)
OBJASM
=
$(ASMSRC:%.asm=%.o)
ASFLAGS
+=
-Icommon
/
i3
86/
ASFLAGS
+=
-Icommon
/
x
86/
endif
$(OBJASM)
:
common/x86/x86inc.asm common/x86/x86inc-32.asm
endif
endif
# MMX/SSE optims
ifeq
($(ARCH),X86_64)
ifeq
($(ARCH),X86_64)
ifneq
($(AS),)
SRCS
+=
common/x86/mc-c.c common/x86/predict-c.c
SRCS
+=
common/i386/mc-c.c common/i386/predict-c.c
ASMSRC
=
$
(
X86SRC:-32.asm
=
-64
.asm
)
ASMSRC
=
common/amd64/dct-a.asm common/amd64/cpu-a.asm
\
common/amd64/pixel-a.asm common/amd64/mc-a.asm
\
common/amd64/mc-a2.asm common/amd64/predict-a.asm
\
common/amd64/pixel-sse2.asm common/amd64/quant-a.asm
\
common/amd64/deblock-a.asm
OBJASM
=
$(ASMSRC:%.asm=%.o)
OBJASM
=
$(ASMSRC:%.asm=%.o)
ASFLAGS
+=
-Icommon
/amd64
ASFLAGS
+=
-Icommon
/x86/
-DARCH_X86_64
$(OBJASM)
:
common/x86/x86inc.asm common/x86/x86inc-64.asm
endif
endif
endif
endif
...
@@ -69,7 +67,6 @@ OBJCLI = $(SRCCLI:%.c=%.o)
...
@@ -69,7 +67,6 @@ OBJCLI = $(SRCCLI:%.c=%.o)
DEP
=
depend
DEP
=
depend
.PHONY
:
all default fprofiled clean distclean install install-gtk uninstall dox test testclean
.PHONY
:
all default fprofiled clean distclean install install-gtk uninstall dox test testclean
all
:
default
default
:
$(DEP) x264$(EXE)
default
:
$(DEP) x264$(EXE)
...
@@ -89,8 +86,6 @@ libx264gtk.a: muxers.o libx264.a
...
@@ -89,8 +86,6 @@ libx264gtk.a: muxers.o libx264.a
checkasm
:
tools/checkasm.o libx264.a
checkasm
:
tools/checkasm.o libx264.a
$(CC)
-o
$@
$+
$(LDFLAGS)
$(CC)
-o
$@
$+
$(LDFLAGS)
common/amd64/*.o
:
common/amd64/amd64inc.asm
common/i386/*.o
:
common/i386/i386inc.asm
%.o
:
%.asm
%.o
:
%.asm
$(AS)
$(ASFLAGS)
-o
$@
$<
$(AS)
$(ASFLAGS)
-o
$@
$<
# delete local/anonymous symbols, so they don't show up in oprofile
# delete local/anonymous symbols, so they don't show up in oprofile
...
...
common/amd64/dct-a.asm
deleted
100644 → 0
View file @
3445cca4
;*****************************************************************************
;* dct.asm: h264 encoder library
;*****************************************************************************
;* Copyright (C) 2003 x264 project
;* $Id: dct.asm,v 1.1 2004/06/03 19:27:07 fenrir Exp $
;*
;* Authors: Laurent Aimar <fenrir@via.ecp.fr> (initial version)
;* Min Chen <chenm001.163.com> (converted to nasm)
;* Loren Merritt <lorenm@u.washington.edu> (dct8)
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
;*****************************************************************************
;*****************************************************************************
;* *
;* Revision history: *
;* *
;* 2004.04.28 portab all 4x4 function to nasm (CM) *
;* *
;*****************************************************************************
BITS
64
;=============================================================================
; Macros and other preprocessor constants
;=============================================================================
%include "amd64inc.asm"
%macro MMX_ZERO 1
pxor
%
1
,
%
1
%endmacro
%macro MMX_LOAD_DIFF_4P 5
movd
%
1
,
%
4
punpcklbw
%
1
,
%
3
movd
%
2
,
%
5
punpcklbw
%
2
,
%
3
psubw
%
1
,
%
2
%endmacro
%macro MMX_LOAD_DIFF_8P 5
movq
%
1
,
%
4
punpcklbw
%
1
,
%
3
movq
%
2
,
%
5
punpcklbw
%
2
,
%
3
psubw
%
1
,
%
2
%endmacro
%macro MMX_SUMSUB_BA 2
paddw
%
1
,
%
2
paddw
%
2
,
%
2
psubw
%
2
,
%
1
%endmacro
%macro MMX_SUMSUB_BADC 4
paddw
%
1
,
%
2
paddw
%
3
,
%
4
paddw
%
2
,
%
2
paddw
%
4
,
%
4
psubw
%
2
,
%
1
psubw
%
4
,
%
3
%endmacro
%macro MMX_SUMSUB2_AB 3
movq
%
3
,
%
1
paddw
%
1
,
%
1
paddw
%
1
,
%
2
psubw
%
3
,
%
2
psubw
%
3
,
%
2
%endmacro
%macro MMX_SUMSUBD2_AB 4
movq
%
4
,
%
1
movq
%
3
,
%
2
psraw
%
2
,
1
psraw
%
4
,
1
paddw
%
1
,
%
2
psubw
%
4
,
%
3
%endmacro
%macro SBUTTERFLY 5
mov
%
1
%
5
,
%
3
punpckl
%
2
%
3
,
%
4
punpckh
%
2
%
5
,
%
4
%endmacro
;-----------------------------------------------------------------------------
; input ABCD output ADTC
;-----------------------------------------------------------------------------
%macro MMX_TRANSPOSE 5
SBUTTERFLY
q
,
wd
,
%
1
,
%
2
,
%
5
SBUTTERFLY
q
,
wd
,
%
3
,
%
4
,
%
2
SBUTTERFLY
q
,
dq
,
%
1
,
%
3
,
%
4
SBUTTERFLY
q
,
dq
,
%
5
,
%
2
,
%
3
%endmacro
;-----------------------------------------------------------------------------
; input ABCDEFGH output AFHDTECB
;-----------------------------------------------------------------------------
%macro SSE2_TRANSPOSE8x8 9
SBUTTERFLY
dqa
,
wd
,
%
1
,
%
2
,
%
9
SBUTTERFLY
dqa
,
wd
,
%
3
,
%
4
,
%
2
SBUTTERFLY
dqa
,
wd
,
%
5
,
%
6
,
%
4
SBUTTERFLY
dqa
,
wd
,
%
7
,
%
8
,
%
6
SBUTTERFLY
dqa
,
dq
,
%
1
,
%
3
,
%
8
SBUTTERFLY
dqa
,
dq
,
%
9
,
%
2
,
%
3
SBUTTERFLY
dqa
,
dq
,
%
5
,
%
7
,
%
2
SBUTTERFLY
dqa
,
dq
,
%
4
,
%
6
,
%
7
SBUTTERFLY
dqa
,
qdq
,
%
1
,
%
5
,
%
6
SBUTTERFLY
dqa
,
qdq
,
%
9
,
%
4
,
%
5
SBUTTERFLY
dqa
,
qdq
,
%
8
,
%
2
,
%
4
SBUTTERFLY
dqa
,
qdq
,
%
3
,
%
7
,
%
2
%endmacro
%macro MMX_STORE_DIFF_4P 5
paddw
%
1
,
%
3
psraw
%
1
,
6
movd
%
2
,
%
5
punpcklbw
%
2
,
%
4
paddsw
%
1
,
%
2
packuswb
%
1
,
%
1
movd
%
5
,
%
1
%endmacro
%macro MMX_STORE_DIFF_8P 4
psraw
%
1
,
6
movq
%
2
,
%
4
punpcklbw
%
2
,
%
3
paddsw
%
1
,
%
2
packuswb
%
1
,
%
1
movq
%
4
,
%
1
%endmacro
;=============================================================================
; Constants
;=============================================================================
SECTION
_RODATA
pw_1:
times
8
dw
1
pw_32:
times
8
dw
32
;=============================================================================
; Code
;=============================================================================
SECTION
.text
;-----------------------------------------------------------------------------
; void x264_dct4x4dc_mmx( int16_t d[4][4] )
;-----------------------------------------------------------------------------
cglobal
x264_dct4x4dc_mmx
movq
mm0
,
[
parm1q
+
0
]
movq
mm1
,
[
parm1q
+
8
]
movq
mm2
,
[
parm1q
+
16
]
movq
mm3
,
[
parm1q
+
24
]
MMX_SUMSUB_BADC
mm1
,
mm0
,
mm3
,
mm2
; mm1=s01 mm0=d01 mm3=s23 mm2=d23
MMX_SUMSUB_BADC
mm3
,
mm1
,
mm2
,
mm0
; mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23
MMX_TRANSPOSE
mm3
,
mm1
,
mm0
,
mm2
,
mm4
; in: mm3, mm1, mm0, mm2 out: mm3, mm2, mm4, mm0
MMX_SUMSUB_BADC
mm2
,
mm3
,
mm0
,
mm4
; mm2=s01 mm3=d01 mm0=s23 mm4=d23
MMX_SUMSUB_BADC
mm0
,
mm2
,
mm4
,
mm3
; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23
movq
mm6
,
[
pw_1
GLOBAL
]
paddw
mm0
,
mm6
paddw
mm2
,
mm6
psraw
mm0
,
1
movq
[
parm1q
+
0
],
mm0
psraw
mm2
,
1
movq
[
parm1q
+
8
],
mm2
paddw
mm3
,
mm6
paddw
mm4
,
mm6
psraw
mm3
,
1
movq
[
parm1q
+
16
],
mm3
psraw
mm4
,
1
movq
[
parm1q
+
24
],
mm4
ret
;-----------------------------------------------------------------------------
; void x264_idct4x4dc_mmx( int16_t d[4][4] )
;-----------------------------------------------------------------------------
cglobal
x264_idct4x4dc_mmx
movq
mm0
,
[
parm1q
+
0
]
movq
mm1
,
[
parm1q
+
8
]
movq
mm2
,
[
parm1q
+
16
]
movq
mm3
,
[
parm1q
+
24
]
MMX_SUMSUB_BADC
mm1
,
mm0
,
mm3
,
mm2
; mm1=s01 mm0=d01 mm3=s23 mm2=d23
MMX_SUMSUB_BADC
mm3
,
mm1
,
mm2
,
mm0
; mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23
MMX_TRANSPOSE
mm3
,
mm1
,
mm0
,
mm2
,
mm4
; in: mm3, mm1, mm0, mm2 out: mm3, mm2, mm4, mm0
MMX_SUMSUB_BADC
mm2
,
mm3
,
mm0
,
mm4
; mm2=s01 mm3=d01 mm0=s23 mm4=d23
MMX_SUMSUB_BADC
mm0
,
mm2
,
mm4
,
mm3
; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23
movq
[
parm1q
+
0
],
mm0
movq
[
parm1q
+
8
],
mm2
movq
[
parm1q
+
16
],
mm3
movq
[
parm1q
+
24
],
mm4
ret
;-----------------------------------------------------------------------------
; void x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
cglobal
x264_sub4x4_dct_mmx
MMX_ZERO
mm7
; Load 4 lines
MMX_LOAD_DIFF_4P
mm0
,
mm6
,
mm7
,
[
parm2q
+
0
*
FENC_STRIDE
],
[
parm3q
+
0
*
FDEC_STRIDE
]
MMX_LOAD_DIFF_4P
mm1
,
mm6
,
mm7
,
[
parm2q
+
1
*
FENC_STRIDE
],
[
parm3q
+
1
*
FDEC_STRIDE
]
MMX_LOAD_DIFF_4P
mm2
,
mm6
,
mm7
,
[
parm2q
+
2
*
FENC_STRIDE
],
[
parm3q
+
2
*
FDEC_STRIDE
]
MMX_LOAD_DIFF_4P
mm3
,
mm6
,
mm7
,
[
parm2q
+
3
*
FENC_STRIDE
],
[
parm3q
+
3
*
FDEC_STRIDE
]
MMX_SUMSUB_BADC
mm3
,
mm0
,
mm2
,
mm1
; mm3=s03 mm0=d03 mm2=s12 mm1=d12
MMX_SUMSUB_BA
mm2
,
mm3
; mm2=s03+s12 mm3=s03-s12
MMX_SUMSUB2_AB
mm0
,
mm1
,
mm4
; mm0=2.d03+d12 mm4=d03-2.d12
; transpose in: mm2, mm0, mm3, mm4, out: mm2, mm4, mm1, mm3
MMX_TRANSPOSE
mm2
,
mm0
,
mm3
,
mm4
,
mm1
MMX_SUMSUB_BADC
mm3
,
mm2
,
mm1
,
mm4
; mm3=s03 mm2=d03 mm1=s12 mm4=d12
MMX_SUMSUB_BA
mm1
,
mm3
; mm1=s03+s12 mm3=s03-s12
MMX_SUMSUB2_AB
mm2
,
mm4
,
mm0
; mm2=2.d03+d12 mm0=d03-2.d12
movq
[
parm1q
+
0
],
mm1
movq
[
parm1q
+
8
],
mm2
movq
[
parm1q
+
16
],
mm3
movq
[
parm1q
+
24
],
mm0
ret
;-----------------------------------------------------------------------------
; void x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] )
;-----------------------------------------------------------------------------
cglobal
x264_add4x4_idct_mmx
; Load dct coeffs
movq
mm0
,
[
parm2q
+
0
]
; dct
movq
mm1
,
[
parm2q
+
8
]
movq
mm2
,
[
parm2q
+
16
]
movq
mm3
,
[
parm2q
+
24
]
MMX_SUMSUB_BA
mm2
,
mm0
; mm2=s02 mm0=d02
MMX_SUMSUBD2_AB
mm1
,
mm3
,
mm5
,
mm4
; mm1=s13 mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3)
MMX_SUMSUB_BADC
mm1
,
mm2
,
mm4
,
mm0
; mm1=s02+s13 mm2=s02-s13 mm4=d02+d13 mm0=d02-d13
; in: mm1, mm4, mm0, mm2 out: mm1, mm2, mm3, mm0
MMX_TRANSPOSE
mm1
,
mm4
,
mm0
,
mm2
,
mm3
MMX_SUMSUB_BA
mm3
,
mm1
; mm3=s02 mm1=d02
MMX_SUMSUBD2_AB
mm2
,
mm0
,
mm5
,
mm4
; mm2=s13 mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3)
MMX_SUMSUB_BADC
mm2
,
mm3
,
mm4
,
mm1
; mm2=s02+s13 mm3=s02-s13 mm4=d02+d13 mm1=d02-d13
MMX_ZERO
mm7
movq
mm6
,
[
pw_32
GLOBAL
]
MMX_STORE_DIFF_4P
mm2
,
mm0
,
mm6
,
mm7
,
[
parm1q
+
0
*
FDEC_STRIDE
]
MMX_STORE_DIFF_4P
mm4
,
mm0
,
mm6
,
mm7
,
[
parm1q
+
1
*
FDEC_STRIDE
]
MMX_STORE_DIFF_4P
mm1
,
mm0
,
mm6
,
mm7
,
[
parm1q
+
2
*
FDEC_STRIDE
]
MMX_STORE_DIFF_4P
mm3
,
mm0
,
mm6
,
mm7
,
[
parm1q
+
3
*
FDEC_STRIDE
]
ret
; =============================================================================
; 8x8 Transform
; =============================================================================
; in: ABCDEFGH
; out: FBCGEDHI
%macro DCT8_1D 10
MMX_SUMSUB_BA
%
8
,
%
1
; %8=s07, %1=d07
MMX_SUMSUB_BA
%
7
,
%
2
; %7=s16, %2=d16
MMX_SUMSUB_BA
%
6
,
%
3
; %6=s25, %3=d25
MMX_SUMSUB_BA
%
5
,
%
4
; %5=s34, %4=d34
MMX_SUMSUB_BA
%
5
,
%
8
; %5=a0, %8=a2
MMX_SUMSUB_BA
%
6
,
%
7
; %6=a1, %7=a3
movdqa
%
9
,
%
1
psraw
%
9
,
1
paddw
%
9
,
%
1
paddw
%
9
,
%
2
paddw
%
9
,
%
3
; %9=a4
movdqa
%
10
,
%
4
psraw
%
10
,
1
paddw
%
10
,
%
4
paddw
%
10
,
%
2
psubw
%
10
,
%
3
; %10=a7
MMX_SUMSUB_BA
%
4
,
%
1
psubw
%
1
,
%
3
psubw
%
4
,
%
2
psraw
%
3
,
1
psraw
%
2
,
1
psubw
%
1
,
%
3
; %1=a5
psubw
%
4
,
%
2
; %4=a6
MMX_SUMSUB_BA
%
6
,
%
5
; %6=b0, %5=b4
movdqa
%
2
,
%
10
psraw
%
2
,
2
paddw
%
2
,
%
9
; %2=b1
psraw
%
9
,
2
psubw
%
9
,
%
10
; %9=b7
movdqa
%
3
,
%
7
psraw
%
3
,
1
paddw
%
3
,
%
8
; %3=b2
psraw
%
8
,
1
psubw
%
8
,
%
7
; %8=b6
movdqa
%
7
,
%
4
psraw
%
7
,
2
paddw
%
7
,
%
1
; %7=b3
psraw
%
1
,
2
psubw
%
4
,
%
1
; %4=b5
%endmacro
;-----------------------------------------------------------------------------
; void __cdecl x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
cglobal
x264_sub8x8_dct8_sse2
MMX_ZERO
xmm9
MMX_LOAD_DIFF_8P
xmm0
,
xmm8
,
xmm9
,
[
parm2q
+
0
*
FENC_STRIDE
],
[
parm3q
+
0
*
FDEC_STRIDE
]
MMX_LOAD_DIFF_8P
xmm1
,
xmm8
,
xmm9
,
[
parm2q
+
1
*
FENC_STRIDE
],
[
parm3q
+
1
*
FDEC_STRIDE
]
MMX_LOAD_DIFF_8P
xmm2
,
xmm8
,
xmm9
,
[
parm2q
+
2
*
FENC_STRIDE
],
[
parm3q
+
2
*
FDEC_STRIDE
]
MMX_LOAD_DIFF_8P
xmm3
,
xmm8
,
xmm9
,
[
parm2q
+
3
*
FENC_STRIDE
],
[
parm3q
+
3
*
FDEC_STRIDE
]
MMX_LOAD_DIFF_8P
xmm4
,
xmm8
,
xmm9
,
[
parm2q
+
4
*
FENC_STRIDE
],
[
parm3q
+
4
*
FDEC_STRIDE
]
MMX_LOAD_DIFF_8P
xmm5
,
xmm8
,
xmm9
,
[
parm2q
+
5
*
FENC_STRIDE
],
[
parm3q
+
5
*
FDEC_STRIDE
]
MMX_LOAD_DIFF_8P
xmm6
,
xmm8
,
xmm9
,
[
parm2q
+
6
*
FENC_STRIDE
],
[
parm3q
+
6
*
FDEC_STRIDE
]
MMX_LOAD_DIFF_8P
xmm7
,
xmm8
,
xmm9
,
[
parm2q
+
7
*
FENC_STRIDE
],
[
parm3q
+
7
*
FDEC_STRIDE
]
DCT8_1D
xmm0
,
xmm1
,
xmm2
,
xmm3
,
xmm4
,
xmm5
,
xmm6
,
xmm7
,
xmm8
,
xmm9
SSE2_TRANSPOSE8x8
xmm5
,
xmm1
,
xmm2
,
xmm6
,
xmm4
,
xmm3
,
xmm7
,
xmm8
,
xmm0
DCT8_1D
xmm5
,
xmm3
,
xmm8
,
xmm6
,
xmm0
,
xmm4
,
xmm2
,
xmm1
,
xmm7
,
xmm9
movdqa
[
parm1q
+
0x00
],
xmm4
movdqa
[
parm1q
+
0x10
],
xmm3
movdqa
[
parm1q
+
0x20
],
xmm8
movdqa
[
parm1q
+
0x30
],
xmm2
movdqa
[
parm1q
+
0x40
],
xmm0
movdqa
[
parm1q
+
0x50
],
xmm6
movdqa
[
parm1q
+
0x60
],
xmm1
movdqa
[
parm1q
+
0x70
],
xmm7
ret
; in: ABCDEFGH
; out: IBHDEACG
%macro IDCT8_1D 10
MMX_SUMSUB_BA
%
5
,
%
1
; %5=a0, %1=a2
movdqa
%
10
,
%
3
psraw
%
3
,
1
psubw
%
3
,
%
7
; %3=a4
psraw
%
7
,
1
paddw
%
7
,
%
10
; %7=a6
movdqa
%
9
,
%
2
psraw
%
9
,
1
paddw
%
9
,
%
2
paddw
%
9
,
%
4
paddw
%
9
,
%
6
; %9=a7
movdqa
%
10
,
%
6
psraw
%
10
,
1
paddw
%
10
,
%
6
paddw
%
10
,
%
8
psubw
%
10
,
%
2
; %10=a5
psubw
%
2
,
%
4
psubw
%
6
,
%
4
paddw
%
2
,
%
8
psubw
%
6
,
%
8
psraw
%
4
,
1
psraw
%
8
,
1
psubw
%
2
,
%
4
; %2=a3
psubw
%
6
,
%
8
; %6=a1
MMX_SUMSUB_BA
%
7
,
%
5
; %7=b0, %5=b6
MMX_SUMSUB_BA
%
3
,
%
1
; %3=b2, %1=b4
movdqa
%
4
,
%
9
psraw
%
4
,
2
paddw
%
4
,
%
6
; %4=b1
psraw
%
6
,
2
psubw
%
9
,
%
6
; %9=b7
movdqa
%
8
,
%
10
psraw
%
8
,
2
paddw
%
8
,
%
2
; %8=b3
psraw
%
2
,
2
psubw
%
2
,
%
10
; %2=b5
MMX_SUMSUB_BA
%
9
,
%
7
; %9=c0, %7=c7
MMX_SUMSUB_BA
%
2
,
%
3
; %2=c1, %3=c6
MMX_SUMSUB_BA
%
8
,
%
1
; %8=c2, %1=c5
MMX_SUMSUB_BA
%
4
,
%
5
; %4=c3, %5=c4
%endmacro
;-----------------------------------------------------------------------------
; void __cdecl x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
cglobal
x264_add8x8_idct8_sse2
movdqa
xmm0
,
[
parm2q
+
0x00
]
movdqa
xmm1
,
[
parm2q
+
0x10
]
movdqa
xmm2
,
[
parm2q
+
0x20
]
movdqa
xmm3
,
[
parm2q
+
0x30
]
movdqa
xmm4
,
[
parm2q
+
0x40
]
movdqa
xmm5
,
[
parm2q
+
0x50
]
movdqa
xmm6
,
[
parm2q
+
0x60
]
movdqa
xmm7
,
[
parm2q
+
0x70
]
IDCT8_1D
xmm0
,
xmm1
,
xmm2
,
xmm3
,
xmm4
,
xmm5
,
xmm6
,
xmm7
,
xmm9
,
xmm8
SSE2_TRANSPOSE8x8
xmm9
,
xmm1
,
xmm7
,
xmm3
,
xmm4
,
xmm0
,
xmm2
,
xmm6
,
xmm5
paddw
xmm9
,
[
pw_32
GLOBAL
]
; rounding for the >>6 at the end
IDCT8_1D
xmm9
,
xmm0
,
xmm6
,
xmm3
,
xmm5
,
xmm4
,
xmm7
,
xmm1
,
xmm8
,
xmm2
MMX_ZERO
xmm15
MMX_STORE_DIFF_8P
xmm8
,
xmm14
,
xmm15
,
[
parm1q
+
0
*
FDEC_STRIDE
]
MMX_STORE_DIFF_8P
xmm0
,
xmm14
,
xmm15
,
[
parm1q
+
1
*
FDEC_STRIDE
]
MMX_STORE_DIFF_8P
xmm1
,
xmm14
,
xmm15
,
[
parm1q
+
2
*
FDEC_STRIDE
]
MMX_STORE_DIFF_8P
xmm3
,
xmm14
,
xmm15
,
[
parm1q
+
3
*
FDEC_STRIDE
]
MMX_STORE_DIFF_8P
xmm5
,
xmm14
,
xmm15
,
[
parm1q
+
4
*
FDEC_STRIDE
]
MMX_STORE_DIFF_8P
xmm9
,
xmm14
,
xmm15
,
[
parm1q
+
5
*
FDEC_STRIDE
]
MMX_STORE_DIFF_8P
xmm6
,
xmm14
,
xmm15
,
[
parm1q
+
6
*
FDEC_STRIDE
]
MMX_STORE_DIFF_8P
xmm7
,
xmm14
,
xmm15
,
[
parm1q
+
7
*
FDEC_STRIDE
]
ret
;-----------------------------------------------------------------------------
; void __cdecl x264_sub8x8_dct_mmx( int16_t dct[4][4][4],
; uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
%macro SUB_NxN_DCT 6
cglobal
%
1
call
%
2
add
parm1q
,
%
3
add
parm2q
,
%
4
-%
5
*
FENC_STRIDE
add
parm3q
,
%
4
-%
5
*
FDEC_STRIDE
call
%
2
add
parm1q
,
%
3
add
parm2q
,
%
4
*
FENC_STRIDE
-%
6
add
parm3q
,
%
4
*
FDEC_STRIDE
-%
6
call
%
2
add
parm1q
,
%
3
add
parm2q
,
%
4
-%
5
*
FENC_STRIDE
add
parm3q
,
%
4
-%
5
*
FDEC_STRIDE
jmp
%
2
%endmacro
;-----------------------------------------------------------------------------
; void __cdecl x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] )
;-----------------------------------------------------------------------------
%macro ADD_NxN_IDCT 6
cglobal
%
1
call
%
2
add
parm1q
,
%
4
-%
5
*
FDEC_STRIDE
add
parm2q
,
%
3
call
%
2
add
parm1q
,
%
4
*
FDEC_STRIDE
-%
6
add
parm2q
,
%
3
call
%
2
add
parm1q
,
%
4
-%
5
*
FDEC_STRIDE
add
parm2q
,
%
3
jmp
%
2
%endmacro
SUB_NxN_DCT
x264_sub8x8_dct_mmx
,
x264_sub4x4_dct_mmx
,
32
,
4
,
0
,
4
ADD_NxN_IDCT
x264_add8x8_idct_mmx
,
x264_add4x4_idct_mmx
,
32
,
4
,
0
,
4
SUB_NxN_DCT
x264_sub16x16_dct_mmx
,
x264_sub8x8_dct_mmx
,
32
,
4
,
4
,
12
ADD_NxN_IDCT
x264_add16x16_idct_mmx
,
x264_add8x8_idct_mmx
,
32
,
4
,
4
,
12
SUB_NxN_DCT
x264_sub16x16_dct8_sse2
,
x264_sub8x8_dct8_sse2
,
128
,
8
,
0
,
8
ADD_NxN_IDCT
x264_add16x16_idct8_sse2
,
x264_add8x8_idct8_sse2
,
128
,
8
,
0
,
8
;-----------------------------------------------------------------------------
; void __cdecl x264_zigzag_scan_4x4_field_sse2( int level[16], int16_t dct[4][4] )
;-----------------------------------------------------------------------------
cglobal
x264_zigzag_scan_4x4_field_sse2
punpcklwd
xmm0
,
[
parm2q
]
punpckhwd
xmm1
,
[
parm2q
]
punpcklwd
xmm2
,
[
parm2q
+
16
]
punpckhwd
xmm3
,
[
parm2q
+
16
]
psrad
xmm0
,
16
psrad
xmm1
,
16
psrad
xmm2
,
16
psrad
xmm3
,
16
movq
[
parm1q
],
xmm0
movdqa
[
parm1q
+
16
],
xmm1
movdqa
[
parm1q
+
32
],
xmm2
movhlps
xmm0
,
xmm0
movdqa
[
parm1q
+
48
],
xmm3
movq
[
parm1q
+
12
],
xmm0
movd
[
parm1q
+
8
],
xmm1
ret
common/amd64/deblock-a.asm
deleted
100644 → 0
View file @
3445cca4
;*****************************************************************************
;* deblock-a.asm: h264 encoder library
;*****************************************************************************
;* Copyright (C) 2005 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.