Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
X
x264
Project overview
Project overview
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
5
Issues
5
List
Boards
Labels
Milestones
Merge Requests
6
Merge Requests
6
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
VideoLAN
x264
Commits
c6e72b86
Commit
c6e72b86
authored
Feb 11, 2009
by
Anton Mitrofanov
Committed by
Fiona Glaser
Feb 11, 2009
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Windows 64-bit support
A "make distclean" is probably required after updating to this revision.
parent
ef48e51d
Changes
21
Show whitespace changes
Inline
Side-by-side
Showing
21 changed files
with
682 additions
and
356 deletions
+682
-356
Makefile
Makefile
+5
-5
common/common.c
common/common.c
+1
-1
common/pixel.c
common/pixel.c
+1
-1
common/x86/cabac-a.asm
common/x86/cabac-a.asm
+20
-17
common/x86/cpu-a.asm
common/x86/cpu-a.asm
+11
-8
common/x86/dct-32.asm
common/x86/dct-32.asm
+4
-4
common/x86/dct-64.asm
common/x86/dct-64.asm
+4
-4
common/x86/dct-a.asm
common/x86/dct-a.asm
+48
-18
common/x86/deblock-a.asm
common/x86/deblock-a.asm
+49
-37
common/x86/mc-a.asm
common/x86/mc-a.asm
+57
-54
common/x86/mc-a2.asm
common/x86/mc-a2.asm
+20
-10
common/x86/mc-c.c
common/x86/mc-c.c
+1
-1
common/x86/pixel-a.asm
common/x86/pixel-a.asm
+69
-55
common/x86/predict-a.asm
common/x86/predict-a.asm
+4
-4
common/x86/quant-a.asm
common/x86/quant-a.asm
+79
-74
common/x86/sad-a.asm
common/x86/sad-a.asm
+70
-21
common/x86/x86inc.asm
common/x86/x86inc.asm
+122
-25
configure
configure
+32
-8
extras/getopt.c
extras/getopt.c
+5
-0
tools/checkasm-a.asm
tools/checkasm-a.asm
+77
-6
tools/checkasm.c
tools/checkasm.c
+3
-3
No files found.
Makefile
View file @
c6e72b86
...
...
@@ -74,8 +74,8 @@ DEP = depend
default
:
$(DEP) x264$(EXE)
libx264.a
:
.depend $(OBJS) $(OBJASM)
ar
rc libx264.a
$(OBJS)
$(OBJASM)
ranlib
libx264.a
$(AR)
rc libx264.a
$(OBJS)
$(OBJASM)
$(RANLIB)
libx264.a
$(SONAME)
:
.depend $(OBJS) $(OBJASM)
$(CC)
-shared
-o
$@
$(OBJS)
$(OBJASM)
$(SOFLAGS)
$(LDFLAGS)
...
...
@@ -89,7 +89,7 @@ checkasm: tools/checkasm.o libx264.a
%.o
:
%.asm
$(AS)
$(ASFLAGS)
-o
$@
$<
# delete local/anonymous symbols, so they don't show up in oprofile
-@
strip
-x
$@
-@
$(STRIP)
-x
$@
.depend
:
config.mak
rm
-f
.depend
...
...
@@ -135,7 +135,7 @@ endif
clean
:
rm
-f
$(OBJS)
$(OBJASM)
$(OBJCLI)
$(SONAME)
*
.a x264 x264.exe .depend TAGS
rm
-f
checkasm checkasm.exe tools/checkasm.o
rm
-f
checkasm checkasm.exe tools/checkasm.o
tools/checkasm-a.o
rm
-f
$(SRC2:%.c=%.gcda)
$(SRC2:%.c=%.gcno)
-
sed
-e
's/ *-fprofile-\(generate\|use\)//g'
config.mak
>
config.mak2
&&
mv
config.mak2 config.mak
...
...
@@ -150,7 +150,7 @@ install: x264$(EXE) $(SONAME)
install
-m
644 libx264.a
$(DESTDIR)$(libdir)
install
-m
644 x264.pc
$(DESTDIR)$(libdir)
/pkgconfig
install
x264
$(EXE)
$(DESTDIR)$(bindir)
ranlib
$(DESTDIR)$(libdir)
/libx264.a
$(RANLIB)
$(DESTDIR)$(libdir)
/libx264.a
ifeq
($(SYS),MINGW)
$(
if
$(SONAME)
,
install
-m
755
$(SONAME)
$(DESTDIR)$(bindir)
)
else
...
...
common/common.c
View file @
c6e72b86
...
...
@@ -710,7 +710,7 @@ void *x264_malloc( int i_size )
buf
=
(
uint8_t
*
)
malloc
(
i_size
+
15
+
sizeof
(
void
**
)
+
sizeof
(
int
)
);
align_buf
=
buf
+
15
+
sizeof
(
void
**
)
+
sizeof
(
int
);
align_buf
-=
(
long
)
align_buf
&
15
;
align_buf
-=
(
intptr_t
)
align_buf
&
15
;
*
(
(
void
**
)
(
align_buf
-
sizeof
(
void
**
)
)
)
=
buf
;
*
(
(
int
*
)
(
align_buf
-
sizeof
(
void
**
)
-
sizeof
(
int
)
)
)
=
i_size
;
return
align_buf
;
...
...
common/pixel.c
View file @
c6e72b86
...
...
@@ -99,7 +99,7 @@ int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1
{
int64_t
i_ssd
=
0
;
int
x
,
y
;
int
align
=
!
(((
long
)
pix1
|
(
long
)
pix2
|
i_pix1
|
i_pix2
)
&
15
);
int
align
=
!
(((
intptr_t
)
pix1
|
(
intptr_t
)
pix2
|
i_pix1
|
i_pix2
)
&
15
);
#define SSD(size) i_ssd += pf->ssd[size]( pix1 + y*i_pix1 + x, i_pix1, \
pix2 + y*i_pix2 + x, i_pix2 );
...
...
common/x86/cabac-a.asm
View file @
c6e72b86
...
...
@@ -32,7 +32,10 @@ cextern x264_cabac_transition
cextern
x264_cabac_renorm_shift
; t3 must be ecx, since it's used for shift.
%ifdef
ARCH_X86_64
%ifdef
WIN64
DECLARE_REG_TMP
3
,
1
,
2
,
0
,
4
,
5
,
6
,
10
%
define
pointer
resq
%elifdef
ARCH_X86_64
DECLARE_REG_TMP
0
,
1
,
2
,
3
,
4
,
5
,
6
,
10
%
define
pointer
resq
%else
...
...
@@ -67,10 +70,10 @@ endstruc
%endmacro
cglobal
x264_cabac_encode_decision_asm
,
0
,
7
movifnidn
t0
d
,
r0m
movifnidn
t0
,
r0mp
movifnidn
t1d
,
r1m
mov
t5d
,
[
r
0
+
cb
.
range
]
movzx
t3d
,
byte
[
r
0
+
cb
.
state
+
t1
]
mov
t5d
,
[
t
0
+
cb
.
range
]
movzx
t3d
,
byte
[
t
0
+
cb
.
state
+
t1
]
mov
t4d
,
t5d
shr
t5d
,
6
and
t5d
,
3
...
...
@@ -80,23 +83,23 @@ cglobal x264_cabac_encode_decision_asm, 0,7
shr
t6d
,
6
movifnidn
t2d
,
r2m
cmp
t6d
,
t2d
mov
t6d
,
[
r
0
+
cb
.
low
]
mov
t6d
,
[
t
0
+
cb
.
low
]
lea
t7
,
[
t6
+
t4
]
cmovne
t4d
,
t5d
cmovne
t6d
,
t7d
LOAD_GLOBAL
t3d
,
x264_cabac_transition
,
t2
,
t3
*
2
movifnidn
t1d
,
r1m
mov
[
r
0
+
cb
.
state
+
t1
]
,
t3b
mov
[
t
0
+
cb
.
state
+
t1
]
,
t3b
.
renorm
:
mov
t3d
,
t4d
shr
t3d
,
3
LOAD_GLOBAL
t3d
,
x264_cabac_renorm_shift
,
0
,
t3
shl
t4d
,
t3b
shl
t6d
,
t3b
add
t3d
,
[
r
0
+
cb
.
queue
]
mov
[
r
0
+
cb
.
range
]
,
t4d
mov
[
r
0
+
cb
.
low
]
,
t6d
mov
[
r
0
+
cb
.
queue
]
,
t3d
add
t3d
,
[
t
0
+
cb
.
queue
]
mov
[
t
0
+
cb
.
range
]
,
t4d
mov
[
t
0
+
cb
.
low
]
,
t6d
mov
[
t
0
+
cb
.
queue
]
,
t3d
cmp
t3d
,
8
jge
.
putbyte
REP_RET
...
...
@@ -111,12 +114,12 @@ cglobal x264_cabac_encode_decision_asm, 0,7
sub
t3d
,
10
and
t6d
,
t1d
cmp
t2b
,
0xff
; FIXME is a 32bit op faster?
mov
[
r
0
+
cb
.
queue
]
,
t3d
mov
[
r
0
+
cb
.
low
]
,
t6d
mov
[
t
0
+
cb
.
queue
]
,
t3d
mov
[
t
0
+
cb
.
low
]
,
t6d
mov
t1d
,
t2d
mov
t4
,
[
r
0
+
cb
.
p
]
mov
t4
,
[
t
0
+
cb
.
p
]
je
.
postpone
mov
t5d
,
[
r
0
+
cb
.
bytes_outstanding
]
mov
t5d
,
[
t
0
+
cb
.
bytes_outstanding
]
shr
t1d
,
8
; carry
add
[
t4
-
1
]
,
t1b
test
t5d
,
t5d
...
...
@@ -130,10 +133,10 @@ cglobal x264_cabac_encode_decision_asm, 0,7
.
no_outstanding
:
mov
[t4],
t2b
inc
t4
mov
[
r
0
+
cb
.
bytes_outstanding
]
,
t5d
; is zero, but a reg has smaller opcode than an immediate
mov
[
r
0
+
cb
.
p
]
,
t4
mov
[
t
0
+
cb
.
bytes_outstanding
]
,
t5d
; is zero, but a reg has smaller opcode than an immediate
mov
[
t
0
+
cb
.
p
]
,
t4
RET
.
postpone
:
inc
dword
[
r
0
+
cb
.
bytes_outstanding
]
inc
dword
[
t
0
+
cb
.
bytes_outstanding
]
RET
common/x86/cpu-a.asm
View file @
c6e72b86
...
...
@@ -27,22 +27,24 @@
SECTION
.
text
%ifdef
ARCH_X86_64
;-----------------------------------------------------------------------------
; int x264_cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
;-----------------------------------------------------------------------------
cglobal
x264_cpu_cpuid
cglobal
x264_cpu_cpuid
,
5
,
7
push
rbx
mov
r10
,
r3
mov
r11
,
r2
mov
r9
,
r1
mov
r11
,
r1
mov
r10
,
r2
movifnidn
r9
,
r3
movifnidn
r8
,
r4
mov
eax
,
r0d
cpuid
mov
[r
9],
eax
mov
[r1
1
],
ebx
mov
[r
10],
ecx
mov
[r
11],
eax
mov
[r1
0
],
ebx
mov
[r
9],
ecx
mov
[r8],
edx
pop
rbx
ret
RET
%else
...
...
@@ -102,6 +104,7 @@ cglobal x264_stack_align
call
ecx
leave
ret
%endif
;-----------------------------------------------------------------------------
...
...
common/x86/dct-32.asm
View file @
c6e72b86
...
...
@@ -189,7 +189,7 @@ dct8_mmx:
; void x264_sub8x8_dct8_mmx( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
cglobal
x264_sub8x8_dct8_mmx
,
3
,
3
global
x264_sub8x8_dct8_mmx
%
+
.
skip_prologue
global
x264_sub8x8_dct8_mmx
.
skip_prologue
.
skip_prologue
:
INIT_MMX
call
load_diff_4x8_mmx
...
...
@@ -255,7 +255,7 @@ idct8_mmx:
; void x264_add8x8_idct8_mmx( uint8_t *dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
cglobal
x264_add8x8_idct8_mmx
,
2
,
2
global
x264_add8x8_idct8_mmx
%
+
.
skip_prologue
global
x264_add8x8_idct8_mmx
.
skip_prologue
.
skip_prologue
:
INIT_MMX
add
word
[r1],
32
...
...
@@ -348,7 +348,7 @@ INIT_XMM
; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
cglobal
x264_sub8x8_dct8_sse2
,
3
,
3
global
x264_sub8x8_dct8_sse2
%
+
.
skip_prologue
global
x264_sub8x8_dct8_sse2
.
skip_prologue
.
skip_prologue
:
LOAD_DIFF
m0
,
m7
,
none
,
[
r1
+
0
*
FENC_STRIDE
]
,
[
r2
+
0
*
FDEC_STRIDE
]
LOAD_DIFF
m1
,
m7
,
none
,
[
r1
+
1
*
FENC_STRIDE
]
,
[
r2
+
1
*
FDEC_STRIDE
]
...
...
@@ -372,7 +372,7 @@ global x264_sub8x8_dct8_sse2 %+ .skip_prologue
; void x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
cglobal
x264_add8x8_idct8_sse2
,
2
,
2
global
x264_add8x8_idct8_sse2
%
+
.
skip_prologue
global
x264_add8x8_idct8_sse2
.
skip_prologue
.
skip_prologue
:
UNSPILL
r1
,
1
,
2
,
3
,
5
,
6
,
7
IDCT8_1D
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
r1
...
...
common/x86/dct-64.asm
View file @
c6e72b86
...
...
@@ -86,7 +86,7 @@ INIT_XMM
;-----------------------------------------------------------------------------
; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
cglobal
x264_sub8x8_dct8_sse2
cglobal
x264_sub8x8_dct8_sse2
,
3
,
3
,
10
LOAD_DIFF
m0
,
m8
,
m9
,
[
r1
+
0
*
FENC_STRIDE
]
,
[
r2
+
0
*
FDEC_STRIDE
]
LOAD_DIFF
m1
,
m8
,
m9
,
[
r1
+
1
*
FENC_STRIDE
]
,
[
r2
+
1
*
FDEC_STRIDE
]
LOAD_DIFF
m2
,
m8
,
m9
,
[
r1
+
2
*
FENC_STRIDE
]
,
[
r2
+
2
*
FDEC_STRIDE
]
...
...
@@ -108,7 +108,7 @@ cglobal x264_sub8x8_dct8_sse2
movdqa
[
r0
+
0x50
]
,
m5
movdqa
[
r0
+
0x60
]
,
m6
movdqa
[
r0
+
0x70
]
,
m7
ret
RET
%macro
IDCT8_1D
10
...
...
@@ -167,7 +167,7 @@ cglobal x264_sub8x8_dct8_sse2
;-----------------------------------------------------------------------------
; void x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
cglobal
x264_add8x8_idct8_sse2
cglobal
x264_add8x8_idct8_sse2
,
2
,
2
,
10
movdqa
m0
,
[
r1
+
0x00
]
movdqa
m1
,
[
r1
+
0x10
]
movdqa
m2
,
[
r1
+
0x20
]
...
...
@@ -191,6 +191,6 @@ cglobal x264_add8x8_idct8_sse2
STORE_DIFF
m5
,
m8
,
m9
,
[
r0
+
5
*
FDEC_STRIDE
]
STORE_DIFF
m6
,
m8
,
m9
,
[
r0
+
6
*
FDEC_STRIDE
]
STORE_DIFF
m7
,
m8
,
m9
,
[
r0
+
7
*
FDEC_STRIDE
]
ret
RET
common/x86/dct-a.asm
View file @
c6e72b86
...
...
@@ -155,12 +155,16 @@ cglobal x264_add4x4_idct_mmx, 2,2
INIT_XMM
cglobal
x264_sub8x8_dct_sse2
,
3
,
3
cglobal
x264_sub8x8_dct_sse2
,
3
,
3
,
8
.
skip_prologue
:
call
.
8
x4
add
r0
,
64
add
r1
,
4
*
FENC_STRIDE
add
r2
,
4
*
FDEC_STRIDE
%ifdef
WIN64
call
.
8
x4
RET
%endif
.
8
x4
:
SUB_DCT4
2
x4x4W
movhps
[
r0
+
32
]
,
m0
...
...
@@ -169,11 +173,15 @@ cglobal x264_sub8x8_dct_sse2, 3,3
movhps
[
r0
+
56
]
,
m3
ret
cglobal
x264_add8x8_idct_sse2
,
2
,
2
cglobal
x264_add8x8_idct_sse2
,
2
,
2
,
8
.
skip_prologue
:
call
.
8
x4
add
r1
,
64
add
r0
,
4
*
FDEC_STRIDE
%ifdef
WIN64
call
.
8
x4
RET
%endif
.
8
x4
:
movq
m0
,
[
r1
+
0
]
movq
m1
,
[
r1
+
8
]
...
...
@@ -192,6 +200,9 @@ cglobal x264_add8x8_idct_sse2, 2,2
%macro
SUB_NxN_DCT
6
cglobal
%1
,
3
,
3
.
skip_prologue
:
%ifdef
WIN64
sub
rsp
,
8
%endif
call
%2
add
r0
,
%3
add
r1
,
%4
-
%5
-
%6
*
FENC_STRIDE
...
...
@@ -204,6 +215,9 @@ cglobal %1, 3,3
add
r0
,
%3
add
r1
,
%4
-
%5
-
%6
*
FENC_STRIDE
add
r2
,
%4
-
%5
-
%6
*
FDEC_STRIDE
%ifdef
WIN64
add
rsp
,
8
%endif
jmp
%2
%endmacro
...
...
@@ -213,6 +227,9 @@ cglobal %1, 3,3
%macro
ADD_NxN_IDCT
6
cglobal
%1
,
2
,
2
.
skip_prologue
:
%ifdef
WIN64
sub
rsp
,
8
%endif
call
%2
add
r0
,
%4
-
%5
-
%6
*
FDEC_STRIDE
add
r1
,
%3
...
...
@@ -222,25 +239,30 @@ cglobal %1, 2,2
call
%2
add
r0
,
%4
-
%5
-
%6
*
FDEC_STRIDE
add
r1
,
%3
%ifdef
WIN64
add
rsp
,
8
%endif
jmp
%2
%endmacro
%ifndef
ARCH_X86_64
SUB_NxN_DCT
x264_sub8x8_dct_mmx
,
x264_sub4x4_dct_mmx
%
+
.
skip_prologue
,
32
,
4
,
0
,
0
ADD_NxN_IDCT
x264_add8x8_idct_mmx
,
x264_add4x4_idct_mmx
%
+
.
skip_prologue
,
32
,
4
,
0
,
0
SUB_NxN_DCT
x264_sub16x16_dct_mmx
,
x264_sub8x8_dct_mmx
%
+
.
skip_prologue
,
32
,
8
,
4
,
4
ADD_NxN_IDCT
x264_add16x16_idct_mmx
,
x264_add8x8_idct_mmx
%
+
.
skip_prologue
,
32
,
8
,
4
,
4
SUB_NxN_DCT
x264_sub8x8_dct_mmx
,
x264_sub4x4_dct_mmx
.
skip_prologue
,
32
,
4
,
0
,
0
ADD_NxN_IDCT
x264_add8x8_idct_mmx
,
x264_add4x4_idct_mmx
.
skip_prologue
,
32
,
4
,
0
,
0
SUB_NxN_DCT
x264_sub16x16_dct_mmx
,
x264_sub8x8_dct_mmx
.
skip_prologue
,
32
,
8
,
4
,
4
ADD_NxN_IDCT
x264_add16x16_idct_mmx
,
x264_add8x8_idct_mmx
.
skip_prologue
,
32
,
8
,
4
,
4
cextern
x264_sub8x8_dct8_mmx
.
skip_prologue
cextern
x264_add8x8_idct8_mmx
.
skip_prologue
SUB_NxN_DCT
x264_sub16x16_dct8_mmx
,
x264_sub8x8_dct8_mmx
%
+
.
skip_prologue
,
128
,
8
,
0
,
0
ADD_NxN_IDCT
x264_add16x16_idct8_mmx
,
x264_add8x8_idct8_mmx
%
+
.
skip_prologue
,
128
,
8
,
0
,
0
SUB_NxN_DCT
x264_sub16x16_dct8_mmx
,
x264_sub8x8_dct8_mmx
.
skip_prologue
,
128
,
8
,
0
,
0
ADD_NxN_IDCT
x264_add16x16_idct8_mmx
,
x264_add8x8_idct8_mmx
.
skip_prologue
,
128
,
8
,
0
,
0
%define
x264_sub8x8_dct_sse2
x264_sub8x8_dct_sse2
.
skip_prologue
%define
x264_add8x8_idct_sse2
x264_add8x8_idct_sse2
.
skip_prologue
%define
x264_sub8x8_dct8_sse2
x264_sub8x8_dct8_sse2
.
skip_prologue
%define
x264_add8x8_idct8_sse2
x264_add8x8_idct8_sse2
.
skip_prologue
%endif
SUB_NxN_DCT
x264_sub16x16_dct_sse2
,
x264_sub8x8_dct_sse2
%
+
.
skip_prologue
,
64
,
8
,
0
,
4
ADD_NxN_IDCT
x264_add16x16_idct_sse2
,
x264_add8x8_idct_sse2
%
+
.
skip_prologue
,
64
,
8
,
0
,
4
SUB_NxN_DCT
x264_sub16x16_dct_sse2
,
x264_sub8x8_dct_sse2
,
64
,
8
,
0
,
4
ADD_NxN_IDCT
x264_add16x16_idct_sse2
,
x264_add8x8_idct_sse2
,
64
,
8
,
0
,
4
cextern
x264_sub8x8_dct8_sse2
cextern
x264_add8x8_idct8_sse2
...
...
@@ -286,7 +308,7 @@ cglobal x264_add8x8_idct_dc_mmx, 2,2
punpcklbw
mm1
,
mm1
ADD_DC
mm0
,
mm1
,
r0
-
FDEC_STRIDE
*
4
ADD_DC
mm2
,
mm3
,
r0
ret
RET
cglobal
x264_add8x8_idct_dc_ssse3
,
2
,
2
movq
xmm0
,
[r1]
...
...
@@ -324,7 +346,7 @@ cglobal x264_add8x8_idct_dc_ssse3, 2,2
movhps
[
r0
+
FDEC_STRIDE
*
1
]
,
xmm3
movhps
[
r0
+
FDEC_STRIDE
*
2
]
,
xmm4
movhps
[
r0
+
FDEC_STRIDE
*
3
]
,
xmm5
ret
RET
cglobal
x264_add16x16_idct_dc_mmx
,
2
,
3
mov
r2
,
4
...
...
@@ -348,7 +370,7 @@ cglobal x264_add16x16_idct_dc_mmx, 2,3
add
r0
,
FDEC_STRIDE
*
4
dec
r2
jg
.
loop
ret
REP_RET
%macro
IDCT_DC_STORE
3
movdqa
xmm4
,
[
r0
+
%1
+
FDEC_STRIDE
*
0
]
...
...
@@ -369,9 +391,13 @@ cglobal x264_add16x16_idct_dc_mmx, 2,3
movdqa
[
r0
+
%1
+
FDEC_STRIDE
*
3
]
,
xmm7
%endmacro
cglobal
x264_add16x16_idct_dc_sse2
,
2
,
2
cglobal
x264_add16x16_idct_dc_sse2
,
2
,
2
,
8
call
.
loop
add
r0
,
FDEC_STRIDE
*
4
%ifdef
WIN64
call
.
loop
RET
%endif
.
loop
:
add
r0
,
FDEC_STRIDE
*
4
movq
xmm0
,
[
r1
+
0
]
...
...
@@ -399,9 +425,13 @@ cglobal x264_add16x16_idct_dc_sse2, 2,2
IDCT_DC_STORE
0
,
xmm2
,
xmm3
ret
cglobal
x264_add16x16_idct_dc_ssse3
,
2
,
2
cglobal
x264_add16x16_idct_dc_ssse3
,
2
,
2
,
8
call
.
loop
add
r0
,
FDEC_STRIDE
*
4
%ifdef
WIN64
call
.
loop
RET
%endif
.
loop
:
add
r0
,
FDEC_STRIDE
*
4
movdqa
xmm0
,
[r1]
...
...
@@ -428,7 +458,7 @@ cglobal x264_add16x16_idct_dc_ssse3, 2,2
; void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[8][8] )
;-----------------------------------------------------------------------------
%macro
SCAN_8x8
1
cglobal
x264_zigzag_scan_8x8_frame_
%1
,
2
,
2
cglobal
x264_zigzag_scan_8x8_frame_
%1
,
2
,
2
,
8
movdqa
xmm0
,
[r1]
movdqa
xmm1
,
[
r1
+
16
]
movdq2q
mm0
,
xmm0
...
...
@@ -703,7 +733,7 @@ cglobal x264_zigzag_scan_4x4_field_mmxext, 2,3
;-----------------------------------------------------------------------------
; void x264_zigzag_sub_4x4_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst )
;-----------------------------------------------------------------------------
cglobal
x264_zigzag_sub_4x4_frame_ssse3
,
3
,
3
cglobal
x264_zigzag_sub_4x4_frame_ssse3
,
3
,
3
,
8
movd
xmm0
,
[
r1
+
0
*
FENC_STRIDE
]
movd
xmm1
,
[
r1
+
1
*
FENC_STRIDE
]
movd
xmm2
,
[
r1
+
2
*
FENC_STRIDE
]
...
...
common/x86/deblock-a.asm
View file @
c6e72b86
...
...
@@ -278,7 +278,7 @@ SECTION .text
; void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
INIT_XMM
cglobal
x264_deblock_v_luma_sse2
cglobal
x264_deblock_v_luma_sse2
,
5
,
5
,
10
movd
m8
,
[r4]
; tc0
lea
r4
,
[
r1
*
3
]
dec
r2d
; alpha-1
...
...
@@ -318,54 +318,66 @@ cglobal x264_deblock_v_luma_sse2
DEBLOCK_P0_Q0
mova
[
r4
+
2
*
r1
]
,
m1
mova
[r0],
m2
ret
RET
;-----------------------------------------------------------------------------
; void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
INIT_MMX
cglobal
x264_deblock_h_luma_sse2
movsxd
r10
,
esi
cglobal
x264_deblock_h_luma_sse2
,
5
,
7
movsxd
r10
,
r1d
lea
r11
,
[
r10
+
r10
*
2
]
lea
rax
,
[
r0
-
4
]
lea
r9
,
[
r0
-
4
+
r11
]
lea
r6
,
[
r0
-
4
]
lea
r5
,
[
r0
-
4
+
r11
]
%ifdef
WIN64
sub
rsp
,
0x98
%
define
pix_tmp
rsp
+
0x30
%else
sub
rsp
,
0x68
%
define
pix_tmp
rsp
%endif
; transpose 6x16 -> tmp space
TRANSPOSE6x8_MEM
PASS8ROWS
(
r
ax
,
r9
,
r10
,
r11
),
pix_tmp
lea
r
ax
,
[
rax
+
r10
*
8
]
lea
r
9
,
[
r9
+
r10
*
8
]
TRANSPOSE6x8_MEM
PASS8ROWS
(
r
ax
,
r9
,
r10
,
r11
),
pix_tmp
+
8
TRANSPOSE6x8_MEM
PASS8ROWS
(
r
6
,
r5
,
r10
,
r11
),
pix_tmp
lea
r
6
,
[
r6
+
r10
*
8
]
lea
r
5
,
[
r5
+
r10
*
8
]
TRANSPOSE6x8_MEM
PASS8ROWS
(
r
6
,
r5
,
r10
,
r11
),
pix_tmp
+
8
; vertical filter
; alpha, beta, tc0 are still in r2d, r3d, r4
; don't backup r
ax, r9
, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
; don't backup r
6, r5
, r10, r11 because x264_deblock_v_luma_sse2 doesn't use them
lea
r0
,
[
pix_tmp
+
0x30
]
mov
esi
,
0x10
mov
r1d
,
0x10
%ifdef
WIN64
mov
[
rsp
+
0x20
]
,
r4
%endif
call
x264_deblock_v_luma_sse2
; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
add
r
ax
,
2
add
r
9
,
2
add
r
6
,
2
add
r
5
,
2
movq
m0
,
[
pix_tmp
+
0x18
]
movq
m1
,
[
pix_tmp
+
0x28
]
movq
m2
,
[
pix_tmp
+
0x38
]
movq
m3
,
[
pix_tmp
+
0x48
]
TRANSPOSE8x4_STORE
PASS8ROWS
(
r
ax
,
r9
,
r10
,
r11
)
TRANSPOSE8x4_STORE
PASS8ROWS
(
r
6
,
r5
,
r10
,
r11
)
shl
r10
,
3
sub
r
ax
,
r10
sub
r
9
,
r10
sub
r
6
,
r10
sub
r
5
,
r10
shr
r10
,
3
movq
m0
,
[
pix_tmp
+
0x10
]
movq
m1
,
[
pix_tmp
+
0x20
]
movq
m2
,
[
pix_tmp
+
0x30
]
movq
m3
,
[
pix_tmp
+
0x40
]
TRANSPOSE8x4_STORE
PASS8ROWS
(
r
ax
,
r9
,
r10
,
r11
)
TRANSPOSE8x4_STORE
PASS8ROWS
(
r
6
,
r5
,
r10
,
r11
)
%ifdef
WIN64
add
rsp
,
0x98
%else
add
rsp
,
0x68
ret
%endif
RET
%else
...
...
@@ -388,7 +400,7 @@ cglobal x264_deblock_%2_luma_%1, 5,5
mova
m3
,
[
r0
+
r1
]
; q1
LOAD_MASK
r2
,
r3
mov
r3
,
r4m
mov
r3
,
r4m
p
movd
m4
,
[r3]
; tc0
punpcklbw
m4
,
m4
punpcklbw
m4
,
m4
; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
...
...
@@ -428,7 +440,7 @@ cglobal x264_deblock_%2_luma_%1, 5,5
;-----------------------------------------------------------------------------
INIT_MMX
cglobal
x264_deblock_h_luma_
%1
,
0
,
5
mov
r0
,
r0m
mov
r0
,
r0m
p
mov
r3
,
r1m
lea
r4
,
[
r3
*
3
]
sub
r0
,
4
...
...
@@ -459,7 +471,7 @@ cglobal x264_deblock_h_luma_%1, 0,5
ADD
esp
,
20
; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
mov
r0
,
r0m
mov
r0
,
r0m
p
sub
r0
,
2
lea
r1
,
[
r0
+
r4
]
...
...
@@ -609,7 +621,7 @@ DEBLOCK_LUMA sse2, v, 16
;-----------------------------------------------------------------------------
; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal
x264_deblock_
%2
_luma_intra_
%1
,
4
,
6
cglobal
x264_deblock_
%2
_luma_intra_
%1
,
4
,
6
,
16
%ifndef
ARCH_X86_64
sub
esp
,
0x60
%endif
...
...
@@ -671,34 +683,34 @@ INIT_MMX
;-----------------------------------------------------------------------------
; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal
x264_deblock_h_luma_intra_
%1
cglobal
x264_deblock_h_luma_intra_
%1
,
4
,
7
movsxd
r10
,
r1d
lea
r11
,
[
r10
*
3
]
lea
r
ax
,
[
r0
-
4
]
lea
r
9
,
[
r0
-
4
+
r11
]
lea
r
6
,
[
r0
-
4
]
lea
r
5
,
[
r0
-
4
+
r11
]
sub
rsp
,
0x88
%
define
pix_tmp
rsp
; transpose 8x16 -> tmp space
TRANSPOSE8x8_MEM
PASS8ROWS
(
r
ax
,
r9
,
r10
,
r11
),
PASS8ROWS
(
pix_tmp
,
pix_tmp
+
0x30
,
0x10
,
0x30
)
lea
r
ax
,
[
rax
+
r10
*
8
]
lea
r
9
,
[
r9
+
r10
*
8
]
TRANSPOSE8x8_MEM
PASS8ROWS
(
r
ax
,
r9
,
r10
,
r11
),
PASS8ROWS
(
pix_tmp
+
8
,
pix_tmp
+
0x38
,
0x10
,
0x30
)
TRANSPOSE8x8_MEM
PASS8ROWS
(
r
6
,
r5
,
r10
,
r11
),
PASS8ROWS
(
pix_tmp
,
pix_tmp
+
0x30
,
0x10
,
0x30
)
lea
r
6
,
[
r6
+
r10
*
8
]
lea
r
5
,
[
r5
+
r10
*
8
]
TRANSPOSE8x8_MEM
PASS8ROWS
(
r
6
,
r5
,
r10
,
r11
),
PASS8ROWS
(
pix_tmp
+
8
,
pix_tmp
+
0x38
,
0x10
,
0x30
)
lea
r0
,
[
pix_tmp
+
0x40
]
mov
r1
,
0x10
call
x264_deblock_v_luma_intra_
%1
; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
lea
r
9
,
[
rax
+
r11
]
TRANSPOSE8x8_MEM
PASS8ROWS
(
pix_tmp
+
8
,
pix_tmp
+
0x38
,
0x10
,
0x30
),
PASS8ROWS
(
r
ax
,
r9
,
r10
,
r11
)
lea
r
5
,
[
r6
+
r11
]
TRANSPOSE8x8_MEM
PASS8ROWS
(
pix_tmp
+
8
,
pix_tmp
+
0x38
,
0x10
,
0x30
),
PASS8ROWS
(
r
6
,
r5
,
r10
,
r11
)
shl
r10
,
3
sub
r
ax
,
r10
sub
r
9
,
r10
sub
r
6
,
r10
sub
r
5
,
r10
shr
r10
,
3
TRANSPOSE8x8_MEM
PASS8ROWS
(
pix_tmp
,
pix_tmp
+
0x30
,
0x10
,
0x30
),
PASS8ROWS
(
r
ax
,
r9
,
r10
,
r11
)
TRANSPOSE8x8_MEM
PASS8ROWS
(
pix_tmp
,
pix_tmp
+
0x30
,
0x10
,
0x30
),
PASS8ROWS
(
r
6
,
r5
,
r10
,
r11
)
add
rsp
,
0x88
ret
RET
%else
cglobal
x264_deblock_h_luma_intra_
%1
,
2
,
4
lea
r3
,
[
r1
*
3
]
...
...
@@ -727,7 +739,7 @@ cglobal x264_deblock_h_luma_intra_%1, 2,4
ADD
esp
,
16
mov
r1
,
r1m
mov
r0
,
r0m
mov
r0
,
r0m
p
lea
r3
,
[
r1
*
3
]
sub
r0
,
4
lea
r2
,
[
r0
+
r3
]
...
...
common/x86/mc-a.asm
View file @
c6e72b86
...
...
@@ -42,14 +42,17 @@ SECTION .text
; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
%ifdef
ARCH_X86_64
DECLARE_REG_TMP
0
,
1
,
2
,
3
,
4
,
5
,
10
,
11
%
macro
AVG_START
0
PROLOGUE
6
,
7
%
macro
AVG_START
0
-
1
0
PROLOGUE
6
,
7
,
%1
%ifdef
WIN64
movsxd
r5
,
r5d
%endif
.
height_loop
:
%
endmacro
%else
DECLARE_REG_TMP
1
,
2
,
3
,
4
,
5
,
6
,
1
,
2
%
macro
AVG_START
0
PROLOGUE
0
,
7
%
macro
AVG_START
0
-
1
0
PROLOGUE
0
,
7
,
%1
mov
t0
,
r0m
mov
t1
,
r1m
mov
t2
,
r2m
...
...
@@ -72,30 +75,30 @@ SECTION .text
%macro
BIWEIGHT_MMX
2
movh
m0
,
%1
movh
m1
,
%2
punpcklbw
m0
,
m
7
punpcklbw
m1
,
m
7
pmullw
m0
,
m
4
pmullw
m1
,
m
5
punpcklbw
m0
,
m
5
punpcklbw
m1
,
m
5
pmullw
m0
,
m
2
pmullw
m1
,
m
3
paddw
m0
,
m1
paddw
m0
,
m
6
paddw
m0
,
m
4
psraw
m0
,
6
%endmacro
%macro
BIWEIGHT_START_MMX
0
movd
m
4
,
r6m
SPLATW
m
4
,
m4
; weight_dst
mova
m
5
,
[
pw_64
GLOBAL
]
psubw
m
5
,
m4
; weight_src
mova
m
6
,
[
pw_32
GLOBAL
]
; rounding
pxor
m
7
,
m7
movd
m
2
,
r6m
SPLATW
m
2
,
m2
; weight_dst
mova
m
3
,
[
pw_64
GLOBAL
]
psubw
m
3
,
m2
; weight_src
mova
m
4
,
[
pw_32
GLOBAL
]
; rounding
pxor
m
5
,
m5
%endmacro
%macro
BIWEIGHT_SSSE3
2
movh
m0
,
%1
movh
m1
,
%2
punpcklbw
m0
,
m1
pmaddubsw
m0
,
m
5
paddw
m0
,
m
6
pmaddubsw
m0
,
m
3
paddw
m0
,
m
4
psraw
m0
,
6
%endmacro
...
...
@@ -105,9 +108,9 @@ SECTION .text
sub
t7d
,
t6d
shl
t7d
,
8
add
t6d
,
t7d
movd
m
5
,
t6d
mova
m
6
,
[
pw_32
GLOBAL
]
SPLATW
m
5
,
m5
; weight_dst,src
movd
m
3
,
t6d
mova
m
4
,
[
pw_32
GLOBAL
]
SPLATW
m
3
,
m3
; weight_dst,src
%endmacro
%macro
BIWEIGHT_ROW
4
...
...
@@ -116,27 +119,27 @@ SECTION .text
packuswb
m0
,
m0
movh
[
%1
]
,
m0
%else
SWAP
0
,
2
SWAP
0
,
6
BIWEIGHT
[
%2
+
mmsize
/
2
]
,
[
%3
+
mmsize
/
2
]
packuswb
m
2
,
m0
mova
[
%1
]
,
m
2
packuswb
m
6
,
m0
mova
[
%1
]
,
m
6
%endif
%endmacro
;-----------------------------------------------------------------------------
; int x264_pixel_avg_weight_w16_mmxext( uint8_t *dst, int, uint8_t *src1, int, uint8_t *src2, int, int i_weight )
;-----------------------------------------------------------------------------
%macro
AVG_WEIGHT
2
cglobal
x264_pixel_avg_weight_w
%2
_
%1
,
0
,
0
%macro
AVG_WEIGHT
2
-
3
0
cglobal
x264_pixel_avg_weight_w
%2
_
%1
BIWEIGHT_START
AVG_START
AVG_START
%3
%if
%2
==
8
&&
mmsize
==
16
BIWEIGHT
[t2],
[t4]
SWAP
0
,
2
SWAP
0
,
6
BIWEIGHT
[
t2
+
t3
]
,
[
t4
+
t5
]
packuswb
m
2
,
m0
movlps
[t0],
m
2
movhps
[
t0
+
t1
]
,
m
2
packuswb
m
6
,
m0
movlps
[t0],
m
6
movhps
[
t0
+
t1
]
,
m
6
%else
%assign
x
0
%rep
1
+
%2
/
(
mmsize
*
2
)
...
...
@@ -161,15 +164,15 @@ AVG_WEIGHT mmxext, 8
AVG_WEIGHT
mmxext
,
16
INIT_XMM
%define
x264_pixel_avg_weight_w4_sse2
x264_pixel_avg_weight_w4_mmxext
AVG_WEIGHT
sse2
,
8