Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
Menu
Open sidebar
VideoLAN
x264
Commits
43e4162f
Commit
43e4162f
authored
Oct 06, 2006
by
Loren Merritt
Browse files
merge center_filter_mmx with horizontal_filter_mmx
git-svn-id:
svn://svn.videolan.org/x264/trunk@584
df754926-b1dd-0310-bc7b-ec298dee348c
parent
c485e7e7
Changes
3
Hide whitespace changes
Inline
Side-by-side
common/amd64/mc-a2.asm
View file @
43e4162f
...
...
@@ -30,316 +30,246 @@ BITS 64
; Read only data
;=============================================================================
SECTION
.rodata
SECTION
.rodata
al
ign
=
16
ALIGN
16
mmx_dw_16:
times
4
dw
16
mmx_dw_32:
times
4
dw
32
%assign tbuffer 0
pw_1:
times
4
dw
1
pw_16:
times
4
dw
16
pw_32:
times
4
dw
32
;=============================================================================
; Macros
;=============================================================================
%macro LOAD_4 9
movd
%
1
,
%
5
movd
%
2
,
%
6
movd
%
3
,
%
7
movd
%
4
,
%
8
punpcklbw
%
1
,
%
9
punpcklbw
%
2
,
%
9
punpcklbw
%
3
,
%
9
punpcklbw
%
4
,
%
9
%endmacro
%macro FILT_2 2
psubw
%
1
,
%
2
psllw
%
2
,
2
psubw
%
1
,
%
2
%macro LOAD_ADD 3
movd
%
1
,
%
2
movd
mm7
,
%
3
punpcklbw
%
1
,
mm0
punpcklbw
mm7
,
mm0
paddw
%
1
,
mm7
%endmacro
%macro FILT_4 3
paddw
%
2
,
%
3
psllw
%
2
,
2
paddw
%
1
,
%
2
psllw
%
2
,
2
paddw
%
1
,
%
2
%macro FILT_V 0
psubw
mm1
,
mm2
; a-b
psubw
mm4
,
mm5
psubw
mm2
,
mm3
; b-c
psubw
mm5
,
mm6
psllw
mm2
,
2
psllw
mm5
,
2
psubw
mm1
,
mm2
; a-5*b+4*c
psubw
mm4
,
mm5
psllw
mm3
,
4
psllw
mm6
,
4
paddw
mm1
,
mm3
; a-5*b+20*c
paddw
mm4
,
mm6
%endmacro
%macro FILT_6 4
psubw
%
1
,
%
2
psllw
%
2
,
2
psubw
%
1
,
%
2
paddw
%
1
,
%
3
paddw
%
1
,
%
4
psraw
%
1
,
5
%macro FILT_H 0
psubw
mm1
,
mm2
; a-b
psubw
mm4
,
mm5
psraw
mm1
,
2
; (a-b)/4
psraw
mm4
,
2
psubw
mm1
,
mm2
; (a-b)/4-b
psubw
mm4
,
mm5
paddw
mm1
,
mm3
; (a-b)/4-b+c
paddw
mm4
,
mm6
psraw
mm1
,
2
; ((a-b)/4-b+c)/4
psraw
mm4
,
2
paddw
mm1
,
mm3
; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
paddw
mm4
,
mm6
%endmacro
%macro FILT_ALL 1
LOAD_4
mm1
,
mm2
,
mm3
,
mm4
,
[
%
1
],
[
%
1
+
rcx
],
[
%
1
+
2
*
rcx
],
[
%
1
+
rbx
],
mm0
FILT_2
mm1
,
mm2
movd
mm5
,
[
%
1
+
4
*
rcx
]
movd
mm6
,
[
%
1
+
rdx
]
FILT_4
mm1
,
mm3
,
mm4
punpcklbw
mm5
,
mm0
punpcklbw
mm6
,
mm0
psubw
mm1
,
mm5
psllw
mm5
,
2
psubw
mm1
,
mm5
paddw
mm1
,
mm6
%macro FILT_PACK 1
paddw
mm1
,
mm7
paddw
mm4
,
mm7
psraw
mm1
,
%
1
psraw
mm4
,
%
1
packuswb
mm1
,
mm4
%endmacro
;=============================================================================
; Code
;=============================================================================
SECTION
.text
cglobal
x264_horizontal_filter_mmxext
cglobal
x264_center_filter_mmxext
cglobal
x264_hpel_filter_mmxext
cglobal
x264_plane_copy_mmxext
;-----------------------------------------------------------------------------
;
; void x264_center_filter_mmxext( uint8_t *dst1, int i_dst1_stride,
; uint8_t *dst2, int i_dst2_stride,
; uint8_t *src, int i_src_stride,
; int i_width, int i_height );
;
; void x264_hpel_filter_mmxext( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
; int i_stride, int i_width, int i_height );
;-----------------------------------------------------------------------------
ALIGN
16
x264_
center
_filter_mmxext
:
x264_
hpel
_filter_mmxext
:
push
r15
pushreg
r15
%ifdef WIN64
push
rdi
pushreg
rdi
push
rsi
pushreg
rsi
%endif
push
rbp
pushreg
rbp
push
rbx
pushreg
rbx
push
r12
pushreg
r12
push
r13
pushreg
r13
push
r14
pushreg
r14
lea
rbp
,
[
rsp
]
mov
rbp
,
rsp
setframe
rbp
,
0
endprolog
%ifdef WIN64
movsxd
r13
,
dword
[
rsp
+
64
+
48
]
; src_stride
mov
r12
,
[
rsp
+
64
+
40
]
; src
mov
rdi
,
parm1q
mov
rsi
,
parm2q
mov
rdx
,
parm3q
mov
rcx
,
parm4q
movsxd
r8
,
dword
[
rbp
+
72
]
movsxd
r9
,
dword
[
rbp
+
80
]
mov
ebx
,
dword
[
rbp
+
88
]
%else
movsxd
r13
,
r9d
; src_stride
mov
r12
,
r8
; src
mov
ebx
,
dword
[
rbp
+
24
]
%endif
sub
r12
,
r13
sub
r12
,
r13
; tsrc = src - 2 * src_stride
; use 24 instead of 18 (used in i386/mc-a2.asm) to keep rsp aligned
lea
rax
,
[
r13
+
r13
+
24
+
tbuffer
]
%define dsth rdi
%define dstv rsi
%define dstc rdx
%define src rcx
%define stride r8
%define width r9
%define height ebx
%define stride3 r10
%define stride5 r11
%define x rax
%define tbuffer rsp + 8
lea
stride3
,
[
stride
*
3
]
lea
stride5
,
[
stride
*
5
]
sub
src
,
stride
sub
src
,
stride
lea
rax
,
[
stride
*
2
+
24
]
sub
rsp
,
rax
mov
r10
,
parm3q
; dst2
movsxd
r11
,
parm4d
; dst2_stride
mov
r8
,
parm1q
; dst1
movsxd
r9
,
parm2d
; dst1_stride
%ifdef WIN64
movsxd
r14
,
dword
[
rbp
+
64
+
56
]
; width
movsxd
r15
,
dword
[
rbp
+
64
+
64
]
; height
%else
movsxd
r14
,
dword
[
rbp
+
56
]
; width
movsxd
r15
,
dword
[
rbp
+
64
]
; height
%endif
pxor
mm0
,
mm0
mov
rcx
,
r13
; src_stride
lea
rbx
,
[
r13
+
r13
*
2
]
; 3 * src_stride
lea
rdx
,
[
r13
+
r13
*
4
]
; 5 * src_stride
.loopy:
pxor
mm0
,
mm0
; 0 ---> mm0
xor
x
,
x
ALIGN
16
.vertical_filter:
.loopcy:
prefetchnta
[
src
+
stride5
+
32
]
mov
rax
,
4
mov
rsi
,
r12
; tsrc
lea
rdi
,
[
r8
-
4
]
; rdi = dst1 - 4
movq
mm7
,
[
mmx_dw_16
GLOBAL
]
; for rounding
LOAD_ADD
mm1
,
[
src
],
[
src
+
stride5
]
; a0
LOAD_ADD
mm2
,
[
src
+
stride
],
[
src
+
stride
*
4
]
; b0
LOAD_ADD
mm3
,
[
src
+
stride
*
2
],
[
src
+
stride3
]
; c0
LOAD_ADD
mm4
,
[
src
+
4
],
[
src
+
stride5
+
4
]
; a1
LOAD_ADD
mm5
,
[
src
+
stride
+
4
],
[
src
+
stride
*
4
+
4
]
; b1
LOAD_ADD
mm6
,
[
src
+
stride
*
2
+
4
],
[
src
+
stride3
+
4
]
; c1
.vertical_filter:
FILT_V
prefetchnta
[
rsi
+
rdx
+
32
]
FILT_ALL
rsi
movq
mm7
,
mm1
FILT_ALL
rsi
+
4
movq
mm6
,
[
mmx_dw_16
GLOBAL
]
movq
[
rsp
+
tbuffer
+
2
*
rax
],
mm7
movq
[
rsp
+
tbuffer
+
2
*
rax
+
8
],
mm1
paddw
mm7
,
mm6
paddw
mm1
,
mm6
psraw
mm7
,
5
movq
mm7
,
[
pw_16
GLOBAL
]
movq
[
tbuffer
+
x
*
2
],
mm1
movq
[
tbuffer
+
x
*
2
+
8
],
mm4
paddw
mm1
,
mm7
paddw
mm4
,
mm7
psraw
mm1
,
5
packuswb
mm7
,
mm1
movntq
[
rdi
+
rax
],
mm7
; dst1[rax - 4]
cmp
rax
,
r14
; cmp rax, width
lea
rsi
,
[
rsi
+
8
]
lea
rax
,
[
rax
+
8
]
jl
.vertical_filter
psraw
mm4
,
5
packuswb
mm1
,
mm4
movntq
[
ds
tv
+
x
],
mm1
pshufw
mm2
,
[
rsp
+
tbuffer
+
8
],
0
movq
[
rsp
+
tbuffer
],
mm2
; pad left
; no need to pad right, since loopcx1 already did 4 extra pixels
add
x
,
8
add
src
,
8
cmp
x
,
width
jle
.vertical_filter
add
r12
,
r13
; tsrc = tsrc + src_stride
add
r8
,
r9
; dst1 = dst1 + dst1_stride
xor
rax
,
rax
movq
mm7
,
[
mmx_dw_32
GLOBAL
]
; for rounding
pshufw
mm2
,
[
tbuffer
],
0
movq
[
tbuffer
-
8
],
mm2
; pad left
; no need to pad right, since vertical_filter already did 4 extra pixels
sub
src
,
x
xor
x
,
x
movq
mm7
,
[
pw_32
GLOBAL
]
.center_filter:
movq
mm1
,
[
rsp
+
2
*
rax
+
4
+
tbuffer
]
movq
mm2
,
[
rsp
+
2
*
rax
+
2
+
4
+
tbuffer
]
movq
mm3
,
[
rsp
+
2
*
rax
+
4
+
4
+
tbuffer
]
movq
mm4
,
[
rsp
+
2
*
rax
+
8
+
4
+
tbuffer
]
movq
mm5
,
[
rsp
+
2
*
rax
+
10
+
4
+
tbuffer
]
paddw
mm3
,
[
rsp
+
2
*
rax
+
6
+
4
+
tbuffer
]
paddw
mm2
,
mm4
paddw
mm1
,
mm5
movq
mm6
,
[
rsp
+
2
*
rax
+
12
+
4
+
tbuffer
]
paddw
mm4
,
[
rsp
+
2
*
rax
+
18
+
4
+
tbuffer
]
paddw
mm5
,
[
rsp
+
2
*
rax
+
16
+
4
+
tbuffer
]
paddw
mm6
,
[
rsp
+
2
*
rax
+
14
+
4
+
tbuffer
]
psubw
mm1
,
mm2
; a-b
psubw
mm4
,
mm5
psraw
mm1
,
2
; (a-b)/4
psraw
mm4
,
2
psubw
mm1
,
mm2
; (a-b)/4-b
psubw
mm4
,
mm5
paddw
mm1
,
mm3
; (a-b)/4-b+c
paddw
mm4
,
mm6
psraw
mm1
,
2
; ((a-b)/4-b+c)/4
psraw
mm4
,
2
paddw
mm1
,
mm3
; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
paddw
mm4
,
mm6
paddw
mm1
,
mm7
; +32
paddw
mm4
,
mm7
psraw
mm1
,
6
psraw
mm4
,
6
packuswb
mm1
,
mm4
movntq
[
r10
+
rax
],
mm1
; dst2[rax]
movq
mm1
,
[
tbuffer
+
x
*
2
-
4
]
movq
mm2
,
[
tbuffer
+
x
*
2
-
2
]
movq
mm3
,
[
tbuffer
+
x
*
2
]
movq
mm4
,
[
tbuffer
+
x
*
2
+
4
]
movq
mm5
,
[
tbuffer
+
x
*
2
+
6
]
paddw
mm3
,
[
tbuffer
+
x
*
2
+
2
]
; c0
paddw
mm2
,
mm4
; b0
paddw
mm1
,
mm5
; a0
movq
mm6
,
[
tbuffer
+
x
*
2
+
8
]
paddw
mm4
,
[
tbuffer
+
x
*
2
+
14
]
; a1
paddw
mm5
,
[
tbuffer
+
x
*
2
+
12
]
; b1
paddw
mm6
,
[
tbuffer
+
x
*
2
+
10
]
; c1
FILT_H
FILT_PACK
6
movntq
[
ds
tc
+
x
],
mm1
add
x
,
8
cmp
x
,
width
jl
.center_filter
lea
src
,
[
src
+
stride
*
2
]
xor
x
,
x
.horizontal_filter:
movd
mm1
,
[
src
+
x
-
2
]
movd
mm2
,
[
src
+
x
-
1
]
movd
mm3
,
[
src
+
x
]
movd
mm6
,
[
src
+
x
+
1
]
movd
mm4
,
[
src
+
x
+
2
]
movd
mm5
,
[
src
+
x
+
3
]
punpcklbw
mm1
,
mm0
punpcklbw
mm2
,
mm0
punpcklbw
mm3
,
mm0
punpcklbw
mm6
,
mm0
punpcklbw
mm4
,
mm0
punpcklbw
mm5
,
mm0
paddw
mm3
,
mm6
; c0
paddw
mm2
,
mm4
; b0
paddw
mm1
,
mm5
; a0
movd
mm7
,
[
src
+
x
+
7
]
movd
mm6
,
[
src
+
x
+
6
]
punpcklbw
mm7
,
mm0
punpcklbw
mm6
,
mm0
paddw
mm4
,
mm7
; c1
paddw
mm5
,
mm6
; b1
movd
mm7
,
[
src
+
x
+
5
]
movd
mm6
,
[
src
+
x
+
4
]
punpcklbw
mm7
,
mm0
punpcklbw
mm6
,
mm0
paddw
mm6
,
mm7
; a1
add
rax
,
8
cmp
rax
,
r14
; cmp rax, width
jnz
.center_filter
movq
mm7
,
[
pw_1
GLOBAL
]
FILT_H
FILT_PACK
1
movntq
[
ds
th
+
x
],
mm1
add
r10
,
r11
; dst2 += dst2_stride
dec
r15
; height
j
nz
.
loopcy
add
x
,
8
cmp
x
,
width
j
l
.
horizontal_filter
lea
rsp
,
[
rbp
]
sub
src
,
stride
add
ds
th
,
stride
add
ds
tv
,
stride
add
ds
tc
,
stride
dec
height
jg
.loopy
pop
r14
pop
r13
pop
r12
mov
rsp
,
rbp
pop
rbx
pop
rbp
%ifdef WIN64
pop
rsi
pop
rdi
%endif
pop
r15
ret
;-----------------------------------------------------------------------------
;
; void x264_horizontal_filter_mmxext( uint8_t *dst, int i_dst_stride,
; uint8_t *src, int i_src_stride,
; int i_width, int i_height );
;
;-----------------------------------------------------------------------------
ALIGN
16
x264_horizontal_filter_mmxext
:
movsxd
r10
,
parm2d
; dst_stride
movsxd
r11
,
parm4d
; src_stride
%ifdef WIN64
mov
rdx
,
r8
; src
mov
r9
,
rcx
; dst
movsxd
rcx
,
parm6d
; height
%else
movsxd
rcx
,
parm6d
; height
mov
r9
,
rdi
; dst
%endif
movsxd
r8
,
parm5d
; width
pxor
mm0
,
mm0
movq
mm7
,
[
mmx_dw_16
GLOBAL
]
sub
rdx
,
2
loophy:
xor
rax
,
rax
loophx:
prefetchnta
[
rdx
+
rax
+
48
]
LOAD_4
mm1
,
mm2
,
mm3
,
mm4
,
[
rdx
+
rax
],
[
rdx
+
rax
+
1
],
[
rdx
+
rax
+
2
],
[
rdx
+
rax
+
3
],
mm0
FILT_2
mm1
,
mm2
movd
mm5
,
[
rdx
+
rax
+
4
]
movd
mm6
,
[
rdx
+
rax
+
5
]
FILT_4
mm1
,
mm3
,
mm4
movd
mm2
,
[
rdx
+
rax
+
4
]
movd
mm3
,
[
rdx
+
rax
+
6
]
punpcklbw
mm5
,
mm0
punpcklbw
mm6
,
mm0
FILT_6
mm1
,
mm5
,
mm6
,
mm7
movd
mm4
,
[
rdx
+
rax
+
7
]
movd
mm5
,
[
rdx
+
rax
+
8
]
punpcklbw
mm2
,
mm0
punpcklbw
mm3
,
mm0
; mm2(1), mm3(20), mm6(-5) ready
FILT_2
mm2
,
mm6
movd
mm6
,
[
rdx
+
rax
+
9
]
punpcklbw
mm4
,
mm0
punpcklbw
mm5
,
mm0
; mm2(1-5), mm3(20), mm4(20), mm5(-5) ready
FILT_4
mm2
,
mm3
,
mm4
punpcklbw
mm6
,
mm0
FILT_6
mm2
,
mm5
,
mm6
,
mm7
packuswb
mm1
,
mm2
movq
[
r9
+
rax
],
mm1
add
rax
,
8
cmp
rax
,
r8
; cmp rax, width
jnz
loophx
add
rdx
,
r11
; src_pitch
add
r9
,
r10
; dst_pitch
dec
rcx
jnz
loophy
ret
;-----------------------------------------------------------------------------
...
...
common/i386/mc-a2.asm
View file @
43e4162f
...
...
@@ -33,350 +33,245 @@ BITS 32
SECTION
_RODATA
ALIGN
16
mmx_dw_one:
times
4
dw
16
mmx_dd_one:
times
2
dd
512
mmx_dw_20:
times
4
dw
20
mmx_dw_5:
times
4
dw
-
5
%assign twidth 0
%assign theight 4
%assign tdstp1 8
%assign tdstp2 12
%assign tdst1 16
%assign tdst2 20
%assign tsrc 24
%assign tsrcp 28
%assign toffset 32
%assign tbuffer 36
pw_1:
times
4
dw
1
pw_16:
times
4
dw
16
pw_32:
times
4
dw
32
;=============================================================================
; Macros
;=============================================================================
%macro LOAD_4 9
movd
%
1
,
%
5
movd
%
2
,
%
6
movd
%
3
,
%
7
movd
%
4
,
%
8
punpcklbw
%
1
,
%
9
punpcklbw
%
2
,
%
9
punpcklbw
%
3
,
%
9
punpcklbw
%
4
,
%
9
%endmacro
%macro FILT_2 2
psubw
%
1
,
%
2
psllw
%
2
,
2
psubw
%
1
,
%
2
%macro LOAD_ADD 3
movd
%
1
,
%
2
movd
mm7
,
%
3
punpcklbw
%
1
,
mm0
punpcklbw
mm7
,
mm0
paddw
%
1
,
mm7
%endmacro
%macro FILT_4 3
paddw
%
2
,
%
3
psllw
%
2
,
2
paddw
%
1
,
%
2
psllw
%
2
,
2
paddw
%
1
,
%
2
%macro FILT_V 0
psubw
mm1
,
mm2
; a-b
psubw
mm4
,
mm5
psubw
mm2
,
mm3
; b-c
psubw
mm5
,
mm6
psllw
mm2
,
2
psllw
mm5
,
2
psubw
mm1
,
mm2
; a-5*b+4*c
psubw
mm4
,
mm5
psllw
mm3
,
4
psllw
mm6
,
4
paddw
mm1
,
mm3
; a-5*b+20*c
paddw
mm4
,
mm6
%endmacro
%macro FILT_6 4
psubw
%
1
,
%
2
psllw
%
2
,
2
psubw
%
1
,
%
2
paddw
%
1
,
%
3
paddw
%
1
,
%
4
psraw
%
1
,
5
%macro FILT_H 0
psubw
mm1
,
mm2
; a-b
psubw
mm4
,
mm5
psraw
mm1
,
2
; (a-b)/4
psraw
mm4
,
2
psubw
mm1
,
mm2
; (a-b)/4-b
psubw
mm4
,
mm5
paddw
mm1
,
mm3
; (a-b)/4-b+c
paddw
mm4
,
mm6
psraw
mm1
,
2
; ((a-b)/4-b+c)/4
psraw
mm4
,
2
paddw
mm1
,
mm3
; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
paddw
mm4
,
mm6
%endmacro
%macro FILT_ALL 1
LOAD_4
mm1
,
mm2
,
mm3
,
mm4
,
[
%
1
],
[
%
1
+
ecx
],
[
%
1
+
2
*
ecx
],
[
%
1
+
ebx
],
mm0
FILT_2
mm1
,
mm2
movd
mm5
,
[
%
1
+
4
*
ecx
]
movd
mm6
,
[
%
1
+
edx
]
FILT_4
mm1
,
mm3
,
mm4
punpcklbw
mm5
,
mm0
punpcklbw
mm6
,
mm0
psubw
mm1
,
mm5
psllw
mm5
,
2
psubw
mm1
,
mm5
paddw
mm1
,
mm6
%macro FILT_PACK 1
paddw
mm1
,
mm7
paddw
mm4
,
mm7
psraw
mm1
,
%
1
psraw
mm4
,
%
1
packuswb
mm1
,
mm4
%endmacro
;=============================================================================
; Code
;=============================================================================
SECTION
.text
cglobal
x264_horizontal_filter_mmxext
cglobal
x264_center_filter_mmxext
cglobal
x264_hpel_filter_mmxext
cglobal
x264_plane_copy_mmxext
;-----------------------------------------------------------------------------
;