Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
VideoLAN
x264
Commits
918a791b
Commit
918a791b
authored
Mar 05, 2006
by
Loren Merritt
Browse files
more amd64 mmx intra prediction
git-svn-id:
svn://svn.videolan.org/x264/trunk@454
df754926-b1dd-0310-bc7b-ec298dee348c
parent
469b4e50
Changes
6
Hide whitespace changes
Inline
Side-by-side
common/amd64/predict-a.asm
View file @
918a791b
...
...
@@ -60,15 +60,21 @@ ALIGN 4
SECTION
.rodata
al
ign
=
16
ALIGN
8
ALIGN
16
pw_2:
times
4
dw
2
pw_8:
times
4
dw
8
pb_1:
times
8
db
1
pb_1:
times
16
db
1
pw_3210:
dw
0
dw
1
dw
2
dw
3
ALIGN
16
pb_00s_ff:
times
8
db
0
pb_0s_ff:
times
7
db
0
db
0xff
;=============================================================================
; Code
...
...
@@ -76,29 +82,44 @@ pw_3210:
SECTION
.text
cglobal
predict_4x4_ddl_mmxext
cglobal
predict_4x4_vl_mmxext
cglobal
predict_8x8_v_mmxext
cglobal
predict_8x8_ddl_mmxext
cglobal
predict_8x8_ddl_sse2
cglobal
predict_8x8_ddr_sse2
cglobal
predict_8x8_vl_sse2
cglobal
predict_8x8_vr_core_mmxext
cglobal
predict_8x8_dc_core_mmxext
cglobal
predict_8x8c_v_mmx
cglobal
predict_8x8c_dc_core_mmxext
cglobal
predict_8x8c_p_core_mmx
cglobal
predict_16x16_p_core_mmx
cglobal
predict_8x8c_p_core_mmx
ext
cglobal
predict_16x16_p_core_mmx
ext
cglobal
predict_16x16_v_mmx
cglobal
predict_16x16_dc_core_mmxext
cglobal
predict_16x16_dc_top_mmxext
%macro PRED8x8_LOWPASS 2
movq
mm3
,
mm1
pavgb
mm1
,
mm2
pxor
mm2
,
mm3
movq
%
1
,
%
2
pand
mm2
,
[
pb_1
GLOBAL
]
psubusb
mm1
,
mm2
pavgb
%
1
,
mm1
; %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
; dest, left, right, src, tmp
; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
%macro PRED8x8_LOWPASS0 6
mov
%
6
%
5
,
%
2
pavgb
%
2
,
%
3
pxor
%
3
,
%
5
mov
%
6
%
1
,
%
4
pand
%
3
,
[
pb_1
GLOBAL
]
psubusb
%
2
,
%
3
pavgb
%
1
,
%
2
%endmacro
%macro PRED8x8_LOWPASS 5
PRED8x8_LOWPASS0
%
1
,
%
2
,
%
3
,
%
4
,
%
5
,
q
%endmacro
%macro PRED8x8_LOWPASS_XMM 5
PRED8x8_LOWPASS0
%
1
,
%
2
,
%
3
,
%
4
,
%
5
,
dqa
%endmacro
%macro PRED8x8_LOAD_TOP 0
; output: mm0 = filtered t0..t7
%macro PRED8x8_LOAD_TOP_FILT 0
sub
parm1q
,
FDEC_STRIDE
and
parm2d
,
12
...
...
@@ -108,20 +129,116 @@ cglobal predict_16x16_dc_top_mmxext
cmp
parm2d
,
byte
8
jge
.have_topleft
mov
al
,
[
parm1q
]
mov
ah
,
[
parm1q
]
mov
ah
,
al
pinsrw
mm1
,
eax
,
0
.have_topleft:
and
parm2d
,
byte
4
jne
.have_topright
mov
al
,
[
parm1q
+
7
]
mov
ah
,
[
parm1q
+
7
]
mov
ah
,
al
pinsrw
mm2
,
eax
,
3
.have_topright:
PRED8x8_LOWPASS
mm0
,
[
parm1q
]
PRED8x8_LOWPASS
mm0
,
mm1
,
mm2
,
[
parm1q
],
mm7
%endmacro
; output: xmm0 = unfiltered t0..t15
; xmm1 = unfiltered t1..t15
; xmm2 = unfiltered tl..t14
%macro PRED8x8_LOAD_TOP_TOPRIGHT_XMM 0
sub
parm1q
,
FDEC_STRIDE
and
parm2d
,
12
movdqu
xmm1
,
[
parm1q
-
1
]
cmp
parm2d
,
byte
8
jge
.have_topleft
mov
al
,
[
parm1q
]
mov
ah
,
al
pinsrw
xmm1
,
eax
,
0
.have_topleft:
and
parm2d
,
byte
4
jne
.have_topright
mov
al
,
[
parm1q
+
7
]
mov
ah
,
al
pinsrw
xmm1
,
eax
,
4
pshufhw
xmm1
,
xmm1
,
0
movdqa
xmm0
,
xmm1
movdqa
xmm2
,
xmm1
psrldq
xmm0
,
1
psrldq
xmm2
,
2
pshufhw
xmm0
,
xmm0
,
0
pshufhw
xmm2
,
xmm2
,
0
jmp
.done_topright
.have_topright:
movdqu
xmm0
,
[
parm1q
]
movdqa
xmm2
,
xmm0
psrldq
xmm2
,
1
mov
al
,
[
parm1q
+
15
]
mov
ah
,
al
pinsrw
xmm2
,
eax
,
7
.done_topright:
%endmacro
;-----------------------------------------------------------------------------
;
; void predict_4x4_ddl_mmxext( uint8_t *src )
;
;-----------------------------------------------------------------------------
ALIGN
16
predict_4x4_ddl_mmxext:
sub
parm1q
,
FDEC_STRIDE
movq
mm3
,
[
parm1q
]
movq
mm1
,
[
parm1q
-
1
]
movq
mm2
,
mm3
movq
mm4
,
[
pb_0s_ff
GLOBAL
]
psrlq
mm2
,
8
pand
mm4
,
mm3
por
mm2
,
mm4
PRED8x8_LOWPASS
mm0
,
mm1
,
mm2
,
mm3
,
mm5
%assign Y 1
%rep 4
psrlq
mm0
,
8
movd
[
parm1q
+
Y
*
FDEC_STRIDE
],
mm0
%assign Y (Y+1)
%endrep
ret
;-----------------------------------------------------------------------------
;
; void predict_4x4_vl_mmxext( uint8_t *src )
;
;-----------------------------------------------------------------------------
ALIGN
16
predict_4x4_vl_mmxext:
movq
mm1
,
[
parm1q
-
FDEC_STRIDE
]
movq
mm3
,
mm1
movq
mm2
,
mm1
psrlq
mm3
,
8
psrlq
mm2
,
16
movq
mm4
,
mm3
pavgb
mm4
,
mm1
PRED8x8_LOWPASS
mm0
,
mm1
,
mm2
,
mm3
,
mm5
movd
[
parm1q
+
0
*
FDEC_STRIDE
],
mm4
movd
[
parm1q
+
1
*
FDEC_STRIDE
],
mm0
psrlq
mm4
,
8
psrlq
mm0
,
8
movd
[
parm1q
+
2
*
FDEC_STRIDE
],
mm4
movd
[
parm1q
+
3
*
FDEC_STRIDE
],
mm0
ret
;-----------------------------------------------------------------------------
;
; void predict_8x8_v_mmxext( uint8_t *src, int i_neighbors )
...
...
@@ -130,7 +247,7 @@ cglobal predict_16x16_dc_top_mmxext
ALIGN
16
predict_8x8_v_mmxext:
PRED8x8_LOAD_TOP
PRED8x8_LOAD_TOP
_FILT
STORE8x8
mm0
,
mm0
ret
...
...
@@ -144,9 +261,9 @@ ALIGN 16
predict_8x8_dc_core_mmxext:
movq
mm1
,
[
parm3q
-
1
]
movq
mm2
,
[
parm3q
+
1
]
PRED8x8_LOWPASS
mm4
,
[
parm3q
]
PRED8x8_LOWPASS
mm4
,
mm1
,
mm2
,
[
parm3q
]
,
mm7
PRED8x8_LOAD_TOP
PRED8x8_LOAD_TOP
_FILT
pxor
mm1
,
mm1
psadbw
mm0
,
mm1
...
...
@@ -160,6 +277,280 @@ predict_8x8_dc_core_mmxext:
STORE8x8
mm0
,
mm0
ret
;-----------------------------------------------------------------------------
;
; void predict_8x8_ddl_mmxext( uint8_t *src, int i_neighbors )
;
;-----------------------------------------------------------------------------
ALIGN
16
predict_8x8_ddl_mmxext:
sub
parm1q
,
FDEC_STRIDE
and
parm2d
,
12
movq
mm1
,
[
parm1q
-
1
]
movq
mm2
,
[
parm1q
+
1
]
cmp
parm2d
,
byte
8
jge
.have_topleft
mov
al
,
[
parm1q
]
mov
ah
,
al
pinsrw
mm1
,
eax
,
0
.have_topleft:
and
parm2d
,
byte
4
jne
.have_topright
mov
al
,
[
parm1q
+
7
]
mov
ah
,
[
parm1q
+
7
]
pinsrw
mm2
,
eax
,
3
pshufw
mm3
,
mm2
,
0xff
jmp
.done_topright
.have_topright:
movq
mm5
,
[
parm1q
+
9
]
;
mov
al
,
[
parm1q
+
15
]
mov
ah
,
al
pinsrw
mm5
,
eax
,
3
movq
mm4
,
[
parm1q
+
7
]
;
PRED8x8_LOWPASS
mm3
,
mm4
,
mm5
,
[
parm1q
+
8
],
mm7
.done_topright:
;?0123456789abcdeff
; [-mm0--][-mm3--]
;[-mm1--][-mm4--]
; [-mm2--][-mm5--]
PRED8x8_LOWPASS
mm0
,
mm1
,
mm2
,
[
parm1q
],
mm7
movq
mm1
,
mm0
movq
mm2
,
mm0
psllq
mm1
,
8
psrlq
mm2
,
8
movq
mm6
,
mm3
movq
mm4
,
mm3
psllq
mm6
,
56
movq
mm7
,
mm0
por
mm2
,
mm6
psllq
mm4
,
8
movq
mm5
,
mm3
movq
mm6
,
mm3
psrlq
mm5
,
8
pand
mm6
,
[
pb_0s_ff
GLOBAL
]
psrlq
mm7
,
56
por
mm5
,
mm6
por
mm4
,
mm7
PRED8x8_LOWPASS
mm6
,
mm1
,
mm2
,
mm0
,
mm7
PRED8x8_LOWPASS
mm7
,
mm4
,
mm5
,
mm3
,
mm2
%assign Y 8
%rep 6
movq
[
parm1q
+
Y
*
FDEC_STRIDE
],
mm7
movq
mm1
,
mm6
psllq
mm7
,
8
psrlq
mm1
,
56
psllq
mm6
,
8
por
mm7
,
mm1
%assign Y (Y-1)
%endrep
movq
[
parm1q
+
Y
*
FDEC_STRIDE
],
mm7
psllq
mm7
,
8
psrlq
mm6
,
56
por
mm7
,
mm6
%assign Y (Y-1)
movq
[
parm1q
+
Y
*
FDEC_STRIDE
],
mm7
ret
;-----------------------------------------------------------------------------
;
; void predict_8x8_ddl_sse2( uint8_t *src, int i_neighbors )
;
;-----------------------------------------------------------------------------
ALIGN
16
predict_8x8_ddl_sse2:
PRED8x8_LOAD_TOP_TOPRIGHT_XMM
;?0123456789abcdeff
; [-----xmm0-----]
;[-----xmm1-----]
; [-----xmm2-----]
movdqa
xmm3
,
[
pb_00s_ff
GLOBAL
]
PRED8x8_LOWPASS_XMM
xmm4
,
xmm1
,
xmm2
,
xmm0
,
xmm5
movdqa
xmm1
,
xmm4
movdqa
xmm2
,
xmm4
pand
xmm3
,
xmm4
psrldq
xmm2
,
1
pslldq
xmm1
,
1
por
xmm2
,
xmm3
PRED8x8_LOWPASS_XMM
xmm0
,
xmm1
,
xmm2
,
xmm4
,
xmm5
%assign Y 1
%rep 8
psrldq
xmm0
,
1
movq
[
parm1q
+
Y
*
FDEC_STRIDE
],
xmm0
%assign Y (Y+1)
%endrep
ret
;-----------------------------------------------------------------------------
;
; void predict_8x8_ddr_sse2( uint8_t *src, int i_neighbors )
;
;-----------------------------------------------------------------------------
ALIGN
16
predict_8x8_ddr_sse2:
lea
r8
,
[
rsp
-
24
]
movq
mm0
,
[
parm1q
-
FDEC_STRIDE
]
movq
[
r8
+
8
],
mm0
and
parm2d
,
byte
4
mov
al
,
[
parm1q
-
FDEC_STRIDE
+
7
]
cmovnz
ax
,
[
parm1q
-
FDEC_STRIDE
+
8
]
mov
[
r8
+
16
],
al
mov
dh
,
[
parm1q
+
3
*
FDEC_STRIDE
-
1
]
mov
dl
,
[
parm1q
+
4
*
FDEC_STRIDE
-
1
]
mov
ah
,
[
parm1q
-
1
*
FDEC_STRIDE
-
1
]
mov
al
,
[
parm1q
+
0
*
FDEC_STRIDE
-
1
]
shl
edx
,
16
shl
eax
,
16
mov
dh
,
[
parm1q
+
5
*
FDEC_STRIDE
-
1
]
mov
dl
,
[
parm1q
+
6
*
FDEC_STRIDE
-
1
]
mov
ah
,
[
parm1q
+
1
*
FDEC_STRIDE
-
1
]
mov
al
,
[
parm1q
+
2
*
FDEC_STRIDE
-
1
]
mov
[
r8
+
4
],
eax
mov
[
r8
],
edx
movzx
eax
,
byte
[
parm1q
+
7
*
FDEC_STRIDE
-
1
]
movd
xmm4
,
eax
movzx
edx
,
dl
lea
eax
,
[
rax
+
2
*
rax
+
2
]
add
eax
,
edx
shr
eax
,
2
movd
xmm5
,
eax
; r8 -> {l6 l5 l4 l3 l2 l1 l0 lt t0 t1 t2 t3 t4 t5 t6 t7 t8}
movdqu
xmm0
,
[
r8
]
movdqu
xmm2
,
[
r8
+
1
]
movdqa
xmm1
,
xmm0
pslldq
xmm1
,
1
por
xmm1
,
xmm4
PRED8x8_LOWPASS_XMM
xmm3
,
xmm1
,
xmm2
,
xmm0
,
xmm4
movdqa
xmm1
,
xmm3
movdqa
xmm2
,
xmm3
pslldq
xmm1
,
1
psrldq
xmm2
,
1
por
xmm1
,
xmm5
PRED8x8_LOWPASS_XMM
xmm0
,
xmm1
,
xmm2
,
xmm3
,
xmm4
movdqa
xmm1
,
xmm0
psrldq
xmm1
,
1
%assign Y 7
%rep 3
movq
[
parm1q
+
Y
*
FDEC_STRIDE
],
xmm0
psrldq
xmm0
,
2
movq
[
parm1q
+
(
Y
-
1
)
*
FDEC_STRIDE
],
xmm1
psrldq
xmm1
,
2
%assign Y (Y-2)
%endrep
movq
[
parm1q
+
1
*
FDEC_STRIDE
],
xmm0
movq
[
parm1q
+
0
*
FDEC_STRIDE
],
xmm1
ret
;-----------------------------------------------------------------------------
;
; void predict_8x8_vl_sse2( uint8_t *src, int i_neighbors )
;
;-----------------------------------------------------------------------------
ALIGN
16
predict_8x8_vl_sse2:
PRED8x8_LOAD_TOP_TOPRIGHT_XMM
PRED8x8_LOWPASS_XMM
xmm4
,
xmm1
,
xmm2
,
xmm0
,
xmm5
movdqa
xmm2
,
xmm4
movdqa
xmm1
,
xmm4
movdqa
xmm3
,
xmm4
psrldq
xmm2
,
1
pslldq
xmm1
,
1
pavgb
xmm3
,
xmm2
PRED8x8_LOWPASS_XMM
xmm0
,
xmm1
,
xmm2
,
xmm4
,
xmm5
; xmm0: (t0 + 2*t1 + t2 + 2) >> 2
; xmm3: (t0 + t1 + 1) >> 1
%assign Y 1
%rep 3
psrldq
xmm0
,
1
movq
[
parm1q
+
Y
*
FDEC_STRIDE
],
xmm3
movq
[
parm1q
+
(
Y
+
1
)
*
FDEC_STRIDE
],
xmm0
psrldq
xmm3
,
1
%assign Y (Y+2)
%endrep
psrldq
xmm0
,
1
movq
[
parm1q
+
Y
*
FDEC_STRIDE
],
xmm3
movq
[
parm1q
+
(
Y
+
1
)
*
FDEC_STRIDE
],
xmm0
ret
;-----------------------------------------------------------------------------
;
; void predict_8x8_vr_core_mmxext( uint8_t *src, int i_neighbors, uint16_t ltt0 )
;
;-----------------------------------------------------------------------------
; fills only some pixels:
; f0123456789abcdef
; 0 .......
; 1 ,,,,,,
; 2 ......
; 3 ,,,,,
; 4 .....
; 5 ,,,,
; 6 ....
; 7 ,,,
ALIGN
16
predict_8x8_vr_core_mmxext:
sub
parm1q
,
FDEC_STRIDE
movq
mm1
,
[
parm1q
-
1
]
movq
mm2
,
[
parm1q
+
1
]
and
parm2d
,
byte
4
jne
.have_topright
mov
al
,
[
parm1q
+
7
]
mov
ah
,
al
pinsrw
mm2
,
eax
,
3
.have_topright:
PRED8x8_LOWPASS
mm4
,
mm1
,
mm2
,
[
parm1q
],
mm7
movq
mm1
,
mm4
movq
mm2
,
mm4
psllq
mm1
,
8
movq
mm3
,
mm4
pinsrw
mm1
,
parm3d
,
0
psrlq
mm2
,
8
pavgb
mm3
,
mm1
PRED8x8_LOWPASS
mm0
,
mm1
,
mm2
,
mm4
,
mm5
%assign Y 1
%rep 3
psllq
mm0
,
8
movq
[
parm1q
+
Y
*
FDEC_STRIDE
],
mm3
movq
[
parm1q
+
(
Y
+
1
)
*
FDEC_STRIDE
],
mm0
psllq
mm3
,
8
%assign Y (Y+2)
%endrep
psllq
mm0
,
8
movq
[
parm1q
+
Y
*
FDEC_STRIDE
],
mm3
movq
[
parm1q
+
(
Y
+
1
)
*
FDEC_STRIDE
],
mm0
ret
;-----------------------------------------------------------------------------
;
; void predict_8x8c_v_mmx( uint8_t *src )
...
...
@@ -213,12 +604,12 @@ predict_8x8c_dc_core_mmxext:
;-----------------------------------------------------------------------------
;
; void predict_8x8c_p_core_mmx( uint8_t *src, int i00, int b, int c )
; void predict_8x8c_p_core_mmx
ext
( uint8_t *src, int i00, int b, int c )
;
;-----------------------------------------------------------------------------
ALIGN
16
predict_8x8c_p_core_mmx:
predict_8x8c_p_core_mmx
ext
:
movd
mm0
,
parm2d
movd
mm2
,
parm3d
movd
mm4
,
parm4d
...
...
@@ -230,19 +621,16 @@ predict_8x8c_p_core_mmx:
psllw
mm1
,
2
paddsw
mm0
,
mm2
; mm0 = {i+0*b, i+1*b, i+2*b, i+3*b}
paddsw
mm1
,
mm0
; mm1 = {i+4*b, i+5*b, i+6*b, i+7*b}
pxor
mm5
,
mm5
mov
eax
,
8
ALIGN
4
.loop:
movq
mm6
,
mm0
movq
mm7
,
mm1
movq
mm5
,
mm0
movq
mm6
,
mm1
psraw
mm5
,
5
psraw
mm6
,
5
psraw
mm7
,
5
pmaxsw
mm6
,
mm5
pmaxsw
mm7
,
mm5
packuswb
mm6
,
mm7
movq
[
parm1q
],
mm6
packuswb
mm5
,
mm6
movq
[
parm1q
],
mm5
paddsw
mm0
,
mm4
paddsw
mm1
,
mm4
...
...
@@ -255,12 +643,12 @@ ALIGN 4
;-----------------------------------------------------------------------------
;
; void predict_16x16_p_core_mmx( uint8_t *src, int i00, int b, int c )
; void predict_16x16_p_core_mmx
ext
( uint8_t *src, int i00, int b, int c )
;
;-----------------------------------------------------------------------------
ALIGN
16
predict_16x16_p_core_mmx:
predict_16x16_p_core_mmx
ext
:
movd
mm0
,
parm2d
movd
mm2
,
parm3d
movd
mm4
,
parm4d
...
...
@@ -277,28 +665,23 @@ predict_16x16_p_core_mmx:
paddsw
mm1
,
mm0
; mm1 = {i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
paddsw
mm2
,
mm0
; mm2 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b}
paddsw
mm3
,
mm1
; mm3 = {i+12*b, i+13*b, i+14*b, i+15*b}
pxor
mm5
,
mm5
mov
eax
,
16
ALIGN
4
.loop:
movq
mm6
,
mm0
movq
mm7
,
mm1
movq
mm5
,
mm0
movq
mm6
,
mm1
psraw
mm5
,
5
psraw
mm6
,
5
psraw
mm7
,
5
pmaxsw
mm6
,
mm5
pmaxsw
mm7
,
mm5
packuswb
mm6
,
mm7
movq
[
parm1q
],
mm6
movq
mm6
,
mm2
movq
mm7
,
mm3
packuswb
mm5
,
mm6
movq
[
parm1q
],
mm5
movq
mm5
,
mm2
movq
mm6
,
mm3
psraw
mm5
,
5
psraw
mm6
,
5
psraw
mm7
,
5
pmaxsw
mm6
,
mm5
pmaxsw
mm7
,
mm5
packuswb
mm6
,
mm7
movq
[
parm1q
+
8
],
mm6
packuswb
mm5
,
mm6
movq
[
parm1q
+
8
],
mm5
paddsw
mm0
,
mm4
paddsw
mm1
,
mm4
...
...
common/i386/predict-a.asm
View file @
918a791b
...
...
@@ -74,8 +74,8 @@ cglobal predict_8x8_v_mmxext
cglobal
predict_8x8_dc_core_mmxext
cglobal
predict_8x8c_v_mmx
cglobal
predict_8x8c_dc_core_mmxext
cglobal
predict_8x8c_p_core_mmx
cglobal
predict_16x16_p_core_mmx
cglobal
predict_8x8c_p_core_mmx
ext
cglobal
predict_16x16_p_core_mmx
ext
cglobal
predict_16x16_v_mmx
cglobal
predict_16x16_dc_core_mmxext
cglobal
predict_16x16_dc_top_mmxext
...
...
@@ -103,7 +103,7 @@ cglobal predict_16x16_dc_top_mmxext
cmp
eax
,
byte
8
jge
.have_topleft
mov
al
,
[
edx
]
mov
ah
,
[
edx
]
mov
ah
,
al
pinsrw
mm1
,
eax
,
0
mov
eax
,
[
picesp
+
8
]
.have_topleft:
...
...
@@ -111,7 +111,7 @@ cglobal predict_16x16_dc_top_mmxext
and
eax
,
byte
4
jne
.have_topright
mov
al
,
[
edx
+
7
]
mov
ah
,
[
edx
+
7
]
mov
ah
,
al
pinsrw
mm2
,
eax
,
3
.have_topright:
...
...
@@ -231,12 +231,12 @@ predict_8x8c_dc_core_mmxext:
;-----------------------------------------------------------------------------
;
; void predict_8x8c_p_core_mmx( uint8_t *src, int i00, int b, int c )
; void predict_8x8c_p_core_mmx
ext
( uint8_t *src, int i00, int b, int c )