Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
VideoLAN
x264
Commits
a5a6d0ee
Commit
a5a6d0ee
authored
Sep 21, 2011
by
Fiona Glaser
Browse files
Initial XOP and FMA4 support on AMD Bulldozer
~10% faster Hadamard functions (SATD/SA8D/hadamard_ac) plus other improvements.
parent
e73b85b5
Changes
20
Hide whitespace changes
Inline
Side-by-side
common/cpu.c
View file @
a5a6d0ee
...
...
@@ -63,6 +63,8 @@ const x264_cpu_name_t x264_cpu_names[] =
{
"SSE4"
,
SSE2
|
X264_CPU_SSE3
|
X264_CPU_SSSE3
|
X264_CPU_SSE4
},
{
"SSE4.2"
,
SSE2
|
X264_CPU_SSE3
|
X264_CPU_SSSE3
|
X264_CPU_SSE4
|
X264_CPU_SSE42
},
{
"AVX"
,
SSE2
|
X264_CPU_SSE3
|
X264_CPU_SSSE3
|
X264_CPU_SSE4
|
X264_CPU_SSE42
|
X264_CPU_AVX
},
{
"XOP"
,
SSE2
|
X264_CPU_SSE3
|
X264_CPU_SSSE3
|
X264_CPU_SSE4
|
X264_CPU_SSE42
|
X264_CPU_AVX
|
X264_CPU_XOP
},
{
"FMA4"
,
SSE2
|
X264_CPU_SSE3
|
X264_CPU_SSSE3
|
X264_CPU_SSE4
|
X264_CPU_SSE42
|
X264_CPU_AVX
|
X264_CPU_FMA4
},
#undef SSE2
{
"Cache32"
,
X264_CPU_CACHELINE_32
},
{
"Cache64"
,
X264_CPU_CACHELINE_64
},
...
...
@@ -175,6 +177,14 @@ uint32_t x264_cpu_detect( void )
cpu
|=
X264_CPU_SSE_MISALIGN
;
x264_cpu_mask_misalign_sse
();
}
if
(
cpu
&
X264_CPU_AVX
)
{
if
(
ecx
&
0x00000800
)
/* XOP */
cpu
|=
X264_CPU_XOP
;
if
(
ecx
&
0x00010000
)
/* FMA4 */
cpu
|=
X264_CPU_FMA4
;
}
}
}
...
...
common/dct.c
View file @
a5a6d0ee
...
...
@@ -887,6 +887,8 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig
if
(
cpu
&
X264_CPU_SHUFFLE_IS_FAST
)
pf_progressive
->
scan_4x4
=
x264_zigzag_scan_4x4_frame_avx
;
}
if
(
cpu
&
X264_CPU_XOP
)
pf_progressive
->
scan_4x4
=
x264_zigzag_scan_4x4_frame_xop
;
#endif // HAVE_MMX
#if HAVE_ALTIVEC
if
(
cpu
&
X264_CPU_ALTIVEC
)
...
...
common/pixel.c
View file @
a5a6d0ee
...
...
@@ -496,6 +496,7 @@ SATD_X_DECL6( _sse2 )
SATD_X_DECL7
(
_ssse3
)
SATD_X_DECL7
(
_sse4
)
SATD_X_DECL7
(
_avx
)
SATD_X_DECL7
(
_xop
)
#endif // !HIGH_BIT_DEPTH
#endif
...
...
@@ -1134,9 +1135,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf
->
intra_satd_x9_4x4
=
x264_intra_satd_x9_4x4_avx
;
}
INIT5
(
ssd
,
_avx
);
#if ARCH_X86_64
pixf
->
sa8d
[
PIXEL_16x16
]
=
x264_pixel_sa8d_16x16_avx
;
pixf
->
sa8d
[
PIXEL_8x8
]
=
x264_pixel_sa8d_8x8_avx
;
#if ARCH_X86_64
pixf
->
intra_sa8d_x3_8x8
=
x264_intra_sa8d_x3_8x8_avx
;
#endif
pixf
->
ssd_nv12_core
=
x264_pixel_ssd_nv12_core_avx
;
...
...
@@ -1148,6 +1149,28 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf
->
intra_sad_x3_4x4
=
x264_intra_sad_x3_4x4_avx
;
pixf
->
intra_sad_x3_8x8
=
x264_intra_sad_x3_8x8_avx
;
}
if
(
cpu
&
X264_CPU_XOP
)
{
INIT7
(
satd
,
_xop
);
INIT7
(
satd_x3
,
_xop
);
INIT7
(
satd_x4
,
_xop
);
if
(
!
(
cpu
&
X264_CPU_STACK_MOD4
)
)
{
INIT4
(
hadamard_ac
,
_xop
);
pixf
->
intra_satd_x9_4x4
=
x264_intra_satd_x9_4x4_xop
;
}
INIT5
(
ssd
,
_xop
);
pixf
->
sa8d
[
PIXEL_16x16
]
=
x264_pixel_sa8d_16x16_xop
;
pixf
->
sa8d
[
PIXEL_8x8
]
=
x264_pixel_sa8d_8x8_xop
;
#if ARCH_X86_64
pixf
->
intra_sa8d_x3_8x8
=
x264_intra_sa8d_x3_8x8_xop
;
#endif
pixf
->
var
[
PIXEL_16x16
]
=
x264_pixel_var_16x16_xop
;
pixf
->
var
[
PIXEL_8x16
]
=
x264_pixel_var_8x16_xop
;
pixf
->
var
[
PIXEL_8x8
]
=
x264_pixel_var_8x8_xop
;
pixf
->
var2
[
PIXEL_8x8
]
=
x264_pixel_var2_8x8_xop
;
}
#endif //HAVE_MMX
#if HAVE_ARMV6
...
...
common/quant.c
View file @
a5a6d0ee
...
...
@@ -506,6 +506,15 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
{
pf
->
denoise_dct
=
x264_denoise_dct_avx
;
}
if
(
cpu
&
X264_CPU_XOP
)
{
pf
->
dequant_4x4_dc
=
x264_dequant_4x4dc_xop
;
if
(
h
->
param
.
i_cqm_preset
!=
X264_CQM_FLAT
)
{
pf
->
dequant_4x4
=
x264_dequant_4x4_xop
;
pf
->
dequant_8x8
=
x264_dequant_8x8_xop
;
}
}
#endif // HAVE_MMX
#else // !HIGH_BIT_DEPTH
#if HAVE_MMX
...
...
@@ -629,6 +638,15 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf
->
optimize_chroma_2x2_dc
=
x264_optimize_chroma_2x2_dc_avx
;
pf
->
denoise_dct
=
x264_denoise_dct_avx
;
}
if
(
cpu
&
X264_CPU_XOP
)
{
if
(
h
->
param
.
i_cqm_preset
!=
X264_CQM_FLAT
)
{
pf
->
dequant_4x4
=
x264_dequant_4x4_xop
;
pf
->
dequant_8x8
=
x264_dequant_8x8_xop
;
}
}
#endif // HAVE_MMX
#if HAVE_ALTIVEC
...
...
common/x86/dct-a.asm
View file @
a5a6d0ee
...
...
@@ -30,20 +30,14 @@
%include "x86inc.asm"
%include "x86util.asm"
%macro SHUFFLE_16BIT 8
%rep 8
db
%
1
*
2
db
%
1
*
2
+
1
%rotate 1
%endrep
%endmacro
SECTION
_RODATA
pb_sub4frame:
db
0
,
1
,
4
,
8
,
5
,
2
,
3
,
6
,
9
,
12
,
13
,
10
,
7
,
11
,
14
,
15
pb_sub4field:
db
0
,
4
,
1
,
8
,
12
,
5
,
9
,
13
,
2
,
6
,
10
,
14
,
3
,
7
,
11
,
15
pb_subacmask:
dw
0
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
pb_scan4framea:
SHUFFLE_16BIT
6
,
3
,
7
,
0
,
4
,
1
,
2
,
5
pb_scan4frameb:
SHUFFLE_16BIT
0
,
4
,
1
,
2
,
5
,
6
,
3
,
7
pb_scan4framea:
SHUFFLE_MASK_W
6
,
3
,
7
,
0
,
4
,
1
,
2
,
5
pb_scan4frameb:
SHUFFLE_MASK_W
0
,
4
,
1
,
2
,
5
,
6
,
3
,
7
pb_scan4frame2a:
SHUFFLE_MASK_W
0
,
4
,
1
,
2
,
5
,
8
,
12
,
9
pb_scan4frame2b:
SHUFFLE_MASK_W
6
,
3
,
7
,
10
,
13
,
14
,
11
,
15
pb_idctdc_unpack:
db
0
,
0
,
0
,
0
,
1
,
1
,
1
,
1
,
2
,
2
,
2
,
2
,
3
,
3
,
3
,
3
pb_idctdc_unpack2:
db
4
,
4
,
4
,
4
,
5
,
5
,
5
,
5
,
6
,
6
,
6
,
6
,
7
,
7
,
7
,
7
...
...
@@ -1098,6 +1092,16 @@ INIT_XMM ssse3
SCAN_4x4_FRAME
INIT_XMM
avx
SCAN_4x4_FRAME
INIT_XMM
xop
cglobal
zigzag_scan_4x4_frame
,
2
,
2
mova
m0
,
[
r1
+
0
]
mova
m1
,
[
r1
+
16
]
vpperm
m2
,
m0
,
m1
,
[
pb_scan4frame2a
]
vpperm
m1
,
m0
,
m1
,
[
pb_scan4frame2b
]
mova
[
r0
+
0
],
m2
mova
[
r0
+
16
],
m1
RET
%endif
; !HIGH_BIT_DEPTH
%ifdef HIGH_BIT_DEPTH
...
...
common/x86/dct.h
View file @
a5a6d0ee
...
...
@@ -88,6 +88,7 @@ void x264_zigzag_scan_8x8_frame_avx ( dctcoef level[64], dctcoef dct[64] );
void
x264_zigzag_scan_8x8_frame_ssse3
(
int16_t
level
[
64
],
int16_t
dct
[
64
]
);
void
x264_zigzag_scan_8x8_frame_sse2
(
dctcoef
level
[
64
],
dctcoef
dct
[
64
]
);
void
x264_zigzag_scan_8x8_frame_mmx2
(
int16_t
level
[
64
],
int16_t
dct
[
64
]
);
void
x264_zigzag_scan_4x4_frame_xop
(
dctcoef
level
[
16
],
dctcoef
dct
[
16
]
);
void
x264_zigzag_scan_4x4_frame_avx
(
dctcoef
level
[
16
],
dctcoef
dct
[
16
]
);
void
x264_zigzag_scan_4x4_frame_ssse3
(
int16_t
level
[
16
],
int16_t
dct
[
16
]
);
void
x264_zigzag_scan_4x4_frame_sse2
(
int32_t
level
[
16
],
int32_t
dct
[
16
]
);
...
...
common/x86/mc-a2.asm
View file @
a5a6d0ee
...
...
@@ -1635,8 +1635,8 @@ FRAME_INIT_LOWRES
; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
; uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
;-----------------------------------------------------------------------------
INIT_XMM
cglobal
mbtree_propagate_cost
_sse2
,
7
,
7
,
7
%macro MBTREE 0
cglobal
mbtree_propagate_cost
,
7
,
7
,
7
add
r6d
,
r6d
lea
r0
,
[
r0
+
r6
*
2
]
add
r1
,
r6
...
...
@@ -1660,6 +1660,20 @@ cglobal mbtree_propagate_cost_sse2, 7,7,7
pand
xmm3
,
xmm5
punpcklwd
xmm1
,
xmm4
punpcklwd
xmm3
,
xmm4
%if cpuflag(fma4)
cvtdq2ps
xmm0
,
xmm0
cvtdq2ps
xmm1
,
xmm1
vfmaddps
xmm0
,
xmm0
,
xmm6
,
xmm1
cvtdq2ps
xmm1
,
xmm2
psubd
xmm2
,
xmm3
cvtdq2ps
xmm2
,
xmm2
rcpps
xmm3
,
xmm1
mulps
xmm1
,
xmm3
mulps
xmm0
,
xmm2
addps
xmm2
,
xmm3
,
xmm3
vfnmaddps
xmm3
,
xmm1
,
xmm3
,
xmm2
mulps
xmm0
,
xmm3
%else
cvtdq2ps
xmm0
,
xmm0
mulps
xmm0
,
xmm6
; intra*invq*fps_factor>>8
cvtdq2ps
xmm1
,
xmm1
; prop
...
...
@@ -1674,11 +1688,19 @@ cglobal mbtree_propagate_cost_sse2, 7,7,7
addps
xmm3
,
xmm3
; 2 * (1/intra 1st approx)
subps
xmm3
,
xmm1
; 2nd approximation for 1/intra
mulps
xmm0
,
xmm3
; / intra
%endif
cvtps2dq
xmm0
,
xmm0
movdqa
[
r0
+
r6
*
2
],
xmm0
add
r6
,
8
jl
.loop
REP_RET
%endmacro
INIT_XMM
ss
e2
MBTREE
; Bulldozer only has a 128-bit float unit, so the AVX version of this function is actually slower.
INIT_XMM
fma4
MBTREE
%macro INT16_TO_FLOAT 1
vpunpckhwd
xmm4
,
xmm
%
1
,
xmm7
...
...
@@ -1688,7 +1710,8 @@ cglobal mbtree_propagate_cost_sse2, 7,7,7
%endmacro
; FIXME: align loads/stores to 16 bytes
cglobal
mbtree_propagate_cost_avx
,
7
,
7
,
8
INIT_YMM
avx
cglobal
mbtree_propagate_cost
,
7
,
7
,
8
add
r6d
,
r6d
lea
r0
,
[
r0
+
r6
*
2
]
add
r1
,
r6
...
...
common/x86/mc-c.c
View file @
a5a6d0ee
...
...
@@ -141,6 +141,8 @@ void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t
uint16_t
*
inter_costs
,
uint16_t
*
inv_qscales
,
float
*
fps_factor
,
int
len
);
void
x264_mbtree_propagate_cost_avx
(
int
*
dst
,
uint16_t
*
propagate_in
,
uint16_t
*
intra_costs
,
uint16_t
*
inter_costs
,
uint16_t
*
inv_qscales
,
float
*
fps_factor
,
int
len
);
void
x264_mbtree_propagate_cost_fma4
(
int
*
dst
,
uint16_t
*
propagate_in
,
uint16_t
*
intra_costs
,
uint16_t
*
inter_costs
,
uint16_t
*
inv_qscales
,
float
*
fps_factor
,
int
len
);
#define MC_CHROMA(cpu)\
void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, int i_dst,\
...
...
@@ -741,4 +743,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
if
(
!
(
cpu
&
X264_CPU_AVX
)
)
return
;
pf
->
mbtree_propagate_cost
=
x264_mbtree_propagate_cost_avx
;
if
(
!
(
cpu
&
X264_CPU_FMA4
)
)
return
;
pf
->
mbtree_propagate_cost
=
x264_mbtree_propagate_cost_fma4
;
}
common/x86/pixel-a.asm
View file @
a5a6d0ee
...
...
@@ -81,6 +81,9 @@ intrax9b_vh1: db 6, 7, 8, 9, 6, 7, 8, 9, 4, 3, 2, 1, 4, 3, 2, 1
intrax9b_v1:
db
0
,
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
4
,
5
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
intrax9b_v2:
db
2
,
3
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
6
,
7
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
transd_shuf1:
SHUFFLE_MASK_W
0
,
8
,
2
,
10
,
4
,
12
,
6
,
14
transd_shuf2:
SHUFFLE_MASK_W
1
,
9
,
3
,
11
,
5
,
13
,
7
,
15
sw_f0:
dq
0xfff0
,
0
sq_0f:
dq
0xffffffff
,
0
pd_f0:
times
4
dd
0xffff0000
...
...
@@ -417,6 +420,12 @@ INIT_MMX ssse3
SSD
4
,
4
SSD
4
,
8
SSD
4
,
16
INIT_XMM
xop
SSD
16
,
16
SSD
8
,
8
SSD
16
,
8
SSD
8
,
16
SSD
8
,
4
%assign function_align 16
%endif
; !HIGH_BIT_DEPTH
...
...
@@ -654,20 +663,20 @@ SSD_NV12
;-----------------------------------------------------------------------------
; int pixel_var_wxh( uint8_t *, int )
;-----------------------------------------------------------------------------
INIT_MMX
cglobal
pixel_var_16x16
_mmx2
,
2
,
3
INIT_MMX
mmx2
cglobal
pixel_var_16x16
,
2
,
3
FIX_STRIDES
r1
VAR_START
0
VAR_2ROW
8
*
SI
ZEOF_PIXEL
,
16
VAR_END
16
,
16
cglobal
pixel_var_8x16
_mmx2
,
2
,
3
cglobal
pixel_var_8x16
,
2
,
3
FIX_STRIDES
r1
VAR_START
0
VAR_2ROW
r1
,
8
VAR_END
8
,
16
cglobal
pixel_var_8x8
_mmx2
,
2
,
3
cglobal
pixel_var_8x8
,
2
,
3
FIX_STRIDES
r1
VAR_START
0
VAR_2ROW
r1
,
4
...
...
@@ -702,6 +711,8 @@ INIT_XMM sse2
VAR
INIT_XMM
avx
VAR
INIT_XMM
xop
VAR
%endif
; HIGH_BIT_DEPTH
%ifndef HIGH_BIT_DEPTH
...
...
@@ -756,6 +767,8 @@ INIT_XMM sse2
VAR
INIT_XMM
avx
VAR
INIT_XMM
xop
VAR
%endif
; !HIGH_BIT_DEPTH
%macro VAR2_END 0
...
...
@@ -773,8 +786,8 @@ VAR
;-----------------------------------------------------------------------------
; int pixel_var2_8x8( pixel *, int, pixel *, int, int * )
;-----------------------------------------------------------------------------
INIT_MMX
cglobal
pixel_var2_8x8
_mmx2
,
5
,
6
INIT_MMX
mmx2
cglobal
pixel_var2_8x8
,
5
,
6
FIX_STRIDES
r1
,
r3
VAR_START
0
mov
r5d
,
8
...
...
@@ -809,8 +822,8 @@ cglobal pixel_var2_8x8_mmx2, 5,6
VAR2_END
RET
INIT_XMM
cglobal
pixel_var2_8x8
_sse2
,
5
,
6
,
8
INIT_XMM
ss
e2
cglobal
pixel_var2_8x8
,
5
,
6
,
8
VAR_START
1
mov
r5d
,
4
.loop:
...
...
@@ -842,7 +855,8 @@ cglobal pixel_var2_8x8_sse2, 5,6,8
RET
%ifndef HIGH_BIT_DEPTH
cglobal
pixel_var2_8x8_ssse3
,
5
,
6
,
8
%macro VAR2_8x8 0
cglobal
pixel_var2_8x8
,
5
,
6
,
8
pxor
m5
,
m5
; sum
pxor
m6
,
m6
; sum squared
mova
m7
,
[
hsub_mul
]
...
...
@@ -884,6 +898,13 @@ cglobal pixel_var2_8x8_ssse3, 5,6,8
jg
.loop
VAR2_END
RET
%endmacro
INIT_XMM
ss
se3
VAR2_8x8
INIT_XMM
xop
VAR2_8x8
%endif
; !HIGH_BIT_DEPTH
;=============================================================================
...
...
@@ -1680,6 +1701,20 @@ cglobal intra_sa8d_x3_8x8, 3,3,16
paddusw
m2
,
m0
; 3x HADDW
%if cpuflag(xop)
phaddw
m2
,
m14
vphadduwq
m0
,
m15
movhlps
m1
,
m0
vphadduwq
m2
,
m2
; i8x8_v, i8x8_h
paddd
m0
,
m1
; i8x8_dc
packusdw
m2
,
m0
; i8x8_v, i8x8_h, i8x8_dc
pxor
m3
,
m3
psrlw
m2
,
1
pavgw
m2
,
m3
movq
[
r2
],
m2
; i8x8_v, i8x8_h
psrldq
m2
,
8
movd
[
r2
+
8
],
m2
; i8x8_dc
%else
movdqa
m7
,
[
pw_1
]
pmaddwd
m2
,
m7
pmaddwd
m14
,
m7
...
...
@@ -1697,6 +1732,7 @@ cglobal intra_sa8d_x3_8x8, 3,3,16
movq
[
r2
],
m3
; i8x8_v, i8x8_h
psrldq
m3
,
8
movd
[
r2
+
8
],
m3
; i8x8_dc
%endif
RET
%endif
; ARCH_X86_64
%endmacro
; INTRA_SA8D_SSE2
...
...
@@ -2088,11 +2124,9 @@ cglobal intra_satd_x3_8x8c, 0,6
psignw
m
%
1
,
[
pw_pmpmpmpm
]
paddw
m0
,
m
%
1
psllw
m0
,
2
; hadamard(top), hadamard(left)
mova
m1
,
m0
mova
m2
,
m0
movhlps
m3
,
m0
pshufb
m1
,
[
intrax9b_v1
]
pshufb
m2
,
[
intrax9b_v2
]
pshufb
m1
,
m0
,
[
intrax9b_v1
]
pshufb
m2
,
m0
,
[
intrax9b_v2
]
paddw
m0
,
m3
psignw
m3
,
[
pw_pmmpzzzz
]
; FIXME could this be eliminated?
pavgw
m0
,
[
pw_16
]
...
...
@@ -2122,8 +2156,14 @@ cglobal intra_satd_x3_8x8c, 0,6
%endif
movhlps
m2
,
m1
paddw
m1
,
m2
%if cpuflag(xop)
vphaddwq
m3
,
m3
vphaddwq
m1
,
m1
packssdw
m1
,
m3
%else
phaddw
m1
,
m3
pmaddwd
m1
,
[
pw_1
]
; v, _, h, dc
%endif
%endmacro
; INTRA_X9_VHDC
%macro INTRA_X9_END 1
...
...
@@ -2167,6 +2207,7 @@ cglobal intra_satd_x3_8x8c, 0,6
;-----------------------------------------------------------------------------
; int intra_sad_x9_4x4( uint8_t *fenc, uint8_t *fdec, uint16_t *bitcosts )
;-----------------------------------------------------------------------------
%if notcpuflag(xop)
cglobal
intra_sad_x9_4x4
,
3
,
3
,
9
%ifdef ARCH_X86_64
INTRA_X9_PRED
intrax9a
,
m8
...
...
@@ -2206,12 +2247,10 @@ cglobal intra_sad_x9_4x4, 3,3,9
mova
m7
,
[
rsp
]
%define %%zero [pb_0]
%endif
mova
m3
,
m7
mova
m5
,
m7
pshufb
m3
,
m7
,
[
intrax9a_vh1
]
pshufb
m5
,
m7
,
[
intrax9a_vh2
]
pshufb
m7
,
[
intrax9a_dc
]
pshufb
m3
,
[
intrax9a_vh1
]
psadbw
m7
,
%%
zero
pshufb
m5
,
[
intrax9a_vh2
]
psrlw
m7
,
2
psadbw
m3
,
m0
pavgw
m7
,
%%
zero
...
...
@@ -2236,6 +2275,7 @@ cglobal intra_sad_x9_4x4, 3,3,9
add
rsp
,
0x1c
%endif
RET
%endif
%ifdef ARCH_X86_64
;-----------------------------------------------------------------------------
...
...
@@ -2940,6 +2980,16 @@ INTRA_X9
%endif
HADAMARD_AC_SSE2
%define TRANS TRANS_XOP
INIT_XMM
xop
SATDS_SSE2
SA8D
%ifndef HIGH_BIT_DEPTH
INTRA_SA8D_SSE2
INTRA_X9
%endif
HADAMARD_AC_SSE2
;=============================================================================
; SSIM
;=============================================================================
...
...
common/x86/pixel.h
View file @
a5a6d0ee
...
...
@@ -62,16 +62,19 @@ DECL_X1( ssd, sse2slow )
DECL_X1
(
ssd
,
sse2
)
DECL_X1
(
ssd
,
ssse3
)
DECL_X1
(
ssd
,
avx
)
DECL_X1
(
ssd
,
xop
)
DECL_X1
(
satd
,
mmx2
)
DECL_X1
(
satd
,
sse2
)
DECL_X1
(
satd
,
ssse3
)
DECL_X1
(
satd
,
sse4
)
DECL_X1
(
satd
,
avx
)
DECL_X1
(
satd
,
xop
)
DECL_X1
(
sa8d
,
mmx2
)
DECL_X1
(
sa8d
,
sse2
)
DECL_X1
(
sa8d
,
ssse3
)
DECL_X1
(
sa8d
,
sse4
)
DECL_X1
(
sa8d
,
avx
)
DECL_X1
(
sa8d
,
xop
)
DECL_X1
(
sad
,
cache32_mmx2
);
DECL_X1
(
sad
,
cache64_mmx2
);
DECL_X1
(
sad
,
cache64_sse2
);
...
...
@@ -84,11 +87,13 @@ DECL_X4( sad, cache64_ssse3 );
DECL_PIXELS
(
uint64_t
,
var
,
mmx2
,
(
pixel
*
pix
,
int
i_stride
))
DECL_PIXELS
(
uint64_t
,
var
,
sse2
,
(
pixel
*
pix
,
int
i_stride
))
DECL_PIXELS
(
uint64_t
,
var
,
avx
,
(
pixel
*
pix
,
int
i_stride
))
DECL_PIXELS
(
uint64_t
,
var
,
xop
,
(
pixel
*
pix
,
int
i_stride
))
DECL_PIXELS
(
uint64_t
,
hadamard_ac
,
mmx2
,
(
pixel
*
pix
,
int
i_stride
))
DECL_PIXELS
(
uint64_t
,
hadamard_ac
,
sse2
,
(
pixel
*
pix
,
int
i_stride
))
DECL_PIXELS
(
uint64_t
,
hadamard_ac
,
ssse3
,
(
pixel
*
pix
,
int
i_stride
))
DECL_PIXELS
(
uint64_t
,
hadamard_ac
,
sse4
,
(
pixel
*
pix
,
int
i_stride
))
DECL_PIXELS
(
uint64_t
,
hadamard_ac
,
avx
,
(
pixel
*
pix
,
int
i_stride
))
DECL_PIXELS
(
uint64_t
,
hadamard_ac
,
xop
,
(
pixel
*
pix
,
int
i_stride
))
void
x264_intra_satd_x3_4x4_mmx2
(
pixel
*
,
pixel
*
,
int
*
);
...
...
@@ -110,6 +115,7 @@ void x264_intra_sa8d_x3_8x8_mmx2 ( uint8_t *, uint8_t *, int * );
void
x264_intra_sa8d_x3_8x8_sse2
(
pixel
*
,
pixel
*
,
int
*
);
void
x264_intra_sa8d_x3_8x8_ssse3
(
uint8_t
*
,
uint8_t
*
,
int
*
);
void
x264_intra_sa8d_x3_8x8_avx
(
uint8_t
*
,
uint8_t
*
,
int
*
);
void
x264_intra_sa8d_x3_8x8_xop
(
uint8_t
*
,
uint8_t
*
,
int
*
);
void
x264_intra_sad_x3_8x8_mmx2
(
pixel
*
,
pixel
*
,
int
*
);
void
x264_intra_sad_x3_8x8_sse2
(
pixel
*
,
pixel
*
,
int
*
);
void
x264_intra_sad_x3_8x8_ssse3
(
pixel
*
,
pixel
*
,
int
*
);
...
...
@@ -117,6 +123,7 @@ void x264_intra_sad_x3_8x8_avx ( pixel *, pixel *, int * );
int
x264_intra_satd_x9_4x4_ssse3
(
uint8_t
*
,
uint8_t
*
,
uint16_t
*
);
int
x264_intra_satd_x9_4x4_sse4
(
uint8_t
*
,
uint8_t
*
,
uint16_t
*
);
int
x264_intra_satd_x9_4x4_avx
(
uint8_t
*
,
uint8_t
*
,
uint16_t
*
);
int
x264_intra_satd_x9_4x4_xop
(
uint8_t
*
,
uint8_t
*
,
uint16_t
*
);
int
x264_intra_sad_x9_4x4_ssse3
(
uint8_t
*
,
uint8_t
*
,
uint16_t
*
);
int
x264_intra_sad_x9_4x4_sse4
(
uint8_t
*
,
uint8_t
*
,
uint16_t
*
);
int
x264_intra_sad_x9_4x4_avx
(
uint8_t
*
,
uint8_t
*
,
uint16_t
*
);
...
...
@@ -141,6 +148,7 @@ float x264_pixel_ssim_end4_avx( int sum0[5][4], int sum1[5][4], int width );
int
x264_pixel_var2_8x8_mmx2
(
pixel
*
,
int
,
pixel
*
,
int
,
int
*
);
int
x264_pixel_var2_8x8_sse2
(
pixel
*
,
int
,
pixel
*
,
int
,
int
*
);
int
x264_pixel_var2_8x8_ssse3
(
uint8_t
*
,
int
,
uint8_t
*
,
int
,
int
*
);
int
x264_pixel_var2_8x8_xop
(
uint8_t
*
,
int
,
uint8_t
*
,
int
,
int
*
);
int
x264_pixel_vsad_mmx2
(
pixel
*
src
,
int
stride
,
int
height
);
int
x264_pixel_vsad_sse2
(
pixel
*
src
,
int
stride
,
int
height
);
...
...
common/x86/predict-a.asm
View file @
a5a6d0ee
...
...
@@ -582,9 +582,9 @@ PREDICT_4x4_V1 b
;-----------------------------------------------------------------------------
; void predict_4x4_dc( pixel *src )
;-----------------------------------------------------------------------------
INIT_MMX
mmx2
%ifdef HIGH_BIT_DEPTH
INIT_MMX
cglobal
predict_4x4_dc_mmx2
,
1
,
1
cglobal
predict_4x4_dc
,
1
,
1
mova
m2
,
[
r0
+
0
*
FDEC_STRIDEB
-
4
*
SI
ZEOF_PIXEL
]
paddw
m2
,
[
r0
+
1
*
FDEC_STRIDEB
-
4
*
SI
ZEOF_PIXEL
]
paddw
m2
,
[
r0
+
2
*
FDEC_STRIDEB
-
4
*
SI
ZEOF_PIXEL
]
...
...
@@ -603,8 +603,7 @@ cglobal predict_4x4_dc_mmx2, 1,1
RET
%else
; !HIGH_BIT_DEPTH
INIT_MMX
cglobal
predict_4x4_dc_mmx2
,
1
,
4
cglobal
predict_4x4_dc
,
1
,
4
pxor
mm7
,
mm7
movd
mm0
,
[
r0
-
FDEC_STRIDEB
]
psadbw
mm0
,
mm7
...
...
@@ -797,8 +796,8 @@ PREDICT_8x8_H bw, W
; void predict_8x8_dc( pixel *src, pixel *edge );
;-----------------------------------------------------------------------------
%ifdef HIGH_BIT_DEPTH
INIT_XMM
cglobal
predict_8x8_dc
_sse2
,
2
,
2
INIT_XMM
ss
e2
cglobal
predict_8x8_dc
,
2
,
2
movu
m0
,
[
r1
+
14
]
paddw
m0
,
[
r1
+
32
]
HADDW
m0
,
m1
...
...
@@ -809,8 +808,8 @@ cglobal predict_8x8_dc_sse2, 2,2
REP_RET
%else
; !HIGH_BIT_DEPTH
INIT_MMX
cglobal
predict_8x8_dc
_mmx2
,
2
,
2
INIT_MMX
mmx2
cglobal
predict_8x8_dc
,
2
,
2
pxor
mm0
,
mm0
pxor
mm1
,
mm1
psadbw
mm0
,
[
r1
+
7
]
...
...
@@ -839,9 +838,9 @@ cglobal %1, 2,2
STORE8x8
m0
,
m0
RET
%endmacro
INIT_XMM
PREDICT_8x8_DC
predict_8x8_dc_top
_sse2
,
32
,
mova
PREDICT_8x8_DC
predict_8x8_dc_left
_sse2
,
14
,
movu
INIT_XMM
ss
e2
PREDICT_8x8_DC
predict_8x8_dc_top
,
32
,
mova
PREDICT_8x8_DC
predict_8x8_dc_left
,
14
,
movu
%else
; !HIGH_BIT_DEPTH
%macro PREDICT_8x8_DC 2
...
...
@@ -1106,9 +1105,9 @@ ALIGN 4
REP_RET
%endif
; !ARCH_X86_64
INIT_XMM
%macro PREDICT_8x8C 0
%ifdef HIGH_BIT_DEPTH
cglobal
predict_8x8c_p_core
_sse2
,
1
,
1
,
7
cglobal
predict_8x8c_p_core
,
1
,
1
,
7
movd
m0
,
r1m
movd
m2
,
r2m
movd
m4
,
r3m
...
...
@@ -1133,7 +1132,7 @@ cglobal predict_8x8c_p_core_sse2, 1,1,7
jg
.loop
REP_RET
%else
; !HIGH_BIT_DEPTH
cglobal
predict_8x8c_p_core
_sse2
,
1
,
1
cglobal
predict_8x8c_p_core
,
1
,
1
movd
m0
,
r1m
movd
m2
,
r2m
movd
m4
,
r3m
...
...
@@ -1163,12 +1162,19 @@ call .loop
movhps
[
r0
+
FDEC_STRIDE
*
3
],
m5
RET
%endif
; HIGH_BIT_DEPTH
%endmacro
INIT_XMM
ss
e2
PREDICT_8x8C
INIT_XMM
avx
PREDICT_8x8C
;-----------------------------------------------------------------------------
; void predict_16x16_p_core( uint8_t *src, int i00, int b, int c )
;-----------------------------------------------------------------------------
%ifndef ARCH_X86_64
cglobal
predict_16x16_p_core_mmx2
,
1
,
2
INIT_MMX
mmx2
cglobal
predict_16x16_p_core
,
1
,
2
LOAD_PLANE_ARGS
movq
mm5
,
mm2
movq
mm1
,
mm2
...
...
@@ -1912,8 +1918,8 @@ PREDICT_16x16_H
%endif
%endmacro
INIT_MMX
cglobal
predict_16x16_dc_core
_mmx2
,
1
,
2
INIT_MMX
mmx2
cglobal
predict_16x16_dc_core
,
1
,
2
%ifdef ARCH_X86_64
movd
m6
,
r1d
PRED16x16_DC
m6
,
5
...
...
@@ -1922,20 +1928,20 @@ cglobal predict_16x16_dc_core_mmx2, 1,2
%endif
REP_RET
INIT_MMX
<