Commit a5a6d0ee authored by Fiona Glaser's avatar Fiona Glaser

Initial XOP and FMA4 support on AMD Bulldozer

~10% faster Hadamard functions (SATD/SA8D/hadamard_ac) plus other improvements.
parent e73b85b5
......@@ -63,6 +63,8 @@ const x264_cpu_name_t x264_cpu_names[] =
{"SSE4", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
{"SSE4.2", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42},
{"AVX", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42|X264_CPU_AVX},
{"XOP", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42|X264_CPU_AVX|X264_CPU_XOP},
{"FMA4", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42|X264_CPU_AVX|X264_CPU_FMA4},
#undef SSE2
{"Cache32", X264_CPU_CACHELINE_32},
{"Cache64", X264_CPU_CACHELINE_64},
......@@ -175,6 +177,14 @@ uint32_t x264_cpu_detect( void )
cpu |= X264_CPU_SSE_MISALIGN;
x264_cpu_mask_misalign_sse();
}
if( cpu & X264_CPU_AVX )
{
if( ecx&0x00000800 ) /* XOP */
cpu |= X264_CPU_XOP;
if( ecx&0x00010000 ) /* FMA4 */
cpu |= X264_CPU_FMA4;
}
}
}
......
......@@ -887,6 +887,8 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig
if( cpu&X264_CPU_SHUFFLE_IS_FAST )
pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
}
if( cpu&X264_CPU_XOP )
pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_xop;
#endif // HAVE_MMX
#if HAVE_ALTIVEC
if( cpu&X264_CPU_ALTIVEC )
......
......@@ -496,6 +496,7 @@ SATD_X_DECL6( _sse2 )
SATD_X_DECL7( _ssse3 )
SATD_X_DECL7( _sse4 )
SATD_X_DECL7( _avx )
SATD_X_DECL7( _xop )
#endif // !HIGH_BIT_DEPTH
#endif
......@@ -1134,9 +1135,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_avx;
}
INIT5( ssd, _avx );
#if ARCH_X86_64
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_avx;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx;
#if ARCH_X86_64
pixf->intra_sa8d_x3_8x8= x264_intra_sa8d_x3_8x8_avx;
#endif
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx;
......@@ -1148,6 +1149,28 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_avx;
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_avx;
}
if( cpu&X264_CPU_XOP )
{
INIT7( satd, _xop );
INIT7( satd_x3, _xop );
INIT7( satd_x4, _xop );
if( !(cpu&X264_CPU_STACK_MOD4) )
{
INIT4( hadamard_ac, _xop );
pixf->intra_satd_x9_4x4 = x264_intra_satd_x9_4x4_xop;
}
INIT5( ssd, _xop );
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_xop;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_xop;
#if ARCH_X86_64
pixf->intra_sa8d_x3_8x8= x264_intra_sa8d_x3_8x8_xop;
#endif
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_xop;
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_xop;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_xop;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_xop;
}
#endif //HAVE_MMX
#if HAVE_ARMV6
......
......@@ -506,6 +506,15 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
{
pf->denoise_dct = x264_denoise_dct_avx;
}
if( cpu&X264_CPU_XOP )
{
pf->dequant_4x4_dc = x264_dequant_4x4dc_xop;
if( h->param.i_cqm_preset != X264_CQM_FLAT )
{
pf->dequant_4x4 = x264_dequant_4x4_xop;
pf->dequant_8x8 = x264_dequant_8x8_xop;
}
}
#endif // HAVE_MMX
#else // !HIGH_BIT_DEPTH
#if HAVE_MMX
......@@ -629,6 +638,15 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_avx;
pf->denoise_dct = x264_denoise_dct_avx;
}
if( cpu&X264_CPU_XOP )
{
if( h->param.i_cqm_preset != X264_CQM_FLAT )
{
pf->dequant_4x4 = x264_dequant_4x4_xop;
pf->dequant_8x8 = x264_dequant_8x8_xop;
}
}
#endif // HAVE_MMX
#if HAVE_ALTIVEC
......
......@@ -30,20 +30,14 @@
%include "x86inc.asm"
%include "x86util.asm"
%macro SHUFFLE_16BIT 8
%rep 8
db %1*2
db %1*2+1
%rotate 1
%endrep
%endmacro
SECTION_RODATA
pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
pb_sub4field: db 0,4,1,8,12,5,9,13,2,6,10,14,3,7,11,15
pb_subacmask: dw 0,-1,-1,-1,-1,-1,-1,-1
pb_scan4framea: SHUFFLE_16BIT 6,3,7,0,4,1,2,5
pb_scan4frameb: SHUFFLE_16BIT 0,4,1,2,5,6,3,7
pb_scan4framea: SHUFFLE_MASK_W 6,3,7,0,4,1,2,5
pb_scan4frameb: SHUFFLE_MASK_W 0,4,1,2,5,6,3,7
pb_scan4frame2a: SHUFFLE_MASK_W 0,4,1,2,5,8,12,9
pb_scan4frame2b: SHUFFLE_MASK_W 6,3,7,10,13,14,11,15
pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
pb_idctdc_unpack2: db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
......@@ -1098,6 +1092,16 @@ INIT_XMM ssse3
SCAN_4x4_FRAME
INIT_XMM avx
SCAN_4x4_FRAME
INIT_XMM xop
cglobal zigzag_scan_4x4_frame, 2,2
mova m0, [r1+ 0]
mova m1, [r1+16]
vpperm m2, m0, m1, [pb_scan4frame2a]
vpperm m1, m0, m1, [pb_scan4frame2b]
mova [r0+ 0], m2
mova [r0+16], m1
RET
%endif ; !HIGH_BIT_DEPTH
%ifdef HIGH_BIT_DEPTH
......
......@@ -88,6 +88,7 @@ void x264_zigzag_scan_8x8_frame_avx ( dctcoef level[64], dctcoef dct[64] );
void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[64] );
void x264_zigzag_scan_8x8_frame_sse2 ( dctcoef level[64], dctcoef dct[64] );
void x264_zigzag_scan_8x8_frame_mmx2 ( int16_t level[64], int16_t dct[64] );
void x264_zigzag_scan_4x4_frame_xop ( dctcoef level[16], dctcoef dct[16] );
void x264_zigzag_scan_4x4_frame_avx ( dctcoef level[16], dctcoef dct[16] );
void x264_zigzag_scan_4x4_frame_ssse3( int16_t level[16], int16_t dct[16] );
void x264_zigzag_scan_4x4_frame_sse2 ( int32_t level[16], int32_t dct[16] );
......
......@@ -1635,8 +1635,8 @@ FRAME_INIT_LOWRES
; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
; uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
;-----------------------------------------------------------------------------
INIT_XMM
cglobal mbtree_propagate_cost_sse2, 7,7,7
%macro MBTREE 0
cglobal mbtree_propagate_cost, 7,7,7
add r6d, r6d
lea r0, [r0+r6*2]
add r1, r6
......@@ -1660,6 +1660,20 @@ cglobal mbtree_propagate_cost_sse2, 7,7,7
pand xmm3, xmm5
punpcklwd xmm1, xmm4
punpcklwd xmm3, xmm4
%if cpuflag(fma4)
cvtdq2ps xmm0, xmm0
cvtdq2ps xmm1, xmm1
vfmaddps xmm0, xmm0, xmm6, xmm1
cvtdq2ps xmm1, xmm2
psubd xmm2, xmm3
cvtdq2ps xmm2, xmm2
rcpps xmm3, xmm1
mulps xmm1, xmm3
mulps xmm0, xmm2
addps xmm2, xmm3, xmm3
vfnmaddps xmm3, xmm1, xmm3, xmm2
mulps xmm0, xmm3
%else
cvtdq2ps xmm0, xmm0
mulps xmm0, xmm6 ; intra*invq*fps_factor>>8
cvtdq2ps xmm1, xmm1 ; prop
......@@ -1674,11 +1688,19 @@ cglobal mbtree_propagate_cost_sse2, 7,7,7
addps xmm3, xmm3 ; 2 * (1/intra 1st approx)
subps xmm3, xmm1 ; 2nd approximation for 1/intra
mulps xmm0, xmm3 ; / intra
%endif
cvtps2dq xmm0, xmm0
movdqa [r0+r6*2], xmm0
add r6, 8
jl .loop
REP_RET
%endmacro
INIT_XMM sse2
MBTREE
; Bulldozer only has a 128-bit float unit, so the AVX version of this function is actually slower.
INIT_XMM fma4
MBTREE
%macro INT16_TO_FLOAT 1
vpunpckhwd xmm4, xmm%1, xmm7
......@@ -1688,7 +1710,8 @@ cglobal mbtree_propagate_cost_sse2, 7,7,7
%endmacro
; FIXME: align loads/stores to 16 bytes
cglobal mbtree_propagate_cost_avx, 7,7,8
INIT_YMM avx
cglobal mbtree_propagate_cost, 7,7,8
add r6d, r6d
lea r0, [r0+r6*2]
add r1, r6
......
......@@ -141,6 +141,8 @@ void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void x264_mbtree_propagate_cost_avx( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void x264_mbtree_propagate_cost_fma4( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
#define MC_CHROMA(cpu)\
void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, int i_dst,\
......@@ -741,4 +743,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
if( !(cpu&X264_CPU_AVX) )
return;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx;
if( !(cpu&X264_CPU_FMA4) )
return;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_fma4;
}
......@@ -81,6 +81,9 @@ intrax9b_vh1: db 6, 7, 8, 9, 6, 7, 8, 9, 4, 3, 2, 1, 4, 3, 2, 1
intrax9b_v1: db 0, 1,-1,-1,-1,-1,-1,-1, 4, 5,-1,-1,-1,-1,-1,-1
intrax9b_v2: db 2, 3,-1,-1,-1,-1,-1,-1, 6, 7,-1,-1,-1,-1,-1,-1
transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14
transd_shuf2: SHUFFLE_MASK_W 1, 9, 3, 11, 5, 13, 7, 15
sw_f0: dq 0xfff0, 0
sq_0f: dq 0xffffffff, 0
pd_f0: times 4 dd 0xffff0000
......@@ -417,6 +420,12 @@ INIT_MMX ssse3
SSD 4, 4
SSD 4, 8
SSD 4, 16
INIT_XMM xop
SSD 16, 16
SSD 8, 8
SSD 16, 8
SSD 8, 16
SSD 8, 4
%assign function_align 16
%endif ; !HIGH_BIT_DEPTH
......@@ -654,20 +663,20 @@ SSD_NV12
;-----------------------------------------------------------------------------
; int pixel_var_wxh( uint8_t *, int )
;-----------------------------------------------------------------------------
INIT_MMX
cglobal pixel_var_16x16_mmx2, 2,3
INIT_MMX mmx2
cglobal pixel_var_16x16, 2,3
FIX_STRIDES r1
VAR_START 0
VAR_2ROW 8*SIZEOF_PIXEL, 16
VAR_END 16, 16
cglobal pixel_var_8x16_mmx2, 2,3
cglobal pixel_var_8x16, 2,3
FIX_STRIDES r1
VAR_START 0
VAR_2ROW r1, 8
VAR_END 8, 16
cglobal pixel_var_8x8_mmx2, 2,3
cglobal pixel_var_8x8, 2,3
FIX_STRIDES r1
VAR_START 0
VAR_2ROW r1, 4
......@@ -702,6 +711,8 @@ INIT_XMM sse2
VAR
INIT_XMM avx
VAR
INIT_XMM xop
VAR
%endif ; HIGH_BIT_DEPTH
%ifndef HIGH_BIT_DEPTH
......@@ -756,6 +767,8 @@ INIT_XMM sse2
VAR
INIT_XMM avx
VAR
INIT_XMM xop
VAR
%endif ; !HIGH_BIT_DEPTH
%macro VAR2_END 0
......@@ -773,8 +786,8 @@ VAR
;-----------------------------------------------------------------------------
; int pixel_var2_8x8( pixel *, int, pixel *, int, int * )
;-----------------------------------------------------------------------------
INIT_MMX
cglobal pixel_var2_8x8_mmx2, 5,6
INIT_MMX mmx2
cglobal pixel_var2_8x8, 5,6
FIX_STRIDES r1, r3
VAR_START 0
mov r5d, 8
......@@ -809,8 +822,8 @@ cglobal pixel_var2_8x8_mmx2, 5,6
VAR2_END
RET
INIT_XMM
cglobal pixel_var2_8x8_sse2, 5,6,8
INIT_XMM sse2
cglobal pixel_var2_8x8, 5,6,8
VAR_START 1
mov r5d, 4
.loop:
......@@ -842,7 +855,8 @@ cglobal pixel_var2_8x8_sse2, 5,6,8
RET
%ifndef HIGH_BIT_DEPTH
cglobal pixel_var2_8x8_ssse3, 5,6,8
%macro VAR2_8x8 0
cglobal pixel_var2_8x8, 5,6,8
pxor m5, m5 ; sum
pxor m6, m6 ; sum squared
mova m7, [hsub_mul]
......@@ -884,6 +898,13 @@ cglobal pixel_var2_8x8_ssse3, 5,6,8
jg .loop
VAR2_END
RET
%endmacro
INIT_XMM ssse3
VAR2_8x8
INIT_XMM xop
VAR2_8x8
%endif ; !HIGH_BIT_DEPTH
;=============================================================================
......@@ -1680,6 +1701,20 @@ cglobal intra_sa8d_x3_8x8, 3,3,16
paddusw m2, m0
; 3x HADDW
%if cpuflag(xop)
phaddw m2, m14
vphadduwq m0, m15
movhlps m1, m0
vphadduwq m2, m2 ; i8x8_v, i8x8_h
paddd m0, m1 ; i8x8_dc
packusdw m2, m0 ; i8x8_v, i8x8_h, i8x8_dc
pxor m3, m3
psrlw m2, 1
pavgw m2, m3
movq [r2], m2 ; i8x8_v, i8x8_h
psrldq m2, 8
movd [r2+8], m2 ; i8x8_dc
%else
movdqa m7, [pw_1]
pmaddwd m2, m7
pmaddwd m14, m7
......@@ -1697,6 +1732,7 @@ cglobal intra_sa8d_x3_8x8, 3,3,16
movq [r2], m3 ; i8x8_v, i8x8_h
psrldq m3, 8
movd [r2+8], m3 ; i8x8_dc
%endif
RET
%endif ; ARCH_X86_64
%endmacro ; INTRA_SA8D_SSE2
......@@ -2088,11 +2124,9 @@ cglobal intra_satd_x3_8x8c, 0,6
psignw m%1, [pw_pmpmpmpm]
paddw m0, m%1
psllw m0, 2 ; hadamard(top), hadamard(left)
mova m1, m0
mova m2, m0
movhlps m3, m0
pshufb m1, [intrax9b_v1]
pshufb m2, [intrax9b_v2]
pshufb m1, m0, [intrax9b_v1]
pshufb m2, m0, [intrax9b_v2]
paddw m0, m3
psignw m3, [pw_pmmpzzzz] ; FIXME could this be eliminated?
pavgw m0, [pw_16]
......@@ -2122,8 +2156,14 @@ cglobal intra_satd_x3_8x8c, 0,6
%endif
movhlps m2, m1
paddw m1, m2
%if cpuflag(xop)
vphaddwq m3, m3
vphaddwq m1, m1
packssdw m1, m3
%else
phaddw m1, m3
pmaddwd m1, [pw_1] ; v, _, h, dc
%endif
%endmacro ; INTRA_X9_VHDC
%macro INTRA_X9_END 1
......@@ -2167,6 +2207,7 @@ cglobal intra_satd_x3_8x8c, 0,6
;-----------------------------------------------------------------------------
; int intra_sad_x9_4x4( uint8_t *fenc, uint8_t *fdec, uint16_t *bitcosts )
;-----------------------------------------------------------------------------
%if notcpuflag(xop)
cglobal intra_sad_x9_4x4, 3,3,9
%ifdef ARCH_X86_64
INTRA_X9_PRED intrax9a, m8
......@@ -2206,12 +2247,10 @@ cglobal intra_sad_x9_4x4, 3,3,9
mova m7, [rsp]
%define %%zero [pb_0]
%endif
mova m3, m7
mova m5, m7
pshufb m3, m7, [intrax9a_vh1]
pshufb m5, m7, [intrax9a_vh2]
pshufb m7, [intrax9a_dc]
pshufb m3, [intrax9a_vh1]
psadbw m7, %%zero
pshufb m5, [intrax9a_vh2]
psrlw m7, 2
psadbw m3, m0
pavgw m7, %%zero
......@@ -2236,6 +2275,7 @@ cglobal intra_sad_x9_4x4, 3,3,9
add rsp, 0x1c
%endif
RET
%endif
%ifdef ARCH_X86_64
;-----------------------------------------------------------------------------
......@@ -2940,6 +2980,16 @@ INTRA_X9
%endif
HADAMARD_AC_SSE2
%define TRANS TRANS_XOP
INIT_XMM xop
SATDS_SSE2
SA8D
%ifndef HIGH_BIT_DEPTH
INTRA_SA8D_SSE2
INTRA_X9
%endif
HADAMARD_AC_SSE2
;=============================================================================
; SSIM
;=============================================================================
......
......@@ -62,16 +62,19 @@ DECL_X1( ssd, sse2slow )
DECL_X1( ssd, sse2 )
DECL_X1( ssd, ssse3 )
DECL_X1( ssd, avx )
DECL_X1( ssd, xop )
DECL_X1( satd, mmx2 )
DECL_X1( satd, sse2 )
DECL_X1( satd, ssse3 )
DECL_X1( satd, sse4 )
DECL_X1( satd, avx )
DECL_X1( satd, xop )
DECL_X1( sa8d, mmx2 )
DECL_X1( sa8d, sse2 )
DECL_X1( sa8d, ssse3 )
DECL_X1( sa8d, sse4 )
DECL_X1( sa8d, avx )
DECL_X1( sa8d, xop )
DECL_X1( sad, cache32_mmx2 );
DECL_X1( sad, cache64_mmx2 );
DECL_X1( sad, cache64_sse2 );
......@@ -84,11 +87,13 @@ DECL_X4( sad, cache64_ssse3 );
DECL_PIXELS( uint64_t, var, mmx2, ( pixel *pix, int i_stride ))
DECL_PIXELS( uint64_t, var, sse2, ( pixel *pix, int i_stride ))
DECL_PIXELS( uint64_t, var, avx, ( pixel *pix, int i_stride ))
DECL_PIXELS( uint64_t, var, xop, ( pixel *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, mmx2, ( pixel *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, sse2, ( pixel *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( pixel *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, sse4, ( pixel *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, avx, ( pixel *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, xop, ( pixel *pix, int i_stride ))
void x264_intra_satd_x3_4x4_mmx2 ( pixel *, pixel *, int * );
......@@ -110,6 +115,7 @@ void x264_intra_sa8d_x3_8x8_mmx2 ( uint8_t *, uint8_t *, int * );
void x264_intra_sa8d_x3_8x8_sse2 ( pixel *, pixel *, int * );
void x264_intra_sa8d_x3_8x8_ssse3 ( uint8_t *, uint8_t *, int * );
void x264_intra_sa8d_x3_8x8_avx ( uint8_t *, uint8_t *, int * );
void x264_intra_sa8d_x3_8x8_xop ( uint8_t *, uint8_t *, int * );
void x264_intra_sad_x3_8x8_mmx2 ( pixel *, pixel *, int * );
void x264_intra_sad_x3_8x8_sse2 ( pixel *, pixel *, int * );
void x264_intra_sad_x3_8x8_ssse3 ( pixel *, pixel *, int * );
......@@ -117,6 +123,7 @@ void x264_intra_sad_x3_8x8_avx ( pixel *, pixel *, int * );
int x264_intra_satd_x9_4x4_ssse3( uint8_t *, uint8_t *, uint16_t * );
int x264_intra_satd_x9_4x4_sse4 ( uint8_t *, uint8_t *, uint16_t * );
int x264_intra_satd_x9_4x4_avx ( uint8_t *, uint8_t *, uint16_t * );
int x264_intra_satd_x9_4x4_xop ( uint8_t *, uint8_t *, uint16_t * );
int x264_intra_sad_x9_4x4_ssse3 ( uint8_t *, uint8_t *, uint16_t * );
int x264_intra_sad_x9_4x4_sse4 ( uint8_t *, uint8_t *, uint16_t * );
int x264_intra_sad_x9_4x4_avx ( uint8_t *, uint8_t *, uint16_t * );
......@@ -141,6 +148,7 @@ float x264_pixel_ssim_end4_avx( int sum0[5][4], int sum1[5][4], int width );
int x264_pixel_var2_8x8_mmx2( pixel *, int, pixel *, int, int * );
int x264_pixel_var2_8x8_sse2( pixel *, int, pixel *, int, int * );
int x264_pixel_var2_8x8_ssse3( uint8_t *, int, uint8_t *, int, int * );
int x264_pixel_var2_8x8_xop( uint8_t *, int, uint8_t *, int, int * );
int x264_pixel_vsad_mmx2( pixel *src, int stride, int height );
int x264_pixel_vsad_sse2( pixel *src, int stride, int height );
......
......@@ -582,9 +582,9 @@ PREDICT_4x4_V1 b
;-----------------------------------------------------------------------------
; void predict_4x4_dc( pixel *src )
;-----------------------------------------------------------------------------
INIT_MMX mmx2
%ifdef HIGH_BIT_DEPTH
INIT_MMX
cglobal predict_4x4_dc_mmx2, 1,1
cglobal predict_4x4_dc, 1,1
mova m2, [r0+0*FDEC_STRIDEB-4*SIZEOF_PIXEL]
paddw m2, [r0+1*FDEC_STRIDEB-4*SIZEOF_PIXEL]
paddw m2, [r0+2*FDEC_STRIDEB-4*SIZEOF_PIXEL]
......@@ -603,8 +603,7 @@ cglobal predict_4x4_dc_mmx2, 1,1
RET
%else ; !HIGH_BIT_DEPTH
INIT_MMX
cglobal predict_4x4_dc_mmx2, 1,4
cglobal predict_4x4_dc, 1,4
pxor mm7, mm7
movd mm0, [r0-FDEC_STRIDEB]
psadbw mm0, mm7
......@@ -797,8 +796,8 @@ PREDICT_8x8_H bw, W
; void predict_8x8_dc( pixel *src, pixel *edge );
;-----------------------------------------------------------------------------
%ifdef HIGH_BIT_DEPTH
INIT_XMM
cglobal predict_8x8_dc_sse2, 2,2
INIT_XMM sse2
cglobal predict_8x8_dc, 2,2
movu m0, [r1+14]
paddw m0, [r1+32]
HADDW m0, m1
......@@ -809,8 +808,8 @@ cglobal predict_8x8_dc_sse2, 2,2
REP_RET
%else ; !HIGH_BIT_DEPTH
INIT_MMX
cglobal predict_8x8_dc_mmx2, 2,2
INIT_MMX mmx2
cglobal predict_8x8_dc, 2,2
pxor mm0, mm0
pxor mm1, mm1
psadbw mm0, [r1+7]
......@@ -839,9 +838,9 @@ cglobal %1, 2,2
STORE8x8 m0, m0
RET
%endmacro
INIT_XMM
PREDICT_8x8_DC predict_8x8_dc_top_sse2 , 32, mova
PREDICT_8x8_DC predict_8x8_dc_left_sse2, 14, movu
INIT_XMM sse2
PREDICT_8x8_DC predict_8x8_dc_top , 32, mova
PREDICT_8x8_DC predict_8x8_dc_left, 14, movu
%else ; !HIGH_BIT_DEPTH
%macro PREDICT_8x8_DC 2
......@@ -1106,9 +1105,9 @@ ALIGN 4
REP_RET
%endif ; !ARCH_X86_64
INIT_XMM
%macro PREDICT_8x8C 0
%ifdef HIGH_BIT_DEPTH
cglobal predict_8x8c_p_core_sse2, 1,1,7
cglobal predict_8x8c_p_core, 1,1,7
movd m0, r1m
movd m2, r2m
movd m4, r3m
......@@ -1133,7 +1132,7 @@ cglobal predict_8x8c_p_core_sse2, 1,1,7
jg .loop
REP_RET
%else ; !HIGH_BIT_DEPTH
cglobal predict_8x8c_p_core_sse2, 1,1
cglobal predict_8x8c_p_core, 1,1
movd m0, r1m
movd m2, r2m
movd m4, r3m
......@@ -1163,12 +1162,19 @@ call .loop
movhps [r0+FDEC_STRIDE*3], m5
RET
%endif ; HIGH_BIT_DEPTH
%endmacro
INIT_XMM sse2
PREDICT_8x8C
INIT_XMM avx
PREDICT_8x8C
;-----------------------------------------------------------------------------
; void predict_16x16_p_core( uint8_t *src, int i00, int b, int c )
;-----------------------------------------------------------------------------
%ifndef ARCH_X86_64
cglobal predict_16x16_p_core_mmx2, 1,2
INIT_MMX mmx2
cglobal predict_16x16_p_core, 1,2
LOAD_PLANE_ARGS
movq mm5, mm2
movq mm1, mm2
......@@ -1912,8 +1918,8 @@ PREDICT_16x16_H
%endif
%endmacro
INIT_MMX
cglobal predict_16x16_dc_core_mmx2, 1,2
INIT_MMX mmx2
cglobal predict_16x16_dc_core, 1,2
%ifdef ARCH_X86_64
movd m6, r1d
PRED16x16_DC m6, 5
......@@ -1922,20 +1928,20 @@ cglobal predict_16x16_dc_core_mmx2, 1,2
%endif
REP_RET
INIT_MMX
cglobal predict_16x16_dc_top_mmx2, 1,2
INIT_MMX mmx2
cglobal predict_16x16_dc_top, 1,2
PRED16x16_DC [pw_8], 4
REP_RET
INIT_MMX
INIT_MMX mmx2
%ifdef HIGH_BIT_DEPTH
cglobal predict_16x16_dc_left_core_mmx2, 1,2
cglobal predict_16x16_dc_left_core, 1,2
movd m0, r1m
SPLATW m0, m0
STORE16x16 m0, m0, m0, m0
REP_RET
%else ; !HIGH_BIT_DEPTH
cglobal predict_16x16_dc_left_core_mmx2, 1,1
cglobal predict_16x16_dc_left_core, 1,1
movd m0, r1m
pshufw m0, m0, 0
packuswb m0, m0
......@@ -1969,25 +1975,25 @@ cglobal predict_16x16_dc_left_core_mmx2, 1,1
%endif
%endmacro
INIT_XMM
cglobal predict_16x16_dc_core_sse2, 2,2,4
INIT_XMM sse2
cglobal predict_16x16_dc_core, 2,2,4
movd m3, r1m
PRED16x16_DC_SSE2 m3, 5
REP_RET
cglobal predict_16x16_dc_top_sse2, 1,2
cglobal predict_16x16_dc_top, 1,2
PRED16x16_DC_SSE2 [pw_8], 4
REP_RET
INIT_XMM
INIT_XMM sse2
%ifdef HIGH_BIT_DEPTH
cglobal predict_16x16_dc_left_core_sse2, 1,2
cglobal predict_16x16_dc_left_core, 1,2
movd m0, r1m
SPLATW m0, m0
STORE16x16_SSE2 m0, m0
REP_RET
%else ; !HIGH_BIT_DEPTH
cglobal predict_16x16_dc_left_core_sse2, 1,1
cglobal predict_16x16_dc_left_core, 1,1
movd m0, r1m
SPLATW m0, m0
packuswb m0, m0
......
......@@ -191,55 +191,65 @@ PREDICT_8x8_P( sse2 )
#endif //!HIGH_BIT_DEPTH
#if HAVE_X86_INLINE_ASM
#define PREDICT_8x8C_P_CORE\
V = 1 * ( src[4*FDEC_STRIDE -1] - src[ 2*FDEC_STRIDE -1] )\
+ 2 * ( src[5*FDEC_STRIDE -1] - src[ 1*FDEC_STRIDE -1] )\
+ 3 * ( src[6*FDEC_STRIDE -1] - src[ 0*FDEC_STRIDE -1] )\
+ 4 * ( src[7*FDEC_STRIDE -1] - src[-1*FDEC_STRIDE -1] );\
H += -4 * src[-1*FDEC_STRIDE -1];\
int a = 16 * ( src[7*FDEC_STRIDE -1] + src[7 - FDEC_STRIDE] );\
int b = ( 17 * H + 16 ) >> 5;\
int c = ( 17 * V + 16 ) >> 5;\
#if HIGH_BIT_DEPTH
static void x264_predict_8x8c_p_sse2( uint16_t *src )
#else
static void x264_predict_8x8c_p_ssse3( uint8_t *src )
#endif
{
int a, b, c, i00;
int H, V;
#if HIGH_BIT_DEPTH
asm (
"movdqa %1, %%xmm0 \n"
"pmaddwd %2, %%xmm0 \n"
"movhlps %%xmm0, %%xmm1 \n"
"paddd %%xmm1, %%xmm0 \n"
"pshuflw $14, %%xmm0, %%xmm1 \n"
"paddd %%xmm1, %%xmm0 \n"
"movd %%xmm0, %0 \n"
:"=r"(H)
:"m"(src[-FDEC_STRIDE]), "m"(*pw_m32101234)
);
#else
asm (
"movq %1, %%mm0 \n"
"pmaddubsw %2, %%mm0 \n"
"pshufw $14, %%mm0, %%mm1 \n"
"paddw %%mm1, %%mm0 \n"
"pshufw $1, %%mm0, %%mm1 \n"
"paddw %%mm1, %%mm0 \n"
"movd %%mm0, %0 \n"
"movswl %w0, %0 \n"
:"=r"(H)
:"m"(src[-FDEC_STRIDE]), "m"(*pb_m32101234)
);
#endif
V = 1 * ( src[4*FDEC_STRIDE -1] - src[ 2*FDEC_STRIDE -1] )
+ 2 * ( src[5*FDEC_STRIDE -1] - src[ 1*FDEC_STRIDE -1] )
+ 3 * ( src[6*FDEC_STRIDE -1] - src[ 0*FDEC_STRIDE -1] )
+ 4 * ( src[7*FDEC_STRIDE -1] - src[-1*FDEC_STRIDE -1] );
H += -4 * src[-1*FDEC_STRIDE -1];
a = 16 * ( src[7*FDEC_STRIDE -1] + src[7 - FDEC_STRIDE] );
b = ( 17 * H + 16 ) >> 5;
c = ( 17 * V + 16 ) >> 5;
i00 = a -3*b -3*c + 16;
#if HIGH_BIT_DEPTH
x264_predict_8x8c_p_core_sse2( src, a, b, c );
#else
x264_predict_8x8c_p_core_sse2( src, i00, b, c );