Commit 7e60fcd7 authored by Loren Merritt's avatar Loren Merritt Committed by Fiona Glaser
Browse files

Enable some existing asm functions that were missing function pointers

pixel_ads1_avx, predict_8x8_hd_avxx
High bit depth mc_copy_w8_sse2, denoise_dct_avx, prefetch_fenc/ref, and several pixel*sse4.
parent 52f287e8
......@@ -546,6 +546,7 @@ INTRA_MBCMP( sad, 16, v, h, dc, , _mmx2 )
INTRA_MBCMP(satd, 16, v, h, dc, , _mmx2 )
INTRA_MBCMP( sad, 8, dc, h, v, c, _sse2 )
INTRA_MBCMP( sad, 16, v, h, dc, , _sse2 )
INTRA_MBCMP( sad, 4, v, h, dc, , _ssse3 )
INTRA_MBCMP( sad, 8, dc, h, v, c, _ssse3 )
INTRA_MBCMP( sad, 16, v, h, dc, , _ssse3 )
#endif
......@@ -873,10 +874,35 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3;
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_ssse3;
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_ssse3;
pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_ssse3;
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_ssse3;
}
if( cpu&X264_CPU_SSE4 )
{
if( !(cpu&X264_CPU_STACK_MOD4) )
{
INIT4( hadamard_ac, _sse4 );
}
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_sse4;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse4;
}
if( cpu&X264_CPU_AVX )
{
INIT_ADS( _avx );
if( !(cpu&X264_CPU_STACK_MOD4) )
{
INIT4( hadamard_ac, _avx );
}
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_avx;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_avx;
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx;
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_avx;
pixf->ssim_end4 = x264_pixel_ssim_end4_avx;
}
#endif // HAVE_MMX
#else // !HIGH_BIT_DEPTH
#if HAVE_MMX
......@@ -1038,6 +1064,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_ssse3;
#endif
pixf->var2_8x8 = x264_pixel_var2_8x8_ssse3;
if( cpu&X264_CPU_SHUFFLE_IS_FAST )
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_ssse3;
if( cpu&X264_CPU_CACHELINE_64 )
{
INIT2( sad, _cache64_ssse3 );
......@@ -1062,8 +1090,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_sse4;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse4;
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_sse4;
/* Slower on Conroe, so only enable under SSE4 */
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_ssse3;
}
if( cpu&X264_CPU_AVX )
......@@ -1071,8 +1097,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT7( satd, _avx );
INIT7( satd_x3, _avx );
INIT7( satd_x4, _avx );
pixf->ads[PIXEL_16x16] = x264_pixel_ads4_avx;
pixf->ads[PIXEL_16x8] = x264_pixel_ads2_avx;
INIT_ADS( _avx );
if( !(cpu&X264_CPU_STACK_MOD4) )
{
INIT4( hadamard_ac, _avx );
......
......@@ -428,6 +428,10 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->quant_4x4 = x264_quant_4x4_sse4;
pf->quant_8x8 = x264_quant_8x8_sse4;
}
if( cpu&X264_CPU_AVX )
{
pf->denoise_dct = x264_denoise_dct_avx;
}
#endif // HAVE_MMX
#else // !HIGH_BIT_DEPTH
#if HAVE_MMX
......
......@@ -1291,19 +1291,21 @@ MC_COPY 16
;=============================================================================
; prefetch
;=============================================================================
; FIXME assumes 64 byte cachelines
; assumes 64 byte cachelines
; FIXME doesn't cover all pixels in high depth and/or 4:4:4
;-----------------------------------------------------------------------------
; void prefetch_fenc( uint8_t *pix_y, int stride_y,
; uint8_t *pix_uv, int stride_uv, int mb_x )
; void prefetch_fenc( pixel *pix_y, int stride_y,
; pixel *pix_uv, int stride_uv, int mb_x )
;-----------------------------------------------------------------------------
INIT_MMX
%ifdef ARCH_X86_64
cglobal prefetch_fenc_mmx2, 5,5
FIX_STRIDES r1d, r3d
and r4d, 3
mov eax, r4d
imul r4d, r1d
lea r0, [r0+r4*4+64]
lea r0, [r0+r4*4+64*SIZEOF_PIXEL]
prefetcht0 [r0]
prefetcht0 [r0+r1]
lea r0, [r0+r1*2]
......@@ -1311,7 +1313,7 @@ cglobal prefetch_fenc_mmx2, 5,5
prefetcht0 [r0+r1]
imul eax, r3d
lea r2, [r2+rax*2+64]
lea r2, [r2+rax*2+64*SIZEOF_PIXEL]
prefetcht0 [r2]
prefetcht0 [r2+r3]
RET
......@@ -1321,9 +1323,10 @@ cglobal prefetch_fenc_mmx2, 0,3
mov r2, r4m
mov r1, r1m
mov r0, r0m
FIX_STRIDES r1
and r2, 3
imul r2, r1
lea r0, [r0+r2*4+64]
lea r0, [r0+r2*4+64*SIZEOF_PIXEL]
prefetcht0 [r0]
prefetcht0 [r0+r1]
lea r0, [r0+r1*2]
......@@ -1333,21 +1336,23 @@ cglobal prefetch_fenc_mmx2, 0,3
mov r2, r4m
mov r1, r3m
mov r0, r2m
FIX_STRIDES r1
and r2, 3
imul r2, r1
lea r0, [r0+r2*2+64]
lea r0, [r0+r2*2+64*SIZEOF_PIXEL]
prefetcht0 [r0]
prefetcht0 [r0+r1]
ret
%endif ; ARCH_X86_64
;-----------------------------------------------------------------------------
; void prefetch_ref( uint8_t *pix, int stride, int parity )
; void prefetch_ref( pixel *pix, int stride, int parity )
;-----------------------------------------------------------------------------
cglobal prefetch_ref_mmx2, 3,3
FIX_STRIDES r1d
dec r2d
and r2d, r1d
lea r0, [r0+r2*8+64]
lea r0, [r0+r2*8+64*SIZEOF_PIXEL]
lea r2, [r1*3]
prefetcht0 [r0]
prefetcht0 [r0+r1]
......
......@@ -85,8 +85,8 @@ void x264_mc_copy_w8_sse2( pixel *, int, pixel *, int, int );
void x264_mc_copy_w16_mmx( pixel *, int, pixel *, int, int );
void x264_mc_copy_w16_sse2( pixel *, int, pixel *, int, int );
void x264_mc_copy_w16_aligned_sse2( pixel *, int, pixel *, int, int );
void x264_prefetch_fenc_mmx2( uint8_t *, int, uint8_t *, int, int );
void x264_prefetch_ref_mmx2( uint8_t *, int, int );
void x264_prefetch_fenc_mmx2( pixel *, int, pixel *, int, int );
void x264_prefetch_ref_mmx2( pixel *, int, int );
void x264_plane_copy_core_mmx2( pixel *, int, pixel *, int, int w, int h);
void x264_plane_copy_c( pixel *, int, pixel *, int, int w, int h );
void x264_plane_copy_interleave_core_mmx2( pixel *dst, int i_dst,
......@@ -225,7 +225,11 @@ static void (* const x264_mc_copy_wtab_##instr[5])( pixel *, int, pixel *, int,
};
MC_COPY_WTAB(mmx,mmx,mmx,mmx)
#if HIGH_BIT_DEPTH
MC_COPY_WTAB(sse2,mmx,sse2,sse2)
#else
MC_COPY_WTAB(sse2,mmx,mmx,sse2)
#endif
#define MC_WEIGHT_WTAB(function, instr, name1, name2, w12version)\
static void (* x264_mc_##function##_wtab_##instr[6])( pixel *, int, pixel *, int, const x264_weight_t *, int ) =\
......@@ -510,6 +514,9 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
if( !(cpu&X264_CPU_MMX2) )
return;
pf->prefetch_fenc = x264_prefetch_fenc_mmx2;
pf->prefetch_ref = x264_prefetch_ref_mmx2;
pf->plane_copy = x264_plane_copy_mmx2;
pf->plane_copy_interleave = x264_plane_copy_interleave_mmx2;
pf->store_interleave_8x8x2 = x264_store_interleave_8x8x2_mmx2;
......@@ -605,8 +612,6 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
if( !(cpu&X264_CPU_STACK_MOD4) )
pf->mc_chroma = x264_mc_chroma_avx;
#else // !HIGH_BIT_DEPTH
pf->prefetch_fenc = x264_prefetch_fenc_mmx2;
pf->prefetch_ref = x264_prefetch_ref_mmx2;
#if ARCH_X86 // all x86_64 cpus with cacheline split issues use sse2 instead
if( cpu&X264_CPU_CACHELINE_32 )
......
......@@ -566,6 +566,7 @@ void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12], x264_predict_
pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_avx;
pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_avx;
pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_avx;
pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_avx;
#endif // HIGH_BIT_DEPTH
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment