Commit 189c30d3 authored by Loren Merritt's avatar Loren Merritt Committed by Fiona Glaser

Cosmetics: s/mmxext/mmx2/

parent b37de189
......@@ -40,7 +40,7 @@ static uint8_t *x264_nal_escape_c( uint8_t *dst, uint8_t *src, uint8_t *end )
}
#if HAVE_MMX
uint8_t *x264_nal_escape_mmxext( uint8_t *dst, uint8_t *src, uint8_t *end );
uint8_t *x264_nal_escape_mmx2( uint8_t *dst, uint8_t *src, uint8_t *end );
uint8_t *x264_nal_escape_sse2( uint8_t *dst, uint8_t *src, uint8_t *end );
uint8_t *x264_nal_escape_avx( uint8_t *dst, uint8_t *src, uint8_t *end );
#endif
......@@ -90,8 +90,8 @@ void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf )
{
pf->nal_escape = x264_nal_escape_c;
#if HAVE_MMX
if( cpu&X264_CPU_MMXEXT )
pf->nal_escape = x264_nal_escape_mmxext;
if( cpu&X264_CPU_MMX2 )
pf->nal_escape = x264_nal_escape_mmx2;
if( (cpu&X264_CPU_SSE2) && (cpu&X264_CPU_SSE2_IS_FAST) )
pf->nal_escape = x264_nal_escape_sse2;
if( cpu&X264_CPU_AVX )
......
......@@ -47,31 +47,33 @@
const x264_cpu_name_t x264_cpu_names[] =
{
{"Altivec", X264_CPU_ALTIVEC},
// {"MMX", X264_CPU_MMX}, // we don't support asm on mmx1 cpus anymore
{"MMX2", X264_CPU_MMX|X264_CPU_MMXEXT},
{"MMXEXT", X264_CPU_MMX|X264_CPU_MMXEXT},
// {"SSE", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE}, // there are no sse1 functions in x264
{"SSE2Slow",X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE2_IS_SLOW},
{"SSE2", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2},
{"SSE2Fast",X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE2_IS_FAST},
{"SSE3", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3},
{"SSSE3", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3},
{"FastShuffle", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SHUFFLE_IS_FAST},
{"SSE4.1", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
{"SSE4", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
{"SSE4.2", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42},
{"AVX", X264_CPU_MMX|X264_CPU_MMXEXT|X264_CPU_SSE|X264_CPU_SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42|X264_CPU_AVX},
{"Cache32", X264_CPU_CACHELINE_32},
{"Cache64", X264_CPU_CACHELINE_64},
{"SSEMisalign", X264_CPU_SSE_MISALIGN},
{"LZCNT", X264_CPU_LZCNT},
{"Altivec", X264_CPU_ALTIVEC},
// {"MMX", X264_CPU_MMX}, // we don't support asm on mmx1 cpus anymore
{"MMX2", X264_CPU_MMX|X264_CPU_MMX2},
{"MMXEXT", X264_CPU_MMX|X264_CPU_MMX2},
// {"SSE", X264_CPU_MMX|X264_CPU_MMX2|X264_CPU_SSE}, // there are no sse1 functions in x264
#define SSE2 X264_CPU_MMX|X264_CPU_MMX2|X264_CPU_SSE|X264_CPU_SSE2
{"SSE2Slow", SSE2|X264_CPU_SSE2_IS_SLOW},
{"SSE2", SSE2},
{"SSE2Fast", SSE2|X264_CPU_SSE2_IS_FAST},
{"SSE3", SSE2|X264_CPU_SSE3},
{"SSSE3", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3},
{"FastShuffle", SSE2|X264_CPU_SHUFFLE_IS_FAST},
{"SSE4.1", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
{"SSE4", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
{"SSE4.2", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42},
{"AVX", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4|X264_CPU_SSE42|X264_CPU_AVX},
#undef SSE2
{"Cache32", X264_CPU_CACHELINE_32},
{"Cache64", X264_CPU_CACHELINE_64},
{"SSEMisalign", X264_CPU_SSE_MISALIGN},
{"LZCNT", X264_CPU_LZCNT},
{"Slow_mod4_stack", X264_CPU_STACK_MOD4},
{"ARMv6", X264_CPU_ARMV6},
{"NEON", X264_CPU_NEON},
{"Fast_NEON_MRC", X264_CPU_FAST_NEON_MRC},
{"SlowCTZ", X264_CPU_SLOW_CTZ},
{"SlowAtom", X264_CPU_SLOW_ATOM},
{"ARMv6", X264_CPU_ARMV6},
{"NEON", X264_CPU_NEON},
{"Fast_NEON_MRC", X264_CPU_FAST_NEON_MRC},
{"SlowCTZ", X264_CPU_SLOW_CTZ},
{"SlowAtom", X264_CPU_SLOW_ATOM},
{"", 0},
};
......@@ -122,7 +124,7 @@ uint32_t x264_cpu_detect( void )
else
return 0;
if( edx&0x02000000 )
cpu |= X264_CPU_MMXEXT|X264_CPU_SSE;
cpu |= X264_CPU_MMX2|X264_CPU_SSE;
if( edx&0x04000000 )
cpu |= X264_CPU_SSE2;
if( ecx&0x00000001 )
......@@ -155,7 +157,7 @@ uint32_t x264_cpu_detect( void )
cpu |= X264_CPU_SLOW_CTZ;
x264_cpu_cpuid( 0x80000001, &eax, &ebx, &ecx, &edx );
if( edx&0x00400000 )
cpu |= X264_CPU_MMXEXT;
cpu |= X264_CPU_MMX2;
if( cpu & X264_CPU_SSE2 )
{
if( ecx&0x00000040 ) /* SSE4a */
......
......@@ -460,7 +460,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx;
dctf->dct4x4dc = x264_dct4x4dc_mmx;
dctf->idct4x4dc = x264_idct4x4dc_mmx;
dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_mmxext;
dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_mmx2;
#if !ARCH_X86_64
dctf->sub8x8_dct = x264_sub8x8_dct_mmx;
......@@ -783,11 +783,11 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig
#if HAVE_MMX
if( cpu&X264_CPU_MMX )
pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx;
if( cpu&X264_CPU_MMXEXT )
if( cpu&X264_CPU_MMX2 )
{
pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_mmxext;
pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_mmxext;
pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_mmxext;
pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_mmx2;
pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_mmx2;
pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_mmx2;
}
if( cpu&X264_CPU_SSE2_IS_FAST )
pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
......
......@@ -625,42 +625,42 @@ void x264_deblock_v_chroma_intra_sse2( pixel *pix, int stride, int alpha, int be
void x264_deblock_v_chroma_intra_avx ( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_sse2( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_avx ( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_strength_mmxext( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
void x264_deblock_strength_sse2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
void x264_deblock_strength_ssse3 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
void x264_deblock_strength_avx ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
void x264_deblock_strength_mmx2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
void x264_deblock_strength_sse2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
void x264_deblock_strength_ssse3( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
void x264_deblock_strength_avx ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
#if ARCH_X86
void x264_deblock_h_luma_mmxext( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v_chroma_mmxext( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_mmxext( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_luma_intra_mmxext( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_v8_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
void x264_deblock_v_chroma_intra_mmxext( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_mmxext( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_luma_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v8_luma_mmx2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v_chroma_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_luma_intra_mmx2( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_v8_luma_intra_mmx2( uint8_t *pix, int stride, int alpha, int beta );
void x264_deblock_v_chroma_intra_mmx2( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_mmx2( pixel *pix, int stride, int alpha, int beta );
#if HIGH_BIT_DEPTH
void x264_deblock_v_luma_mmxext( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v_luma_intra_mmxext( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_v_luma_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v_luma_intra_mmx2( pixel *pix, int stride, int alpha, int beta );
#else
// FIXME this wrapper has a significant cpu cost
static void x264_deblock_v_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
static void x264_deblock_v_luma_mmx2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
{
x264_deblock_v8_luma_mmxext( pix, stride, alpha, beta, tc0 );
x264_deblock_v8_luma_mmxext( pix+8, stride, alpha, beta, tc0+2 );
x264_deblock_v8_luma_mmx2( pix, stride, alpha, beta, tc0 );
x264_deblock_v8_luma_mmx2( pix+8, stride, alpha, beta, tc0+2 );
}
static void x264_deblock_v_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta )
static void x264_deblock_v_luma_intra_mmx2( uint8_t *pix, int stride, int alpha, int beta )
{
x264_deblock_v8_luma_intra_mmxext( pix, stride, alpha, beta );
x264_deblock_v8_luma_intra_mmxext( pix+8, stride, alpha, beta );
x264_deblock_v8_luma_intra_mmx2( pix, stride, alpha, beta );
x264_deblock_v8_luma_intra_mmx2( pix+8, stride, alpha, beta );
}
#endif // HIGH_BIT_DEPTH
#endif
......@@ -695,19 +695,19 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
pf->deblock_strength = deblock_strength_c;
#if HAVE_MMX
if( cpu&X264_CPU_MMXEXT )
if( cpu&X264_CPU_MMX2 )
{
#if ARCH_X86
pf->deblock_luma[1] = x264_deblock_v_luma_mmxext;
pf->deblock_luma[0] = x264_deblock_h_luma_mmxext;
pf->deblock_chroma[1] = x264_deblock_v_chroma_mmxext;
pf->deblock_chroma[0] = x264_deblock_h_chroma_mmxext;
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_mmxext;
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_mmxext;
pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_mmxext;
pf->deblock_chroma_intra[0] = x264_deblock_h_chroma_intra_mmxext;
pf->deblock_luma[1] = x264_deblock_v_luma_mmx2;
pf->deblock_luma[0] = x264_deblock_h_luma_mmx2;
pf->deblock_chroma[1] = x264_deblock_v_chroma_mmx2;
pf->deblock_chroma[0] = x264_deblock_h_chroma_mmx2;
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_mmx2;
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_mmx2;
pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_mmx2;
pf->deblock_chroma_intra[0] = x264_deblock_h_chroma_intra_mmx2;
#endif
pf->deblock_strength = x264_deblock_strength_mmxext;
pf->deblock_strength = x264_deblock_strength_mmx2;
if( cpu&X264_CPU_SSE2 )
{
pf->deblock_strength = x264_deblock_strength_sse2;
......
......@@ -115,7 +115,7 @@ void __intel_cpu_indicator_init( void )
__intel_cpu_indicator = 0x200;
else if( cpu&X264_CPU_SSE )
__intel_cpu_indicator = 0x80;
else if( cpu&X264_CPU_MMXEXT )
else if( cpu&X264_CPU_MMX2 )
__intel_cpu_indicator = 8;
else
__intel_cpu_indicator = 1;
......
......@@ -483,7 +483,7 @@ SATD_X( 4x4, cpu )
SATD_X_DECL7()
#if HAVE_MMX
SATD_X_DECL7( _mmxext )
SATD_X_DECL7( _mmx2 )
#if !HIGH_BIT_DEPTH
SATD_X_DECL6( _sse2 )
SATD_X_DECL7( _ssse3 )
......@@ -513,7 +513,7 @@ void x264_intra_##mbcmp##_x3_8x8##cpu( pixel *fenc, pixel edge[33], int res[3] )
INTRA_MBCMP_8x8( sad, )
INTRA_MBCMP_8x8(sa8d, )
#if HIGH_BIT_DEPTH && HAVE_MMX
INTRA_MBCMP_8x8( sad, _mmxext)
INTRA_MBCMP_8x8( sad, _mmx2 )
INTRA_MBCMP_8x8( sad, _sse2 )
INTRA_MBCMP_8x8( sad, _ssse3 )
INTRA_MBCMP_8x8(sa8d, _sse2 )
......@@ -538,14 +538,14 @@ INTRA_MBCMP( sad, 16, v, h, dc, , )
INTRA_MBCMP(satd, 16, v, h, dc, , )
#if HIGH_BIT_DEPTH && HAVE_MMX
INTRA_MBCMP( sad, 4, v, h, dc, , _mmxext)
INTRA_MBCMP(satd, 4, v, h, dc, , _mmxext)
INTRA_MBCMP( sad, 8, dc, h, v, c, _mmxext)
INTRA_MBCMP(satd, 8, dc, h, v, c, _mmxext)
INTRA_MBCMP( sad, 16, v, h, dc, , _mmxext)
INTRA_MBCMP(satd, 16, v, h, dc, , _mmxext)
INTRA_MBCMP( sad, 8, dc, h, v, c, _sse2 )
INTRA_MBCMP( sad, 16, v, h, dc, , _sse2 )
INTRA_MBCMP( sad, 4, v, h, dc, , _mmx2 )
INTRA_MBCMP(satd, 4, v, h, dc, , _mmx2 )
INTRA_MBCMP( sad, 8, dc, h, v, c, _mmx2 )
INTRA_MBCMP(satd, 8, dc, h, v, c, _mmx2 )
INTRA_MBCMP( sad, 16, v, h, dc, , _mmx2 )
INTRA_MBCMP(satd, 16, v, h, dc, , _mmx2 )
INTRA_MBCMP( sad, 8, dc, h, v, c, _sse2 )
INTRA_MBCMP( sad, 16, v, h, dc, , _sse2 )
INTRA_MBCMP( sad, 8, dc, h, v, c, _ssse3 )
INTRA_MBCMP( sad, 16, v, h, dc, , _ssse3 )
#endif
......@@ -790,30 +790,30 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
#if HIGH_BIT_DEPTH
#if HAVE_MMX
if( cpu&X264_CPU_MMXEXT )
if( cpu&X264_CPU_MMX2 )
{
INIT7( sad, _mmxext );
INIT7( sad_x3, _mmxext );
INIT7( sad_x4, _mmxext );
INIT7( satd, _mmxext );
INIT7( satd_x3, _mmxext );
INIT7( satd_x4, _mmxext );
INIT4( hadamard_ac, _mmxext );
INIT7( ssd, _mmxext );
INIT_ADS( _mmxext );
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_mmxext;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmxext;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmxext;
pixf->var2_8x8 = x264_pixel_var2_8x8_mmxext;
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_mmxext;
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmxext;
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_mmxext;
pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_mmxext;
pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_mmxext;
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_mmxext;
pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_mmxext;
INIT7( sad, _mmx2 );
INIT7( sad_x3, _mmx2 );
INIT7( sad_x4, _mmx2 );
INIT7( satd, _mmx2 );
INIT7( satd_x3, _mmx2 );
INIT7( satd_x4, _mmx2 );
INIT4( hadamard_ac, _mmx2 );
INIT7( ssd, _mmx2 );
INIT_ADS( _mmx2 );
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_mmx2;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmx2;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmx2;
pixf->var2_8x8 = x264_pixel_var2_8x8_mmx2;
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_mmx2;
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmx2;
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_mmx2;
pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_mmx2;
pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_mmx2;
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_mmx2;
pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_mmx2;
}
if( cpu&X264_CPU_SSE2 )
{
......@@ -885,59 +885,59 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT7( ssd, _mmx );
}
if( cpu&X264_CPU_MMXEXT )
if( cpu&X264_CPU_MMX2 )
{
INIT7( sad, _mmxext );
INIT7_NAME( sad_aligned, sad, _mmxext );
INIT7( sad_x3, _mmxext );
INIT7( sad_x4, _mmxext );
INIT7( satd, _mmxext );
INIT7( satd_x3, _mmxext );
INIT7( satd_x4, _mmxext );
INIT4( hadamard_ac, _mmxext );
INIT_ADS( _mmxext );
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmxext;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmxext;
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_mmxext;
INIT7( sad, _mmx2 );
INIT7_NAME( sad_aligned, sad, _mmx2 );
INIT7( sad_x3, _mmx2 );
INIT7( sad_x4, _mmx2 );
INIT7( satd, _mmx2 );
INIT7( satd_x3, _mmx2 );
INIT7( satd_x4, _mmx2 );
INIT4( hadamard_ac, _mmx2 );
INIT_ADS( _mmx2 );
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmx2;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmx2;
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_mmx2;
#if ARCH_X86
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmxext;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_mmxext;
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmxext;
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_mmxext;
pixf->var2_8x8 = x264_pixel_var2_8x8_mmxext;
pixf->vsad = x264_pixel_vsad_mmxext;
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmx2;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_mmx2;
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmx2;
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_mmx2;
pixf->var2_8x8 = x264_pixel_var2_8x8_mmx2;
pixf->vsad = x264_pixel_vsad_mmx2;
if( cpu&X264_CPU_CACHELINE_32 )
{
INIT5( sad, _cache32_mmxext );
INIT4( sad_x3, _cache32_mmxext );
INIT4( sad_x4, _cache32_mmxext );
INIT5( sad, _cache32_mmx2 );
INIT4( sad_x3, _cache32_mmx2 );
INIT4( sad_x4, _cache32_mmx2 );
}
else if( cpu&X264_CPU_CACHELINE_64 )
{
INIT5( sad, _cache64_mmxext );
INIT4( sad_x3, _cache64_mmxext );
INIT4( sad_x4, _cache64_mmxext );
INIT5( sad, _cache64_mmx2 );
INIT4( sad_x3, _cache64_mmx2 );
INIT4( sad_x4, _cache64_mmx2 );
}
#else
if( cpu&X264_CPU_CACHELINE_64 )
{
pixf->sad[PIXEL_8x16] = x264_pixel_sad_8x16_cache64_mmxext;
pixf->sad[PIXEL_8x8] = x264_pixel_sad_8x8_cache64_mmxext;
pixf->sad[PIXEL_8x4] = x264_pixel_sad_8x4_cache64_mmxext;
pixf->sad_x3[PIXEL_8x16] = x264_pixel_sad_x3_8x16_cache64_mmxext;
pixf->sad_x3[PIXEL_8x8] = x264_pixel_sad_x3_8x8_cache64_mmxext;
pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_cache64_mmxext;
pixf->sad_x4[PIXEL_8x8] = x264_pixel_sad_x4_8x8_cache64_mmxext;
pixf->sad[PIXEL_8x16] = x264_pixel_sad_8x16_cache64_mmx2;
pixf->sad[PIXEL_8x8] = x264_pixel_sad_8x8_cache64_mmx2;
pixf->sad[PIXEL_8x4] = x264_pixel_sad_8x4_cache64_mmx2;
pixf->sad_x3[PIXEL_8x16] = x264_pixel_sad_x3_8x16_cache64_mmx2;
pixf->sad_x3[PIXEL_8x8] = x264_pixel_sad_x3_8x8_cache64_mmx2;
pixf->sad_x4[PIXEL_8x16] = x264_pixel_sad_x4_8x16_cache64_mmx2;
pixf->sad_x4[PIXEL_8x8] = x264_pixel_sad_x4_8x8_cache64_mmx2;
}
#endif
pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_mmxext;
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_mmxext;
pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_mmxext;
pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_mmxext;
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_mmxext;
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmxext;
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_mmxext;
pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_mmx2;
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_mmx2;
pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_mmx2;
pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_mmx2;
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_mmx2;
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmx2;
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_mmx2;
}
if( cpu&X264_CPU_SSE2 )
......
......@@ -349,28 +349,28 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
#if HIGH_BIT_DEPTH
#if HAVE_MMX
if( cpu&X264_CPU_MMXEXT )
if( cpu&X264_CPU_MMX2 )
{
#if ARCH_X86
pf->denoise_dct = x264_denoise_dct_mmx;
pf->decimate_score15 = x264_decimate_score15_mmxext;
pf->decimate_score16 = x264_decimate_score16_mmxext;
pf->decimate_score15 = x264_decimate_score15_mmx2;
pf->decimate_score16 = x264_decimate_score16_mmx2;
if( cpu&X264_CPU_SLOW_CTZ )
{
pf->decimate_score15 = x264_decimate_score15_mmxext_slowctz;
pf->decimate_score16 = x264_decimate_score16_mmxext_slowctz;
pf->decimate_score15 = x264_decimate_score15_mmx2_slowctz;
pf->decimate_score16 = x264_decimate_score16_mmx2_slowctz;
}
pf->decimate_score64 = x264_decimate_score64_mmxext;
pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmxext;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_mmxext;
pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmxext;
pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64_mmxext;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_mmxext;
pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_mmxext;
pf->decimate_score64 = x264_decimate_score64_mmx2;
pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmx2;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_mmx2;
pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmx2;
pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64_mmx2;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_mmx2;
pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_mmx2;
#endif
pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmxext;
pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmx2;
if( cpu&X264_CPU_LZCNT )
pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmxext_lzcnt;
pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmx2_lzcnt;
}
if( cpu&X264_CPU_SSE2 )
{
......@@ -397,7 +397,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2;
if( cpu&X264_CPU_LZCNT )
{
pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmxext_lzcnt;
pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmx2_lzcnt;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2_lzcnt;
pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2_lzcnt;
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2_lzcnt;
......@@ -437,7 +437,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->quant_4x4 = x264_quant_4x4_mmx;
pf->quant_8x8 = x264_quant_8x8_mmx;
pf->dequant_4x4 = x264_dequant_4x4_mmx;
pf->dequant_4x4_dc = x264_dequant_4x4dc_mmxext;
pf->dequant_4x4_dc = x264_dequant_4x4dc_mmx2;
pf->dequant_8x8 = x264_dequant_8x8_mmx;
if( h->param.i_cqm_preset == X264_CQM_FLAT )
{
......@@ -448,31 +448,31 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
#endif
}
if( cpu&X264_CPU_MMXEXT )
if( cpu&X264_CPU_MMX2 )
{
pf->quant_2x2_dc = x264_quant_2x2_dc_mmxext;
pf->quant_2x2_dc = x264_quant_2x2_dc_mmx2;
#if ARCH_X86
pf->quant_4x4_dc = x264_quant_4x4_dc_mmxext;
pf->decimate_score15 = x264_decimate_score15_mmxext;
pf->decimate_score16 = x264_decimate_score16_mmxext;
pf->quant_4x4_dc = x264_quant_4x4_dc_mmx2;
pf->decimate_score15 = x264_decimate_score15_mmx2;
pf->decimate_score16 = x264_decimate_score16_mmx2;
if( cpu&X264_CPU_SLOW_CTZ )
{
pf->decimate_score15 = x264_decimate_score15_mmxext_slowctz;
pf->decimate_score16 = x264_decimate_score16_mmxext_slowctz;
pf->decimate_score15 = x264_decimate_score15_mmx2_slowctz;
pf->decimate_score16 = x264_decimate_score16_mmx2_slowctz;
}
pf->decimate_score64 = x264_decimate_score64_mmxext;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_mmxext;
pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmxext;
pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64_mmxext;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_mmxext;
pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_mmxext;
pf->decimate_score64 = x264_decimate_score64_mmx2;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_mmx2;
pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmx2;
pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64_mmx2;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_mmx2;
pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_mmx2;
#endif
pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmxext;
pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmxext;
pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmx2;
pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmx2;
if( cpu&X264_CPU_LZCNT )
{
pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmxext_lzcnt;
pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmxext_lzcnt;
pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmx2_lzcnt;
pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmx2_lzcnt;
}
}
......
......@@ -124,7 +124,7 @@ ALIGN 16
%endmacro
INIT_MMX
NAL_ESCAPE mmxext
NAL_ESCAPE mmx2
INIT_XMM
NAL_ESCAPE sse2
INIT_AVX
......
......@@ -767,7 +767,7 @@ ADD16x16 avx
%endmacro
INIT_MMX
cglobal sub8x8_dct_dc_mmxext, 3,3
cglobal sub8x8_dct_dc_mmx2, 3,3
DCTDC_2ROW_MMX m0, m4, 0
DCTDC_2ROW_MMX m5, m6, 2
paddw m0, m5
......@@ -1028,12 +1028,12 @@ cglobal zigzag_scan_8x8_frame_%1, 2,2,8*(mmsize/16)
%ifdef HIGH_BIT_DEPTH
INIT_XMM
SCAN_8x8_FRAME sse2 , 4 , dq, qdq, dq, d
SCAN_8x8_FRAME sse2, 4 , dq, qdq, dq, d
INIT_AVX
SCAN_8x8_FRAME avx , 4 , dq, qdq, dq, d
SCAN_8x8_FRAME avx , 4 , dq, qdq, dq, d
%else
INIT_MMX
SCAN_8x8_FRAME mmxext, 16, q , dq , wd, w
SCAN_8x8_FRAME mmx2, 16, q , dq , wd, w
%endif
;-----------------------------------------------------------------------------
......@@ -1123,7 +1123,7 @@ cglobal zigzag_scan_4x4_field_sse2, 2,3
; void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] )
;-----------------------------------------------------------------------------
; sse2 is only 1 cycle faster, and ssse3/pshufb is slower on core2
cglobal zigzag_scan_4x4_field_mmxext, 2,3
cglobal zigzag_scan_4x4_field_mmx2, 2,3
pshufw mm0, [r1+4], 0xd2
movq mm1, [r1+16]
movq mm2, [r1+24]
......@@ -1228,12 +1228,12 @@ cglobal zigzag_scan_8x8_field_%1, 2,3,8*(mmsize/16)
%endmacro
%ifdef HIGH_BIT_DEPTH
INIT_XMM
SCAN_8x8 sse4 , d, dq, qdq, dq, 4
SCAN_8x8 sse4, d, dq, qdq, dq, 4
INIT_AVX
SCAN_8x8 avx , d, dq, qdq, dq, 4
SCAN_8x8 avx , d, dq, qdq, dq, 4
%else
INIT_MMX
SCAN_8x8 mmxext, w, wd, dq , q , 16
SCAN_8x8 mmx2, w, wd, dq , q , 16
%endif
;-----------------------------------------------------------------------------
......
......@@ -28,18 +28,18 @@
#ifndef X264_I386_DCT_H
#define X264_I386_DCT_H
void x264_sub4x4_dct_mmx ( dctcoef dct [16], pixel *pix1, pixel *pix2 );
void x264_sub8x8_dct_mmx ( dctcoef dct[ 4][16], pixel *pix1, pixel *pix2 );
void x264_sub16x16_dct_mmx ( dctcoef dct[16][16], pixel *pix1, pixel *pix2 );
void x264_sub8x8_dct_sse2 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_sse2 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub4x4_dct_ssse3 ( int16_t dct [16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_ssse3 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_ssse3 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_avx ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_avx ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_dc_mmxext( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_dc_sse2 ( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
void x264_sub4x4_dct_mmx ( dctcoef dct [16], pixel *pix1, pixel *pix2 );
void x264_sub8x8_dct_mmx ( dctcoef dct[ 4][16], pixel *pix1, pixel *pix2 );
void x264_sub16x16_dct_mmx ( dctcoef dct[16][16], pixel *pix1, pixel *pix2 );
void x264_sub8x8_dct_sse2 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_sse2 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub4x4_dct_ssse3 ( int16_t dct [16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_ssse3 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_ssse3( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_avx ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_avx ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_dc_mmx2( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_dc_sse2( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
void x264_add4x4_idct_mmx ( uint8_t *p_dst, int16_t dct [16] );
void x264_add4x4_idct_sse2 ( uint16_t *p_dst, int32_t dct [16] );
......@@ -84,19 +84,19 @@ void x264_add16x16_idct8_sse2( uint8_t *dst, int16_t dct[4][64] );
void x264_add8x8_idct8_avx ( uint8_t *dst, int16_t dct [64] );
void x264_add16x16_idct8_avx ( uint8_t *dst, int16_t dct[4][64] );
void x264_zigzag_scan_8x8_frame_avx ( dctcoef level[64], dctcoef dct[64] );
void x264_zigzag_scan_8x8_frame_ssse3 ( int16_t level[64], int16_t dct[64] );
void x264_zigzag_scan_8x8_frame_sse2 ( dctcoef level[64], dctcoef dct[64] );
void x264_zigzag_scan_8x8_frame_mmxext( int16_t level[64], int16_t dct[64] );
void x264_zigzag_scan_4x4_frame_avx ( dctcoef level[16], dctcoef dct[16] );
void x264_zigzag_scan_4x4_frame_ssse3 ( int16_t level[16], int16_t dct[16] );
void x264_zigzag_scan_4x4_frame_sse2 ( int32_t level[16], int32_t dct[16] );
void x264_zigzag_scan_4x4_frame_mmx ( int16_t level[16], int16_t dct[16] );
void x264_zigzag_scan_4x4_field_sse2 ( int32_t level[16], int32_t dct[16] );
void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[16] );
void x264_zigzag_scan_8x8_field_avx ( int32_t level[64], int32_t dct[64] );
void x264_zigzag_scan_8x8_field_sse4 ( int32_t level[64], int32_t dct[64] );
void x264_zigzag_scan_8x8_field_mmxext( int16_t level[64], int16_t dct[64] );
void x264_zigzag_scan_8x8_frame_avx ( dctcoef level[64], dctcoef dct[64] );
void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[64] );
void x264_zigzag_scan_8x8_frame_sse2 ( dctcoef level[64], dctcoef dct[64] );
void x264_zigzag_scan_8x8_frame_mmx2 ( int16_t level[64], int16_t dct[64] );
void x264_zigzag_scan_4x4_frame_avx ( dctcoef level[16], dctcoef dct[16] );
void x264_zigzag_scan_4x4_frame_ssse3( int16_t level[16], int16_t dct[16] );
void x264_zigzag_scan_4x4_frame_sse2 ( int32_t level[16], int32_t dct[16] );
void x264_zigzag_scan_4x4_frame_mmx ( int16_t level[16], int16_t dct[16] );
void x264_zigzag_scan_4x4_field_sse2 ( int32_t level[16], int32_t dct[16] );
void x264_zigzag_scan_4x4_field_mmx2 ( int16_t level[16], int16_t dct[16] );
void x264_zigzag_scan_8x8_field_avx ( int32_t level[64], int32_t dct[64] );
void x264_zigzag_scan_8x8_field_sse4 ( int32_t level[64], int32_t dct[64] );
void x264_zigzag_scan_8x8_field_mmx2 ( int16_t level[64], int16_t dct[64] );
int x264_zigzag_sub_4x4_frame_avx ( int16_t level[16], const uint8_t *src, uint8_t *dst );
int x264_zigzag_sub_4x4_frame_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst );
int x264_zigzag_sub_4x4ac_frame_avx ( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
......@@ -105,8 +105,8 @@ int x264_zigzag_sub_4x4_field_avx ( int16_t level[16], const uint8_t *src, u
int x264_zigzag_sub_4x4_field_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst );
int x264_zigzag_sub_4x4ac_field_avx ( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
int x264_zigzag_sub_4x4ac_field_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
void x264_zigzag_interleave_8x8_cavlc_mmx( int16_t *dst, int16_t *src, uint8_t *nnz );
void x264_zigzag_interleave_8x8_cavlc_mmx ( int16_t *dst, int16_t *src, uint8_t *nnz );
void x264_zigzag_interleave_8x8_cavlc_sse2( dctcoef *dst, dctcoef *src, uint8_t *nnz );
void x264_zigzag_interleave_8x8_cavlc_avx( dctcoef *dst, dctcoef *src, uint8_t *nnz );
void x264_zigzag_interleave_8x8_cavlc_avx ( dctcoef *dst, dctcoef *src, uint8_t *nnz );
#endif
......@@ -792,8 +792,8 @@ cglobal deblock_h_luma_intra_%1, 4,7,8*(mmsize/16)
%ifndef ARCH_X86_64
INIT_MMX
DEBLOCK_LUMA mmxext
DEBLOCK_LUMA_INTRA mmxext
DEBLOCK_LUMA mmx2
DEBLOCK_LUMA_INTRA mmx2
INIT_XMM
DEBLOCK_LUMA sse2
DEBLOCK_LUMA_INTRA sse2
......@@ -1314,7 +1314,7 @@ cglobal deblock_h_luma_%1, 0,5
%endmacro ; DEBLOCK_LUMA
INIT_MMX
DEBLOCK_LUMA mmxext, v8, 8
DEBLOCK_LUMA mmx2, v8, 8
INIT_XMM