Commit 4442eace authored by Henrik Gramner's avatar Henrik Gramner Committed by Fiona Glaser

Faster chroma weight cost calculation

New assembly function with SSE2, SSSE3 and XOP implementations for calculating absolute sum of differences.
parent e8952dff
......@@ -703,6 +703,15 @@ int x264_field_vsad( x264_t *h, int mb_x, int mb_y )
return (score_field < score_frame);
}
static int pixel_asd8( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height )
{
int sum = 0;
for( int y = 0; y < height; y++, pix1 += stride1, pix2 += stride2 )
for( int x = 0; x < 8; x++ )
sum += pix1[x] - pix2[x];
return abs( sum );
}
/****************************************************************************
* successive elimination
****************************************************************************/
......@@ -814,6 +823,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->ssim_4x4x2_core = ssim_4x4x2_core;
pixf->ssim_end4 = ssim_end4;
pixf->vsad = pixel_vsad;
pixf->asd8 = pixel_asd8;
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4;
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4;
......@@ -888,6 +898,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT4( hadamard_ac, _sse2 );
}
pixf->vsad = x264_pixel_vsad_sse2;
pixf->asd8 = x264_pixel_asd8_sse2;
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_sse2;
pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_sse2;
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_sse2;
......@@ -915,6 +926,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT4( hadamard_ac, _ssse3 );
}
pixf->vsad = x264_pixel_vsad_ssse3;
pixf->asd8 = x264_pixel_asd8_ssse3;
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_ssse3;
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3;
......@@ -951,6 +963,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
if( cpu&X264_CPU_XOP )
{
pixf->vsad = x264_pixel_vsad_xop;
pixf->asd8 = x264_pixel_asd8_xop;
}
#endif // HAVE_MMX
#else // !HIGH_BIT_DEPTH
......@@ -1035,6 +1048,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_sse2;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_sse2;
pixf->vsad = x264_pixel_vsad_sse2;
pixf->asd8 = x264_pixel_asd8_sse2;
}
if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
......@@ -1126,6 +1140,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_ssse3;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_ssse3;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_ssse3;
pixf->asd8 = x264_pixel_asd8_ssse3;
if( cpu&X264_CPU_CACHELINE_64 )
{
INIT2( sad, _cache64_ssse3 );
......
......@@ -89,6 +89,7 @@ typedef struct
x264_pixel_cmp_x4_t fpelcmp_x4[7];
x264_pixel_cmp_t sad_aligned[8]; /* Aligned SAD for mbcmp */
int (*vsad)( pixel *, intptr_t, int );
int (*asd8)( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
uint64_t (*var[4])( pixel *pix, intptr_t stride );
int (*var2[4])( pixel *pix1, intptr_t stride1,
......
......@@ -4001,6 +4001,73 @@ SSIM
INIT_XMM avx
SSIM
;-----------------------------------------------------------------------------
; int pixel_asd8( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
;-----------------------------------------------------------------------------
%macro ASD8 0
cglobal pixel_asd8, 5,5
pxor m0, m0
pxor m1, m1
.loop:
%if HIGH_BIT_DEPTH
paddw m0, [r0]
paddw m1, [r2]
paddw m0, [r0+2*r1]
paddw m1, [r2+2*r3]
lea r0, [r0+4*r1]
paddw m0, [r0]
paddw m1, [r2+4*r3]
lea r2, [r2+4*r3]
paddw m0, [r0+2*r1]
paddw m1, [r2+2*r3]
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
%else
movq m2, [r0]
movq m3, [r2]
movhps m2, [r0+r1]
movhps m3, [r2+r3]
lea r0, [r0+2*r1]
psadbw m2, m1
psadbw m3, m1
movq m4, [r0]
movq m5, [r2+2*r3]
lea r2, [r2+2*r3]
movhps m4, [r0+r1]
movhps m5, [r2+r3]
lea r0, [r0+2*r1]
paddw m0, m2
psubw m0, m3
psadbw m4, m1
psadbw m5, m1
lea r2, [r2+2*r3]
paddw m0, m4
psubw m0, m5
%endif
sub r4d, 4
jg .loop
%if HIGH_BIT_DEPTH
psubw m0, m1
HADDW m0, m1
ABSD m1, m0
%else
movhlps m1, m0
paddw m0, m1
ABSW m1, m0
%endif
movd eax, m1
RET
%endmacro
INIT_XMM sse2
ASD8
INIT_XMM ssse3
ASD8
%if HIGH_BIT_DEPTH
INIT_XMM xop
ASD8
%endif
;=============================================================================
; Successive Elimination ADS
;=============================================================================
......
......@@ -159,6 +159,9 @@ int x264_pixel_vsad_mmx2 ( pixel *src, intptr_t stride, int height );
int x264_pixel_vsad_sse2 ( pixel *src, intptr_t stride, int height );
int x264_pixel_vsad_ssse3( pixel *src, intptr_t stride, int height );
int x264_pixel_vsad_xop ( pixel *src, intptr_t stride, int height );
int x264_pixel_asd8_sse2 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
int x264_pixel_asd8_ssse3( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
int x264_pixel_asd8_xop ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
#define DECL_ADS( size, suffix ) \
int x264_pixel_ads##size##_##suffix( int enc_dc[size], uint16_t *sums, int delta,\
......
......@@ -220,15 +220,12 @@ static NOINLINE unsigned int x264_weight_cost_chroma( x264_t *h, x264_frame_t *f
{
unsigned int cost = 0;
int i_stride = fenc->i_stride[1];
int i_offset = i_stride / 2;
int i_lines = fenc->i_lines[1];
int i_width = fenc->i_width[1];
pixel *src = ref + i_offset;
pixel *src = ref + (i_stride >> 1);
ALIGNED_ARRAY_16( pixel, buf, [8*16] );
int pixoff = 0;
int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
int height = 16 >> CHROMA_V_SHIFT;
ALIGNED_16( static pixel flat[8] ) = {0};
if( w )
{
for( int y = 0; y < i_lines; y += height, pixoff = y*i_stride )
......@@ -239,19 +236,15 @@ static NOINLINE unsigned int x264_weight_cost_chroma( x264_t *h, x264_frame_t *f
* But testing shows that for chroma the DC coefficient is by far the most
* important part of the coding cost. Thus a more useful chroma weight is
* obtained by comparing each block's DC coefficient instead of the actual
* pixels.
*
* FIXME: add a (faster) asm sum function to replace sad. */
cost += abs( h->pixf.sad_aligned[chromapix]( buf, 8, flat, 0 ) -
h->pixf.sad_aligned[chromapix]( &src[pixoff], i_stride, flat, 0 ) );
* pixels. */
cost += h->pixf.asd8( buf, 8, &src[pixoff], i_stride, height );
}
cost += x264_weight_slice_header_cost( h, w, 1 );
}
else
for( int y = 0; y < i_lines; y += height, pixoff = y*i_stride )
for( int x = 0; x < i_width; x += 8, pixoff += 8 )
cost += abs( h->pixf.sad_aligned[chromapix]( &ref[pixoff], i_stride, flat, 0 ) -
h->pixf.sad_aligned[chromapix]( &src[pixoff], i_stride, flat, 0 ) );
cost += h->pixf.asd8( &ref[pixoff], i_stride, &src[pixoff], i_stride, height );
x264_emms();
return cost;
}
......
......@@ -475,6 +475,21 @@ static int check_pixel( int cpu_ref, int cpu_new )
}
report( "pixel vsad :" );
ok = 1; used_asm = 0;
if( pixel_asm.asd8 != pixel_ref.asd8 )
{
set_func_name( "asd8" );
used_asm = 1;
int res_c = call_c( pixel_c.asd8, pbuf1, (intptr_t)8, pbuf2, (intptr_t)8, 16 );
int res_a = call_a( pixel_asm.asd8, pbuf1, (intptr_t)8, pbuf2, (intptr_t)8, 16 );
if( res_c != res_a )
{
ok = 0;
fprintf( stderr, "asd: %d != %d\n", res_c, res_a );
}
}
report( "pixel asd :" );
#define TEST_INTRA_X3( name, i8x8, ... ) \
if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \
{ \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment