Commit 9fe6e5e6 authored by Fiona Glaser's avatar Fiona Glaser

Optimize variance asm + minor changes

Remove SAD argument from var, not needed anymore.
Speed up var asm a bit by eliminating psadbw and instead HADDWing at end.
Eliminate all remaining warnings on gcc 3.4 on cygwin
Port another minor optimization from lavc (pskip)
parent 8761805b
......@@ -140,8 +140,8 @@ void x264_mb_predict_mv_pskip( x264_t *h, int16_t mv[2] )
int16_t *mv_b = h->mb.cache.mv[0][X264_SCAN8_0 - 8];
if( i_refa == -2 || i_refb == -2 ||
( i_refa == 0 && *(uint32_t*)mv_a == 0 ) ||
( i_refb == 0 && *(uint32_t*)mv_b == 0 ) )
!( i_refa | *(uint32_t*)mv_a ) ||
!( i_refb | *(uint32_t*)mv_b ) )
{
*(uint32_t*)mv = 0;
}
......
......@@ -140,7 +140,7 @@ int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1
* pixel_var_wxh
****************************************************************************/
#define PIXEL_VAR_C( name, w, shift ) \
static int name( uint8_t *pix, int i_stride, uint32_t *sad ) \
static int name( uint8_t *pix, int i_stride ) \
{ \
uint32_t var = 0, sum = 0, sqr = 0; \
int x, y; \
......@@ -154,7 +154,6 @@ static int name( uint8_t *pix, int i_stride, uint32_t *sad ) \
pix += i_stride; \
} \
var = sqr - (sum * sum >> shift); \
*sad = sum; \
return var; \
}
......
......@@ -74,7 +74,7 @@ typedef struct
x264_pixel_cmp_x4_t fpelcmp_x4[7];
x264_pixel_cmp_t sad_aligned[7]; /* Aligned SAD for mbcmp */
int (*var[4])( uint8_t *pix, int stride, uint32_t *sad );
int (*var[4])( uint8_t *pix, int stride );
uint64_t (*hadamard_ac[4])( uint8_t *pix, int stride );
void (*ssim_4x4x2_core)( const uint8_t *pix1, int stride1,
......
......@@ -237,13 +237,8 @@ cglobal x264_pixel_ssd_4x4_sse4, 4,4
%endmacro
%macro VAR_END 1
%if mmsize == 16
movhlps m0, m5
paddw m5, m0
%endif
movifnidn r2d, r2m
HADDW m5, m7
movd r1d, m5
movd [r2], m5 ; return sum
imul r1d, r1d
HADDD m6, m1
shr r1d, %1
......@@ -258,27 +253,25 @@ cglobal x264_pixel_ssd_4x4_sse4, 4,4
mova m0, [r0]
mova m1, m0
mova m3, [r0+%1]
mova m2, m0
punpcklbw m0, m7
mova m4, m3
punpcklbw m0, m7
punpckhbw m1, m7
%ifidn %1, r1
lea r0, [r0+%1*2]
%else
add r0, r1
%endif
punpckhbw m4, m7
psadbw m2, m7
paddw m5, m2
mova m2, m3
punpcklbw m3, m7
punpckhbw m4, m7
paddw m5, m0
dec t3d
psadbw m2, m7
pmaddwd m0, m0
paddw m5, m2
paddw m5, m1
pmaddwd m1, m1
paddw m5, m3
paddd m6, m0
pmaddwd m3, m3
paddw m5, m4
paddd m6, m1
pmaddwd m4, m4
paddd m6, m3
......@@ -287,7 +280,7 @@ cglobal x264_pixel_ssd_4x4_sse4, 4,4
%endmacro
;-----------------------------------------------------------------------------
; int x264_pixel_var_wxh_mmxext( uint8_t *, int, int * )
; int x264_pixel_var_wxh_mmxext( uint8_t *, int )
;-----------------------------------------------------------------------------
INIT_MMX
cglobal x264_pixel_var_16x16_mmxext, 2,3
......@@ -315,13 +308,12 @@ cglobal x264_pixel_var_8x8_sse2, 2,3
lea r0, [r0+r1*2]
mova m1, m0
punpcklbw m0, m7
mova m2, m1
punpckhbw m1, m7
dec t3d
paddw m5, m0
paddw m5, m1
pmaddwd m0, m0
pmaddwd m1, m1
psadbw m2, m7
paddw m5, m2
paddd m6, m0
paddd m6, m1
jnz .loop
......
......@@ -67,8 +67,8 @@ DECL_X4( sad, cache64_mmxext );
DECL_X4( sad, cache64_sse2 );
DECL_X4( sad, cache64_ssse3 );
DECL_PIXELS( int, var, mmxext, ( uint8_t *pix, int i_stride, uint32_t *sad ))
DECL_PIXELS( int, var, sse2, ( uint8_t *pix, int i_stride, uint32_t *sad ))
DECL_PIXELS( int, var, mmxext, ( uint8_t *pix, int i_stride ))
DECL_PIXELS( int, var, sse2, ( uint8_t *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, mmxext, ( uint8_t *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, sse2, ( uint8_t *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( uint8_t *pix, int i_stride ))
......
......@@ -174,8 +174,8 @@ static NOINLINE int ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_frame_t *f
* and putting it after floating point ops. As a result, we put the emms at the end of the
* function and make sure that its always called before the float math. Noinline makes
* sure no reordering goes on. */
unsigned int var=0, sad, i;
for( i=0; i<3; i++ )
unsigned int var = 0, i;
for( i = 0; i < 3; i++ )
{
int w = i ? 8 : 16;
int stride = frame->i_stride[i];
......@@ -184,7 +184,7 @@ static NOINLINE int ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_frame_t *f
: w * (mb_x + mb_y * stride);
int pix = i ? PIXEL_8x8 : PIXEL_16x16;
stride <<= h->mb.b_interlaced;
var += h->pixf.var[pix]( frame->plane[i]+offset, stride, &sad );
var += h->pixf.var[pix]( frame->plane[i]+offset, stride );
}
var = X264_MAX(var,1);
x264_emms();
......
......@@ -290,7 +290,7 @@ int read_frame_y4m( x264_picture_t *p_pic, hnd_t handle, int i_frame )
header[slen] = 0;
if (strncmp(header, Y4M_FRAME_MAGIC, slen))
{
fprintf(stderr, "Bad header magic (%08X <=> %s)\n",
fprintf(stderr, "Bad header magic (%"PRIx32" <=> %s)\n",
*((uint32_t*)header), header);
return -1;
}
......
......@@ -330,16 +330,15 @@ static int check_pixel( int cpu_ref, int cpu_new )
#define TEST_PIXEL_VAR( i ) \
if( pixel_asm.var[i] != pixel_ref.var[i] ) \
{ \
uint32_t res_c, res_asm; \
uint32_t sad_c, sad_asm; \
int res_c, res_asm; \
set_func_name( "%s_%s", "var", pixel_names[i] ); \
used_asm = 1; \
res_c = call_c( pixel_c.var[i], buf1, 16, &sad_c ); \
res_asm = call_a( pixel_asm.var[i], buf1, 16, &sad_asm ); \
if( (res_c != res_asm) || (sad_c != sad_asm) ) \
res_c = call_c( pixel_c.var[i], buf1, 16 ); \
res_asm = call_a( pixel_asm.var[i], buf1, 16 ); \
if( res_c != res_asm ) \
{ \
ok = 0; \
fprintf( stderr, "var[%d]: %d,%d != %d,%d [FAILED]\n", i, res_c, sad_c, res_asm, sad_asm ); \
fprintf( stderr, "var[%d]: %d != %d [FAILED]\n", i, res_c, res_asm ); \
} \
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment