Commit 104511d6 authored by Fiona Glaser's avatar Fiona Glaser

intra_sad_x3_4x4 assembly

parent 82aef940
......@@ -666,6 +666,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_mmxext;
pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_mmxext;
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmxext;
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_mmxext;
}
if( cpu&X264_CPU_SSE2 )
......
......@@ -100,7 +100,9 @@ typedef struct
void (*intra_mbcmp_x3_8x8c) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
void (*intra_satd_x3_8x8c) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
void (*intra_sad_x3_8x8c) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
void (*intra_mbcmp_x3_4x4) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
void (*intra_satd_x3_4x4) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
void (*intra_sad_x3_4x4) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
void (*intra_sa8d_x3_8x8) ( uint8_t *fenc, uint8_t edge[33], int res[3] );
} x264_pixel_function_t;
......
......@@ -79,6 +79,7 @@ DECL_PIXELS( uint64_t, hadamard_ac, sse4, ( uint8_t *pix, int i_stride ))
void x264_intra_satd_x3_4x4_mmxext ( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_4x4_ssse3 ( uint8_t *, uint8_t *, int * );
void x264_intra_sad_x3_4x4_mmxext ( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_8x8c_mmxext ( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_8x8c_ssse3 ( uint8_t *, uint8_t *, int * );
void x264_intra_sad_x3_8x8c_mmxext ( uint8_t *, uint8_t *, int * );
......
......@@ -257,6 +257,53 @@ cglobal x264_pixel_sad_8x16_sse2, 4,4
SAD_END_SSE2
RET
;-----------------------------------------------------------------------------
; void intra_sad_x3_4x4 ( uint8_t *fenc, uint8_t *fdec, int res[3] );
;-----------------------------------------------------------------------------
cglobal x264_intra_sad_x3_4x4_mmxext, 3,3
pxor mm7, mm7
movd mm0, [r1-FDEC_STRIDE]
movd mm1, [r0+FENC_STRIDE*0]
movd mm2, [r0+FENC_STRIDE*2]
punpckldq mm0, mm0
punpckldq mm1, [r0+FENC_STRIDE*1]
punpckldq mm2, [r0+FENC_STRIDE*3]
movq mm6, mm0
movq mm3, mm1
psadbw mm3, mm0
psadbw mm0, mm2
paddw mm0, mm3
movd [r2], mm0 ;V prediction cost
movd mm3, [r1+FDEC_STRIDE*0-4]
movd mm0, [r1+FDEC_STRIDE*1-4]
movd mm4, [r1+FDEC_STRIDE*2-4]
movd mm5, [r1+FDEC_STRIDE*3-4]
punpcklbw mm3, mm0
punpcklbw mm4, mm5
movq mm5, mm3
punpckhwd mm5, mm4
punpckhdq mm5, mm6
psadbw mm5, mm7
punpckhbw mm3, mm3
punpckhbw mm4, mm4
punpckhwd mm3, mm3
punpckhwd mm4, mm4
psraw mm5, 2
pavgw mm5, mm7
punpcklbw mm5, mm5
pshufw mm5, mm5, 0x0 ;DC prediction
movq mm6, mm5
psadbw mm5, mm1
psadbw mm6, mm2
psadbw mm1, mm3
psadbw mm2, mm4
paddw mm5, mm6
paddw mm1, mm2
movd [r2+8], mm5 ;DC prediction cost
movd [r2+4], mm1 ;H prediction cost
RET
;-----------------------------------------------------------------------------
; void intra_sad_x3_8x8c ( uint8_t *fenc, uint8_t *fdec, int res[3] );
;-----------------------------------------------------------------------------
......
......@@ -760,7 +760,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
int i_cost;
int i_satd_thresh = X264_MIN3( i_satd_inter, a->i_satd_i16x16, a->i_satd_i8x8 );
h->mb.i_cbp_luma = 0;
b_merged_satd = h->pixf.intra_satd_x3_4x4 && h->pixf.mbcmp[0] == h->pixf.satd[0];
b_merged_satd = h->pixf.intra_mbcmp_x3_4x4 && !h->mb.b_lossless;
if( a->i_mbrd )
i_satd_thresh = i_satd_thresh * (10-a->b_fast_intra)/8;
......
......@@ -606,6 +606,7 @@ static void mbcmp_init( x264_t *h )
memcpy( h->pixf.mbcmp_unaligned, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.mbcmp_unaligned) );
h->pixf.intra_mbcmp_x3_16x16 = satd ? h->pixf.intra_satd_x3_16x16 : h->pixf.intra_sad_x3_16x16;
h->pixf.intra_mbcmp_x3_8x8c = satd ? h->pixf.intra_satd_x3_8x8c : h->pixf.intra_sad_x3_8x8c;
h->pixf.intra_mbcmp_x3_4x4 = satd ? h->pixf.intra_satd_x3_4x4 : h->pixf.intra_sad_x3_4x4;
satd &= h->param.analyse.i_me_method == X264_ME_TESA;
memcpy( h->pixf.fpelcmp, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.fpelcmp) );
memcpy( h->pixf.fpelcmp_x3, satd ? h->pixf.satd_x3 : h->pixf.sad_x3, sizeof(h->pixf.fpelcmp_x3) );
......
......@@ -406,6 +406,7 @@ static int check_pixel( int cpu_ref, int cpu_new )
report( "intra satd_x3 :" );
TEST_INTRA_MBCMP( intra_sad_x3_16x16 , predict_16x16, sad [PIXEL_16x16], 0 );
TEST_INTRA_MBCMP( intra_sad_x3_8x8c , predict_8x8c , sad [PIXEL_8x8] , 0 );
TEST_INTRA_MBCMP( intra_sad_x3_4x4 , predict_4x4 , sad [PIXEL_4x4] , 0 );
report( "intra sad_x3 :" );
if( pixel_asm.ssim_4x4x2_core != pixel_ref.ssim_4x4x2_core ||
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment