Commit 82aef940 authored by Fiona Glaser's avatar Fiona Glaser

intra_sad_x3_8x8c assembly

Also fix intra_sad_x3_16x16's use of "n" as a loop variable (broke SWAP)
parent 291b6ab1
......@@ -662,8 +662,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
}
#endif
pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_mmxext;
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_mmxext;
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_mmxext;
pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_mmxext;
pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_mmxext;
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmxext;
}
......@@ -753,6 +754,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_ssse3;
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_ssse3;
pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_ssse3;
pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_ssse3;
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_ssse3;
#ifdef ARCH_X86_64
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_ssse3;
......
......@@ -97,7 +97,9 @@ typedef struct
void (*intra_mbcmp_x3_16x16)( uint8_t *fenc, uint8_t *fdec , int res[3] );
void (*intra_satd_x3_16x16) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
void (*intra_sad_x3_16x16) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
void (*intra_mbcmp_x3_8x8c) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
void (*intra_satd_x3_8x8c) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
void (*intra_sad_x3_8x8c) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
void (*intra_satd_x3_4x4) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
void (*intra_sa8d_x3_8x8) ( uint8_t *fenc, uint8_t edge[33], int res[3] );
} x264_pixel_function_t;
......
......@@ -81,6 +81,8 @@ void x264_intra_satd_x3_4x4_mmxext ( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_4x4_ssse3 ( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_8x8c_mmxext ( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_8x8c_ssse3 ( uint8_t *, uint8_t *, int * );
void x264_intra_sad_x3_8x8c_mmxext ( uint8_t *, uint8_t *, int * );
void x264_intra_sad_x3_8x8c_ssse3 ( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_16x16_mmxext( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_16x16_ssse3 ( uint8_t *, uint8_t *, int * );
void x264_intra_sad_x3_16x16_mmxext ( uint8_t *, uint8_t *, int * );
......
......@@ -28,6 +28,8 @@
SECTION_RODATA
pb_3: times 16 db 3
pb_shuf8x8c0: db 0,0,0,0,2,2,2,2
pb_shuf8x8c1: db 4,4,4,4,6,6,6,6
sw_64: dd 64
SECTION .text
......@@ -255,6 +257,125 @@ cglobal x264_pixel_sad_8x16_sse2, 4,4
SAD_END_SSE2
RET
;-----------------------------------------------------------------------------
; void intra_sad_x3_8x8c ( uint8_t *fenc, uint8_t *fdec, int res[3] );
;-----------------------------------------------------------------------------
%macro INTRA_SAD_HV_ITER 2
%ifidn %2, ssse3
movd m1, [r1 + FDEC_STRIDE*(%1-4) - 4]
movd m3, [r1 + FDEC_STRIDE*(%1-3) - 4]
pshufb m1, m7
pshufb m3, m7
%else
movq m1, [r1 + FDEC_STRIDE*(%1-4) - 8]
movq m3, [r1 + FDEC_STRIDE*(%1-3) - 8]
punpckhbw m1, m1
punpckhbw m3, m3
pshufw m1, m1, 0xff
pshufw m3, m3, 0xff
%endif
movq m4, [r0 + FENC_STRIDE*(%1+0)]
movq m5, [r0 + FENC_STRIDE*(%1+1)]
psadbw m1, m4
psadbw m3, m5
psadbw m4, m6
psadbw m5, m6
paddw m1, m3
paddw m4, m5
%if %1
paddw m0, m1
paddw m2, m4
%else
SWAP 0,1
SWAP 2,4
%endif
%endmacro
%macro INTRA_SAD_8x8C 1
cglobal x264_intra_sad_x3_8x8c_%1, 3,3
movq m6, [r1 - FDEC_STRIDE]
add r1, FDEC_STRIDE*4
%ifidn %1,ssse3
movq m7, [pb_3 GLOBAL]
%endif
INTRA_SAD_HV_ITER 0, %1
INTRA_SAD_HV_ITER 2, %1
INTRA_SAD_HV_ITER 4, %1
INTRA_SAD_HV_ITER 6, %1
movd [r2+4], m0
movd [r2+8], m2
pxor m7, m7
movq m2, [r1 + FDEC_STRIDE*-4 - 8]
movq m4, [r1 + FDEC_STRIDE*-2 - 8]
movq m3, [r1 + FDEC_STRIDE* 0 - 8]
movq m5, [r1 + FDEC_STRIDE* 2 - 8]
punpckhbw m2, [r1 + FDEC_STRIDE*-3 - 8]
punpckhbw m4, [r1 + FDEC_STRIDE*-1 - 8]
punpckhbw m3, [r1 + FDEC_STRIDE* 1 - 8]
punpckhbw m5, [r1 + FDEC_STRIDE* 3 - 8]
punpckhbw m2, m4
punpckhbw m3, m5
psrlq m2, 32
psrlq m3, 32
psadbw m2, m7 ; s2
psadbw m3, m7 ; s3
movq m1, m6
SWAP 0, 6
punpckldq m0, m7
punpckhdq m1, m7
psadbw m0, m7 ; s0
psadbw m1, m7 ; s1
punpcklwd m0, m1
punpcklwd m2, m3
punpckldq m0, m2 ;s0 s1 s2 s3
pshufw m3, m0, 11110110b ;s2,s1,s3,s3
pshufw m0, m0, 01110100b ;s0,s1,s3,s1
paddw m0, m3
psrlw m0, 2
pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
%ifidn %1, ssse3
movq m1, m0
pshufb m0, [pb_shuf8x8c0 GLOBAL]
pshufb m1, [pb_shuf8x8c1 GLOBAL]
%else
packuswb m0, m0
punpcklbw m0, m0
movq m1, m0
punpcklbw m0, m0 ; 4x dc0 4x dc1
punpckhbw m1, m1 ; 4x dc2 4x dc3
%endif
movq m2, [r0+FENC_STRIDE*0]
movq m3, [r0+FENC_STRIDE*1]
movq m4, [r0+FENC_STRIDE*2]
movq m5, [r0+FENC_STRIDE*3]
movq m6, [r0+FENC_STRIDE*4]
movq m7, [r0+FENC_STRIDE*5]
psadbw m2, m0
psadbw m3, m0
psadbw m4, m0
psadbw m5, m0
movq m0, [r0+FENC_STRIDE*6]
psadbw m6, m1
psadbw m7, m1
psadbw m0, m1
psadbw m1, [r0+FENC_STRIDE*7]
paddw m2, m3
paddw m4, m5
paddw m6, m7
paddw m0, m1
paddw m2, m4
paddw m6, m0
paddw m2, m6
movd [r2], m2
RET
%endmacro
INIT_MMX
INTRA_SAD_8x8C mmxext
INTRA_SAD_8x8C ssse3
;-----------------------------------------------------------------------------
; void intra_sad_x3_16x16 ( uint8_t *fenc, uint8_t *fdec, int res[3] );
;-----------------------------------------------------------------------------
......@@ -272,11 +393,11 @@ cglobal x264_intra_sad_x3_16x16_%1,3,5,%2
%ifidn %1, ssse3
mova m1, [pb_3 GLOBAL]
%endif
%assign n 0
%assign x 0
%rep 16
movzx r4d, byte [r1-1+FDEC_STRIDE*n]
movzx r4d, byte [r1-1+FDEC_STRIDE*x]
add r3d, r4d
%assign n n+1
%assign x x+1
%endrep
add r3d, 16
shr r3d, 5
......
......@@ -539,6 +539,7 @@ static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
int i_max;
int predict_mode[4];
int b_merged_satd = !!h->pixf.intra_mbcmp_x3_8x8c && !h->mb.b_lossless;
uint8_t *p_dstc[2], *p_srcc[2];
......@@ -553,11 +554,11 @@ static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
predict_8x8chroma_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
a->i_satd_i8x8chroma = COST_MAX;
if( i_max == 4 && h->pixf.intra_satd_x3_8x8c && h->pixf.mbcmp[0] == h->pixf.satd[0] )
if( i_max == 4 && b_merged_satd )
{
int satdu[4], satdv[4];
h->pixf.intra_satd_x3_8x8c( p_srcc[0], p_dstc[0], satdu );
h->pixf.intra_satd_x3_8x8c( p_srcc[1], p_dstc[1], satdv );
h->pixf.intra_mbcmp_x3_8x8c( p_srcc[0], p_dstc[0], satdu );
h->pixf.intra_mbcmp_x3_8x8c( p_srcc[1], p_dstc[1], satdv );
h->predict_8x8c[I_PRED_CHROMA_P]( p_dstc[0] );
h->predict_8x8c[I_PRED_CHROMA_P]( p_dstc[1] );
satdu[I_PRED_CHROMA_P] =
......
......@@ -605,6 +605,7 @@ static void mbcmp_init( x264_t *h )
memcpy( h->pixf.mbcmp, satd ? h->pixf.satd : h->pixf.sad_aligned, sizeof(h->pixf.mbcmp) );
memcpy( h->pixf.mbcmp_unaligned, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.mbcmp_unaligned) );
h->pixf.intra_mbcmp_x3_16x16 = satd ? h->pixf.intra_satd_x3_16x16 : h->pixf.intra_sad_x3_16x16;
h->pixf.intra_mbcmp_x3_8x8c = satd ? h->pixf.intra_satd_x3_8x8c : h->pixf.intra_sad_x3_8x8c;
satd &= h->param.analyse.i_me_method == X264_ME_TESA;
memcpy( h->pixf.fpelcmp, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.fpelcmp) );
memcpy( h->pixf.fpelcmp_x3, satd ? h->pixf.satd_x3 : h->pixf.sad_x3, sizeof(h->pixf.fpelcmp_x3) );
......
......@@ -405,6 +405,7 @@ static int check_pixel( int cpu_ref, int cpu_new )
TEST_INTRA_MBCMP( intra_sa8d_x3_8x8 , predict_8x8 , sa8d[PIXEL_8x8] , 1, edge );
report( "intra satd_x3 :" );
TEST_INTRA_MBCMP( intra_sad_x3_16x16 , predict_16x16, sad [PIXEL_16x16], 0 );
TEST_INTRA_MBCMP( intra_sad_x3_8x8c , predict_8x8c , sad [PIXEL_8x8] , 0 );
report( "intra sad_x3 :" );
if( pixel_asm.ssim_4x4x2_core != pixel_ref.ssim_4x4x2_core ||
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment