Commit c1f64a50 authored by Loren Merritt's avatar Loren Merritt
Browse files

Use sa8d instead of satd for i8x8 search.

+.01 dB, -.5% speed



git-svn-id: svn://svn.videolan.org/x264/trunk@512 df754926-b1dd-0310-bc7b-ec298dee348c
parent 8aa29438
......@@ -847,14 +847,14 @@ x264_pixel_satd_16x16_mmxext:
%endmacro
%macro SUM4x8_MM 0
movq [spill], mm7
MMX_ABS mm0, mm7
MMX_ABS mm1, mm7
MMX_ABS mm2, mm7
MMX_ABS mm3, mm7
movq [spill], mm6
movq [spill+8], mm7
MMX_ABS_TWO mm0, mm1, mm6, mm7
MMX_ABS_TWO mm2, mm3, mm6, mm7
paddw mm0, mm2
paddw mm1, mm3
movq mm7, [spill]
movq mm6, [spill]
movq mm7, [spill+8]
MMX_ABS_TWO mm4, mm5, mm2, mm3
MMX_ABS_TWO mm6, mm7, mm2, mm3
paddw mm4, mm6
......@@ -870,14 +870,14 @@ ALIGN 16
;-----------------------------------------------------------------------------
x264_pixel_sa8d_8x8_mmxext:
SATD_START
sub esp, 0x68
%define args esp+0x6c
sub esp, 0x70
%define args esp+0x74
%define spill esp+0x60
LOAD_DIFF_4x8P 0
HADAMARD1x8 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
movq [spill], mm0
TRANSPOSE4x4 mm4, mm5, mm6, mm7, mm0 ; abcd-t -> adtc
TRANSPOSE4x4 mm4, mm5, mm6, mm7, mm0
movq [esp+0x00], mm4
movq [esp+0x08], mm7
movq [esp+0x10], mm0
......@@ -894,13 +894,13 @@ x264_pixel_sa8d_8x8_mmxext:
LOAD_DIFF_4x8P 4
HADAMARD1x8 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7
movq [spill], mm4
TRANSPOSE4x4 mm0, mm1, mm2, mm3, mm4
movq [spill], mm7
TRANSPOSE4x4 mm0, mm1, mm2, mm3, mm7
movq [esp+0x40], mm0
movq [esp+0x48], mm3
movq [esp+0x50], mm4
movq [esp+0x50], mm7
movq [esp+0x58], mm2
movq mm4, [spill]
movq mm7, [spill]
TRANSPOSE4x4 mm4, mm5, mm6, mm7, mm0
movq mm5, [esp+0x00]
movq mm1, [esp+0x08]
......@@ -933,7 +933,7 @@ x264_pixel_sa8d_8x8_mmxext:
mov ecx, eax ; preserve rounding for 16x16
add eax, 1
shr eax, 1
add esp, 0x68
add esp, 0x70
pop ebx
ret
%undef args
......
......@@ -496,6 +496,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
/* 8x8 prediction selection */
if( flags & X264_ANALYSE_I8x8 )
{
x264_pixel_cmp_t sa8d = (*h->pixf.mbcmp == *h->pixf.sad) ? h->pixf.sad[PIXEL_8x8] : h->pixf.sa8d[PIXEL_8x8];
int i_satd_thresh = a->b_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
int i_cost = 0;
......@@ -520,10 +521,8 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
h->predict_8x8[i_mode]( p_dst_by, h->mb.i_neighbour8[idx] );
/* could use sa8d, but it doesn't seem worth the speed cost (without mmx at least) */
i_satd = h->pixf.mbcmp[PIXEL_8x8]( p_dst_by, FDEC_STRIDE,
p_src_by, FENC_STRIDE )
+ a->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ? 1 : 4);
i_satd = sa8d( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE )
+ a->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ? 1 : 4);
COPY2_IF_LT( i_best, i_satd, a->i_predict8x8[idx], i_mode );
a->i_satd_i8x8_dir[i_mode][idx] = i_satd;
......@@ -588,7 +587,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE,
p_src_by, FENC_STRIDE )
+ a->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ? 1 : 4);
+ a->i_lambda * (i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ? 1 : 4);
COPY2_IF_LT( i_best, i_satd, a->i_predict4x4[idx], i_mode );
}
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment