Commit dfb85477 authored by Loren Merritt's avatar Loren Merritt

in hpel search, merge two 16x16 mc calls into one 16x17. 15% faster hpel, .3% overall.


git-svn-id: svn://svn.videolan.org/x264/trunk@638 df754926-b1dd-0310-bc7b-ec298dee348c
parent e63c3924
......@@ -78,6 +78,11 @@ BITS 64
%define parm7d dword parm7q
%define parm8d dword parm8q
%define temp1q rdi
%define temp2q rsi
%define temp1d edi
%define temp2d esi
%macro firstpush 1
db 0x48
push %1
......@@ -234,6 +239,11 @@ SECTION .text
%define parm7d dword parm7q
%define parm8d dword parm8q
%define temp1q r9
%define temp2q r8
%define temp1d r9d
%define temp2d r8d
%macro allocstack 1
%endmacro
......
......@@ -59,6 +59,7 @@ SECTION .text
cglobal x264_pixel_avg_w4_mmxext
cglobal x264_pixel_avg_w8_mmxext
cglobal x264_pixel_avg_w16_mmxext
cglobal x264_pixel_avg_w20_mmxext
cglobal x264_pixel_avg_w16_sse2
cglobal x264_pixel_avg_weight_4x4_mmxext
......@@ -103,7 +104,7 @@ ALIGN 4
lea parm3q, [parm3q+parm4q*2]
lea r10, [r10+r11*2]
lea parm1q, [parm1q+parm2q*2]
jne .height_loop
jg .height_loop
rep ret
......@@ -132,7 +133,7 @@ ALIGN 4
lea parm3q, [parm3q+parm4q*2]
lea r10, [r10+r11*2]
lea parm1q, [parm1q+parm2q*2]
jne .height_loop
jg .height_loop
rep ret
ALIGN 16
......@@ -159,7 +160,37 @@ ALIGN 4
lea parm3q, [parm3q+parm4q]
lea r10, [r10+r11]
lea parm1q, [parm1q+parm2q]
jne .height_loop
jg .height_loop
rep ret
ALIGN 16
;-----------------------------------------------------------------------------
; void x264_pixel_avg_w20_mmxext( uint8_t *dst, int i_dst_stride,
; uint8_t *src1, int i_src1_stride,
; uint8_t *src2, int i_src2_stride,
; int i_height );
;-----------------------------------------------------------------------------
x264_pixel_avg_w20_mmxext:
mov r10, parm5q ; src2
movsxd r11, parm6d ; i_src2_stride
mov eax, parm7d ; i_height
ALIGN 4
.height_loop
movq mm0, [parm3q ]
movq mm1, [parm3q+8 ]
movd mm2, [parm3q+16]
pavgb mm0, [r10 ]
pavgb mm1, [r10+8 ]
pavgb mm2, [r10+16]
movq [parm1q ], mm0
movq [parm1q+8 ], mm1
movd [parm1q+16], mm2
dec eax
lea parm3q, [parm3q+parm4q]
lea r10, [r10+r11]
lea parm1q, [parm1q+parm2q]
jg .height_loop
rep ret
ALIGN 16
......@@ -183,7 +214,7 @@ ALIGN 4
lea parm3q, [parm3q+parm4q]
lea r10, [r10+r11]
lea parm1q, [parm1q+parm2q]
jne .height_loop
jg .height_loop
rep ret
......@@ -244,7 +275,7 @@ x264_pixel_avg_weight_w16_mmxext:
add parm1q, parm2q
add parm3q, parm4q
dec r11d
jnz .height_loop
jg .height_loop
rep ret
ALIGN 16
......@@ -260,7 +291,7 @@ x264_pixel_avg_weight_w8_mmxext:
add parm1q, parm2q
add parm3q, parm4q
dec r11d
jnz .height_loop
jg .height_loop
rep ret
ALIGN 16
......@@ -301,7 +332,7 @@ ALIGN 4
lea parm1q, [parm1q+parm2q*2]
dec eax
dec eax
jne .height_loop
jg .height_loop
rep ret
ALIGN 16
......@@ -329,7 +360,7 @@ ALIGN 4
lea parm1q, [parm1q+parm2q*4]
sub eax, byte 4
jnz .height_loop
jg .height_loop
rep ret
ALIGN 16
......@@ -364,7 +395,7 @@ ALIGN 4
lea parm3q, [parm3q+parm4q*4]
lea parm1q, [parm1q+parm2q*4]
sub eax, byte 4
jnz .height_loop
jg .height_loop
rep ret
......@@ -384,7 +415,7 @@ ALIGN 4
sub eax, byte 2
lea parm3q, [parm3q+parm4q*2]
lea parm1q, [parm1q+parm2q*2]
jnz .height_loop
jg .height_loop
rep ret
......
......@@ -59,6 +59,7 @@ SECTION .text
cglobal x264_pixel_avg_w4_mmxext
cglobal x264_pixel_avg_w8_mmxext
cglobal x264_pixel_avg_w16_mmxext
cglobal x264_pixel_avg_w20_mmxext
cglobal x264_pixel_avg_w16_sse2
cglobal x264_pixel_avg_weight_4x4_mmxext
......@@ -112,7 +113,7 @@ ALIGN 4
lea ebx, [ebx+eax*2]
lea ecx, [ecx+edx*2]
lea edi, [edi+esi*2]
jne .height_loop
jg .height_loop
pop edi
pop esi
......@@ -151,7 +152,7 @@ ALIGN 4
lea ebx, [ebx+eax]
lea ecx, [ecx+edx]
lea edi, [edi+esi]
jne .height_loop
jg .height_loop
pop edi
pop esi
......@@ -193,7 +194,7 @@ ALIGN 4
lea ebx, [ebx+eax]
lea ecx, [ecx+edx]
lea edi, [edi+esi]
jne .height_loop
jg .height_loop
pop edi
pop esi
......@@ -201,6 +202,53 @@ ALIGN 4
pop ebp
ret
ALIGN 16
;-----------------------------------------------------------------------------
; void x264_pixel_avg_w20_mmxext( uint8_t *dst, int i_dst_stride,
; uint8_t *src1, int i_src1_stride,
; uint8_t *src2, int i_src2_stride,
; int i_height );
;-----------------------------------------------------------------------------
x264_pixel_avg_w20_mmxext:
push ebp
push ebx
push esi
push edi
mov edi, [esp+20] ; dst
mov ebx, [esp+28] ; src1
mov ecx, [esp+36] ; src2
mov esi, [esp+24] ; i_dst_stride
mov eax, [esp+32] ; i_src1_stride
mov edx, [esp+40] ; i_src2_stride
mov ebp, [esp+44] ; i_height
ALIGN 4
.height_loop
movq mm0, [ebx ]
movq mm1, [ebx+8 ]
movd mm2, [ebx+16]
pavgb mm0, [ecx ]
pavgb mm1, [ecx+8 ]
pavgb mm2, [ecx+16]
movq [edi ], mm0
movq [edi+8 ], mm1
movd [edi+16], mm2
dec ebp
lea ebx, [ebx+eax]
lea ecx, [ecx+edx]
lea edi, [edi+esi]
jg .height_loop
pop edi
pop esi
pop ebx
pop ebp
ret
ALIGN 16
;-----------------------------------------------------------------------------
; void x264_pixel_avg_w16_sse2( uint8_t *dst, int i_dst_stride,
......@@ -231,7 +279,7 @@ ALIGN 4
lea ebx, [ebx+eax]
lea ecx, [ecx+edx]
lea edi, [edi+esi]
jne .height_loop
jg .height_loop
pop edi
pop esi
......@@ -302,7 +350,7 @@ x264_pixel_avg_weight_w16_mmxext:
add edi, esi
add edx, ecx
dec eax
jnz .height_loop
jg .height_loop
BIWEIGHT_END_MMX
ALIGN 16
......@@ -323,7 +371,7 @@ x264_pixel_avg_weight_w8_mmxext:
lea edi, [edi+esi*2]
lea edx, [edx+ecx*2]
sub eax, byte 2
jnz .height_loop
jg .height_loop
BIWEIGHT_END_MMX
ALIGN 16
......@@ -371,7 +419,7 @@ ALIGN 4
lea edi, [edi+edx*2]
dec ecx
dec ecx
jne .height_loop
jg .height_loop
pop edi
pop esi
......@@ -409,7 +457,7 @@ ALIGN 4
lea edi, [edi+edx*2]
sub ecx, byte 4
jnz .height_loop
jg .height_loop
pop edi
pop esi
......@@ -455,7 +503,7 @@ ALIGN 4
lea esi, [esi+ebx*2]
lea edi, [edi+edx*2]
sub ecx, byte 4
jnz .height_loop
jg .height_loop
pop edi
pop esi
......@@ -488,7 +536,7 @@ ALIGN 4
dec ecx
lea esi, [esi+ebx*2]
lea edi, [edi+edx*2]
jnz .height_loop
jg .height_loop
pop edi
pop esi
......
......@@ -31,6 +31,7 @@
extern void x264_pixel_avg_w4_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_w8_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_w16_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_w20_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_w16_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int );
......@@ -68,13 +69,14 @@ AVG_WEIGHT(8,16)
AVG_WEIGHT(8,8)
AVG_WEIGHT(8,4)
static void (* const x264_pixel_avg_wtab_mmxext[5])( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ) =
static void (* const x264_pixel_avg_wtab_mmxext[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int, int ) =
{
NULL,
x264_pixel_avg_w4_mmxext,
x264_pixel_avg_w8_mmxext,
NULL,
x264_pixel_avg_w16_mmxext
x264_pixel_avg_w16_mmxext,
x264_pixel_avg_w16_mmxext,
x264_pixel_avg_w20_mmxext,
};
static void (* const x264_mc_copy_wtab_mmx[5])( uint8_t *, int, uint8_t *, int, int ) =
{
......
......@@ -37,6 +37,7 @@ typedef struct
int mvx, int mvy,
int i_width, int i_height );
/* may round up the dimensions if they're not a power of 2 */
uint8_t* (*get_ref)(uint8_t **, int, uint8_t *, int *,
int mvx, int mvy,
int i_width, int i_height );
......
......@@ -224,10 +224,19 @@ uint8_t *get_ref_altivec( uint8_t *src[4], int i_src_stride,
pixel_avg_w8( dst, *i_dst_stride, src1, i_src_stride,
src2, i_src_stride, i_height );
break;
case 12:
case 16:
default:
pixel_avg_w16( dst, *i_dst_stride, src1, i_src_stride,
src2, i_src_stride, i_height );
break;
case 20:
//FIXME suboptimal
pixel_avg_w16( dst, *i_dst_stride, src1, i_src_stride,
src2, i_src_stride, i_height );
pixel_avg_w4( dst+16, *i_dst_stride, src1+16, i_src_stride,
src2+16, i_src_stride, i_height );
break;
}
return dst;
......
......@@ -586,7 +586,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
const int i_pixel = m->i_pixel;
const int b_chroma_me = h->mb.b_chroma_me && i_pixel <= PIXEL_8x8;
DECLARE_ALIGNED( uint8_t, pix[4][16*16], 16 );
DECLARE_ALIGNED( uint8_t, pix[2][32*18], 16 ); // really 17x17, but round up for alignment
int omx, omy;
int i;
......@@ -610,20 +610,12 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
{
int omx = bmx, omy = bmy;
int costs[4];
int stride = 16; // candidates are either all hpel or all qpel, so one stride is enough
int stride = 32; // candidates are either all hpel or all qpel, so one stride is enough
uint8_t *src0, *src1, *src2, *src3;
src0 = h->mc.get_ref( m->p_fref, m->i_stride[0], pix[0], &stride, omx, omy-2, bw, bh );
src2 = h->mc.get_ref( m->p_fref, m->i_stride[0], pix[2], &stride, omx-2, omy, bw, bh );
if( (omx|omy)&1 )
{
src1 = h->mc.get_ref( m->p_fref, m->i_stride[0], pix[1], &stride, omx, omy+2, bw, bh );
src3 = h->mc.get_ref( m->p_fref, m->i_stride[0], pix[3], &stride, omx+2, omy, bw, bh );
}
else
{
src0 = h->mc.get_ref( m->p_fref, m->i_stride[0], pix[0], &stride, omx, omy-2, bw, bh+1 );
src2 = h->mc.get_ref( m->p_fref, m->i_stride[0], pix[1], &stride, omx-2, omy, bw+4, bh );
src1 = src0 + stride;
src3 = src2 + 1;
}
h->pixf.sad_x4[i_pixel]( m->p_fenc[0], src0, src1, src2, src3, stride, costs );
COPY2_IF_LT( bcost, costs[0] + p_cost_mvx[omx ] + p_cost_mvy[omy-2], bmy, omy-2 );
COPY2_IF_LT( bcost, costs[1] + p_cost_mvx[omx ] + p_cost_mvy[omy+2], bmy, omy+2 );
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment