Commit 04dc2536 authored by Loren Merritt's avatar Loren Merritt

lowres_init asm

rounding is changed for asm convenience. this makes the c version slower, but there's no way around that if all the implementations are to have the same results.
parent a59f4a7b
......@@ -73,7 +73,7 @@ x264_frame_t *x264_frame_new( x264_t *h )
if( h->frames.b_have_lowres )
{
frame->i_width_lowres = frame->i_width[0]/2;
frame->i_stride_lowres = frame->i_width_lowres + 2*PADH;
frame->i_stride_lowres = (frame->i_width_lowres + 2*PADH + 15) & ~15;
frame->i_lines_lowres = frame->i_lines[0]/2;
for( i = 0; i < 4; i++ )
{
......
......@@ -283,6 +283,53 @@ static void memzero_aligned( void * dst, int n )
memset( dst, 0, n );
}
void x264_frame_init_lowres( x264_t *h, x264_frame_t *frame )
{
uint8_t *src = frame->plane[0];
int i_stride = frame->i_stride[0];
int i_height = frame->i_lines[0];
int i_width = frame->i_width[0];
int x, y;
// duplicate last row and column so that their interpolation doesn't have to be special-cased
for( y=0; y<i_height; y++ )
src[i_width+y*i_stride] = src[i_width-1+y*i_stride];
h->mc.memcpy_aligned( src+i_stride*i_height, src+i_stride*(i_height-1), i_width );
h->mc.frame_init_lowres_core( src, frame->lowres[0], frame->lowres[1], frame->lowres[2], frame->lowres[3],
i_stride, frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres );
x264_frame_expand_border_lowres( frame );
for( y=0; y<16; y++ )
for( x=0; x<16; x++ )
frame->i_cost_est[y][x] = -1;
}
static void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
int src_stride, int dst_stride, int width, int height )
{
int x,y;
for( y=0; y<height; y++ )
{
uint8_t *src1 = src0+src_stride;
uint8_t *src2 = src1+src_stride;
for( x=0; x<width; x++ )
{
// slower than naive bilinear, but matches asm
#define FILTER(a,b,c,d) ((((a+b+1)>>1)+((c+d+1)>>1)+1)>>1)
dst0[x] = FILTER(src0[2*x ], src1[2*x ], src0[2*x+1], src1[2*x+1]);
dsth[x] = FILTER(src0[2*x+1], src1[2*x+1], src0[2*x+2], src1[2*x+2]);
dstv[x] = FILTER(src1[2*x ], src2[2*x ], src1[2*x+1], src2[2*x+1]);
dstc[x] = FILTER(src1[2*x+1], src2[2*x+1], src1[2*x+2], src2[2*x+2]);
#undef FILTER
}
src0 += src_stride*2;
dst0 += dst_stride;
dsth += dst_stride;
dstv += dst_stride;
dstc += dst_stride;
}
}
void x264_mc_init( int cpu, x264_mc_functions_t *pf )
{
pf->mc_luma = mc_luma;
......@@ -322,6 +369,7 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf )
pf->prefetch_ref = prefetch_ref_null;
pf->memcpy_aligned = memcpy;
pf->memzero_aligned = memzero_aligned;
pf->frame_init_lowres_core = frame_init_lowres_core;
#ifdef HAVE_MMX
x264_mc_init_mmx( cpu, pf );
......@@ -389,42 +437,3 @@ void x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
}
}
}
void x264_frame_init_lowres( x264_t *h, x264_frame_t *frame )
{
// FIXME: tapfilter?
const int i_stride = frame->i_stride[0];
const int i_stride2 = frame->i_stride_lowres;
const int i_width2 = frame->i_width_lowres;
int x, y, i;
for( y = 0; y < frame->i_lines_lowres - 1; y++ )
{
uint8_t *src0 = &frame->plane[0][2*y*i_stride];
uint8_t *src1 = src0+i_stride;
uint8_t *src2 = src1+i_stride;
uint8_t *dst0 = &frame->lowres[0][y*i_stride2];
uint8_t *dsth = &frame->lowres[1][y*i_stride2];
uint8_t *dstv = &frame->lowres[2][y*i_stride2];
uint8_t *dstc = &frame->lowres[3][y*i_stride2];
for( x = 0; x < i_width2 - 1; x++ )
{
dst0[x] = (src0[2*x ] + src0[2*x+1] + src1[2*x ] + src1[2*x+1] + 2) >> 2;
dsth[x] = (src0[2*x+1] + src0[2*x+2] + src1[2*x+1] + src1[2*x+2] + 2) >> 2;
dstv[x] = (src1[2*x ] + src1[2*x+1] + src2[2*x ] + src2[2*x+1] + 2) >> 2;
dstc[x] = (src1[2*x+1] + src1[2*x+2] + src2[2*x+1] + src2[2*x+2] + 2) >> 2;
}
dst0[x] = (src0[2*x ] + src0[2*x+1] + src1[2*x ] + src1[2*x+1] + 2) >> 2;
dstv[x] = (src1[2*x ] + src1[2*x+1] + src2[2*x ] + src2[2*x+1] + 2) >> 2;
dsth[x] = (src0[2*x+1] + src1[2*x+1] + 1) >> 1;
dstc[x] = (src1[2*x+1] + src2[2*x+1] + 1) >> 1;
}
for( i = 0; i < 4; i++ )
memcpy( &frame->lowres[i][y*i_stride2], &frame->lowres[i][(y-1)*i_stride2], i_width2 );
for( y = 0; y < 16; y++ )
for( x = 0; x < 16; x++ )
frame->i_cost_est[x][y] = -1;
x264_frame_expand_border_lowres( frame );
}
......@@ -69,6 +69,8 @@ typedef struct
void *(*memcpy_aligned)( void *dst, const void *src, size_t n );
void (*memzero_aligned)( void *dst, int n );
void (*frame_init_lowres_core)( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
int src_stride, int dst_stride, int width, int height );
} x264_mc_functions_t;
void x264_mc_init( int cpu, x264_mc_functions_t *pf );
......
......@@ -86,13 +86,18 @@ SECTION .text
packuswb m1, m4
%endmacro
%macro PALIGNR_SSE2 4
%ifnidn %2, %4
movdqa %4, %2
%macro PALIGNR_MMX 4
%ifnidn %4, %2
mova %4, %2
%endif
pslldq %1, 16-%3
psrldq %4, %3
por %1, %4
%if regsize == 8
psllq %1, (8-%3)*8
psrlq %4, %3*8
%else
pslldq %1, 16-%3
psrldq %4, %3
%endif
por %1, %4
%endmacro
%macro PALIGNR_SSSE3 4
......@@ -306,7 +311,7 @@ cglobal x264_hpel_filter_h_sse2, 3,3,1
jl .loop
REP_RET
%define PALIGNR PALIGNR_SSE2
%define PALIGNR PALIGNR_MMX
HPEL_V sse2
HPEL_C sse2
%define PALIGNR PALIGNR_SSSE3
......@@ -468,3 +473,185 @@ INIT_MMX
MEMZERO mmx
INIT_XMM
MEMZERO sse2
%macro FILT8x4 7
mova %3, [r0+%7]
mova %4, [r0+r5+%7]
pavgb %3, %4
pavgb %4, [r0+r5*2+%7]
PALIGNR %1, %3, 1, m6
PALIGNR %2, %4, 1, m6
pavgb %1, %3
pavgb %2, %4
mova %5, %1
mova %6, %2
pand %1, m7
pand %2, m7
psrlw %5, 8
psrlw %6, 8
%endmacro
%macro FILT16x2 4
mova m3, [r0+%4+regsize]
mova m2, [r0+%4]
pavgb m3, [r0+%4+r5+regsize]
pavgb m2, [r0+%4+r5]
PALIGNR %1, m3, 1, m6
pavgb %1, m3
PALIGNR m3, m2, 1, m6
pavgb m3, m2
mova m5, m3
mova m4, %1
pand m3, m7
pand %1, m7
psrlw m5, 8
psrlw m4, 8
packuswb m3, %1
packuswb m5, m4
mova [%2], m3
mova [%3], m5
mova %1, m2
%endmacro
%macro FILT8x2U 3
mova m3, [r0+%3+8]
mova m2, [r0+%3]
pavgb m3, [r0+%3+r5+8]
pavgb m2, [r0+%3+r5]
mova m1, [r0+%3+9]
mova m0, [r0+%3+1]
pavgb m1, [r0+%3+r5+9]
pavgb m0, [r0+%3+r5+1]
pavgb m1, m3
pavgb m0, m2
mova m3, m1
mova m2, m0
pand m1, m7
pand m0, m7
psrlw m3, 8
psrlw m2, 8
packuswb m0, m1
packuswb m2, m3
mova [%1], m0
mova [%2], m2
%endmacro
;-----------------------------------------------------------------------------
; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
; int src_stride, int dst_stride, int width, int height )
;-----------------------------------------------------------------------------
%macro FRAME_INIT_LOWRES 1 ; FIXME
cglobal x264_frame_init_lowres_core_%1, 6,7
; src += 2*(height-1)*stride + 2*width
mov r6d, r8m
dec r6d
imul r6d, r5d
add r6d, r7m
lea r0, [r0+r6*2]
; dst += (height-1)*stride + width
mov r6d, r8m
dec r6d
imul r6d, r6m
add r6d, r7m
add r1, r6
add r2, r6
add r3, r6
add r4, r6
; gap = stride - width
mov r6d, r6m
sub r6d, r7m
PUSH r6
%define dst_gap [rsp+push_size]
mov r6d, r5d
sub r6d, r7m
shl r6d, 1
PUSH r6
%define src_gap [rsp]
%if regsize == 16
; adjust for the odd end case
mov r6d, r7m
and r6d, 8
sub r1, r6
sub r2, r6
sub r3, r6
sub r4, r6
add dst_gap, r6d
%endif ; regsize
pcmpeqb m7, m7
psrlw m7, 8
.vloop:
mov r6d, r7m
%ifnidn %1, mmxext
mova m0, [r0]
mova m1, [r0+r5]
pavgb m0, m1
pavgb m1, [r0+r5*2]
%endif
%if regsize == 16
test r6d, 8
jz .hloop
sub r0, 16
FILT8x4 m0, m1, m2, m3, m4, m5, 0
packuswb m0, m4
packuswb m1, m5
movq [r1], m0
movhps [r2], m0
movq [r3], m1
movhps [r4], m1
mova m0, m2
mova m1, m3
sub r6d, 8
%endif ; regsize
.hloop:
sub r0, regsize*2
sub r1, regsize
sub r2, regsize
sub r3, regsize
sub r4, regsize
%ifdef m8
FILT8x4 m0, m1, m2, m3, m10, m11, regsize
mova m8, m0
mova m9, m1
FILT8x4 m2, m3, m0, m1, m4, m5, 0
packuswb m2, m8
packuswb m3, m9
packuswb m4, m10
packuswb m5, m11
mova [r1], m2
mova [r2], m4
mova [r3], m3
mova [r4], m5
%elifidn %1, mmxext
FILT8x2U r1, r2, 0
FILT8x2U r3, r4, r5
%else
FILT16x2 m0, r1, r2, 0
FILT16x2 m1, r3, r4, r5
%endif
sub r6d, regsize
jg .hloop
.skip:
mov r6, dst_gap
sub r0, src_gap
sub r1, r6
sub r2, r6
sub r3, r6
sub r4, r6
dec dword r8m
jg .vloop
ADD rsp, 2*push_size
RET
%endmacro ; FRAME_INIT_LOWRES
INIT_MMX
%define PALIGNR PALIGNR_MMX
FRAME_INIT_LOWRES mmxext
%ifndef ARCH_X86_64
FRAME_INIT_LOWRES cache32_mmxext
%endif
INIT_XMM
FRAME_INIT_LOWRES sse2
%define PALIGNR PALIGNR_SSSE3
FRAME_INIT_LOWRES ssse3
......@@ -65,6 +65,13 @@ extern void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n );
extern void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n );
extern void x264_memzero_aligned_mmx( void * dst, int n );
extern void x264_memzero_aligned_sse2( void * dst, int n );
#define LOWRES(cpu) \
extern void x264_frame_init_lowres_core_##cpu( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,\
int src_stride, int dst_stride, int width, int height );
LOWRES(mmxext)
LOWRES(cache32_mmxext)
LOWRES(sse2)
LOWRES(ssse3)
#define PIXEL_AVG_W(width,cpu)\
extern void x264_pixel_avg2_w##width##_##cpu( uint8_t *, int, uint8_t *, int, uint8_t *, int );
......@@ -269,6 +276,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->plane_copy = x264_plane_copy_mmxext;
pf->hpel_filter = x264_hpel_filter_mmxext;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_mmxext;
pf->prefetch_fenc = x264_prefetch_fenc_mmxext;
pf->prefetch_ref = x264_prefetch_ref_mmxext;
......@@ -278,11 +286,13 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
{
pf->mc_luma = mc_luma_cache32_mmxext;
pf->get_ref = get_ref_cache32_mmxext;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_cache32_mmxext;
}
else if( cpu&X264_CPU_CACHELINE_64 )
{
pf->mc_luma = mc_luma_cache64_mmxext;
pf->get_ref = get_ref_cache64_mmxext;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_cache32_mmxext;
}
#endif
......@@ -308,6 +318,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->avg_weight[PIXEL_8x4] = x264_pixel_avg_weight_8x4_sse2;
}
pf->hpel_filter = x264_hpel_filter_sse2;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2;
pf->mc_chroma = x264_mc_chroma_sse2;
if( cpu&X264_CPU_SSE2_IS_FAST )
......@@ -325,5 +336,6 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
return;
pf->hpel_filter = x264_hpel_filter_ssse3;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
pf->mc_chroma = x264_mc_chroma_ssse3;
}
......@@ -135,6 +135,7 @@ DECLARE_REG 4, r8, r8d, r8w, r8b, r8d
DECLARE_REG 5, r9, r9d, r9w, r9b, r9d
DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 8]
%define r7m [rsp + stack_offset + 16]
%define r8m [rsp + stack_offset + 24]
%macro LOAD_IF_USED 2 ; reg_id, number_of_args
%if %1 < %2
......@@ -167,6 +168,7 @@ DECLARE_REG 4, esi, esi, si, null, [esp + stack_offset + 20]
DECLARE_REG 5, edi, edi, di, null, [esp + stack_offset + 24]
DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
%define r7m [esp + stack_offset + 32]
%define r8m [esp + stack_offset + 36]
%define rsp esp
%macro PUSH_IF_USED 1 ; reg_id
......@@ -332,6 +334,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
%define m5 xmm5
%define m6 xmm6
%define m7 xmm7
%ifdef ARCH_X86_64
%define m8 xmm8
%define m9 xmm9
%define m10 xmm10
......@@ -340,6 +343,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits
%define m13 xmm13
%define m14 xmm14
%define m15 xmm15
%endif
%endmacro
INIT_MMX
......@@ -389,6 +393,7 @@ INIT_MMX
%xdefine %1_m5 m5
%xdefine %1_m6 m6
%xdefine %1_m7 m7
%ifdef ARCH_X86_64
%xdefine %1_m8 m8
%xdefine %1_m9 m9
%xdefine %1_m10 m10
......@@ -397,6 +402,7 @@ INIT_MMX
%xdefine %1_m13 m13
%xdefine %1_m14 m14
%xdefine %1_m15 m15
%endif
%endmacro
%macro LOAD_MM_PERMUTATION 1
......@@ -408,6 +414,7 @@ INIT_MMX
%xdefine m5 %1_m5
%xdefine m6 %1_m6
%xdefine m7 %1_m7
%ifdef ARCH_X86_64
%xdefine m8 %1_m8
%xdefine m9 %1_m9
%xdefine m10 %1_m10
......@@ -416,6 +423,7 @@ INIT_MMX
%xdefine m13 %1_m13
%xdefine m14 %1_m14
%xdefine m15 %1_m15
%endif
%endmacro
%macro call 1
......
......@@ -29,7 +29,8 @@ SECTION .text
cextern printf
; max number of args used by any x264 asm function.
%define max_args 8
; (max_args % 4) must equal 3 for stack alignment
%define max_args 11
; just random numbers to reduce the chance of incidental match
%define n3 dword 0x6549315c
......@@ -42,16 +43,15 @@ cextern printf
; long x264_checkasm_call( long (*func)(), int *ok, ... )
;-----------------------------------------------------------------------------
cglobal x264_checkasm_call, 1,7
sub esp, 12
mov r3, n3
mov r4, n4
mov r5, n5
mov r6, n6
%rep max_args
push dword [esp+36+max_args*4]
push dword [esp+24+max_args*4]
%endrep
call r0
add esp, 12+max_args*4
add esp, max_args*4
xor r3, n3
xor r4, n4
xor r5, n5
......
......@@ -761,6 +761,38 @@ static int check_mc( int cpu_ref, int cpu_new )
report( "hpel filter :" );
}
if( mc_a.frame_init_lowres_core != mc_ref.frame_init_lowres_core )
{
uint8_t *dstc[4] = { buf3, buf3+1024, buf3+2048, buf3+3072 };
uint8_t *dsta[4] = { buf4, buf4+1024, buf4+2048, buf3+3072 };
set_func_name( "lowres_init" );
for( w=40; w<=48; w+=8 )
if( mc_a.frame_init_lowres_core != mc_ref.frame_init_lowres_core )
{
int stride = (w+8)&~15;
used_asm = 1;
call_c( mc_c.frame_init_lowres_core, buf1, dstc[0], dstc[1], dstc[2], dstc[3], w*2, stride, w, 16 );
call_a( mc_a.frame_init_lowres_core, buf1, dsta[0], dsta[1], dsta[2], dsta[3], w*2, stride, w, 16 );
for( i=0; i<16; i++)
{
for( j=0; j<4; j++)
if( memcmp( dstc[j]+i*stride, dsta[j]+i*stride, w ) )
{
ok = 0;
fprintf( stderr, "frame_init_lowres differs at plane %d line %d\n", j, i );
for( k=0; k<w; k++ )
printf( "%d ", dstc[j][k+i*stride] );
printf("\n");
for( k=0; k<w; k++ )
printf( "%d ", dsta[j][k+i*stride] );
printf("\n");
break;
}
}
}
report( "lowres init :" );
}
return ret;
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment