Commit 6eb54835 authored by Fiona Glaser's avatar Fiona Glaser Committed by Loren Merritt

cacheline split workaround for mc_luma

parent c1e43f09
......@@ -31,6 +31,7 @@ x264_frame_t *x264_frame_new( x264_t *h )
int i_mb_count = h->mb.i_mb_count;
int i_stride, i_width, i_lines;
int i_padv = PADV << h->param.b_interlaced;
int luma_plane_size;
if( !frame ) return NULL;
......@@ -55,20 +56,20 @@ x264_frame_t *x264_frame_new( x264_t *h )
frame->i_stride[i] = i_stride >> !!i;
frame->i_width[i] = i_width >> !!i;
frame->i_lines[i] = i_lines >> !!i;
CHECKED_MALLOC( frame->buffer[i],
frame->i_stride[i] * (i_lines + 2*i_padv) >> !!i );
frame->plane[i] = ((uint8_t*)frame->buffer[i]) +
((frame->i_stride[i] * i_padv + PADH) >> !!i);
}
frame->filtered[0] = frame->plane[0];
for( i = 0; i < 3; i++ )
luma_plane_size = (frame->i_stride[0] * ( frame->i_lines[0] + 2*i_padv ));
for( i = 1; i < 3; i++ )
{
CHECKED_MALLOC( frame->buffer[4+i],
frame->i_stride[0] * ( frame->i_lines[0] + 2*i_padv ) );
frame->filtered[i+1] = ((uint8_t*)frame->buffer[4+i]) +
frame->i_stride[0] * i_padv + PADH;
CHECKED_MALLOC( frame->buffer[i], luma_plane_size/4 );
frame->plane[i] = frame->buffer[i] + (frame->i_stride[i] * i_padv + PADH)/2;
}
/* all 4 luma planes allocated together, since the cacheline split code
* requires them to be in-phase wrt cacheline alignment. */
CHECKED_MALLOC( frame->buffer[0], 4*luma_plane_size);
for( i = 0; i < 4; i++ )
frame->filtered[i] = frame->buffer[0] + i*luma_plane_size + frame->i_stride[0] * i_padv + PADH;
frame->plane[0] = frame->filtered[0];
if( h->frames.b_have_lowres )
{
......@@ -86,9 +87,9 @@ x264_frame_t *x264_frame_new( x264_t *h )
if( h->param.analyse.i_me_method >= X264_ME_ESA )
{
CHECKED_MALLOC( frame->buffer[7],
CHECKED_MALLOC( frame->buffer[3],
2 * frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv) * sizeof(uint16_t) );
frame->integral = (uint16_t*)frame->buffer[7] + frame->i_stride[0] * i_padv + PADH;
frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH;
}
frame->i_poc = -1;
......@@ -132,7 +133,7 @@ fail:
void x264_frame_delete( x264_frame_t *frame )
{
int i, j;
for( i = 0; i < 8; i++ )
for( i = 0; i < 4; i++ )
x264_free( frame->buffer[i] );
for( i = 0; i < 4; i++ )
x264_free( frame->buffer_lowres[i] );
......
......@@ -56,7 +56,7 @@ typedef struct
/* for unrestricted mv we allocate more data than needed
* allocated data are stored in buffer */
void *buffer[8];
void *buffer[4];
void *buffer_lowres[4];
/* motion data */
......
......@@ -30,6 +30,7 @@ pw_4: times 4 dw 4
pw_8: times 4 dw 8
pw_32: times 4 dw 32
pw_64: times 4 dw 64
sw_64: dd 64
SECTION .text
......@@ -229,7 +230,8 @@ cglobal x264_pixel_avg2_w20_mmxext, 6,7
jg .height_loop
REP_RET
cglobal x264_pixel_avg2_w16_sse2, 6,7
%macro PIXEL_AVG_SSE 1
cglobal x264_pixel_avg2_w16_%1, 6,7
sub r4, r2
lea r6, [r4+r3]
.height_loop:
......@@ -247,7 +249,7 @@ cglobal x264_pixel_avg2_w16_sse2, 6,7
jg .height_loop
REP_RET
cglobal x264_pixel_avg2_w20_sse2, 6,7
cglobal x264_pixel_avg2_w20_%1, 6,7
sub r4, r2
lea r6, [r4+r3]
.height_loop:
......@@ -270,8 +272,123 @@ cglobal x264_pixel_avg2_w20_sse2, 6,7
sub r5d, 2
jg .height_loop
REP_RET
%endmacro
PIXEL_AVG_SSE sse2
%ifdef HAVE_SSE3
%define movdqu lddqu
PIXEL_AVG_SSE sse3
%undef movdqu
%endif
; Cacheline split code for processors with high latencies for loads
; split over cache lines. See sad-a.asm for a more detailed explanation.
; This particular instance is complicated by the fact that src1 and src2
; can have different alignments. For simplicity and code size, only the
; MMX cacheline workaround is used. As a result, in the case of SSE2
; pixel_avg, the cacheline check functions calls the SSE2 version if there
; is no cacheline split, and the MMX workaround if there is.
%macro INIT_SHIFT 2
and eax, 7
shl eax, 3
%ifdef PIC32
; both versions work, but picgetgot is slower than gpr->mmx is slower than mem->mmx
mov r2, 64
sub r2, eax
movd %2, eax
movd %1, r2
%else
movd %1, [sw_64 GLOBAL]
movd %2, eax
psubw %1, %2
%endif
%endmacro
%macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set
cglobal x264_pixel_avg2_w%1_cache%2_%3, 0,0
mov eax, r2m
and eax, 0x1f|(%2>>1)
cmp eax, (32-%1)|(%2>>1)
jle x264_pixel_avg2_w%1_%3
;w12 isn't needed because w16 is just as fast if there's no cacheline split
%if %1 == 12
jmp x264_pixel_avg2_w16_cache_mmxext
%else
jmp x264_pixel_avg2_w%1_cache_mmxext
%endif
%endmacro
%macro AVG_CACHELINE_START 0
%assign stack_offset 0
INIT_SHIFT mm6, mm7
mov eax, r4m
INIT_SHIFT mm4, mm5
PROLOGUE 6,6,0
and r2, ~7
and r4, ~7
sub r4, r2
.height_loop:
%endmacro
%macro AVG_CACHELINE_LOOP 2
movq mm0, [r2+8+%1]
movq mm1, [r2+%1]
movq mm2, [r2+r4+8+%1]
movq mm3, [r2+r4+%1]
psllq mm0, mm6
psrlq mm1, mm7
psllq mm2, mm4
psrlq mm3, mm5
por mm0, mm1
por mm2, mm3
pavgb mm0, mm2
%2 [r0+%1], mm0
%endmacro
x264_pixel_avg2_w8_cache_mmxext:
AVG_CACHELINE_START
AVG_CACHELINE_LOOP 0, movq
add r2, r3
add r0, r1
dec r5d
jg .height_loop
RET
x264_pixel_avg2_w16_cache_mmxext:
AVG_CACHELINE_START
AVG_CACHELINE_LOOP 0, movq
AVG_CACHELINE_LOOP 8, movq
add r2, r3
add r0, r1
dec r5d
jg .height_loop
RET
x264_pixel_avg2_w20_cache_mmxext:
AVG_CACHELINE_START
AVG_CACHELINE_LOOP 0, movq
AVG_CACHELINE_LOOP 8, movq
AVG_CACHELINE_LOOP 16, movd
add r2, r3
add r0, r1
dec r5d
jg .height_loop
RET
%ifndef ARCH_X86_64
AVG_CACHELINE_CHECK 8, 32, mmxext
AVG_CACHELINE_CHECK 12, 32, mmxext
AVG_CACHELINE_CHECK 16, 32, mmxext
AVG_CACHELINE_CHECK 20, 32, mmxext
AVG_CACHELINE_CHECK 16, 64, mmxext
AVG_CACHELINE_CHECK 20, 64, mmxext
%endif
AVG_CACHELINE_CHECK 8, 64, mmxext
AVG_CACHELINE_CHECK 12, 64, mmxext
AVG_CACHELINE_CHECK 16, 64, sse2
AVG_CACHELINE_CHECK 20, 64, sse2
;=============================================================================
; pixel copy
......@@ -362,6 +479,11 @@ cglobal %1, 5,7
%endmacro
COPY_W16_SSE2 x264_mc_copy_w16_sse2, movdqu
; cacheline split with mmx has too much overhead; the speed benefit is near-zero.
; but with SSE3 the overhead is zero, so there's no reason not to include it.
%ifdef HAVE_SSE3
COPY_W16_SSE2 x264_mc_copy_w16_sse3, lddqu
%endif
COPY_W16_SSE2 x264_mc_copy_w16_aligned_sse2, movdqa
......
......@@ -38,17 +38,11 @@ extern void x264_pixel_avg_8x4_mmxext( uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg_4x8_mmxext( uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg_4x4_mmxext( uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg_4x2_mmxext( uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg2_w4_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg2_w8_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg2_w12_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg2_w16_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg2_w20_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg2_w16_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg2_w20_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int );
extern void x264_mc_copy_w4_mmx( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w8_mmx( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w16_mmx( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w16_sse2( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w16_sse3( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w16_aligned_sse2( uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int );
......@@ -62,6 +56,19 @@ extern void x264_plane_copy_mmxext( uint8_t *, int, uint8_t *, int, int w, int h
extern void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n );
extern void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n );
#define PIXEL_AVG_W(width,cpu)\
extern void x264_pixel_avg2_w##width##_##cpu( uint8_t *, int, uint8_t *, int, uint8_t *, int );
/* This declares some functions that don't exist, but that isn't a problem. */
#define PIXEL_AVG_WALL(cpu)\
PIXEL_AVG_W(4,cpu); PIXEL_AVG_W(8,cpu); PIXEL_AVG_W(12,cpu); PIXEL_AVG_W(16,cpu); PIXEL_AVG_W(20,cpu);
PIXEL_AVG_WALL(mmxext)
PIXEL_AVG_WALL(cache32_mmxext)
PIXEL_AVG_WALL(cache64_mmxext)
PIXEL_AVG_WALL(cache64_sse2)
PIXEL_AVG_WALL(sse2)
PIXEL_AVG_WALL(sse3)
#define AVG_WEIGHT(W,H) \
void x264_pixel_avg_weight_ ## W ## x ## H ## _mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int i_weight_dst ) \
{ \
......@@ -73,40 +80,48 @@ AVG_WEIGHT(8,16)
AVG_WEIGHT(8,8)
AVG_WEIGHT(8,4)
static void (* const x264_pixel_avg_wtab_mmxext[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) =
{
NULL,
x264_pixel_avg2_w4_mmxext,
x264_pixel_avg2_w8_mmxext,
x264_pixel_avg2_w12_mmxext,
x264_pixel_avg2_w16_mmxext,
x264_pixel_avg2_w20_mmxext,
};
static void (* const x264_mc_copy_wtab_mmx[5])( uint8_t *, int, uint8_t *, int, int ) =
{
NULL,
x264_mc_copy_w4_mmx,
x264_mc_copy_w8_mmx,
NULL,
x264_mc_copy_w16_mmx
};
static void (* const x264_pixel_avg_wtab_sse2[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) =
{
NULL,
x264_pixel_avg2_w4_mmxext,
x264_pixel_avg2_w8_mmxext,
x264_pixel_avg2_w12_mmxext,
x264_pixel_avg2_w16_sse2,
x264_pixel_avg2_w20_sse2,
#define PIXEL_AVG_WTAB(instr, name1, name2, name3, name4, name5)\
static void (* const x264_pixel_avg_wtab_##instr[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) =\
{\
NULL,\
x264_pixel_avg2_w4_##name1,\
x264_pixel_avg2_w8_##name2,\
x264_pixel_avg2_w12_##name3,\
x264_pixel_avg2_w16_##name4,\
x264_pixel_avg2_w20_##name5,\
};
static void (* const x264_mc_copy_wtab_sse2[5])( uint8_t *, int, uint8_t *, int, int ) =
{
NULL,
x264_mc_copy_w4_mmx,
x264_mc_copy_w8_mmx,
NULL,
x264_mc_copy_w16_sse2,
/* w16 sse2 is faster than w12 mmx as long as the cacheline issue is resolved */
#define x264_pixel_avg2_w12_cache64_sse2 x264_pixel_avg2_w16_cache64_sse2
#define x264_pixel_avg2_w12_sse3 x264_pixel_avg2_w16_sse3
PIXEL_AVG_WTAB(mmxext, mmxext, mmxext, mmxext, mmxext, mmxext)
#ifdef ARCH_X86
PIXEL_AVG_WTAB(cache32_mmxext, mmxext, cache32_mmxext, cache32_mmxext, cache32_mmxext, cache32_mmxext)
#endif
PIXEL_AVG_WTAB(cache64_mmxext, mmxext, cache64_mmxext, cache64_mmxext, cache64_mmxext, cache64_mmxext)
PIXEL_AVG_WTAB(sse2, mmxext, mmxext, mmxext, sse2, sse2)
PIXEL_AVG_WTAB(cache64_sse2, mmxext, cache64_mmxext, cache64_sse2, cache64_sse2, cache64_sse2)
#ifdef HAVE_SSE3
PIXEL_AVG_WTAB(cache64_sse3, mmxext, cache64_mmxext, sse3, sse3, sse3)
#endif
#define MC_COPY_WTAB(instr, name1, name2, name3)\
static void (* const x264_mc_copy_wtab_##instr[5])( uint8_t *, int, uint8_t *, int, int ) =\
{\
NULL,\
x264_mc_copy_w4_##name1,\
x264_mc_copy_w8_##name2,\
NULL,\
x264_mc_copy_w16_##name3,\
};
MC_COPY_WTAB(mmx,mmx,mmx,mmx)
MC_COPY_WTAB(sse2,mmx,mmx,sse2)
#ifdef HAVE_SSE3
MC_COPY_WTAB(sse3,mmx,mmx,sse3)
#endif
static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
......@@ -134,7 +149,15 @@ void mc_luma_##name( uint8_t *dst, int i_dst_stride,\
}
MC_LUMA(mmxext,mmxext,mmx)
#ifdef ARCH_X86
MC_LUMA(cache32_mmxext,cache32_mmxext,mmx)
MC_LUMA(cache64_mmxext,cache64_mmxext,mmx)
#endif
MC_LUMA(sse2,sse2,sse2)
MC_LUMA(cache64_sse2,cache64_sse2,sse2)
#ifdef HAVE_SSE3
MC_LUMA(cache64_sse3,cache64_sse3,sse3)
#endif
#define GET_REF(name)\
uint8_t *get_ref_##name( uint8_t *dst, int *i_dst_stride,\
......@@ -161,7 +184,15 @@ uint8_t *get_ref_##name( uint8_t *dst, int *i_dst_stride,\
}
GET_REF(mmxext)
#ifdef ARCH_X86
GET_REF(cache32_mmxext)
GET_REF(cache64_mmxext)
#endif
GET_REF(sse2)
GET_REF(cache64_sse2)
#ifdef HAVE_SSE3
GET_REF(cache64_sse3)
#endif
#define HPEL(align, cpu, cpuv, cpuc, cpuh)\
void x264_hpel_filter_v_##cpuv( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width);\
......@@ -240,6 +271,19 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->prefetch_fenc = x264_prefetch_fenc_mmxext;
pf->prefetch_ref = x264_prefetch_ref_mmxext;
#ifdef ARCH_X86 // all x86_64 cpus with cacheline split issues use sse2 instead
if( cpu&X264_CPU_CACHELINE_32 )
{
pf->mc_luma = mc_luma_cache32_mmxext;
pf->get_ref = get_ref_cache32_mmxext;
}
else if( cpu&X264_CPU_CACHELINE_SPLIT )
{
pf->mc_luma = mc_luma_cache64_mmxext;
pf->get_ref = get_ref_cache64_mmxext;
}
#endif
if( !(cpu&X264_CPU_SSE2) )
return;
......@@ -257,6 +301,20 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_sse2;
pf->hpel_filter = x264_hpel_filter_sse2;
if( cpu&X264_CPU_CACHELINE_SPLIT )
{
pf->mc_luma = mc_luma_cache64_sse2;
pf->get_ref = get_ref_cache64_sse2;
#ifdef HAVE_SSE3
/* lddqu doesn't work on Core2 */
if( (cpu&X264_CPU_SSE3) && !(cpu&X264_CPU_SSSE3) )
{
pf->mc_luma = mc_luma_cache64_sse3;
pf->get_ref = get_ref_cache64_sse3;
}
#endif
}
if( !(cpu&X264_CPU_SSSE3) )
return;
......
......@@ -670,7 +670,7 @@ sad_w16_align%1_sse2:
lea r2, [r2+2*r3]
dec r4
jg sad_w16_align%1_sse2
rep ret
ret
%endmacro
; computed jump assumes this loop is exactly 64 bytes
......@@ -689,7 +689,7 @@ sad_w16_align%1_ssse3:
lea r2, [r2+2*r3]
dec r4
jg sad_w16_align%1_ssse3
rep ret
ret
%endmacro
%macro SAD16_CACHELINE_FUNC 2 ; cpu, height
......
......@@ -476,8 +476,9 @@ static int check_mc( int cpu_ref, int cpu_new )
}
ok = 1; used_asm = 0;
for( dy = -8; dy < 8; dy++ )
for( dx = -8; dx < 8; dx++ )
for( dx = -128; dx < 128; dx++ )
{
if( rand()&15 ) continue; // running all of them is too slow
MC_TEST_LUMA( 20, 18 );
MC_TEST_LUMA( 16, 16 );
MC_TEST_LUMA( 16, 8 );
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment