Commit d68f3b07 authored by Fiona Glaser's avatar Fiona Glaser

SSSE3 cachesplit workaround for avg2_w16

Palignr-based solution for the most commonly used qpel function.
1-1.5% faster overall on Core 2 chips.
parent 9dfccce4
......@@ -511,6 +511,66 @@ AVG_CACHELINE_CHECK 12, 64, mmxext
AVG_CACHELINE_CHECK 16, 64, sse2
AVG_CACHELINE_CHECK 20, 64, sse2
; computed jump assumes this loop is exactly 48 bytes
%macro AVG16_CACHELINE_LOOP_SSSE3 2 ; alignment
ALIGN 16
avg_w16_align%1_%2_ssse3:
%if %2&15==0
movdqa xmm1, [r2+16]
palignr xmm1, [r2], %1
pavgb xmm1, [r2+r4]
%else
movdqa xmm1, [r2+16]
movdqa xmm2, [r2+r4+16]
palignr xmm1, [r2], %1
palignr xmm2, [r2+r4], %2
pavgb xmm1, xmm2
%endif
movdqa [r0], xmm1
add r2, r3
add r0, r1
dec r5d
jg avg_w16_align%1_%2_ssse3
rep ret
%endmacro
%assign j 1
%assign k 2
%rep 15
AVG16_CACHELINE_LOOP_SSSE3 j, j
AVG16_CACHELINE_LOOP_SSSE3 j, k
%assign j j+1
%assign k k+1
%endrep
cglobal x264_pixel_avg2_w16_cache64_ssse3
mov eax, r2m
and eax, 0x3f
cmp eax, 0x30
jle x264_pixel_avg2_w16_sse2
PROLOGUE 6,7
lea r6, [r4+r2]
and r4, ~0xf
and r6, 0x1f
and r2, ~0xf
lea r6, [r6*3] ;(offset + align*2)*3
sub r4, r2
shl r6, 4 ;jump = (offset + align*2)*48
%define avg_w16_addr avg_w16_align1_1_ssse3-(avg_w16_align2_2_ssse3-avg_w16_align1_1_ssse3)
%ifdef PIC
lea r11, [avg_w16_addr GLOBAL]
add r6, r11
%else
lea r6, [avg_w16_addr + r6 GLOBAL]
%endif
%ifdef UNIX64
jmp r6
%else
call r6
RET
%endif
;=============================================================================
; pixel copy
;=============================================================================
......
......@@ -94,6 +94,7 @@ PIXEL_AVG_WALL(cache64_mmxext)
PIXEL_AVG_WALL(cache64_sse2)
PIXEL_AVG_WALL(sse2)
PIXEL_AVG_WALL(sse2_misalign)
PIXEL_AVG_WALL(cache64_ssse3)
#define PIXEL_AVG_WTAB(instr, name1, name2, name3, name4, name5)\
static void (* const x264_pixel_avg_wtab_##instr[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) =\
......@@ -119,6 +120,7 @@ PIXEL_AVG_WTAB(cache64_mmxext, mmxext, cache64_mmxext, cache64_mmxext, cache64_m
PIXEL_AVG_WTAB(sse2, mmxext, mmxext, sse2, sse2, sse2)
PIXEL_AVG_WTAB(sse2_misalign, mmxext, mmxext, sse2, sse2, sse2_misalign)
PIXEL_AVG_WTAB(cache64_sse2, mmxext, cache64_mmxext, cache64_sse2, cache64_sse2, cache64_sse2)
PIXEL_AVG_WTAB(cache64_ssse3, mmxext, cache64_mmxext, cache64_sse2, cache64_ssse3, cache64_sse2)
#define MC_COPY_WTAB(instr, name1, name2, name3)\
static void (* const x264_mc_copy_wtab_##instr[5])( uint8_t *, int, uint8_t *, int, int ) =\
......@@ -166,6 +168,7 @@ MC_LUMA(cache64_mmxext,cache64_mmxext,mmx)
#endif
MC_LUMA(sse2,sse2,sse2)
MC_LUMA(cache64_sse2,cache64_sse2,sse2)
MC_LUMA(cache64_ssse3,cache64_ssse3,sse2)
#define GET_REF(name)\
static uint8_t *get_ref_##name( uint8_t *dst, int *i_dst_stride,\
......@@ -199,6 +202,7 @@ GET_REF(cache64_mmxext)
GET_REF(sse2)
GET_REF(sse2_misalign)
GET_REF(cache64_sse2)
GET_REF(cache64_ssse3)
#define HPEL(align, cpu, cpuv, cpuc, cpuh)\
void x264_hpel_filter_v_##cpuv( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width);\
......@@ -344,7 +348,11 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
pf->mc_chroma = x264_mc_chroma_ssse3;
if( cpu&X264_CPU_CACHELINE_64 )
{
pf->mc_chroma = x264_mc_chroma_ssse3_cache64;
pf->mc_luma = mc_luma_cache64_ssse3;
pf->get_ref = get_ref_cache64_ssse3;
}
if( cpu&X264_CPU_SHUFFLE_IS_FAST )
pf->integral_init4v = x264_integral_init4v_ssse3;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment