Commit 1af19534 authored by Fiona Glaser's avatar Fiona Glaser
Browse files

Faster H asm intra prediction functions

Take advantage of the H prediction method invented for merged intra SAD and apply it to regular prediction, too.
parent 4d84a45d
......@@ -22,6 +22,7 @@
;*****************************************************************************
%include "x86inc.asm"
%include "x86util.asm"
%macro STORE8x8 2
movq [r0 + 0*FDEC_STRIDE], %1
......@@ -66,6 +67,7 @@ SECTION_RODATA
ALIGN 16
pb_1: times 16 db 1
pb_3: times 16 db 3
pw_2: times 4 dw 2
pw_4: times 4 dw 4
pw_8: times 8 dw 8
......@@ -151,6 +153,31 @@ cglobal predict_8x8_v_mmxext, 2,2
STORE8x8 mm0, mm0
RET
;-----------------------------------------------------------------------------
; void predict_8x8_h_mmxext( uint8_t *src, uint8_t edge[33] )
;-----------------------------------------------------------------------------
INIT_MMX
cglobal predict_8x8_h_mmxext, 2,2
movu m3, [r1+7]
mova m7, m3
punpckhbw m3, m3
punpcklbw m7, m7
pshufw m0, m3, 0xff
pshufw m1, m3, 0xaa
pshufw m2, m3, 0x55
pshufw m3, m3, 0x00
pshufw m4, m7, 0xff
pshufw m5, m7, 0xaa
pshufw m6, m7, 0x55
pshufw m7, m7, 0x00
%assign n 0
%rep 8
mova [r0+n*FDEC_STRIDE], m %+ n
%assign n n+1
%endrep
RET
;-----------------------------------------------------------------------------
; void predict_8x8_dc_mmxext( uint8_t *src, uint8_t *edge );
;-----------------------------------------------------------------------------
......@@ -367,6 +394,30 @@ cglobal predict_8x8c_v_mmx, 1,1
STORE8x8 mm0, mm0
RET
;-----------------------------------------------------------------------------
; void predict_8x8c_h_mmxext( uint8_t *src )
;-----------------------------------------------------------------------------
%macro PRED_8x8C_H 1
cglobal predict_8x8c_h_%1, 1,1
%ifidn %1, ssse3
mova m1, [pb_3 GLOBAL]
%endif
%assign n 0
%rep 8
SPLATB m0, r0+FDEC_STRIDE*n-1, m1
mova [r0+FDEC_STRIDE*n], m0
%assign n n+1
%endrep
REP_RET
%endmacro
INIT_MMX
%define SPLATB SPLATB_MMX
PRED_8x8C_H mmxext
%define SPLATB SPLATB_SSSE3
PRED_8x8C_H ssse3
;-----------------------------------------------------------------------------
; void predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 )
;-----------------------------------------------------------------------------
......@@ -542,6 +593,39 @@ cglobal predict_16x16_v_sse2, 1,2
STORE16x16_SSE2 xmm0
REP_RET
;-----------------------------------------------------------------------------
; void predict_16x16_h_mmxext( uint8_t *src )
;-----------------------------------------------------------------------------
%macro PRED_16x16_H 1
cglobal predict_16x16_h_%1, 1,2
mov r1, FDEC_STRIDE*12
%ifidn %1, ssse3
mova m1, [pb_3 GLOBAL]
%endif
.vloop:
%assign n 0
%rep 4
SPLATB m0, r0+r1+FDEC_STRIDE*n-1, m1
mova [r0+r1+FDEC_STRIDE*n], m0
%if mmsize==8
mova [r0+r1+FDEC_STRIDE*n+8], m0
%endif
%assign n n+1
%endrep
add r1, -FDEC_STRIDE*4
jge .vloop
REP_RET
%endmacro
;no SSE2, its slower than MMX on all systems that don't support SSSE3
INIT_MMX
%define SPLATB SPLATB_MMX
PRED_16x16_H mmxext
INIT_XMM
%define SPLATB SPLATB_SSSE3
PRED_16x16_H ssse3
;-----------------------------------------------------------------------------
; void predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left )
;-----------------------------------------------------------------------------
......
......@@ -26,13 +26,18 @@
#include "pixel.h"
extern void predict_16x16_v_mmx( uint8_t *src );
extern void predict_16x16_h_mmxext( uint8_t *src );
extern void predict_16x16_h_ssse3( uint8_t *src );
extern void predict_16x16_dc_core_mmxext( uint8_t *src, int i_dc_left );
extern void predict_16x16_dc_top_mmxext( uint8_t *src );
extern void predict_16x16_p_core_mmxext( uint8_t *src, int i00, int b, int c );
extern void predict_8x8c_p_core_mmxext( uint8_t *src, int i00, int b, int c );
extern void predict_8x8c_dc_core_mmxext( uint8_t *src, int s2, int s3 );
extern void predict_8x8c_v_mmx( uint8_t *src );
extern void predict_8x8c_h_mmxext( uint8_t *src );
extern void predict_8x8c_h_ssse3( uint8_t *src );
extern void predict_8x8_v_mmxext( uint8_t *src, uint8_t edge[33] );
extern void predict_8x8_h_mmxext( uint8_t *src, uint8_t edge[33] );
extern void predict_8x8_dc_mmxext( uint8_t *src, uint8_t edge[33] );
extern void predict_8x8_dc_top_mmxext( uint8_t *src, uint8_t edge[33] );
extern void predict_8x8_dc_left_mmxext( uint8_t *src, uint8_t edge[33] );
......@@ -126,28 +131,6 @@ static void predict_8x8c_dc_mmxext( uint8_t *src )
}
#ifdef ARCH_X86_64
static void predict_16x16_h( uint8_t *src )
{
int y;
for( y = 0; y < 16; y++ )
{
const uint64_t v = 0x0101010101010101ULL * src[-1];
uint64_t *p = (uint64_t*)src;
p[0] = p[1] = v;
src += FDEC_STRIDE;
}
}
static void predict_8x8c_h( uint8_t *src )
{
int y;
for( y = 0; y < 8; y++ )
{
*(uint64_t*)src = 0x0101010101010101ULL * src[-1];
src += FDEC_STRIDE;
}
}
static void predict_16x16_dc_left( uint8_t *src )
{
uint32_t s = 0;
......@@ -496,7 +479,6 @@ void x264_predict_16x16_init_mmx( int cpu, x264_predict_t pf[7] )
if( !(cpu&X264_CPU_MMX) )
return;
#ifdef ARCH_X86_64
pf[I_PRED_16x16_H] = predict_16x16_h;
pf[I_PRED_16x16_DC_LEFT] = predict_16x16_dc_left;
#endif
pf[I_PRED_16x16_V] = predict_16x16_v_mmx;
......@@ -505,6 +487,7 @@ void x264_predict_16x16_init_mmx( int cpu, x264_predict_t pf[7] )
pf[I_PRED_16x16_DC] = predict_16x16_dc_mmxext;
pf[I_PRED_16x16_DC_TOP] = predict_16x16_dc_top_mmxext;
pf[I_PRED_16x16_P] = predict_16x16_p_mmxext;
pf[I_PRED_16x16_H] = predict_16x16_h_mmxext;
if( !(cpu&X264_CPU_SSE2) )
return;
pf[I_PRED_16x16_DC] = predict_16x16_dc_sse2;
......@@ -513,6 +496,9 @@ void x264_predict_16x16_init_mmx( int cpu, x264_predict_t pf[7] )
return;
pf[I_PRED_16x16_DC_TOP] = predict_16x16_dc_top_sse2;
pf[I_PRED_16x16_P] = predict_16x16_p_sse2;
if( !(cpu&X264_CPU_SSSE3) )
return;
pf[I_PRED_16x16_H] = predict_16x16_h_ssse3;
}
void x264_predict_8x8c_init_mmx( int cpu, x264_predict_t pf[7] )
......@@ -520,15 +506,18 @@ void x264_predict_8x8c_init_mmx( int cpu, x264_predict_t pf[7] )
if( !(cpu&X264_CPU_MMX) )
return;
#ifdef ARCH_X86_64
pf[I_PRED_CHROMA_H] = predict_8x8c_h;
pf[I_PRED_CHROMA_DC_LEFT] = predict_8x8c_dc_left;
pf[I_PRED_CHROMA_DC_TOP] = predict_8x8c_dc_top;
#endif
pf[I_PRED_CHROMA_V] = predict_8x8c_v_mmx;
if( !(cpu&X264_CPU_MMXEXT) )
return;
pf[I_PRED_CHROMA_H] = predict_8x8c_h_mmxext;
pf[I_PRED_CHROMA_P] = predict_8x8c_p_mmxext;
pf[I_PRED_CHROMA_DC] = predict_8x8c_dc_mmxext;
if( !(cpu&X264_CPU_SSSE3) )
return;
pf[I_PRED_CHROMA_H] = predict_8x8c_h_ssse3;
}
void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12] )
......@@ -536,6 +525,7 @@ void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12] )
if( !(cpu&X264_CPU_MMXEXT) )
return;
pf[I_PRED_8x8_V] = predict_8x8_v_mmxext;
pf[I_PRED_8x8_H] = predict_8x8_h_mmxext;
pf[I_PRED_8x8_DC] = predict_8x8_dc_mmxext;
pf[I_PRED_8x8_DC_TOP] = predict_8x8_dc_top_mmxext;
pf[I_PRED_8x8_DC_LEFT]= predict_8x8_dc_left_mmxext;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment