Commit 2ed861c8 authored by Loren Merritt's avatar Loren Merritt

convert absolute difference of sums from mmx to sse2

convert mv bits cost and ads threshold from C to sse2
convert bytemask-to-list from C to scalar asm
1.6x faster me=esa (x86_64) or 1.3x faster (x86_32). (times consider only motion estimation. overall encode speedup may vary.)


git-svn-id: svn://svn.videolan.org/x264/trunk@717 df754926-b1dd-0310-bc7b-ec298dee348c
parent c0fb035a
......@@ -1045,9 +1045,82 @@ cglobal x264_intra_satd_x3_8x8c_mmxext
; int x264_pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
; {
; int nmv=0, i, j;
; *(uint32_t*)(masks+width) = 0;
; for( i=0; i<width; i+=8 )
; {
; uint64_t mask = *(uint64_t*)(masks+i);
; if( !mask ) continue;
; for( j=0; j<8; j++ )
; if( mask & (255<<j*8) )
; mvs[nmv++] = i+j;
; }
; return nmv;
; }
cglobal x264_pixel_ads_mvs
; mvs = parm5q
; masks = rsp
; width = r10
mov dword [rsp+r10], 0
xor eax, eax
xor esi, esi
.loopi:
mov rdi, [rsp+rsi]
test rdi, rdi
jz .nexti
xor ecx, ecx
%macro TEST 1
mov [parm5q+rax*2], si
test edi, 0xff<<(%1*8)
setne cl
add eax, ecx
inc esi
%endmacro
TEST 0
TEST 1
TEST 2
TEST 3
shr rdi, 32
TEST 0
TEST 1
TEST 2
TEST 3
cmp esi, r10d
jl .loopi
leave
ret
.nexti:
add esi, 8
cmp esi, r10d
jl .loopi
leave
ret
%macro ADS_START 0
push rbp
mov rbp, rsp
sub rsp, parm6q
sub rsp, 4
and rsp, ~15
mov rax, rsp
mov r10d, parm6d
shl parm3q, 1
%endmacro
%macro ADS_END 1
add parm2q, 8*%1
add parm4q, 8*%1
add rax, 4*%1
sub parm6d, 4*%1
jg .loop
jmp x264_pixel_ads_mvs
%endmacro
;-----------------------------------------------------------------------------
; void x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta,
; uint16_t *res, int width )
; int x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta,
; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
;-----------------------------------------------------------------------------
cglobal x264_pixel_ads4_mmxext
movq mm6, [parm1q]
......@@ -1056,7 +1129,7 @@ cglobal x264_pixel_ads4_mmxext
pshufw mm6, mm6, 0xAA
pshufw mm5, mm4, 0
pshufw mm4, mm4, 0xAA
shl parm3q, 1
ADS_START
.loop:
movq mm0, [parm2q]
movq mm1, [parm2q+16]
......@@ -1073,19 +1146,19 @@ cglobal x264_pixel_ads4_mmxext
MMX_ABS mm3, mm1
paddw mm0, mm2
paddw mm0, mm3
movq [parm4q], mm0
add parm2q, 8
add parm4q, 8
sub parm5d, 4
jg .loop
nop
ret
pshufw mm1, [rbp+16], 0
paddusw mm0, [parm4q]
psubusw mm1, mm0
packsswb mm1, mm1
movd [rax], mm1
ADS_END 1
cglobal x264_pixel_ads2_mmxext
movq mm6, [parm1q]
pshufw mm5, parm7q, 0
pshufw mm7, mm6, 0
pshufw mm6, mm6, 0xAA
shl parm3q, 1
ADS_START
.loop:
movq mm0, [parm2q]
movq mm1, [parm2q+parm3q]
......@@ -1094,16 +1167,17 @@ cglobal x264_pixel_ads2_mmxext
MMX_ABS mm0, mm2
MMX_ABS mm1, mm3
paddw mm0, mm1
movq [parm4q], mm0
add parm2q, 8
add parm4q, 8
sub parm5d, 4
jg .loop
nop
ret
paddusw mm0, [parm4q]
movq mm4, mm5
psubusw mm4, mm0
packsswb mm4, mm4
movd [rax], mm4
ADS_END 1
cglobal x264_pixel_ads1_mmxext
pshufw mm7, [parm1q], 0
pshufw mm6, parm7q, 0
ADS_START
.loop:
movq mm0, [parm2q]
movq mm1, [parm2q+8]
......@@ -1111,11 +1185,113 @@ cglobal x264_pixel_ads1_mmxext
psubw mm1, mm7
MMX_ABS mm0, mm2
MMX_ABS mm1, mm3
movq [parm4q], mm0
movq [parm4q+8], mm1
add parm2q, 16
add parm4q, 16
sub parm5d, 8
jg .loop
nop
ret
paddusw mm0, [parm4q]
paddusw mm1, [parm4q+8]
movq mm4, mm6
movq mm5, mm6
psubusw mm4, mm0
psubusw mm5, mm1
packsswb mm4, mm5
movq [rax], mm4
ADS_END 2
%macro ADS_SSE2 1
cglobal x264_pixel_ads4_%1
movdqa xmm4, [parm1q]
pshuflw xmm8, parm7q, 0
pshuflw xmm7, xmm4, 0
pshuflw xmm6, xmm4, 0xAA
pshufhw xmm5, xmm4, 0
pshufhw xmm4, xmm4, 0xAA
punpcklqdq xmm8, xmm8
punpcklqdq xmm7, xmm7
punpcklqdq xmm6, xmm6
punpckhqdq xmm5, xmm5
punpckhqdq xmm4, xmm4
ADS_START
movdqu xmm10, [parm2q]
movdqu xmm11, [parm2q+parm3q]
.loop:
movdqa xmm0, xmm10
movdqu xmm1, [parm2q+16]
movdqa xmm10, xmm1
psubw xmm0, xmm7
psubw xmm1, xmm6
MMX_ABS xmm0, xmm2
MMX_ABS xmm1, xmm3
movdqa xmm2, xmm11
movdqu xmm3, [parm2q+parm3q+16]
movdqa xmm11, xmm3
psubw xmm2, xmm5
psubw xmm3, xmm4
paddw xmm0, xmm1
movdqu xmm9, [parm4q]
MMX_ABS xmm2, xmm1
MMX_ABS xmm3, xmm1
paddw xmm0, xmm2
paddw xmm0, xmm3
paddusw xmm0, xmm9
movdqa xmm1, xmm8
psubusw xmm1, xmm0
packsswb xmm1, xmm1
movq [rax], xmm1
ADS_END 2
cglobal x264_pixel_ads2_%1
movq xmm6, [parm1q]
pshuflw xmm8, parm7q, 0
pshuflw xmm7, xmm6, 0
pshuflw xmm6, xmm6, 0xAA
punpcklqdq xmm8, xmm8
punpcklqdq xmm7, xmm7
punpcklqdq xmm6, xmm6
ADS_START
.loop:
movdqu xmm0, [parm2q]
movdqu xmm1, [parm2q+parm3q]
psubw xmm0, xmm7
psubw xmm1, xmm6
movdqu xmm9, [parm4q]
MMX_ABS xmm0, xmm2
MMX_ABS xmm1, xmm3
paddw xmm0, xmm1
paddusw xmm0, xmm9
movdqa xmm4, xmm8
psubusw xmm4, xmm0
packsswb xmm4, xmm4
movq [rax], xmm4
ADS_END 2
cglobal x264_pixel_ads1_%1
pshuflw xmm7, [parm1q], 0
pshuflw xmm8, parm7q, 0
punpcklqdq xmm7, xmm7
punpcklqdq xmm8, xmm8
ADS_START
.loop:
movdqu xmm0, [parm2q]
movdqu xmm1, [parm2q+16]
psubw xmm0, xmm7
psubw xmm1, xmm7
movdqu xmm9, [parm4q]
movdqu xmm10, [parm4q+16]
MMX_ABS xmm0, xmm2
MMX_ABS xmm1, xmm3
paddusw xmm0, xmm9
paddusw xmm1, xmm10
movdqa xmm4, xmm8
movdqa xmm5, xmm8
psubusw xmm4, xmm0
psubusw xmm5, xmm1
packsswb xmm4, xmm5
movdqa [rax], xmm4
ADS_END 4
%endmacro
ADS_SSE2 sse2
%ifdef HAVE_SSE3
%macro MMX_ABS 2
pabsw %1, %1
%endmacro
ADS_SSE2 ssse3
%endif
......@@ -1579,24 +1579,91 @@ cglobal x264_pixel_ssim_4x4x2_core_mmxext
; int x264_pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
cglobal x264_pixel_ads_mvs
mov ebx, [ebp+24] ; mvs
mov ecx, esp ; masks
mov edi, [ebp+28] ; width
mov dword [ecx+edi], 0
push esi
push ebp
xor eax, eax
xor esi, esi
.loopi:
mov ebp, [ecx+esi]
mov edx, [ecx+esi+4]
or edx, ebp
jz .nexti
xor edx, edx
%macro TEST 1
mov [ebx+eax*2], si
test ebp, 0xff<<(%1*8)
setne dl
add eax, edx
inc esi
%endmacro
TEST 0
TEST 1
TEST 2
TEST 3
mov ebp, [ecx+esi]
TEST 0
TEST 1
TEST 2
TEST 3
cmp esi, edi
jl .loopi
jmp .end
.nexti:
add esi, 8
cmp esi, edi
jl .loopi
.end:
pop ebp
pop esi
mov edi, [ebp-8]
mov ebx, [ebp-4]
leave
ret
%macro ADS_START 0
push ebp
mov ebp, esp
push ebx
push edi
mov eax, [ebp+12] ; sums
mov ebx, [ebp+16] ; delta
mov ecx, [ebp+20] ; cost_mvx
mov edx, [ebp+28] ; width
sub esp, edx
sub esp, 4
and esp, ~15
mov edi, esp
shl ebx, 1
%endmacro
%macro ADS_END 1
add eax, 8*%1
add ecx, 8*%1
add edi, 4*%1
sub edx, 4*%1
jg .loop
jmp x264_pixel_ads_mvs
%endmacro
;-----------------------------------------------------------------------------
; void x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta,
; uint16_t *res, int width )
; int x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta,
; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
;-----------------------------------------------------------------------------
cglobal x264_pixel_ads4_mmxext
push ebx
mov eax, [esp+8]
mov eax, [esp+4]
movq mm6, [eax]
movq mm4, [eax+8]
pshufw mm7, mm6, 0
pshufw mm6, mm6, 0xAA
pshufw mm5, mm4, 0
pshufw mm4, mm4, 0xAA
mov eax, [esp+12]
mov ebx, [esp+16]
mov ecx, [esp+20]
mov edx, [esp+24]
shl ebx, 1
ADS_START
.loop:
movq mm0, [eax]
movq mm1, [eax+16]
......@@ -1613,25 +1680,20 @@ cglobal x264_pixel_ads4_mmxext
MMX_ABS mm3, mm1
paddw mm0, mm2
paddw mm0, mm3
movq [ecx], mm0
add eax, 8
add ecx, 8
sub edx, 4
jg .loop
pop ebx
ret
pshufw mm1, [ebp+32], 0
paddusw mm0, [ecx]
psubusw mm1, mm0
packsswb mm1, mm1
movd [edi], mm1
ADS_END 1
cglobal x264_pixel_ads2_mmxext
push ebx
mov eax, [esp+8]
mov eax, [esp+4]
movq mm6, [eax]
pshufw mm5, [esp+28], 0
pshufw mm7, mm6, 0
pshufw mm6, mm6, 0xAA
mov eax, [esp+12]
mov ebx, [esp+16]
mov ecx, [esp+20]
mov edx, [esp+24]
shl ebx, 1
ADS_START
.loop:
movq mm0, [eax]
movq mm1, [eax+ebx]
......@@ -1640,20 +1702,18 @@ cglobal x264_pixel_ads2_mmxext
MMX_ABS mm0, mm2
MMX_ABS mm1, mm3
paddw mm0, mm1
movq [ecx], mm0
add eax, 8
add ecx, 8
sub edx, 4
jg .loop
pop ebx
ret
paddusw mm0, [ecx]
movq mm4, mm5
psubusw mm4, mm0
packsswb mm4, mm4
movd [edi], mm4
ADS_END 1
cglobal x264_pixel_ads1_mmxext
mov eax, [esp+4]
pshufw mm7, [eax], 0
mov eax, [esp+8]
mov ecx, [esp+16]
mov edx, [esp+20]
pshufw mm6, [esp+28], 0
ADS_START
.loop:
movq mm0, [eax]
movq mm1, [eax+8]
......@@ -1661,11 +1721,115 @@ cglobal x264_pixel_ads1_mmxext
psubw mm1, mm7
MMX_ABS mm0, mm2
MMX_ABS mm1, mm3
movq [ecx], mm0
movq [ecx+8], mm1
add eax, 16
add ecx, 16
sub edx, 8
jg .loop
nop
ret
paddusw mm0, [ecx]
paddusw mm1, [ecx+8]
movq mm4, mm6
movq mm5, mm6
psubusw mm4, mm0
psubusw mm5, mm1
packsswb mm4, mm5
movq [edi], mm4
ADS_END 2
%macro ADS_SSE2 1
cglobal x264_pixel_ads4_%1
mov eax, [esp+4] ; enc_dc
movdqa xmm4, [eax]
pshuflw xmm7, xmm4, 0
pshuflw xmm6, xmm4, 0xAA
pshufhw xmm5, xmm4, 0
pshufhw xmm4, xmm4, 0xAA
punpcklqdq xmm7, xmm7
punpcklqdq xmm6, xmm6
punpckhqdq xmm5, xmm5
punpckhqdq xmm4, xmm4
ADS_START
.loop:
movdqu xmm0, [eax]
movdqu xmm1, [eax+16]
psubw xmm0, xmm7
psubw xmm1, xmm6
MMX_ABS xmm0, xmm2
MMX_ABS xmm1, xmm3
movdqu xmm2, [eax+ebx]
movdqu xmm3, [eax+ebx+16]
psubw xmm2, xmm5
psubw xmm3, xmm4
paddw xmm0, xmm1
MMX_ABS xmm2, xmm1
MMX_ABS xmm3, xmm1
paddw xmm0, xmm2
paddw xmm0, xmm3
movd xmm1, [ebp+32] ; thresh
movdqu xmm2, [ecx]
pshuflw xmm1, xmm1, 0
punpcklqdq xmm1, xmm1
paddusw xmm0, xmm2
psubusw xmm1, xmm0
packsswb xmm1, xmm1
movq [edi], xmm1
ADS_END 2
cglobal x264_pixel_ads2_%1
mov eax, [esp+4] ; enc_dc
movq xmm6, [eax]
movd xmm5, [esp+28] ; thresh
pshuflw xmm7, xmm6, 0
pshuflw xmm6, xmm6, 0xAA
pshuflw xmm5, xmm5, 0
punpcklqdq xmm7, xmm7
punpcklqdq xmm6, xmm6
punpcklqdq xmm5, xmm5
ADS_START
.loop:
movdqu xmm0, [eax]
movdqu xmm1, [eax+ebx]
psubw xmm0, xmm7
psubw xmm1, xmm6
movdqu xmm4, [ecx]
MMX_ABS xmm0, xmm2
MMX_ABS xmm1, xmm3
paddw xmm0, xmm1
paddusw xmm0, xmm4
movdqa xmm1, xmm5
psubusw xmm1, xmm0
packsswb xmm1, xmm1
movq [edi], xmm1
ADS_END 2
cglobal x264_pixel_ads1_%1
mov eax, [esp+4] ; enc_dc
movd xmm7, [eax]
movd xmm6, [esp+28] ; thresh
pshuflw xmm7, xmm7, 0
pshuflw xmm6, xmm6, 0
punpcklqdq xmm7, xmm7
punpcklqdq xmm6, xmm6
ADS_START
.loop:
movdqu xmm0, [eax]
movdqu xmm1, [eax+16]
psubw xmm0, xmm7
psubw xmm1, xmm7
movdqu xmm2, [ecx]
movdqu xmm3, [ecx+16]
MMX_ABS xmm0, xmm4
MMX_ABS xmm1, xmm5
paddusw xmm0, xmm2
paddusw xmm1, xmm3
movdqa xmm4, xmm6
movdqa xmm5, xmm6
psubusw xmm4, xmm0
psubusw xmm5, xmm1
packsswb xmm4, xmm5
movdqa [edi], xmm4
ADS_END 4
%endmacro
ADS_SSE2 sse2
%ifdef HAVE_SSE3
%macro MMX_ABS 2
pabsw %1, %1
%endmacro
ADS_SSE2 ssse3
%endif
......@@ -81,11 +81,18 @@ void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1,
const uint8_t *pix2, int stride2, int sums[2][4] );
float x264_pixel_ssim_end4_sse2( int sum0[5][4], int sum1[5][4], int width );
void x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta,
uint16_t *res, int width );
void x264_pixel_ads2_mmxext( int enc_dc[2], uint16_t *sums, int delta,
uint16_t *res, int width );
void x264_pixel_ads1_mmxext( int enc_dc[1], uint16_t *sums, int delta,
uint16_t *res, int width );
#define DECL_ADS( size, suffix ) \
int x264_pixel_ads##size##_##suffix( int enc_dc[size], uint16_t *sums, int delta,\
uint16_t *cost_mvx, int16_t *mvs, int width, int thresh );
DECL_ADS( 4, mmxext )
DECL_ADS( 2, mmxext )
DECL_ADS( 1, mmxext )
DECL_ADS( 4, sse2 )
DECL_ADS( 2, sse2 )
DECL_ADS( 1, sse2 )
DECL_ADS( 4, ssse3 )
DECL_ADS( 2, ssse3 )
DECL_ADS( 1, ssse3 )
#undef DECL_ADS
#endif
......@@ -408,32 +408,50 @@ float x264_pixel_ssim_wxh( x264_pixel_function_t *pf,
/****************************************************************************
* successive elimination
****************************************************************************/
static void pixel_ads4( int enc_dc[4], uint16_t *sums, int delta,
uint16_t *res, int width )
static int x264_pixel_ads4( int enc_dc[4], uint16_t *sums, int delta,
uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
{
int i;
int nmv=0, i;
for( i=0; i<width; i++, sums++ )
res[i] = abs( enc_dc[0] - sums[0] )
+ abs( enc_dc[1] - sums[8] )
+ abs( enc_dc[2] - sums[delta] )
+ abs( enc_dc[3] - sums[delta+8] );
{
int ads = abs( enc_dc[0] - sums[0] )
+ abs( enc_dc[1] - sums[8] )
+ abs( enc_dc[2] - sums[delta] )
+ abs( enc_dc[3] - sums[delta+8] )
+ cost_mvx[i];
if( ads < thresh )
mvs[nmv++] = i;
}
return nmv;
}
static void pixel_ads2( int enc_dc[2], uint16_t *sums, int delta,
uint16_t *res, int width )
static int x264_pixel_ads2( int enc_dc[2], uint16_t *sums, int delta,
uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
{
int i;
int nmv=0, i;
for( i=0; i<width; i++, sums++ )
res[i] = abs( enc_dc[0] - sums[0] )
+ abs( enc_dc[1] - sums[delta] );
{
int ads = abs( enc_dc[0] - sums[0] )
+ abs( enc_dc[1] - sums[delta] )
+ cost_mvx[i];
if( ads < thresh )
mvs[nmv++] = i;
}
return nmv;
}
static void pixel_ads1( int enc_dc[1], uint16_t *sums, int delta,
uint16_t *res, int width )
static int x264_pixel_ads1( int enc_dc[1], uint16_t *sums, int delta,
uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
{
int i;
int nmv=0, i;
for( i=0; i<width; i++, sums++ )
res[i] = abs( enc_dc[0] - sums[0] );
{
int ads = abs( enc_dc[0] - sums[0] )
+ cost_mvx[i];
if( ads < thresh )
mvs[nmv++] = i;
}
return nmv;
}
......@@ -459,20 +477,22 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->name[PIXEL_4x8] = x264_pixel_##name##_4x8##cpu;\
pixf->name[PIXEL_4x4] = x264_pixel_##name##_4x4##cpu;
#define INIT_ADS( cpu ) \
pixf->ads[PIXEL_16x16] = x264_pixel_ads4##cpu;\
pixf->ads[PIXEL_16x8] = x264_pixel_ads2##cpu;\
pixf->ads[PIXEL_8x8] = x264_pixel_ads1##cpu;
INIT7( sad, );
INIT7( sad_x3, );
INIT7( sad_x4, );
INIT7( ssd, );
INIT7( satd, );
INIT4( sa8d, );
INIT_ADS( );
pixf->ssim_4x4x2_core = ssim_4x4x2_core;
pixf->ssim_end4 = ssim_end4;
pixf->ads[PIXEL_16x16] = pixel_ads4;
pixf->ads[PIXEL_16x8] = pixel_ads2;
pixf->ads[PIXEL_8x8] = pixel_ads1;
#ifdef HAVE_MMX
if( cpu&X264_CPU_MMX )
{
......@@ -485,10 +505,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT7( sad_x3, _mmxext );
INIT7( sad_x4, _mmxext );
INIT7( satd, _mmxext );
pixf->ads[PIXEL_16x16] = x264_pixel_ads4_mmxext;