Commit 2ed861c8 authored by Loren Merritt's avatar Loren Merritt

convert absolute difference of sums from mmx to sse2

convert mv bits cost and ads threshold from C to sse2
convert bytemask-to-list from C to scalar asm
1.6x faster me=esa (x86_64) or 1.3x faster (x86_32). (times consider only motion estimation. overall encode speedup may vary.)


git-svn-id: svn://svn.videolan.org/x264/trunk@717 df754926-b1dd-0310-bc7b-ec298dee348c
parent c0fb035a
...@@ -1045,9 +1045,82 @@ cglobal x264_intra_satd_x3_8x8c_mmxext ...@@ -1045,9 +1045,82 @@ cglobal x264_intra_satd_x3_8x8c_mmxext
; int x264_pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
; {
; int nmv=0, i, j;
; *(uint32_t*)(masks+width) = 0;
; for( i=0; i<width; i+=8 )
; {
; uint64_t mask = *(uint64_t*)(masks+i);
; if( !mask ) continue;
; for( j=0; j<8; j++ )
; if( mask & (255<<j*8) )
; mvs[nmv++] = i+j;
; }
; return nmv;
; }
cglobal x264_pixel_ads_mvs
; mvs = parm5q
; masks = rsp
; width = r10
mov dword [rsp+r10], 0
xor eax, eax
xor esi, esi
.loopi:
mov rdi, [rsp+rsi]
test rdi, rdi
jz .nexti
xor ecx, ecx
%macro TEST 1
mov [parm5q+rax*2], si
test edi, 0xff<<(%1*8)
setne cl
add eax, ecx
inc esi
%endmacro
TEST 0
TEST 1
TEST 2
TEST 3
shr rdi, 32
TEST 0
TEST 1
TEST 2
TEST 3
cmp esi, r10d
jl .loopi
leave
ret
.nexti:
add esi, 8
cmp esi, r10d
jl .loopi
leave
ret
%macro ADS_START 0
push rbp
mov rbp, rsp
sub rsp, parm6q
sub rsp, 4
and rsp, ~15
mov rax, rsp
mov r10d, parm6d
shl parm3q, 1
%endmacro
%macro ADS_END 1
add parm2q, 8*%1
add parm4q, 8*%1
add rax, 4*%1
sub parm6d, 4*%1
jg .loop
jmp x264_pixel_ads_mvs
%endmacro
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta, ; int x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta,
; uint16_t *res, int width ) ; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
cglobal x264_pixel_ads4_mmxext cglobal x264_pixel_ads4_mmxext
movq mm6, [parm1q] movq mm6, [parm1q]
...@@ -1056,7 +1129,7 @@ cglobal x264_pixel_ads4_mmxext ...@@ -1056,7 +1129,7 @@ cglobal x264_pixel_ads4_mmxext
pshufw mm6, mm6, 0xAA pshufw mm6, mm6, 0xAA
pshufw mm5, mm4, 0 pshufw mm5, mm4, 0
pshufw mm4, mm4, 0xAA pshufw mm4, mm4, 0xAA
shl parm3q, 1 ADS_START
.loop: .loop:
movq mm0, [parm2q] movq mm0, [parm2q]
movq mm1, [parm2q+16] movq mm1, [parm2q+16]
...@@ -1073,19 +1146,19 @@ cglobal x264_pixel_ads4_mmxext ...@@ -1073,19 +1146,19 @@ cglobal x264_pixel_ads4_mmxext
MMX_ABS mm3, mm1 MMX_ABS mm3, mm1
paddw mm0, mm2 paddw mm0, mm2
paddw mm0, mm3 paddw mm0, mm3
movq [parm4q], mm0 pshufw mm1, [rbp+16], 0
add parm2q, 8 paddusw mm0, [parm4q]
add parm4q, 8 psubusw mm1, mm0
sub parm5d, 4 packsswb mm1, mm1
jg .loop movd [rax], mm1
nop ADS_END 1
ret
cglobal x264_pixel_ads2_mmxext cglobal x264_pixel_ads2_mmxext
movq mm6, [parm1q] movq mm6, [parm1q]
pshufw mm5, parm7q, 0
pshufw mm7, mm6, 0 pshufw mm7, mm6, 0
pshufw mm6, mm6, 0xAA pshufw mm6, mm6, 0xAA
shl parm3q, 1 ADS_START
.loop: .loop:
movq mm0, [parm2q] movq mm0, [parm2q]
movq mm1, [parm2q+parm3q] movq mm1, [parm2q+parm3q]
...@@ -1094,16 +1167,17 @@ cglobal x264_pixel_ads2_mmxext ...@@ -1094,16 +1167,17 @@ cglobal x264_pixel_ads2_mmxext
MMX_ABS mm0, mm2 MMX_ABS mm0, mm2
MMX_ABS mm1, mm3 MMX_ABS mm1, mm3
paddw mm0, mm1 paddw mm0, mm1
movq [parm4q], mm0 paddusw mm0, [parm4q]
add parm2q, 8 movq mm4, mm5
add parm4q, 8 psubusw mm4, mm0
sub parm5d, 4 packsswb mm4, mm4
jg .loop movd [rax], mm4
nop ADS_END 1
ret
cglobal x264_pixel_ads1_mmxext cglobal x264_pixel_ads1_mmxext
pshufw mm7, [parm1q], 0 pshufw mm7, [parm1q], 0
pshufw mm6, parm7q, 0
ADS_START
.loop: .loop:
movq mm0, [parm2q] movq mm0, [parm2q]
movq mm1, [parm2q+8] movq mm1, [parm2q+8]
...@@ -1111,11 +1185,113 @@ cglobal x264_pixel_ads1_mmxext ...@@ -1111,11 +1185,113 @@ cglobal x264_pixel_ads1_mmxext
psubw mm1, mm7 psubw mm1, mm7
MMX_ABS mm0, mm2 MMX_ABS mm0, mm2
MMX_ABS mm1, mm3 MMX_ABS mm1, mm3
movq [parm4q], mm0 paddusw mm0, [parm4q]
movq [parm4q+8], mm1 paddusw mm1, [parm4q+8]
add parm2q, 16 movq mm4, mm6
add parm4q, 16 movq mm5, mm6
sub parm5d, 8 psubusw mm4, mm0
jg .loop psubusw mm5, mm1
nop packsswb mm4, mm5
ret movq [rax], mm4
ADS_END 2
%macro ADS_SSE2 1
cglobal x264_pixel_ads4_%1
movdqa xmm4, [parm1q]
pshuflw xmm8, parm7q, 0
pshuflw xmm7, xmm4, 0
pshuflw xmm6, xmm4, 0xAA
pshufhw xmm5, xmm4, 0
pshufhw xmm4, xmm4, 0xAA
punpcklqdq xmm8, xmm8
punpcklqdq xmm7, xmm7
punpcklqdq xmm6, xmm6
punpckhqdq xmm5, xmm5
punpckhqdq xmm4, xmm4
ADS_START
movdqu xmm10, [parm2q]
movdqu xmm11, [parm2q+parm3q]
.loop:
movdqa xmm0, xmm10
movdqu xmm1, [parm2q+16]
movdqa xmm10, xmm1
psubw xmm0, xmm7
psubw xmm1, xmm6
MMX_ABS xmm0, xmm2
MMX_ABS xmm1, xmm3
movdqa xmm2, xmm11
movdqu xmm3, [parm2q+parm3q+16]
movdqa xmm11, xmm3
psubw xmm2, xmm5
psubw xmm3, xmm4
paddw xmm0, xmm1
movdqu xmm9, [parm4q]
MMX_ABS xmm2, xmm1
MMX_ABS xmm3, xmm1
paddw xmm0, xmm2
paddw xmm0, xmm3
paddusw xmm0, xmm9
movdqa xmm1, xmm8
psubusw xmm1, xmm0
packsswb xmm1, xmm1
movq [rax], xmm1
ADS_END 2
cglobal x264_pixel_ads2_%1
movq xmm6, [parm1q]
pshuflw xmm8, parm7q, 0
pshuflw xmm7, xmm6, 0
pshuflw xmm6, xmm6, 0xAA
punpcklqdq xmm8, xmm8
punpcklqdq xmm7, xmm7
punpcklqdq xmm6, xmm6
ADS_START
.loop:
movdqu xmm0, [parm2q]
movdqu xmm1, [parm2q+parm3q]
psubw xmm0, xmm7
psubw xmm1, xmm6
movdqu xmm9, [parm4q]
MMX_ABS xmm0, xmm2
MMX_ABS xmm1, xmm3
paddw xmm0, xmm1
paddusw xmm0, xmm9
movdqa xmm4, xmm8
psubusw xmm4, xmm0
packsswb xmm4, xmm4
movq [rax], xmm4
ADS_END 2
cglobal x264_pixel_ads1_%1
pshuflw xmm7, [parm1q], 0
pshuflw xmm8, parm7q, 0
punpcklqdq xmm7, xmm7
punpcklqdq xmm8, xmm8
ADS_START
.loop:
movdqu xmm0, [parm2q]
movdqu xmm1, [parm2q+16]
psubw xmm0, xmm7
psubw xmm1, xmm7
movdqu xmm9, [parm4q]
movdqu xmm10, [parm4q+16]
MMX_ABS xmm0, xmm2
MMX_ABS xmm1, xmm3
paddusw xmm0, xmm9
paddusw xmm1, xmm10
movdqa xmm4, xmm8
movdqa xmm5, xmm8
psubusw xmm4, xmm0
psubusw xmm5, xmm1
packsswb xmm4, xmm5
movdqa [rax], xmm4
ADS_END 4
%endmacro
ADS_SSE2 sse2
%ifdef HAVE_SSE3
%macro MMX_ABS 2
pabsw %1, %1
%endmacro
ADS_SSE2 ssse3
%endif
...@@ -1579,24 +1579,91 @@ cglobal x264_pixel_ssim_4x4x2_core_mmxext ...@@ -1579,24 +1579,91 @@ cglobal x264_pixel_ssim_4x4x2_core_mmxext
; int x264_pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
cglobal x264_pixel_ads_mvs
mov ebx, [ebp+24] ; mvs
mov ecx, esp ; masks
mov edi, [ebp+28] ; width
mov dword [ecx+edi], 0
push esi
push ebp
xor eax, eax
xor esi, esi
.loopi:
mov ebp, [ecx+esi]
mov edx, [ecx+esi+4]
or edx, ebp
jz .nexti
xor edx, edx
%macro TEST 1
mov [ebx+eax*2], si
test ebp, 0xff<<(%1*8)
setne dl
add eax, edx
inc esi
%endmacro
TEST 0
TEST 1
TEST 2
TEST 3
mov ebp, [ecx+esi]
TEST 0
TEST 1
TEST 2
TEST 3
cmp esi, edi
jl .loopi
jmp .end
.nexti:
add esi, 8
cmp esi, edi
jl .loopi
.end:
pop ebp
pop esi
mov edi, [ebp-8]
mov ebx, [ebp-4]
leave
ret
%macro ADS_START 0
push ebp
mov ebp, esp
push ebx
push edi
mov eax, [ebp+12] ; sums
mov ebx, [ebp+16] ; delta
mov ecx, [ebp+20] ; cost_mvx
mov edx, [ebp+28] ; width
sub esp, edx
sub esp, 4
and esp, ~15
mov edi, esp
shl ebx, 1
%endmacro
%macro ADS_END 1
add eax, 8*%1
add ecx, 8*%1
add edi, 4*%1
sub edx, 4*%1
jg .loop
jmp x264_pixel_ads_mvs
%endmacro
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta, ; int x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta,
; uint16_t *res, int width ) ; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
cglobal x264_pixel_ads4_mmxext cglobal x264_pixel_ads4_mmxext
push ebx mov eax, [esp+4]
mov eax, [esp+8]
movq mm6, [eax] movq mm6, [eax]
movq mm4, [eax+8] movq mm4, [eax+8]
pshufw mm7, mm6, 0 pshufw mm7, mm6, 0
pshufw mm6, mm6, 0xAA pshufw mm6, mm6, 0xAA
pshufw mm5, mm4, 0 pshufw mm5, mm4, 0
pshufw mm4, mm4, 0xAA pshufw mm4, mm4, 0xAA
mov eax, [esp+12] ADS_START
mov ebx, [esp+16]
mov ecx, [esp+20]
mov edx, [esp+24]
shl ebx, 1
.loop: .loop:
movq mm0, [eax] movq mm0, [eax]
movq mm1, [eax+16] movq mm1, [eax+16]
...@@ -1613,25 +1680,20 @@ cglobal x264_pixel_ads4_mmxext ...@@ -1613,25 +1680,20 @@ cglobal x264_pixel_ads4_mmxext
MMX_ABS mm3, mm1 MMX_ABS mm3, mm1
paddw mm0, mm2 paddw mm0, mm2
paddw mm0, mm3 paddw mm0, mm3
movq [ecx], mm0 pshufw mm1, [ebp+32], 0
add eax, 8 paddusw mm0, [ecx]
add ecx, 8 psubusw mm1, mm0
sub edx, 4 packsswb mm1, mm1
jg .loop movd [edi], mm1
pop ebx ADS_END 1
ret
cglobal x264_pixel_ads2_mmxext cglobal x264_pixel_ads2_mmxext
push ebx mov eax, [esp+4]
mov eax, [esp+8]
movq mm6, [eax] movq mm6, [eax]
pshufw mm5, [esp+28], 0
pshufw mm7, mm6, 0 pshufw mm7, mm6, 0
pshufw mm6, mm6, 0xAA pshufw mm6, mm6, 0xAA
mov eax, [esp+12] ADS_START
mov ebx, [esp+16]
mov ecx, [esp+20]
mov edx, [esp+24]
shl ebx, 1
.loop: .loop:
movq mm0, [eax] movq mm0, [eax]
movq mm1, [eax+ebx] movq mm1, [eax+ebx]
...@@ -1640,20 +1702,18 @@ cglobal x264_pixel_ads2_mmxext ...@@ -1640,20 +1702,18 @@ cglobal x264_pixel_ads2_mmxext
MMX_ABS mm0, mm2 MMX_ABS mm0, mm2
MMX_ABS mm1, mm3 MMX_ABS mm1, mm3
paddw mm0, mm1 paddw mm0, mm1
movq [ecx], mm0 paddusw mm0, [ecx]
add eax, 8 movq mm4, mm5
add ecx, 8 psubusw mm4, mm0
sub edx, 4 packsswb mm4, mm4
jg .loop movd [edi], mm4
pop ebx ADS_END 1
ret
cglobal x264_pixel_ads1_mmxext cglobal x264_pixel_ads1_mmxext
mov eax, [esp+4] mov eax, [esp+4]
pshufw mm7, [eax], 0 pshufw mm7, [eax], 0
mov eax, [esp+8] pshufw mm6, [esp+28], 0
mov ecx, [esp+16] ADS_START
mov edx, [esp+20]
.loop: .loop:
movq mm0, [eax] movq mm0, [eax]
movq mm1, [eax+8] movq mm1, [eax+8]
...@@ -1661,11 +1721,115 @@ cglobal x264_pixel_ads1_mmxext ...@@ -1661,11 +1721,115 @@ cglobal x264_pixel_ads1_mmxext
psubw mm1, mm7 psubw mm1, mm7
MMX_ABS mm0, mm2 MMX_ABS mm0, mm2
MMX_ABS mm1, mm3 MMX_ABS mm1, mm3
movq [ecx], mm0 paddusw mm0, [ecx]
movq [ecx+8], mm1 paddusw mm1, [ecx+8]
add eax, 16 movq mm4, mm6
add ecx, 16 movq mm5, mm6
sub edx, 8 psubusw mm4, mm0
jg .loop psubusw mm5, mm1
nop packsswb mm4, mm5
ret movq [edi], mm4
ADS_END 2
%macro ADS_SSE2 1
cglobal x264_pixel_ads4_%1
mov eax, [esp+4] ; enc_dc
movdqa xmm4, [eax]
pshuflw xmm7, xmm4, 0
pshuflw xmm6, xmm4, 0xAA
pshufhw xmm5, xmm4, 0
pshufhw xmm4, xmm4, 0xAA
punpcklqdq xmm7, xmm7
punpcklqdq xmm6, xmm6
punpckhqdq xmm5, xmm5
punpckhqdq xmm4, xmm4
ADS_START
.loop:
movdqu xmm0, [eax]
movdqu xmm1, [eax+16]
psubw xmm0, xmm7
psubw xmm1, xmm6
MMX_ABS xmm0, xmm2
MMX_ABS xmm1, xmm3
movdqu xmm2, [eax+ebx]
movdqu xmm3, [eax+ebx+16]
psubw xmm2, xmm5
psubw xmm3, xmm4
paddw xmm0, xmm1
MMX_ABS xmm2, xmm1
MMX_ABS xmm3, xmm1
paddw xmm0, xmm2
paddw xmm0, xmm3
movd xmm1, [ebp+32] ; thresh
movdqu xmm2, [ecx]
pshuflw xmm1, xmm1, 0
punpcklqdq xmm1, xmm1
paddusw xmm0, xmm2
psubusw xmm1, xmm0
packsswb xmm1, xmm1
movq [edi], xmm1
ADS_END 2
cglobal x264_pixel_ads2_%1
mov eax, [esp+4] ; enc_dc
movq xmm6, [eax]
movd xmm5, [esp+28] ; thresh
pshuflw xmm7, xmm6, 0
pshuflw xmm6, xmm6, 0xAA
pshuflw xmm5, xmm5, 0
punpcklqdq xmm7, xmm7
punpcklqdq xmm6, xmm6
punpcklqdq xmm5, xmm5
ADS_START
.loop:
movdqu xmm0, [eax]
movdqu xmm1, [eax+ebx]
psubw xmm0, xmm7
psubw xmm1, xmm6
movdqu xmm4, [ecx]
MMX_ABS xmm0, xmm2
MMX_ABS xmm1, xmm3
paddw xmm0, xmm1
paddusw xmm0, xmm4
movdqa xmm1, xmm5
psubusw xmm1, xmm0
packsswb xmm1, xmm1
movq [edi], xmm1
ADS_END 2
cglobal x264_pixel_ads1_%1
mov eax, [esp+4] ; enc_dc
movd xmm7, [eax]
movd xmm6, [esp+28] ; thresh
pshuflw xmm7, xmm7, 0
pshuflw xmm6, xmm6, 0
punpcklqdq xmm7, xmm7
punpcklqdq xmm6, xmm6
ADS_START
.loop:
movdqu xmm0, [eax]
movdqu xmm1, [eax+16]
psubw xmm0, xmm7
psubw xmm1, xmm7
movdqu xmm2, [ecx]
movdqu xmm3, [ecx+16]
MMX_ABS xmm0, xmm4
MMX_ABS xmm1, xmm5
paddusw xmm0, xmm2
paddusw xmm1, xmm3
movdqa xmm4, xmm6
movdqa xmm5, xmm6
psubusw xmm4, xmm0
psubusw xmm5, xmm1
packsswb xmm4, xmm5
movdqa [edi], xmm4
ADS_END 4
%endmacro
ADS_SSE2 sse2
%ifdef HAVE_SSE3
%macro MMX_ABS 2
pabsw %1, %1
%endmacro
ADS_SSE2 ssse3
%endif
...@@ -81,11 +81,18 @@ void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1, ...@@ -81,11 +81,18 @@ void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1,
const uint8_t *pix2, int stride2, int sums[2][4] ); const uint8_t *pix2, int stride2, int sums[2][4] );
float x264_pixel_ssim_end4_sse2( int sum0[5][4], int sum1[5][4], int width ); float x264_pixel_ssim_end4_sse2( int sum0[5][4], int sum1[5][4], int width );
void x264_pixel_ads4_mmxext( int enc_dc[4], uint16_t *sums, int delta, #define DECL_ADS( size, suffix ) \
uint16_t *res, int width ); int x264_pixel_ads##size##_##suffix( int enc_dc[size], uint16_t *sums, int delta,\
void x264_pixel_ads2_mmxext( int enc_dc[2], uint16_t *sums, int delta, uint16_t *cost_mvx, int16_t *mvs, int width, int thresh );
uint16_t *res, int width ); DECL_ADS( 4, mmxext )
void x264_pixel_ads1_mmxext( int enc_dc[1], uint16_t *sums, int delta, DECL_ADS( 2, mmxext )
uint16_t *res, int width ); DECL_ADS( 1, mmxext )
DECL_ADS( 4, sse2 )
DECL_ADS( 2, sse2 )
DECL_ADS( 1, sse2 )
DECL_ADS( 4, ssse3 )
DECL_ADS( 2, ssse3 )
DECL_ADS( 1, ssse3 )
#undef DECL_ADS
#endif #endif
...@@ -408,32 +408,50 @@ float x264_pixel_ssim_wxh( x264_pixel_function_t *pf, ...@@ -408,32 +408,50 @@ float x264_pixel_ssim_wxh( x264_pixel_function_t *pf,
/**************************************************************************** /****************************************************************************
* successive elimination * successive elimination
****************************************************************************/ ****************************************************************************/
static void pixel_ads4( int enc_dc[4], uint16_t *sums, int delta, static int x264_pixel_ads4( int enc_dc[4], uint16_t *sums, int delta,
uint16_t *res, int width ) uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
{ {
int i; int nmv=0, i;
for( i=0; i<width; i++, sums++ ) for( i=0; i<width; i++, sums++ )
res[i] = abs( enc_dc[0] - sums[0] ) {
+ abs( enc_dc[1] - sums[8] ) int ads = abs( enc_dc[0] - sums[0] )
+ abs( enc_dc[2] - sums[delta] ) + abs( enc_dc[1] - sums[8] )
+ abs( enc_dc[3] - sums[delta+8] ); + abs( enc_dc[2] - sums[delta] )
+ abs( enc_dc[3] - sums[delta+8] )
+ cost_mvx[i];
if( ads < thresh )
mvs[nmv++] = i;
}
return nmv;
} }
static void pixel_ads2( int enc_dc[2], uint16_t *sums, int delta, static int x264_pixel_ads2( int enc_dc[2], uint16_t *sums, int delta,
uint16_t *res, int width ) uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
{ {
int i; int nmv=0, i;
for( i=0; i<width; i++, sums++ ) for( i=0; i<width; i++, sums++ )
res[i] = abs( enc_dc[0] - sums[0] ) {
+ abs( enc_dc[1] - sums[delta] ); int ads = abs( enc_dc[0] - sums[0] )
+ abs( enc_dc[1] - sums[delta] )
+ cost_mvx[i];
if( ads < thresh )
mvs[nmv++] = i;
}
return nmv;
}