Commit 5b6c5eff authored by Loren Merritt's avatar Loren Merritt

use FDEC_STRIDE instead of a parameter in mmx dct

.5% speedup



git-svn-id: svn://svn.videolan.org/x264/trunk@479 df754926-b1dd-0310-bc7b-ec298dee348c
parent da9158b3
......@@ -272,4 +272,5 @@ SECTION .text
%define GLOBAL
%endif
%assign FENC_STRIDE 16
%assign FDEC_STRIDE 32
......@@ -222,33 +222,16 @@ cglobal x264_sub4x4_dct_mmx
ALIGN 16
;-----------------------------------------------------------------------------
; void x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
; void x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
x264_sub4x4_dct_mmx:
firstpush rbx
pushreg rbx
endprolog
mov r10, parm1q ; dct
mov rax, parm2q ; pix1
%ifdef WIN64
mov rcx, parm4q ; pix2
movsxd rdx, dword [rsp+40+8] ; i_pix2
movsxd rbx, parm3d ; i_pix1
%else
movsxd rbx, parm3d ; i_pix1
movsxd rdx, parm5d ; i_pix2
%endif
MMX_ZERO mm7
; Load 4 lines
MMX_LOAD_DIFF_4P mm0, mm6, mm7, [rax ], [rcx]
MMX_LOAD_DIFF_4P mm1, mm6, mm7, [rax+rbx ], [rcx+rdx]
MMX_LOAD_DIFF_4P mm2, mm6, mm7, [rax+rbx*2], [rcx+rdx*2]
add rax, rbx
add rcx, rdx
MMX_LOAD_DIFF_4P mm3, mm6, mm7, [rax+rbx*2], [rcx+rdx*2]
MMX_LOAD_DIFF_4P mm0, mm6, mm7, [parm2q+0*FENC_STRIDE], [parm3q+0*FDEC_STRIDE]
MMX_LOAD_DIFF_4P mm1, mm6, mm7, [parm2q+1*FENC_STRIDE], [parm3q+1*FDEC_STRIDE]
MMX_LOAD_DIFF_4P mm2, mm6, mm7, [parm2q+2*FENC_STRIDE], [parm3q+2*FDEC_STRIDE]
MMX_LOAD_DIFF_4P mm3, mm6, mm7, [parm2q+3*FENC_STRIDE], [parm3q+3*FDEC_STRIDE]
MMX_SUMSUB_BADC mm3, mm0, mm2, mm1 ; mm3=s03 mm0=d03 mm2=s12 mm1=d12
......@@ -263,32 +246,25 @@ x264_sub4x4_dct_mmx:
MMX_SUMSUB_BA mm1, mm3 ; mm1=s03+s12 mm3=s03-s12
MMX_SUMSUB2_AB mm2, mm4, mm0 ; mm2=2.d03+d12 mm0=d03-2.d12
movq [r10+ 0], mm1 ; dct
movq [r10+ 8], mm2
movq [r10+16], mm3
movq [r10+24], mm0
pop rbx
movq [parm1q+ 0], mm1
movq [parm1q+ 8], mm2
movq [parm1q+16], mm3
movq [parm1q+24], mm0
ret
endfunc
cglobal x264_add4x4_idct_mmx
ALIGN 16
;-----------------------------------------------------------------------------
; void x264_add4x4_idct_mmx( uint8_t *p_dst, int i_dst, int16_t dct[4][4] )
; void x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] )
;-----------------------------------------------------------------------------
x264_add4x4_idct_mmx:
; Load dct coeffs
movq mm0, [parm3q+ 0] ; dct
movq mm1, [parm3q+ 8]
movq mm2, [parm3q+16]
movq mm3, [parm3q+24]
movq mm0, [parm2q+ 0] ; dct
movq mm1, [parm2q+ 8]
movq mm2, [parm2q+16]
movq mm3, [parm2q+24]
mov rax, parm1q ; p_dst
movsxd rcx, parm2d ; i_dst
lea rdx, [rcx+rcx*2]
MMX_SUMSUB_BA mm2, mm0 ; mm2=s02 mm0=d02
MMX_SUMSUBD2_AB mm1, mm3, mm5, mm4 ; mm1=s13 mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3)
......@@ -305,10 +281,10 @@ x264_add4x4_idct_mmx:
MMX_ZERO mm7
movq mm6, [pw_32 GLOBAL]
MMX_STORE_DIFF_4P mm2, mm0, mm6, mm7, [rax]
MMX_STORE_DIFF_4P mm4, mm0, mm6, mm7, [rax+rcx]
MMX_STORE_DIFF_4P mm1, mm0, mm6, mm7, [rax+rcx*2]
MMX_STORE_DIFF_4P mm3, mm0, mm6, mm7, [rax+rdx]
MMX_STORE_DIFF_4P mm2, mm0, mm6, mm7, [parm1q+0*FDEC_STRIDE]
MMX_STORE_DIFF_4P mm4, mm0, mm6, mm7, [parm1q+1*FDEC_STRIDE]
MMX_STORE_DIFF_4P mm1, mm0, mm6, mm7, [parm1q+2*FDEC_STRIDE]
MMX_STORE_DIFF_4P mm3, mm0, mm6, mm7, [parm1q+3*FDEC_STRIDE]
ret
......@@ -374,42 +350,32 @@ cglobal x264_sub8x8_dct8_sse2
ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
; void __cdecl x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
x264_sub8x8_dct8_sse2:
; mov rdi, rdi ; dct
; mov rsi, rsi ; pix1
movsxd rdx, edx ; i_pix1
; mov rcx, rcx ; pix2
movsxd r8, r8d ; i_pix2
MMX_ZERO xmm9
MMX_LOAD_DIFF_8P xmm0, xmm8, xmm9, [rsi ], [rcx]
MMX_LOAD_DIFF_8P xmm1, xmm8, xmm9, [rsi+rdx ], [rcx+r8]
MMX_LOAD_DIFF_8P xmm2, xmm8, xmm9, [rsi+rdx*2], [rcx+r8*2]
lea r9, [rdx+rdx*2]
lea r10, [r8+r8*2]
add rsi, r9
add rcx, r10
MMX_LOAD_DIFF_8P xmm3, xmm8, xmm9, [rsi ], [rcx]
MMX_LOAD_DIFF_8P xmm4, xmm8, xmm9, [rsi+rdx ], [rcx+r8]
MMX_LOAD_DIFF_8P xmm5, xmm8, xmm9, [rsi+rdx*2], [rcx+r8*2]
MMX_LOAD_DIFF_8P xmm6, xmm8, xmm9, [rsi+r9 ], [rcx+r10]
MMX_LOAD_DIFF_8P xmm7, xmm8, xmm9, [rsi+rdx*4], [rcx+r8*4]
MMX_LOAD_DIFF_8P xmm0, xmm8, xmm9, [parm2q+0*FENC_STRIDE], [parm3q+0*FDEC_STRIDE]
MMX_LOAD_DIFF_8P xmm1, xmm8, xmm9, [parm2q+1*FENC_STRIDE], [parm3q+1*FDEC_STRIDE]
MMX_LOAD_DIFF_8P xmm2, xmm8, xmm9, [parm2q+2*FENC_STRIDE], [parm3q+2*FDEC_STRIDE]
MMX_LOAD_DIFF_8P xmm3, xmm8, xmm9, [parm2q+3*FENC_STRIDE], [parm3q+3*FDEC_STRIDE]
MMX_LOAD_DIFF_8P xmm4, xmm8, xmm9, [parm2q+4*FENC_STRIDE], [parm3q+4*FDEC_STRIDE]
MMX_LOAD_DIFF_8P xmm5, xmm8, xmm9, [parm2q+5*FENC_STRIDE], [parm3q+5*FDEC_STRIDE]
MMX_LOAD_DIFF_8P xmm6, xmm8, xmm9, [parm2q+6*FENC_STRIDE], [parm3q+6*FDEC_STRIDE]
MMX_LOAD_DIFF_8P xmm7, xmm8, xmm9, [parm2q+7*FENC_STRIDE], [parm3q+7*FDEC_STRIDE]
DCT8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9
SSE2_TRANSPOSE8x8 xmm5, xmm1, xmm2, xmm6, xmm4, xmm3, xmm7, xmm8, xmm0
DCT8_1D xmm5, xmm3, xmm8, xmm6, xmm0, xmm4, xmm2, xmm1, xmm7, xmm9
movdqa [rdi+0x00], xmm4
movdqa [rdi+0x10], xmm3
movdqa [rdi+0x20], xmm8
movdqa [rdi+0x30], xmm2
movdqa [rdi+0x40], xmm0
movdqa [rdi+0x50], xmm6
movdqa [rdi+0x60], xmm1
movdqa [rdi+0x70], xmm7
movdqa [parm1q+0x00], xmm4
movdqa [parm1q+0x10], xmm3
movdqa [parm1q+0x20], xmm8
movdqa [parm1q+0x30], xmm2
movdqa [parm1q+0x40], xmm0
movdqa [parm1q+0x50], xmm6
movdqa [parm1q+0x60], xmm1
movdqa [parm1q+0x70], xmm7
ret
......@@ -470,18 +436,17 @@ cglobal x264_add8x8_idct8_sse2
ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_add8x8_idct8_sse2( uint8_t *p_dst, int i_dst, int16_t dct[8][8] )
; void __cdecl x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
x264_add8x8_idct8_sse2:
movsxd rsi, esi ; i_dst
movdqa xmm0, [rdx+0x00] ; dct
movdqa xmm1, [rdx+0x10]
movdqa xmm2, [rdx+0x20]
movdqa xmm3, [rdx+0x30]
movdqa xmm4, [rdx+0x40]
movdqa xmm5, [rdx+0x50]
movdqa xmm6, [rdx+0x60]
movdqa xmm7, [rdx+0x70]
movdqa xmm0, [parm2q+0x00]
movdqa xmm1, [parm2q+0x10]
movdqa xmm2, [parm2q+0x20]
movdqa xmm3, [parm2q+0x30]
movdqa xmm4, [parm2q+0x40]
movdqa xmm5, [parm2q+0x50]
movdqa xmm6, [parm2q+0x60]
movdqa xmm7, [parm2q+0x70]
IDCT8_1D xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm9, xmm8
SSE2_TRANSPOSE8x8 xmm9, xmm1, xmm7, xmm3, xmm4, xmm0, xmm2, xmm6, xmm5
......@@ -489,15 +454,13 @@ x264_add8x8_idct8_sse2:
IDCT8_1D xmm9, xmm0, xmm6, xmm3, xmm5, xmm4, xmm7, xmm1, xmm8, xmm2
MMX_ZERO xmm15
MMX_STORE_DIFF_8P xmm8, xmm14, xmm15, [rdi]
MMX_STORE_DIFF_8P xmm0, xmm14, xmm15, [rdi+rsi]
MMX_STORE_DIFF_8P xmm1, xmm14, xmm15, [rdi+rsi*2]
lea rax, [rsi+rsi*2]
add rdi, rax
MMX_STORE_DIFF_8P xmm3, xmm14, xmm15, [rdi]
MMX_STORE_DIFF_8P xmm5, xmm14, xmm15, [rdi+rsi]
MMX_STORE_DIFF_8P xmm9, xmm14, xmm15, [rdi+rsi*2]
MMX_STORE_DIFF_8P xmm6, xmm14, xmm15, [rdi+rax]
MMX_STORE_DIFF_8P xmm7, xmm14, xmm15, [rdi+rsi*4]
MMX_STORE_DIFF_8P xmm8, xmm14, xmm15, [parm1q+0*FDEC_STRIDE]
MMX_STORE_DIFF_8P xmm0, xmm14, xmm15, [parm1q+1*FDEC_STRIDE]
MMX_STORE_DIFF_8P xmm1, xmm14, xmm15, [parm1q+2*FDEC_STRIDE]
MMX_STORE_DIFF_8P xmm3, xmm14, xmm15, [parm1q+3*FDEC_STRIDE]
MMX_STORE_DIFF_8P xmm5, xmm14, xmm15, [parm1q+4*FDEC_STRIDE]
MMX_STORE_DIFF_8P xmm9, xmm14, xmm15, [parm1q+5*FDEC_STRIDE]
MMX_STORE_DIFF_8P xmm6, xmm14, xmm15, [parm1q+6*FDEC_STRIDE]
MMX_STORE_DIFF_8P xmm7, xmm14, xmm15, [parm1q+7*FDEC_STRIDE]
ret
......@@ -140,13 +140,13 @@ static inline void pixel_sub_wxh( int16_t *diff, int i_size,
}
}
static void sub4x4_dct( int16_t dct[4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
static void sub4x4_dct( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
{
int16_t d[4][4];
int16_t tmp[4][4];
int i;
pixel_sub_wxh( (int16_t*)d, 4, pix1, i_pix1, pix2, i_pix2 );
pixel_sub_wxh( (int16_t*)d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
for( i = 0; i < 4; i++ )
{
......@@ -175,24 +175,24 @@ static void sub4x4_dct( int16_t dct[4][4], uint8_t *pix1, int i_pix1, uint8_t *p
}
}
static void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
static void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
{
sub4x4_dct( dct[0], &pix1[0], i_pix1, &pix2[0], i_pix2 );
sub4x4_dct( dct[1], &pix1[4], i_pix1, &pix2[4], i_pix2 );
sub4x4_dct( dct[2], &pix1[4*i_pix1+0], i_pix1, &pix2[4*i_pix2+0], i_pix2 );
sub4x4_dct( dct[3], &pix1[4*i_pix1+4], i_pix1, &pix2[4*i_pix2+4], i_pix2 );
sub4x4_dct( dct[0], &pix1[0], &pix2[0] );
sub4x4_dct( dct[1], &pix1[4], &pix2[4] );
sub4x4_dct( dct[2], &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
sub4x4_dct( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
}
static void sub16x16_dct( int16_t dct[16][4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
static void sub16x16_dct( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 )
{
sub8x8_dct( &dct[ 0], pix1, i_pix1, pix2, i_pix2 );
sub8x8_dct( &dct[ 4], &pix1[8], i_pix1, &pix2[8], i_pix2 );
sub8x8_dct( &dct[ 8], &pix1[8*i_pix1], i_pix1, &pix2[8*i_pix2], i_pix2 );
sub8x8_dct( &dct[12], &pix1[8*i_pix1+8], i_pix1, &pix2[8*i_pix2+8], i_pix2 );
sub8x8_dct( &dct[ 0], &pix1[0], &pix2[0] );
sub8x8_dct( &dct[ 4], &pix1[8], &pix2[8] );
sub8x8_dct( &dct[ 8], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
sub8x8_dct( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
}
static void add4x4_idct( uint8_t *p_dst, int i_dst, int16_t dct[4][4] )
static void add4x4_idct( uint8_t *p_dst, int16_t dct[4][4] )
{
int16_t d[4][4];
int16_t tmp[4][4];
......@@ -232,24 +232,24 @@ static void add4x4_idct( uint8_t *p_dst, int i_dst, int16_t dct[4][4] )
{
p_dst[x] = clip_uint8( p_dst[x] + d[y][x] );
}
p_dst += i_dst;
p_dst += FDEC_STRIDE;
}
}
static void add8x8_idct( uint8_t *p_dst, int i_dst, int16_t dct[4][4][4] )
static void add8x8_idct( uint8_t *p_dst, int16_t dct[4][4][4] )
{
add4x4_idct( p_dst, i_dst, dct[0] );
add4x4_idct( &p_dst[4], i_dst, dct[1] );
add4x4_idct( &p_dst[4*i_dst+0], i_dst, dct[2] );
add4x4_idct( &p_dst[4*i_dst+4], i_dst, dct[3] );
add4x4_idct( &p_dst[0], dct[0] );
add4x4_idct( &p_dst[4], dct[1] );
add4x4_idct( &p_dst[4*FDEC_STRIDE+0], dct[2] );
add4x4_idct( &p_dst[4*FDEC_STRIDE+4], dct[3] );
}
static void add16x16_idct( uint8_t *p_dst, int i_dst, int16_t dct[16][4][4] )
static void add16x16_idct( uint8_t *p_dst, int16_t dct[16][4][4] )
{
add8x8_idct( &p_dst[0], i_dst, &dct[0] );
add8x8_idct( &p_dst[8], i_dst, &dct[4] );
add8x8_idct( &p_dst[8*i_dst], i_dst, &dct[8] );
add8x8_idct( &p_dst[8*i_dst+8], i_dst, &dct[12] );
add8x8_idct( &p_dst[0], &dct[0] );
add8x8_idct( &p_dst[8], &dct[4] );
add8x8_idct( &p_dst[8*FDEC_STRIDE+0], &dct[8] );
add8x8_idct( &p_dst[8*FDEC_STRIDE+8], &dct[12] );
}
/****************************************************************************
......@@ -283,12 +283,12 @@ static void add16x16_idct( uint8_t *p_dst, int i_dst, int16_t dct[16][4][4] )
DST(7) = (a4>>2) - a7 ;\
}
static void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
static void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
{
int i;
int16_t tmp[8][8];
pixel_sub_wxh( (int16_t*)tmp, 8, pix1, i_pix1, pix2, i_pix2 );
pixel_sub_wxh( (int16_t*)tmp, 8, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
#define SRC(x) tmp[x][i]
#define DST(x) tmp[x][i]
......@@ -305,12 +305,12 @@ static void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, int i_pix1, uint8_t *
#undef DST
}
static void sub16x16_dct8( int16_t dct[4][8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
static void sub16x16_dct8( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 )
{
sub8x8_dct8( dct[0], pix1, i_pix1, pix2, i_pix2 );
sub8x8_dct8( dct[1], &pix1[8], i_pix1, &pix2[8], i_pix2 );
sub8x8_dct8( dct[2], &pix1[8*i_pix1], i_pix1, &pix2[8*i_pix2], i_pix2 );
sub8x8_dct8( dct[3], &pix1[8*i_pix1+8], i_pix1, &pix2[8*i_pix2+8], i_pix2 );
sub8x8_dct8( dct[0], &pix1[0], &pix2[0] );
sub8x8_dct8( dct[1], &pix1[8], &pix2[8] );
sub8x8_dct8( dct[2], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
sub8x8_dct8( dct[3], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
}
#define IDCT8_1D {\
......@@ -340,7 +340,7 @@ static void sub16x16_dct8( int16_t dct[4][8][8], uint8_t *pix1, int i_pix1, uint
DST(7, b0 - b7);\
}
static void add8x8_idct8( uint8_t *dst, int i_dst, int16_t dct[8][8] )
static void add8x8_idct8( uint8_t *dst, int16_t dct[8][8] )
{
int i;
......@@ -354,19 +354,19 @@ static void add8x8_idct8( uint8_t *dst, int i_dst, int16_t dct[8][8] )
#undef DST
#define SRC(x) dct[i][x]
#define DST(x,rhs) dst[i + x*i_dst] = clip_uint8( dst[i + x*i_dst] + ((rhs) >> 6) );
#define DST(x,rhs) dst[i + x*FDEC_STRIDE] = clip_uint8( dst[i + x*FDEC_STRIDE] + ((rhs) >> 6) );
for( i = 0; i < 8; i++ )
IDCT8_1D
#undef SRC
#undef DST
}
static void add16x16_idct8( uint8_t *dst, int i_dst, int16_t dct[4][8][8] )
static void add16x16_idct8( uint8_t *dst, int16_t dct[4][8][8] )
{
add8x8_idct8( &dst[0], i_dst, dct[0] );
add8x8_idct8( &dst[8], i_dst, dct[1] );
add8x8_idct8( &dst[8*i_dst], i_dst, dct[2] );
add8x8_idct8( &dst[8*i_dst+8], i_dst, dct[3] );
add8x8_idct8( &dst[0], dct[0] );
add8x8_idct8( &dst[8], dct[1] );
add8x8_idct8( &dst[8*FDEC_STRIDE+0], dct[2] );
add8x8_idct8( &dst[8*FDEC_STRIDE+8], dct[3] );
}
......
......@@ -91,20 +91,23 @@ static const int x264_dct8_weight2_zigzag[64] = {
typedef struct
{
void (*sub4x4_dct) ( int16_t dct[4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
void (*add4x4_idct) ( uint8_t *p_dst, int i_dst, int16_t dct[4][4] );
// pix1 stride = FENC_STRIDE
// pix2 stride = FDEC_STRIDE
// p_dst stride = FDEC_STRIDE
void (*sub4x4_dct) ( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 );
void (*add4x4_idct) ( uint8_t *p_dst, int16_t dct[4][4] );
void (*sub8x8_dct) ( int16_t dct[4][4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
void (*add8x8_idct) ( uint8_t *p_dst, int i_dst, int16_t dct[4][4][4] );
void (*sub8x8_dct) ( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 );
void (*add8x8_idct) ( uint8_t *p_dst, int16_t dct[4][4][4] );
void (*sub16x16_dct) ( int16_t dct[16][4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
void (*add16x16_idct) ( uint8_t *p_dst, int i_dst, int16_t dct[16][4][4] );
void (*sub16x16_dct) ( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 );
void (*add16x16_idct)( uint8_t *p_dst, int16_t dct[16][4][4] );
void (*sub8x8_dct8) ( int16_t dct[8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
void (*add8x8_idct8) ( uint8_t *p_dst, int i_dst, int16_t dct[8][8] );
void (*sub8x8_dct8) ( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 );
void (*add8x8_idct8) ( uint8_t *p_dst, int16_t dct[8][8] );
void (*sub16x16_dct8) ( int16_t dct[4][8][8], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
void (*add16x16_idct8) ( uint8_t *p_dst, int i_dst, int16_t dct[4][8][8] );
void (*sub16x16_dct8) ( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 );
void (*add16x16_idct8)( uint8_t *p_dst, int16_t dct[4][8][8] );
void (*dct4x4dc) ( int16_t d[4][4] );
void (*idct4x4dc)( int16_t d[4][4] );
......
......@@ -214,24 +214,19 @@ cglobal x264_sub4x4_dct_mmx
ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
; void __cdecl x264_sub4x4_dct_mmx( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
x264_sub4x4_dct_mmx:
push ebx
mov eax, [esp+12] ; pix1
mov ebx, [esp+16] ; i_pix1
mov ecx, [esp+20] ; pix2
mov edx, [esp+24] ; i_pix2
mov eax, [esp+ 8] ; pix1
mov ecx, [esp+12] ; pix2
MMX_ZERO mm7
; Load 4 lines
MMX_LOAD_DIFF_4P mm0, mm6, mm7, [eax ], [ecx]
MMX_LOAD_DIFF_4P mm1, mm6, mm7, [eax+ebx ], [ecx+edx]
MMX_LOAD_DIFF_4P mm2, mm6, mm7, [eax+ebx*2], [ecx+edx*2]
add eax, ebx
add ecx, edx
MMX_LOAD_DIFF_4P mm3, mm6, mm7, [eax+ebx*2], [ecx+edx*2]
MMX_LOAD_DIFF_4P mm0, mm6, mm7, [eax+0*FENC_STRIDE], [ecx+0*FDEC_STRIDE]
MMX_LOAD_DIFF_4P mm1, mm6, mm7, [eax+1*FENC_STRIDE], [ecx+1*FDEC_STRIDE]
MMX_LOAD_DIFF_4P mm2, mm6, mm7, [eax+2*FENC_STRIDE], [ecx+2*FDEC_STRIDE]
MMX_LOAD_DIFF_4P mm3, mm6, mm7, [eax+3*FENC_STRIDE], [ecx+3*FDEC_STRIDE]
MMX_SUMSUB_BADC mm3, mm0, mm2, mm1 ; mm3=s03 mm0=d03 mm2=s12 mm1=d12
......@@ -246,32 +241,29 @@ x264_sub4x4_dct_mmx:
MMX_SUMSUB_BA mm1, mm3 ; mm1=s03+s12 mm3=s03-s12
MMX_SUMSUB2_AB mm2, mm4, mm0 ; mm2=2.d03+d12 mm0=d03-2.d12
mov eax, [esp+ 8] ; dct
mov eax, [esp+ 4] ; dct
movq [eax+ 0], mm1
movq [eax+ 8], mm2
movq [eax+16], mm3
movq [eax+24], mm0
pop ebx
ret
cglobal x264_add4x4_idct_mmx
ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_add4x4_idct_mmx( uint8_t *p_dst, int i_dst, int16_t dct[4][4] )
; void __cdecl x264_add4x4_idct_mmx( uint8_t *p_dst, int16_t dct[4][4] )
;-----------------------------------------------------------------------------
x264_add4x4_idct_mmx:
; Load dct coeffs
mov eax, [esp+12] ; dct
mov eax, [esp+ 8] ; dct
movq mm0, [eax+ 0]
movq mm1, [eax+ 8]
movq mm2, [eax+16]
movq mm3, [eax+24]
mov eax, [esp+ 4] ; p_dst
mov ecx, [esp+ 8] ; i_dst
lea edx, [ecx+ecx*2]
picpush ebx
picgetgot ebx
......@@ -292,10 +284,10 @@ x264_add4x4_idct_mmx:
MMX_ZERO mm7
movq mm6, [x264_mmx_32 GOT_ebx]
MMX_STORE_DIFF_4P mm2, mm0, mm6, mm7, [eax]
MMX_STORE_DIFF_4P mm4, mm0, mm6, mm7, [eax+ecx]
MMX_STORE_DIFF_4P mm1, mm0, mm6, mm7, [eax+ecx*2]
MMX_STORE_DIFF_4P mm3, mm0, mm6, mm7, [eax+edx]
MMX_STORE_DIFF_4P mm2, mm0, mm6, mm7, [eax+0*FDEC_STRIDE]
MMX_STORE_DIFF_4P mm4, mm0, mm6, mm7, [eax+1*FDEC_STRIDE]
MMX_STORE_DIFF_4P mm1, mm0, mm6, mm7, [eax+2*FDEC_STRIDE]
MMX_STORE_DIFF_4P mm3, mm0, mm6, mm7, [eax+3*FDEC_STRIDE]
picpop ebx
ret
......@@ -336,32 +328,26 @@ cglobal x264_yidct8_mmx
ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_pixel_sub_8x8_mmx( int16_t *diff, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
; void __cdecl x264_pixel_sub_8x8_mmx( int16_t *diff, uint8_t *pix1, uint8_t *pix2 );
;-----------------------------------------------------------------------------
x264_pixel_sub_8x8_mmx:
push ebx
push ebp
mov ebp, [esp+12] ; diff
mov eax, [esp+16] ; pix1
mov ebx, [esp+20] ; i_pix1
mov ecx, [esp+24] ; pix2
mov edx, [esp+28] ; i_pix2
mov edx, [esp+ 4] ; diff
mov eax, [esp+ 8] ; pix1
mov ecx, [esp+12] ; pix2
MMX_ZERO mm7
%assign disp 0
%rep 8
MMX_LOAD_DIFF_8P mm0, mm1, mm2, mm3, [eax], [ecx], mm7
movq [ebp+disp], mm0
movq [ebp+disp+8], mm1
add eax, ebx
add ecx, edx
movq [edx+disp], mm0
movq [edx+disp+8], mm1
add eax, FENC_STRIDE
add ecx, FDEC_STRIDE
%assign disp disp+16
%endrep
pop ebp
pop ebx
ret
ALIGN 16
......@@ -545,12 +531,11 @@ x264_yidct8_mmx:
ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_pixel_add_8x8_mmx( uint8_t *dst, int i_dst, int16_t src[8][8] );
; void __cdecl x264_pixel_add_8x8_mmx( uint8_t *dst, int16_t src[8][8] );
;-----------------------------------------------------------------------------
x264_pixel_add_8x8_mmx:
mov eax, [esp+04] ; dst
mov ecx, [esp+08] ; i_dst
mov edx, [esp+12] ; src
mov eax, [esp+4] ; dst
mov edx, [esp+8] ; src
MMX_ZERO mm7
......@@ -568,7 +553,7 @@ x264_pixel_add_8x8_mmx:
paddw mm1, mm3
packuswb mm0, mm1
movq [eax], mm0
add eax, ecx
add eax, FDEC_STRIDE
%assign disp disp+16
%endrep
ret
......
......@@ -29,25 +29,24 @@
#include <stdlib.h>
#include <stdarg.h>
#include "x264.h"
#include "dct.h"
#include "common/common.h"
void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
{
x264_sub4x4_dct_mmx( dct[0], &pix1[0], i_pix1, &pix2[0], i_pix2 );
x264_sub4x4_dct_mmx( dct[1], &pix1[4], i_pix1, &pix2[4], i_pix2 );
x264_sub4x4_dct_mmx( dct[2], &pix1[4*i_pix1+0], i_pix1, &pix2[4*i_pix2+0], i_pix2 );
x264_sub4x4_dct_mmx( dct[3], &pix1[4*i_pix1+4], i_pix1, &pix2[4*i_pix2+4], i_pix2 );
x264_sub4x4_dct_mmx( dct[0], &pix1[0], &pix2[0] );
x264_sub4x4_dct_mmx( dct[1], &pix1[4], &pix2[4] );
x264_sub4x4_dct_mmx( dct[2], &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
x264_sub4x4_dct_mmx( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
}
void x264_sub16x16_dct_mmx( int16_t dct[16][4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
void x264_sub16x16_dct_mmx( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 )
{
x264_sub8x8_dct_mmx( &dct[ 0], &pix1[0], i_pix1, &pix2[0], i_pix2 );
x264_sub8x8_dct_mmx( &dct[ 4], &pix1[8], i_pix1, &pix2[8], i_pix2 );