Commit 54e38917 authored by Holger Lubitz's avatar Holger Lubitz Committed by Fiona Glaser

Vastly faster SATD/SA8D/Hadamard_AC/SSD/DCT/IDCT

Heavily optimized for Core 2 and Nehalem, but performance should improve on all modern x86 CPUs.
16x16 SATD: +18% speed on K8(64bit), +22% on K10(32bit), +42% on Penryn(64bit), +44% on Nehalem(64bit), +50% on P4(32bit), +98% on Conroe(64bit)
Similar performance boosts in SATD-like functions (SA8D, hadamard_ac) and somewhat less in DCT/IDCT/SSD.
Overall performance boost is up to ~15% on 64-bit Conroe.
parent 7501d950
......@@ -446,6 +446,11 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
if( cpu&X264_CPU_SSSE3 )
{
dctf->sub4x4_dct = x264_sub4x4_dct_ssse3;
dctf->sub8x8_dct = x264_sub8x8_dct_ssse3;
dctf->sub16x16_dct = x264_sub16x16_dct_ssse3;
dctf->sub8x8_dct8 = x264_sub8x8_dct8_ssse3;
dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3;
dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
}
......
......@@ -426,7 +426,7 @@ SATD_X_DECL7()
SATD_X_DECL7( _mmxext )
SATD_X_DECL6( _sse2 )
SATD_X_DECL7( _ssse3 )
SATD_X_DECL6( _ssse3_phadd )
SATD_X_DECL7( _sse4 )
#endif
/****************************************************************************
......@@ -667,11 +667,28 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmxext;
}
if( cpu&X264_CPU_SSE2 )
{
INIT5( ssd, _sse2slow );
INIT2_NAME( sad_aligned, sad, _sse2_aligned );
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_sse2;
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_sse2;
pixf->ssim_end4 = x264_pixel_ssim_end4_sse2;
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse2;
#ifdef ARCH_X86_64
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2;
#endif
}
if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
{
INIT2( sad, _sse2 );
INIT2( sad_x3, _sse2 );
INIT2( sad_x4, _sse2 );
INIT6( satd, _sse2 );
INIT6( satd_x3, _sse2 );
INIT6( satd_x4, _sse2 );
if( !(cpu&X264_CPU_STACK_MOD4) )
{
INIT4( hadamard_ac, _sse2 );
......@@ -679,9 +696,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT_ADS( _sse2 );
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2;
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_sse2;
if( cpu&X264_CPU_CACHELINE_64 )
if( cpu&X264_CPU_CACHELINE_64 )
{
INIT2( ssd, _sse2); /* faster for width 16 on p4 */
#ifdef ARCH_X86
INIT2( sad, _cache64_sse2 );
INIT2( sad_x3, _cache64_sse2 );
......@@ -700,31 +717,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT2( sad_x4, _sse2_misalign );
}
}
if( cpu&X264_CPU_SSE2 )
{
INIT5( ssd, _sse2 );
if( cpu&X264_CPU_SSE2_IS_FAST )
{
INIT6( satd, _sse2 );
INIT6( satd_x3, _sse2 );
INIT6( satd_x4, _sse2 );
}
else
{
INIT5( satd, _sse2 );
INIT5( satd_x3, _sse2 );
INIT5( satd_x4, _sse2 );
}
INIT2_NAME( sad_aligned, sad, _sse2_aligned );
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_sse2;
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_sse2;
pixf->ssim_end4 = x264_pixel_ssim_end4_sse2;
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse2;
#ifdef ARCH_X86_64
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2;
#endif
}
if( cpu&X264_CPU_SSE2_IS_FAST && !(cpu&X264_CPU_CACHELINE_64) )
{
......@@ -747,6 +739,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
if( cpu&X264_CPU_SSSE3 )
{
INIT7( ssd, _ssse3 );
INIT7( satd, _ssse3 );
INIT7( satd_x3, _ssse3 );
INIT7( satd_x4, _ssse3 );
......@@ -770,18 +763,23 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT2( sad_x3, _cache64_ssse3 );
INIT2( sad_x4, _cache64_ssse3 );
}
if( cpu&X264_CPU_PHADD_IS_FAST )
if( !(cpu&X264_CPU_PHADD_IS_FAST) )
{
INIT6( satd, _ssse3_phadd );
INIT6( satd_x3, _ssse3_phadd );
INIT6( satd_x4, _ssse3_phadd );
INIT5( ssd, _sse2 ); /* on conroe, sse2 is faster for width8/16 */
}
}
if( cpu&X264_CPU_SSE4 )
{
pixf->ssd[PIXEL_4x8] = x264_pixel_ssd_4x8_sse4;
pixf->ssd[PIXEL_4x4] = x264_pixel_ssd_4x4_sse4;
INIT7( satd, _sse4 );
INIT7( satd_x3, _sse4 );
INIT7( satd_x4, _sse4 );
if( !(cpu&X264_CPU_STACK_MOD4) )
{
INIT4( hadamard_ac, _sse4 );
}
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_sse4;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse4;
}
#endif //HAVE_MMX
......
......@@ -3,10 +3,11 @@
;*****************************************************************************
;* Copyright (C) 2003-2008 x264 project
;*
;* Authors: Laurent Aimar <fenrir@via.ecp.fr> (initial version)
;* Loren Merritt <lorenm@u.washington.edu> (misc)
;* Min Chen <chenm001.163.com> (converted to nasm)
;* Christian Heine <sennindemokrit@gmx.net> (dct8/idct8 functions)
;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
;* Loren Merritt <lorenm@u.washington.edu>
;* Holger Lubitz <holger@lubitz.org>
;* Min Chen <chenm001.163.com>
;* Christian Heine <sennindemokrit@gmx.net>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
......@@ -29,6 +30,7 @@
SECTION_RODATA
pw_32: times 8 dw 32
hsub_mul: times 8 db 1, -1
SECTION .text
......@@ -340,26 +342,64 @@ global x264_add8x8_idct8_mmx.skip_prologue
ADD_STORE_ROW 7, m7, [r1+0x78]
ret
INIT_XMM
%macro DCT_SUB8 1
cglobal x264_sub8x8_dct_%1, 3,3
add r2, 4*FDEC_STRIDE
global x264_sub8x8_dct_%1.skip_prologue
.skip_prologue:
%ifnidn %1, sse2
mova m7, [hsub_mul GLOBAL]
%endif
LOAD_DIFF8x4 0, 1, 2, 3, 6, 7, r1, r2-4*FDEC_STRIDE
SPILL r0, 1,2
SWAP 2, 7
LOAD_DIFF8x4 4, 5, 6, 7, 1, 2, r1, r2-4*FDEC_STRIDE
UNSPILL r0, 1
SPILL r0, 7
SWAP 2, 7
UNSPILL r0, 2
DCT4_1D 0, 1, 2, 3, 7
TRANSPOSE2x4x4W 0, 1, 2, 3, 7
UNSPILL r0, 7
SPILL r0, 2
DCT4_1D 4, 5, 6, 7, 2
TRANSPOSE2x4x4W 4, 5, 6, 7, 2
UNSPILL r0, 2
SPILL r0, 6
DCT4_1D 0, 1, 2, 3, 6
UNSPILL r0, 6
STORE_DCT 0, 1, 2, 3, r0, 0
DCT4_1D 4, 5, 6, 7, 3
STORE_DCT 4, 5, 6, 7, r0, 64
ret
;-----------------------------------------------------------------------------
; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
cglobal x264_sub8x8_dct8_sse2, 3,3
global x264_sub8x8_dct8_sse2.skip_prologue
cglobal x264_sub8x8_dct8_%1, 3,3
add r2, 4*FDEC_STRIDE
global x264_sub8x8_dct8_%1.skip_prologue
.skip_prologue:
LOAD_DIFF m0, m7, none, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
LOAD_DIFF m1, m7, none, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
LOAD_DIFF m2, m7, none, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
LOAD_DIFF m3, m7, none, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
LOAD_DIFF m4, m7, none, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE]
LOAD_DIFF m5, m7, none, [r1+5*FENC_STRIDE], [r2+5*FDEC_STRIDE]
%ifidn %1, sse2
LOAD_DIFF m0, m7, none, [r1+0*FENC_STRIDE], [r2-4*FDEC_STRIDE]
LOAD_DIFF m1, m7, none, [r1+1*FENC_STRIDE], [r2-3*FDEC_STRIDE]
LOAD_DIFF m2, m7, none, [r1+2*FENC_STRIDE], [r2-2*FDEC_STRIDE]
LOAD_DIFF m3, m7, none, [r1+3*FENC_STRIDE], [r2-1*FDEC_STRIDE]
LOAD_DIFF m4, m7, none, [r1+4*FENC_STRIDE], [r2+0*FDEC_STRIDE]
LOAD_DIFF m5, m7, none, [r1+5*FENC_STRIDE], [r2+1*FDEC_STRIDE]
SPILL r0, 0
LOAD_DIFF m6, m7, none, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE]
LOAD_DIFF m7, m0, none, [r1+7*FENC_STRIDE], [r2+7*FDEC_STRIDE]
LOAD_DIFF m6, m7, none, [r1+6*FENC_STRIDE], [r2+2*FDEC_STRIDE]
LOAD_DIFF m7, m0, none, [r1+7*FENC_STRIDE], [r2+3*FDEC_STRIDE]
UNSPILL r0, 0
%else
mova m7, [hsub_mul GLOBAL]
LOAD_DIFF8x4 0, 1, 2, 3, 4, 7, r1, r2-4*FDEC_STRIDE
SPILL r0, 0,1
SWAP 1, 7
LOAD_DIFF8x4 4, 5, 6, 7, 0, 1, r1, r2-4*FDEC_STRIDE
UNSPILL r0, 0,1
%endif
DCT8_1D 0,1,2,3,4,5,6,7,r0
UNSPILL r0, 0,4
TRANSPOSE8x8W 0,1,2,3,4,5,6,7,[r0+0x60],[r0+0x40],1
......@@ -367,11 +407,59 @@ global x264_sub8x8_dct8_sse2.skip_prologue
DCT8_1D 0,1,2,3,4,5,6,7,r0
SPILL r0, 1,2,3,5,7
ret
%endmacro
%define LOAD_DIFF8x4 LOAD_DIFF8x4_SSE2
%define movdqa movaps
%define punpcklqdq movlhps
DCT_SUB8 sse2
%undef movdqa
%undef punpcklqdq
%define LOAD_DIFF8x4 LOAD_DIFF8x4_SSSE3
DCT_SUB8 ssse3
;-----------------------------------------------------------------------------
; void x264_add8x8_idct_sse2( uint8_t *pix, int16_t dct[4][4][4] )
;-----------------------------------------------------------------------------
cglobal x264_add8x8_idct_sse2, 2,2
add r0, 4*FDEC_STRIDE
global x264_add8x8_idct_sse2.skip_prologue
.skip_prologue:
UNSPILL_SHUFFLE r1, 0,2,1,3, 0,1,2,3
SBUTTERFLY qdq, 0, 1, 4
SBUTTERFLY qdq, 2, 3, 4
UNSPILL_SHUFFLE r1, 4,6,5,7, 4,5,6,7
SPILL r1, 0
SBUTTERFLY qdq, 4, 5, 0
SBUTTERFLY qdq, 6, 7, 0
UNSPILL r1,0
IDCT4_1D 0,1,2,3,r1
SPILL r1, 4
TRANSPOSE2x4x4W 0,1,2,3,4
UNSPILL r1, 4
IDCT4_1D 4,5,6,7,r1
SPILL r1, 0
TRANSPOSE2x4x4W 4,5,6,7,0
UNSPILL r1, 0
paddw m0, [pw_32 GLOBAL]
IDCT4_1D 0,1,2,3,r1
paddw m4, [pw_32 GLOBAL]
IDCT4_1D 4,5,6,7,r1
SPILL r1, 6,7
pxor m7, m7
DIFFx2 m0, m1, m6, m7, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]; m5
DIFFx2 m2, m3, m6, m7, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]; m5
UNSPILL_SHUFFLE r1, 0,2, 6,7
DIFFx2 m4, m5, m6, m7, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE]; m5
DIFFx2 m0, m2, m6, m7, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE]; m5
STORE_IDCT m1, m3, m5, m2
ret
;-----------------------------------------------------------------------------
; void x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
cglobal x264_add8x8_idct8_sse2, 2,2
add r0, 4*FDEC_STRIDE
global x264_add8x8_idct8_sse2.skip_prologue
.skip_prologue:
UNSPILL r1, 1,2,3,5,6,7
......@@ -383,14 +471,10 @@ global x264_add8x8_idct8_sse2.skip_prologue
IDCT8_1D 0,1,2,3,4,5,6,7,r1
SPILL r1, 6,7
pxor m7, m7
STORE_DIFF m0, m6, m7, [r0+FDEC_STRIDE*0]
STORE_DIFF m1, m6, m7, [r0+FDEC_STRIDE*1]
STORE_DIFF m2, m6, m7, [r0+FDEC_STRIDE*2]
STORE_DIFF m3, m6, m7, [r0+FDEC_STRIDE*3]
STORE_DIFF m4, m6, m7, [r0+FDEC_STRIDE*4]
STORE_DIFF m5, m6, m7, [r0+FDEC_STRIDE*5]
UNSPILL_SHUFFLE r1, 0,1, 6,7
STORE_DIFF m0, m6, m7, [r0+FDEC_STRIDE*6]
STORE_DIFF m1, m6, m7, [r0+FDEC_STRIDE*7]
DIFFx2 m0, m1, m6, m7, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]; m5
DIFFx2 m2, m3, m6, m7, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]; m5
UNSPILL_SHUFFLE r1, 0,2, 6,7
DIFFx2 m4, m5, m6, m7, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE]; m5
DIFFx2 m0, m2, m6, m7, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE]; m5
STORE_IDCT m1, m3, m5, m2
ret
......@@ -3,9 +3,10 @@
;*****************************************************************************
;* Copyright (C) 2003-2008 x264 project
;*
;* Authors: Laurent Aimar <fenrir@via.ecp.fr> (initial version)
;* Loren Merritt <lorenm@u.washington.edu> (dct8, misc)
;* Min Chen <chenm001.163.com> (converted to nasm)
;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
;* Loren Merritt <lorenm@u.washington.edu>
;* Holger Lubitz <holger@lubitz.org>
;* Min Chen <chenm001.163.com>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
......@@ -27,18 +28,19 @@
SECTION_RODATA
pw_32: times 8 dw 32
hsub_mul: times 8 db 1, -1
SECTION .text
INIT_XMM
%macro DCT8_1D 10
SUMSUB_BA m%8, m%1 ; %8=s07, %1=d07
SUMSUB_BA m%7, m%2 ; %7=s16, %2=d16
SUMSUB_BA m%6, m%3 ; %6=s25, %3=d25
SUMSUB_BA m%5, m%4 ; %5=s34, %4=d34
SUMSUB_BA m%6, m%3 ; %6=s25, %3=d25
SUMSUB_BA m%7, m%2 ; %7=s16, %2=d16
SUMSUB_BA m%8, m%1 ; %8=s07, %1=d07
SUMSUB_BA m%5, m%8 ; %5=a0, %8=a2
SUMSUB_BA m%6, m%7 ; %6=a1, %7=a3
SUMSUB_BA m%6, m%7, m%10 ; %6=a1, %7=a3
SUMSUB_BA m%5, m%8, m%10 ; %5=a0, %8=a2
movdqa m%9, m%1
psraw m%9, 1
......@@ -60,14 +62,14 @@ INIT_XMM
psubw m%1, m%3 ; %1=a5
psubw m%4, m%2 ; %4=a6
SUMSUB_BA m%6, m%5 ; %6=b0, %5=b4
movdqa m%2, m%10
psraw m%2, 2
paddw m%2, m%9 ; %2=b1
psraw m%9, 2
psubw m%9, m%10 ; %9=b7
SUMSUB_BA m%6, m%5, m%10 ; %6=b0, %5=b4
movdqa m%3, m%7
psraw m%3, 1
paddw m%3, m%8 ; %3=b2
......@@ -83,41 +85,8 @@ INIT_XMM
SWAP %1, %6, %4, %7, %8, %9
%endmacro
;-----------------------------------------------------------------------------
; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
cglobal x264_sub8x8_dct8_sse2, 3,3,10
LOAD_DIFF m0, m8, m9, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
LOAD_DIFF m1, m8, m9, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
LOAD_DIFF m2, m8, m9, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
LOAD_DIFF m3, m8, m9, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
LOAD_DIFF m4, m8, m9, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE]
LOAD_DIFF m5, m8, m9, [r1+5*FENC_STRIDE], [r2+5*FDEC_STRIDE]
LOAD_DIFF m6, m8, m9, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE]
LOAD_DIFF m7, m8, m9, [r1+7*FENC_STRIDE], [r2+7*FDEC_STRIDE]
DCT8_1D 0,1,2,3,4,5,6,7,8,9
TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
DCT8_1D 0,1,2,3,4,5,6,7,8,9
movdqa [r0+0x00], m0
movdqa [r0+0x10], m1
movdqa [r0+0x20], m2
movdqa [r0+0x30], m3
movdqa [r0+0x40], m4
movdqa [r0+0x50], m5
movdqa [r0+0x60], m6
movdqa [r0+0x70], m7
RET
%macro IDCT8_1D 10
SUMSUB_BA m%5, m%1 ; %5=a0, %1=a2
movdqa m%10, m%3
psraw m%3, 1
psubw m%3, m%7 ; %3=a4
psraw m%7, 1
paddw m%7, m%10 ; %7=a6
SUMSUB_BA m%5, m%1, m%9 ; %5=a0, %1=a2
movdqa m%9, m%2
psraw m%9, 1
......@@ -125,6 +94,12 @@ cglobal x264_sub8x8_dct8_sse2, 3,3,10
paddw m%9, m%4
paddw m%9, m%6 ; %9=a7
movdqa m%10, m%3
psraw m%3, 1
psubw m%3, m%7 ; %3=a4
psraw m%7, 1
paddw m%7, m%10 ; %7=a6
movdqa m%10, m%6
psraw m%10, 1
paddw m%10, m%6
......@@ -140,34 +115,108 @@ cglobal x264_sub8x8_dct8_sse2, 3,3,10
psubw m%2, m%4 ; %2=a3
psubw m%6, m%8 ; %6=a1
SUMSUB_BA m%7, m%5 ; %7=b0, %5=b6
SUMSUB_BA m%3, m%1 ; %3=b2, %1=b4
movdqa m%4, m%9
psraw m%4, 2
paddw m%4, m%6 ; %4=b1
psraw m%6, 2
psubw m%9, m%6 ; %9=b7
SUMSUB_BA m%7, m%5, m%6 ; %7=b0, %5=b6
SUMSUB_BA m%3, m%1, m%6; %3=b2, %1=b4
movdqa m%8, m%10
psraw m%8, 2
paddw m%8, m%2 ; %8=b3
psraw m%2, 2
psubw m%2, m%10 ; %2=b5
SUMSUB_BA m%9, m%7 ; %9=c0, %7=c7
SUMSUB_BA m%2, m%3 ; %2=c1, %3=c6
SUMSUB_BA m%8, m%1 ; %8=c2, %1=c5
SUMSUB_BA m%4, m%5 ; %4=c3, %5=c4
SUMSUB_BA m%9, m%7, m%6 ; %9=c0, %7=c7
SUMSUB_BA m%2, m%3, m%6 ; %2=c1, %3=c6
SUMSUB_BA m%8, m%1, m%6 ; %8=c2, %1=c5
SUMSUB_BA m%4, m%5, m%6 ; %4=c3, %5=c4
SWAP %1, %9, %6
SWAP %3, %8, %7
%endmacro
%macro DCT_SUB8 1
cglobal x264_sub8x8_dct_%1, 3,3,11
add r2, 4*FDEC_STRIDE
%ifnidn %1, sse2
mova m7, [hsub_mul GLOBAL]
%endif
%ifdef WIN64
call .skip_prologue
RET
%endif
global x264_sub8x8_dct_%1.skip_prologue
.skip_prologue:
SWAP 7, 9
LOAD_DIFF8x4 0, 1, 2, 3, 8, 9, r1, r2-4*FDEC_STRIDE
LOAD_DIFF8x4 4, 5, 6, 7, 8, 9, r1, r2-4*FDEC_STRIDE
DCT4_1D 0, 1, 2, 3, 8
TRANSPOSE2x4x4W 0, 1, 2, 3, 8
DCT4_1D 4, 5, 6, 7, 8
TRANSPOSE2x4x4W 4, 5, 6, 7, 8
DCT4_1D 0, 1, 2, 3, 8
STORE_DCT 0, 1, 2, 3, r0, 0
DCT4_1D 4, 5, 6, 7, 8
STORE_DCT 4, 5, 6, 7, r0, 64
ret
;-----------------------------------------------------------------------------
; void x264_sub8x8_dct8_sse2( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
cglobal x264_sub8x8_dct8_%1, 3,3,11
add r2, 4*FDEC_STRIDE
%ifnidn %1, sse2
mova m7, [hsub_mul GLOBAL]
%endif
%ifdef WIN64
call .skip_prologue
RET
%endif
global x264_sub8x8_dct8_%1.skip_prologue
.skip_prologue:
SWAP 7, 10
LOAD_DIFF8x4 0, 1, 2, 3, 4, 10, r1, r2-4*FDEC_STRIDE
LOAD_DIFF8x4 4, 5, 6, 7, 8, 10, r1, r2-4*FDEC_STRIDE
DCT8_1D 0,1,2,3,4,5,6,7,8,9
TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
DCT8_1D 0,1,2,3,4,5,6,7,8,9
movdqa [r0+0x00], m0
movdqa [r0+0x10], m1
movdqa [r0+0x20], m2
movdqa [r0+0x30], m3
movdqa [r0+0x40], m4
movdqa [r0+0x50], m5
movdqa [r0+0x60], m6
movdqa [r0+0x70], m7
ret
%endmacro
%define LOAD_DIFF8x4 LOAD_DIFF8x4_SSE2
%define movdqa movaps
%define punpcklqdq movlhps
DCT_SUB8 sse2
%undef movdqa
%undef punpcklqdq
%define LOAD_DIFF8x4 LOAD_DIFF8x4_SSSE3
DCT_SUB8 ssse3
;-----------------------------------------------------------------------------
; void x264_add8x8_idct8_sse2( uint8_t *p_dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
cglobal x264_add8x8_idct8_sse2, 2,2,10
cglobal x264_add8x8_idct8_sse2, 2,2,11
add r0, 4*FDEC_STRIDE
pxor m7, m7
%ifdef WIN64
call .skip_prologue
RET
%endif
global x264_add8x8_idct8_sse2.skip_prologue
.skip_prologue:
SWAP 7, 9
movdqa m0, [r1+0x00]
movdqa m1, [r1+0x10]
movdqa m2, [r1+0x20]
......@@ -176,21 +225,53 @@ cglobal x264_add8x8_idct8_sse2, 2,2,10
movdqa m5, [r1+0x50]
movdqa m6, [r1+0x60]
movdqa m7, [r1+0x70]
IDCT8_1D 0,1,2,3,4,5,6,7,8,9
IDCT8_1D 0,1,2,3,4,5,6,7,8,10
TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
paddw m0, [pw_32 GLOBAL] ; rounding for the >>6 at the end
IDCT8_1D 0,1,2,3,4,5,6,7,8,9
pxor m9, m9
STORE_DIFF m0, m8, m9, [r0+0*FDEC_STRIDE]
STORE_DIFF m1, m8, m9, [r0+1*FDEC_STRIDE]
STORE_DIFF m2, m8, m9, [r0+2*FDEC_STRIDE]
STORE_DIFF m3, m8, m9, [r0+3*FDEC_STRIDE]
STORE_DIFF m4, m8, m9, [r0+4*FDEC_STRIDE]
STORE_DIFF m5, m8, m9, [r0+5*FDEC_STRIDE]
STORE_DIFF m6, m8, m9, [r0+6*FDEC_STRIDE]
STORE_DIFF m7, m8, m9, [r0+7*FDEC_STRIDE]
RET
IDCT8_1D 0,1,2,3,4,5,6,7,8,10
DIFFx2 m0, m1, m8, m9, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]
DIFFx2 m2, m3, m8, m9, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]
DIFFx2 m4, m5, m8, m9, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE]
DIFFx2 m6, m7, m8, m9, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE]
STORE_IDCT m1, m3, m5, m7
ret
;-----------------------------------------------------------------------------
; void x264_add8x8_idct_sse2( uint8_t *pix, int16_t dct[4][4][4] )
;-----------------------------------------------------------------------------
cglobal x264_add8x8_idct_sse2, 2,2,11
add r0, 4*FDEC_STRIDE
pxor m7, m7
%ifdef WIN64
call .skip_prologue
RET
%endif
global x264_add8x8_idct_sse2.skip_prologue
.skip_prologue:
SWAP 7, 9
mova m0, [r1+ 0]
mova m2, [r1+16]
mova m1, [r1+32]
mova m3, [r1+48]
SBUTTERFLY qdq, 0, 1, 4
SBUTTERFLY qdq, 2, 3, 4
mova m4, [r1+64]
mova m6, [r1+80]
mova m5, [r1+96]
mova m7, [r1+112]
SBUTTERFLY qdq, 4, 5, 8
SBUTTERFLY qdq, 6, 7, 8
IDCT4_1D 0,1,2,3,8,10
TRANSPOSE2x4x4W 0,1,2,3,8
IDCT4_1D 4,5,6,7,8,10
TRANSPOSE2x4x4W 4,5,6,7,8
paddw m0, [pw_32 GLOBAL]
IDCT4_1D 0,1,2,3,8,10
paddw m4, [pw_32 GLOBAL]
IDCT4_1D 4,5,6,7,8,10
DIFFx2 m0, m1, m8, m9, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]
DIFFx2 m2, m3, m8, m9, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]
DIFFx2 m4, m5, m8, m9, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE]
DIFFx2 m6, m7, m8, m9, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE]
STORE_IDCT m1, m3, m5, m7
ret
......@@ -3,9 +3,9 @@
;*****************************************************************************
;* Copyright (C) 2003-2008 x264 project
;*
;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
;* Authors: Holger Lubitz <holger@lubitz.org>
;* Laurent Aimar <fenrir@via.ecp.fr>
;* Loren Merritt <lorenm@u.washington.edu>
;* Holger Lubitz <hal@duncan.ol.sub.de>
;* Min Chen <chenm001.163.com>
;*
;* This program is free software; you can redistribute it and/or modify
......@@ -29,6 +29,7 @@
SECTION_RODATA
pw_32: times 8 dw 32
pw_8000: times 8 dw 0x8000
hsub_mul: times 8 db 1, -1
pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
pb_scan4framea: db 12,13,6,7,14,15,0,1,8,9,2,3,4,5,10,11
pb_scan4frameb: db 0,1,8,9,2,3,4,5,10,11,12,13,6,7,14,15
......@@ -38,38 +39,39 @@ pb_1: times 8 db 1
SECTION .text
%macro HADAMARD4_1D 4
SUMSUB_BADC m%2, m%1, m%4, m%3
SUMSUB_BADC m%4, m%2, m%3, m%1
%macro WALSH4_1D 5
SUMSUB_BADC m%4, m%3, m%2, m%1, m%5
SUMSUB_BADC m%4, m%2, m%3, m%1, m%5
SWAP %1, %4, %3
%endmacro
%macro SUMSUB_17BIT 4 ; a, b, tmp, 0x8000
movq m%3, m%4
paddw m%1, m%4
pxor m%1, m%4
psubw m%3, m%2
paddw m%2, m%4
pxor m%2, m%4
pavgw m%3, m%1
pavgw m%2, m%1
psubw m%3, m%4
psubw m%2, m%4
pxor m%3, m%4
pxor m%2, m%4
SWAP %1, %2, %3
%endmacro
INIT_MMX
;-----------------------------------------------------------------------------
; void x264_dct4x4dc_mmx( int16_t d[4][4] )
;-----------------------------------------------------------------------------
cglobal x264_dct4x4dc_mmx, 1,1
movq m0, [r0+ 0]
movq m1, [r0+ 8]
movq m2, [r0+16]
movq m3, [r0+24]
movq m2, [r0+16]
movq m1, [r0+ 8]
movq m0, [r0+ 0]
movq m7, [pw_8000 GLOBAL] ; convert to unsigned and back, so that pavgw works
HADAMARD4_1D 0,1,2,3
WALSH4_1D 0,1,2,3,4
TRANSPOSE4x4W 0,1,2,3,4
SUMSUB_BADC m1, m0, m3, m2
SWAP 0,1
SWAP 2,3
SUMSUB_BADC m1, m0, m3, m2, m4
SWAP 0, 1
SWAP 2, 3
SUMSUB_17BIT 0,2,4,7
SUMSUB_17BIT 1,3,5,7
movq [r0+0], m0
......@@ -82,123 +84,78 @@ cglobal x264_dct4x4dc_mmx, 1,1
; void x264_idct4x4dc_mmx( int16_t d[4][4] )
;-----------------------------------------------------------------------------
cglobal x264_idct4x4dc_mmx, 1,1
movq m0, [r0+ 0]
movq m1, [r0+ 8]
movq m2, [r0+16]
movq m3, [r0+24]
HADAMARD4_1D 0,1,2,3
movq m3, [r0+24]
movq m2, [r0+16]
movq m1, [r0+ 8]
movq m0, [r0+ 0]
WALSH4_1D 0,1,2,3,4
TRANSPOSE4x4W 0,1,2,3,4