Commit 4e7bd4a1 authored by Loren Merritt's avatar Loren Merritt

slightly faster mmx dct



git-svn-id: svn://svn.videolan.org/x264/trunk@536 df754926-b1dd-0310-bc7b-ec298dee348c
parent 637470c0
......@@ -19,7 +19,7 @@ endif
# MMX/SSE optims
ifeq ($(ARCH),X86)
SRCS += common/i386/mc-c.c common/i386/dct-c.c common/i386/predict-c.c
SRCS += common/i386/mc-c.c common/i386/predict-c.c
ASMSRC = common/i386/dct-a.asm common/i386/cpu-a.asm \
common/i386/pixel-a.asm common/i386/mc-a.asm \
common/i386/mc-a2.asm common/i386/predict-a.asm \
......@@ -31,7 +31,7 @@ endif
# MMX/SSE optims
ifeq ($(ARCH),X86_64)
SRCS += common/i386/mc-c.c common/i386/dct-c.c common/i386/predict-c.c
SRCS += common/i386/mc-c.c common/i386/predict-c.c
ASMSRC = common/amd64/dct-a.asm common/amd64/cpu-a.asm \
common/amd64/pixel-a.asm common/amd64/mc-a.asm \
common/amd64/mc-a2.asm common/amd64/predict-a.asm \
......
......@@ -4,8 +4,9 @@
;* Copyright (C) 2003 x264 project
;* $Id: dct.asm,v 1.1 2004/06/03 19:27:07 fenrir Exp $
;*
;* Authors: Min Chen <chenm001.163.com> (converted to nasm)
;* Laurent Aimar <fenrir@via.ecp.fr> (initial version)
;* Authors: Laurent Aimar <fenrir@via.ecp.fr> (initial version)
;* Min Chen <chenm001.163.com> (converted to nasm)
;* Loren Merritt <lorenm@u.washington.edu> (dct8)
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
......@@ -464,3 +465,56 @@ x264_add8x8_idct8_sse2:
MMX_STORE_DIFF_8P xmm7, xmm14, xmm15, [parm1q+7*FDEC_STRIDE]
ret
;-----------------------------------------------------------------------------
; void __cdecl x264_sub8x8_dct_mmx( int16_t dct[4][4][4],
; uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
%macro SUB_NxN_DCT 6
ALIGN 16
cglobal %1
%1:
call %2
add parm1q, %3
add parm2q, %4-%5*FENC_STRIDE
add parm3q, %4-%5*FDEC_STRIDE
call %2
add parm1q, %3
add parm2q, %4*FENC_STRIDE-%6
add parm3q, %4*FDEC_STRIDE-%6
call %2
add parm1q, %3
add parm2q, %4-%5*FENC_STRIDE
add parm3q, %4-%5*FDEC_STRIDE
jmp %2
%endmacro
;-----------------------------------------------------------------------------
; void __cdecl x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] )
;-----------------------------------------------------------------------------
%macro ADD_NxN_IDCT 6
ALIGN 16
cglobal %1
%1:
call %2
add parm1q, %4-%5*FDEC_STRIDE
add parm2q, %3
call %2
add parm1q, %4*FDEC_STRIDE-%6
add parm2q, %3
call %2
add parm1q, %4-%5*FDEC_STRIDE
add parm2q, %3
jmp %2
%endmacro
SUB_NxN_DCT x264_sub8x8_dct_mmx, x264_sub4x4_dct_mmx, 32, 4, 0, 4
ADD_NxN_IDCT x264_add8x8_idct_mmx, x264_add4x4_idct_mmx, 32, 4, 0, 4
SUB_NxN_DCT x264_sub16x16_dct_mmx, x264_sub8x8_dct_mmx, 32, 4, 4, 12
ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx, 32, 4, 4, 12
SUB_NxN_DCT x264_sub16x16_dct8_sse2, x264_sub8x8_dct8_sse2, 128, 8, 0, 8
ADD_NxN_IDCT x264_add16x16_idct8_sse2, x264_add8x8_idct8_sse2, 128, 8, 0, 8
......@@ -4,9 +4,10 @@
;* Copyright (C) 2003 x264 project
;* $Id: dct.asm,v 1.1 2004/06/03 19:27:07 fenrir Exp $
;*
;* Authors: Min Chen <chenm001.163.com> (converted to nasm)
;* Laurent Aimar <fenrir@via.ecp.fr> (initial version)
;* Authors: Laurent Aimar <fenrir@via.ecp.fr> (initial version)
;* Min Chen <chenm001.163.com> (converted to nasm)
;* Christian Heine <sennindemokrit@gmx.net> (dct8/idct8 functions)
;* Loren Merritt <lorenm@u.washington.edu> (misc)
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
......@@ -320,12 +321,6 @@ x264_add4x4_idct_mmx:
MMX_SUMSUB_BA %1, %2
%endmacro
cglobal x264_pixel_sub_8x8_mmx
cglobal x264_pixel_add_8x8_mmx
cglobal x264_transpose_8x8_mmx
cglobal x264_ydct8_mmx
cglobal x264_yidct8_mmx
ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_pixel_sub_8x8_mmx( int16_t *diff, uint8_t *pix1, uint8_t *pix2 );
......@@ -505,15 +500,15 @@ x264_yidct8_mmx:
movq mm2, [eax+disp+0*16] ; mm2 = d0
movq mm0, [eax+disp+4*16] ; mm0 = d4
MMX_SUMSUB_BA mm0, mm2 ; mm0 = a0, mm2 = a2
MMX_SUMSUB_BA mm0, mm2 ; mm0 = a0, mm2 = a2
MMX_SUMSUB_BA mm6, mm0 ; mm6 = f0, mm0 = f6
MMX_SUMSUB_BA mm4, mm2 ; mm4 = f2, mm2 = f4
MMX_SUMSUB_BADC mm6, mm0, mm4, mm2 ; mm6 = f0, mm0 = f6
; mm4 = f2, mm2 = f4
MMX_SUMSUB_BA mm7, mm6 ; mm7 = g0, mm6 = g7
MMX_SUMSUB_BA mm5, mm4 ; mm5 = g1, mm4 = g6
MMX_SUMSUB_BA mm3, mm2 ; mm3 = g2, mm2 = g5
MMX_SUMSUB_BA mm1, mm0 ; mm1 = g3, mm0 = g4
MMX_SUMSUB_BADC mm7, mm6, mm5, mm4 ; mm7 = g0, mm6 = g7
; mm5 = g1, mm4 = g6
MMX_SUMSUB_BADC mm3, mm2, mm1, mm0 ; mm3 = g2, mm2 = g5
; mm1 = g3, mm0 = g4
movq [eax+disp+0*16], mm7
movq [eax+disp+1*16], mm5
......@@ -607,3 +602,96 @@ x264_transpose_8x8_mmx:
ret
;-----------------------------------------------------------------------------
; void __cdecl x264_sub8x8_dct8_mmx( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
ALIGN 16
cglobal x264_sub8x8_dct8_mmx
x264_sub8x8_dct8_mmx:
push dword [esp+12]
push dword [esp+12]
push dword [esp+12]
call x264_pixel_sub_8x8_mmx
call x264_ydct8_mmx
call x264_transpose_8x8_mmx
add esp, 12
jmp x264_ydct8_mmx
;-----------------------------------------------------------------------------
; void __cdecl x264_add8x8_idct8_mmx( uint8_t *dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
ALIGN 16
cglobal x264_add8x8_idct8_mmx
x264_add8x8_idct8_mmx:
mov eax, [esp+8]
add word [eax], 32
push eax
call x264_yidct8_mmx
call x264_transpose_8x8_mmx
call x264_yidct8_mmx
add esp, 4
jmp x264_pixel_add_8x8_mmx
;-----------------------------------------------------------------------------
; void __cdecl x264_sub8x8_dct_mmx( int16_t dct[4][4][4],
; uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
%macro SUB_NxN_DCT 4
ALIGN 16
cglobal %1
%1:
mov edx, [esp+12]
mov ecx, [esp+ 8]
mov eax, [esp+ 4]
add edx, %4
add ecx, %4
add eax, %3
push edx
push ecx
push eax
call %2
add dword [esp+0], %3
add dword [esp+4], %4*FENC_STRIDE-%4
add dword [esp+8], %4*FDEC_STRIDE-%4
call %2
add dword [esp+0], %3
add dword [esp+4], %4
add dword [esp+8], %4
call %2
add esp, 12
jmp %2
%endmacro
;-----------------------------------------------------------------------------
; void __cdecl x264_add8x8_idct_mmx( uint8_t *pix, int16_t dct[4][4][4] )
;-----------------------------------------------------------------------------
%macro ADD_NxN_IDCT 4
ALIGN 16
cglobal %1
%1:
mov ecx, [esp+8]
mov eax, [esp+4]
add ecx, %3
add eax, %4
push ecx
push eax
call %2
add dword [esp+0], %4*FDEC_STRIDE-%4
add dword [esp+4], %3
call %2
add dword [esp+0], %4
add dword [esp+4], %3
call %2
add esp, 8
jmp %2
%endmacro
SUB_NxN_DCT x264_sub8x8_dct_mmx, x264_sub4x4_dct_mmx, 32, 4
ADD_NxN_IDCT x264_add8x8_idct_mmx, x264_add4x4_idct_mmx, 32, 4
SUB_NxN_DCT x264_sub16x16_dct_mmx, x264_sub8x8_dct_mmx, 128, 8
ADD_NxN_IDCT x264_add16x16_idct_mmx, x264_add8x8_idct_mmx, 128, 8
SUB_NxN_DCT x264_sub16x16_dct8_mmx, x264_sub8x8_dct8_mmx, 128, 8
ADD_NxN_IDCT x264_add16x16_idct8_mmx, x264_add8x8_idct8_mmx, 128, 8
/*****************************************************************************
* dct.c: h264 encoder library
*****************************************************************************
* Copyright (C) 2003 Laurent Aimar
* $Id: dct-c.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
*
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
*****************************************************************************/
#ifdef HAVE_STDINT_H
#include <stdint.h>
#else
#include <inttypes.h>
#endif
#include <stdlib.h>
#include <stdarg.h>
#include "dct.h"
#include "common/common.h"
void x264_sub8x8_dct_mmx( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
{
x264_sub4x4_dct_mmx( dct[0], &pix1[0], &pix2[0] );
x264_sub4x4_dct_mmx( dct[1], &pix1[4], &pix2[4] );
x264_sub4x4_dct_mmx( dct[2], &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
x264_sub4x4_dct_mmx( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
}
void x264_sub16x16_dct_mmx( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 )
{
x264_sub8x8_dct_mmx( &dct[ 0], &pix1[0], &pix2[0] );
x264_sub8x8_dct_mmx( &dct[ 4], &pix1[8], &pix2[8] );
x264_sub8x8_dct_mmx( &dct[ 8], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
x264_sub8x8_dct_mmx( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
}
/****************************************************************************
* addXxX_idct:
****************************************************************************/
void x264_add8x8_idct_mmx( uint8_t *p_dst, int16_t dct[4][4][4] )
{
x264_add4x4_idct_mmx( p_dst, dct[0] );
x264_add4x4_idct_mmx( &p_dst[4], dct[1] );
x264_add4x4_idct_mmx( &p_dst[4*FDEC_STRIDE+0], dct[2] );
x264_add4x4_idct_mmx( &p_dst[4*FDEC_STRIDE+4], dct[3] );
}
void x264_add16x16_idct_mmx( uint8_t *p_dst, int16_t dct[16][4][4] )
{
x264_add8x8_idct_mmx( &p_dst[0], &dct[0] );
x264_add8x8_idct_mmx( &p_dst[8], &dct[4] );
x264_add8x8_idct_mmx( &p_dst[8*FDEC_STRIDE], &dct[8] );
x264_add8x8_idct_mmx( &p_dst[8*FDEC_STRIDE+8], &dct[12] );
}
/***********************
* dct8/idct8 functions
***********************/
#ifdef ARCH_X86_64
void x264_sub16x16_dct8_sse2( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 )
{
x264_sub8x8_dct8_sse2( dct[0], pix1, pix2 );
x264_sub8x8_dct8_sse2( dct[1], pix1+8, pix2+8 );
x264_sub8x8_dct8_sse2( dct[2], pix1+8*FENC_STRIDE, pix2+8*FDEC_STRIDE );
x264_sub8x8_dct8_sse2( dct[3], pix1+8*FENC_STRIDE+8, pix2+8*FDEC_STRIDE+8 );
}
void x264_add16x16_idct8_sse2( uint8_t *p_dst, int16_t dct[4][8][8] )
{
x264_add8x8_idct8_sse2( p_dst, dct[0] );
x264_add8x8_idct8_sse2( p_dst+8, dct[1] );
x264_add8x8_idct8_sse2( p_dst+8*FDEC_STRIDE, dct[2] );
x264_add8x8_idct8_sse2( p_dst+8*FDEC_STRIDE+8, dct[3] );
}
#else // ARCH_X86
void x264_pixel_sub_8x8_mmx( int16_t *diff, uint8_t *pix1, uint8_t *pix2 );
void x264_pixel_add_8x8_mmx( uint8_t *pix, uint16_t *diff );
void x264_transpose_8x8_mmx( int16_t src[8][8] );
void x264_ydct8_mmx( int16_t dct[8][8] );
void x264_yidct8_mmx( int16_t dct[8][8] );
inline void x264_sub8x8_dct8_mmx( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
{
x264_pixel_sub_8x8_mmx( (int16_t *)dct, pix1, pix2 );
x264_ydct8_mmx( dct );
x264_transpose_8x8_mmx( dct );
x264_ydct8_mmx( dct );
}
void x264_sub16x16_dct8_mmx( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 )
{
x264_sub8x8_dct8_mmx( dct[0], pix1, pix2 );
x264_sub8x8_dct8_mmx( dct[1], pix1+8, pix2+8 );
x264_sub8x8_dct8_mmx( dct[2], pix1+8*FENC_STRIDE, pix2+8*FDEC_STRIDE );
x264_sub8x8_dct8_mmx( dct[3], pix1+8*FENC_STRIDE+8, pix2+8*FDEC_STRIDE+8 );
}
inline void x264_add8x8_idct8_mmx( uint8_t *dst, int16_t dct[8][8] )
{
dct[0][0] += 32;
x264_yidct8_mmx( dct );
x264_transpose_8x8_mmx( dct );
x264_yidct8_mmx( dct );
x264_pixel_add_8x8_mmx( dst, (uint16_t *)dct ); // including >>6 at the end
}
void x264_add16x16_idct8_mmx( uint8_t *dst, int16_t dct[4][8][8] )
{
x264_add8x8_idct8_mmx( dst, dct[0] );
x264_add8x8_idct8_mmx( dst+8, dct[1] );
x264_add8x8_idct8_mmx( dst+8*FDEC_STRIDE, dct[2] );
x264_add8x8_idct8_mmx( dst+8*FDEC_STRIDE+8, dct[3] );
}
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment