Commit 413d8fa9 authored by Loren Merritt's avatar Loren Merritt
Browse files

amd64 asm patch, part1.


git-svn-id: svn://svn.videolan.org/x264/trunk@212 df754926-b1dd-0310-bc7b-ec298dee348c
parent 7d35ba6b
;*****************************************************************************
;* cpu.asm: h264 encoder library
;*****************************************************************************
;* Copyright (C) 2003 x264 project
;* $Id: cpu.asm,v 1.1 2004/06/03 19:27:07 fenrir Exp $
;*
;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
;*****************************************************************************
BITS 32
;=============================================================================
; Macros and other preprocessor constants
;=============================================================================
%macro cglobal 1
%ifdef PREFIX
global _%1
%define %1 _%1
%else
global %1
%endif
%endmacro
;=============================================================================
; Code
;=============================================================================
SECTION .text
cglobal x264_cpu_cpuid_test
cglobal x264_cpu_cpuid
cglobal x264_emms
ALIGN 16
;-----------------------------------------------------------------------------
; int __cdecl x264_cpu_cpuid_test( void ) return 0 if unsupported
;-----------------------------------------------------------------------------
x264_cpu_cpuid_test:
pushfd
push ebx
push ebp
push esi
push edi
pushfd
pop eax
mov ebx, eax
xor eax, 0x200000
push eax
popfd
pushfd
pop eax
xor eax, ebx
pop edi
pop esi
pop ebp
pop ebx
popfd
ret
ALIGN 16
;-----------------------------------------------------------------------------
; int __cdecl x264_cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
;-----------------------------------------------------------------------------
x264_cpu_cpuid:
push ebp
mov ebp, esp
push ebx
push esi
push edi
mov eax, [ebp + 8]
cpuid
mov esi, [ebp + 12]
mov [esi], eax
mov esi, [ebp + 16]
mov [esi], ebx
mov esi, [ebp + 20]
mov [esi], ecx
mov esi, [ebp + 24]
mov [esi], edx
pop edi
pop esi
pop ebx
pop ebp
ret
ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_emms( void )
;-----------------------------------------------------------------------------
x264_emms:
emms
ret
;*****************************************************************************
;* dct.asm: h264 encoder library
;*****************************************************************************
;* Copyright (C) 2003 x264 project
;* $Id: dct.asm,v 1.1 2004/06/03 19:27:07 fenrir Exp $
;*
;* Authors: Min Chen <chenm001.163.com> (converted to nasm)
;* Laurent Aimar <fenrir@via.ecp.fr> (initial version)
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
;*****************************************************************************
;*****************************************************************************
;* *
;* Revision history: *
;* *
;* 2004.04.28 portab all 4x4 function to nasm (CM) *
;* *
;*****************************************************************************
BITS 32
;=============================================================================
; Macros and other preprocessor constants
;=============================================================================
%macro cglobal 1
%ifdef PREFIX
global _%1
%define %1 _%1
%else
global %1
%endif
%endmacro
%macro MMX_ZERO 1
pxor %1, %1
%endmacro
%macro MMX_LOAD_DIFF_4P 5
movd %1, %4
punpcklbw %1, %3
movd %2, %5
punpcklbw %2, %3
psubw %1, %2
%endmacro
%macro MMX_SUMSUB_BA 2
paddw %1, %2
paddw %2, %2
psubw %2, %1
%endmacro
%macro MMX_SUMSUB_BADC 4
paddw %1, %2
paddw %3, %4
paddw %2, %2
paddw %4, %4
psubw %2, %1
psubw %4, %3
%endmacro
%macro MMX_SUMSUB2_AB 3
movq %3, %1
paddw %1, %1
paddw %1, %2
psubw %3, %2
psubw %3, %2
%endmacro
%macro MMX_SUMSUBD2_AB 4
movq %4, %1
movq %3, %2
psraw %2, $1
psraw %4, $1
paddw %1, %2
psubw %4, %3
%endmacro
%macro SBUTTERFLYwd 3
movq %3, %1
punpcklwd %1, %2
punpckhwd %3, %2
%endmacro
%macro SBUTTERFLYdq 3
movq %3, %1
punpckldq %1, %2
punpckhdq %3, %2
%endmacro
;-----------------------------------------------------------------------------
; input ABCD output ADTC
;-----------------------------------------------------------------------------
%macro MMX_TRANSPOSE 5
SBUTTERFLYwd %1, %2, %5
SBUTTERFLYwd %3, %4, %2
SBUTTERFLYdq %1, %3, %4
SBUTTERFLYdq %5, %2, %3
%endmacro
%macro MMX_STORE_DIFF_4P 5
paddw %1, %3
psraw %1, $6
movd %2, %5
punpcklbw %2, %4
paddsw %1, %2
packuswb %1, %1
movd %5, %1
%endmacro
;%macro
;%endmacro
;=============================================================================
; Local Data (Read Only)
;=============================================================================
%ifdef FORMAT_COFF
SECTION .rodata data
%else
SECTION .rodata data align=16
%endif
;-----------------------------------------------------------------------------
; Various memory constants (trigonometric values or rounding values)
;-----------------------------------------------------------------------------
ALIGN 16
x264_mmx_1:
dw 1, 1, 1, 1
x264_mmx_32:
dw 32, 32, 32, 32
;=============================================================================
; Code
;=============================================================================
SECTION .text
cglobal x264_dct4x4dc_mmxext
ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl dct4x4dc( int16_t d[4][4] )
;-----------------------------------------------------------------------------
x264_dct4x4dc_mmxext:
mov eax, [esp+ 4]
movq mm0, [eax+ 0]
movq mm1, [eax+ 8]
movq mm2, [eax+16]
movq mm3, [eax+24]
MMX_SUMSUB_BADC mm1, mm0, mm3, mm2 ; mm1=s01 mm0=d01 mm3=s23 mm2=d23
MMX_SUMSUB_BADC mm3, mm1, mm2, mm0 ; mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23
MMX_TRANSPOSE mm3, mm1, mm0, mm2, mm4 ; in: mm3, mm1, mm0, mm2 out: mm3, mm2, mm4, mm0
MMX_SUMSUB_BADC mm2, mm3, mm0, mm4 ; mm2=s01 mm3=d01 mm0=s23 mm4=d23
MMX_SUMSUB_BADC mm0, mm2, mm4, mm3 ; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23
MMX_TRANSPOSE mm0, mm2, mm3, mm4, mm1 ; in: mm0, mm2, mm3, mm4 out: mm0, mm4, mm1, mm3
movq mm6, [x264_mmx_1]
paddw mm0, mm6
paddw mm4, mm6
psraw mm0, 1
movq [eax+ 0], mm0
psraw mm4, 1
movq [eax+ 8], mm4
paddw mm1, mm6
paddw mm3, mm6
psraw mm1, 1
movq [eax+16], mm1
psraw mm3, 1
movq [eax+24], mm3
ret
cglobal x264_idct4x4dc_mmxext
ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_idct4x4dc_mmxext( int16_t d[4][4] )
;-----------------------------------------------------------------------------
x264_idct4x4dc_mmxext:
mov eax, [esp+ 4]
movq mm0, [eax+ 0]
movq mm1, [eax+ 8]
movq mm2, [eax+16]
movq mm3, [eax+24]
MMX_SUMSUB_BADC mm1, mm0, mm3, mm2 ; mm1=s01 mm0=d01 mm3=s23 mm2=d23
MMX_SUMSUB_BADC mm3, mm1, mm2, mm0 ; mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23
MMX_TRANSPOSE mm3, mm1, mm0, mm2, mm4 ; in: mm3, mm1, mm0, mm2 out: mm3, mm2, mm4, mm0
MMX_SUMSUB_BADC mm2, mm3, mm0, mm4 ; mm2=s01 mm3=d01 mm0=s23 mm4=d23
MMX_SUMSUB_BADC mm0, mm2, mm4, mm3 ; mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23
MMX_TRANSPOSE mm0, mm2, mm3, mm4, mm1 ; in: mm0, mm2, mm3, mm4 out: mm0, mm4, mm1, mm3
movq [eax+ 0], mm0
movq [eax+ 8], mm4
movq [eax+16], mm1
movq [eax+24], mm3
ret
cglobal x264_sub4x4_dct_mmxext
ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_sub4x4_dct_mmxext( int16_t dct[4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
;-----------------------------------------------------------------------------
x264_sub4x4_dct_mmxext:
push ebx
mov eax, [esp+12] ; pix1
mov ebx, [esp+16] ; i_pix1
mov ecx, [esp+20] ; pix2
mov edx, [esp+24] ; i_pix2
MMX_ZERO mm7
; Load 4 lines
MMX_LOAD_DIFF_4P mm0, mm6, mm7, [eax ], [ecx]
MMX_LOAD_DIFF_4P mm1, mm6, mm7, [eax+ebx ], [ecx+edx]
MMX_LOAD_DIFF_4P mm2, mm6, mm7, [eax+ebx*2], [ecx+edx*2]
add eax, ebx
add ecx, edx
MMX_LOAD_DIFF_4P mm3, mm6, mm7, [eax+ebx*2], [ecx+edx*2]
MMX_SUMSUB_BADC mm3, mm0, mm2, mm1 ; mm3=s03 mm0=d03 mm2=s12 mm1=d12
MMX_SUMSUB_BA mm2, mm3 ; mm2=s03+s12 mm3=s03-s12
MMX_SUMSUB2_AB mm0, mm1, mm4 ; mm0=2.d03+d12 mm4=d03-2.d12
; transpose in: mm2, mm0, mm3, mm4, out: mm2, mm4, mm1, mm3
MMX_TRANSPOSE mm2, mm0, mm3, mm4, mm1
MMX_SUMSUB_BADC mm3, mm2, mm1, mm4 ; mm3=s03 mm2=d03 mm1=s12 mm4=d12
MMX_SUMSUB_BA mm1, mm3 ; mm1=s03+s12 mm3=s03-s12
MMX_SUMSUB2_AB mm2, mm4, mm0 ; mm2=2.d03+d12 mm0=d03-2.d12
; transpose in: mm1, mm2, mm3, mm0, out: mm1, mm0, mm4, mm3
MMX_TRANSPOSE mm1, mm2, mm3, mm0, mm4
mov eax, [esp+ 8] ; dct
movq [eax+ 0], mm1
movq [eax+ 8], mm0
movq [eax+16], mm4
movq [eax+24], mm3
pop ebx
ret
cglobal x264_add4x4_idct_mmxext
ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_add4x4_idct_mmxext( uint8_t *p_dst, int i_dst, int16_t dct[4][4] )
;-----------------------------------------------------------------------------
x264_add4x4_idct_mmxext:
; Load dct coeffs
mov eax, [esp+12] ; dct
movq mm0, [eax+ 0]
movq mm4, [eax+ 8]
movq mm3, [eax+16]
movq mm1, [eax+24]
mov eax, [esp+ 4] ; p_dst
mov ecx, [esp+ 8] ; i_dst
lea edx, [ecx+ecx*2]
; out:mm0, mm1, mm2, mm3
MMX_TRANSPOSE mm0, mm4, mm3, mm1, mm2
MMX_SUMSUB_BA mm2, mm0 ; mm2=s02 mm0=d02
MMX_SUMSUBD2_AB mm1, mm3, mm5, mm4 ; mm1=s13 mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3)
MMX_SUMSUB_BADC mm1, mm2, mm4, mm0 ; mm1=s02+s13 mm2=s02-s13 mm4=d02+d13 mm0=d02-d13
; in: mm1, mm4, mm0, mm2 out: mm1, mm2, mm3, mm0
MMX_TRANSPOSE mm1, mm4, mm0, mm2, mm3
MMX_SUMSUB_BA mm3, mm1 ; mm3=s02 mm1=d02
MMX_SUMSUBD2_AB mm2, mm0, mm5, mm4 ; mm2=s13 mm4=d13 ( well 1 + 3>>1 and 1>>1 + 3)
MMX_SUMSUB_BADC mm2, mm3, mm4, mm1 ; mm2=s02+s13 mm3=s02-s13 mm4=d02+d13 mm1=d02-d13
MMX_ZERO mm7
movq mm6, [x264_mmx_32]
MMX_STORE_DIFF_4P mm2, mm0, mm6, mm7, [eax]
MMX_STORE_DIFF_4P mm4, mm0, mm6, mm7, [eax+ecx]
MMX_STORE_DIFF_4P mm1, mm0, mm6, mm7, [eax+ecx*2]
MMX_STORE_DIFF_4P mm3, mm0, mm6, mm7, [eax+edx]
ret
/*****************************************************************************
* dct.c: h264 encoder library
*****************************************************************************
* Copyright (C) 2003 Laurent Aimar
* $Id: dct-c.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
*
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
*****************************************************************************/
#ifdef HAVE_STDINT_H
#include <stdint.h>
#else
#include <inttypes.h>
#endif
#include <stdlib.h>
#include <stdarg.h>
#include "x264.h"
#include "common/dct.h"
#include "dct.h"
#if 0
#define MMX_ZERO( MMZ ) \
asm volatile( "pxor " #MMZ ", " #MMZ "\n" :: )
/* MMP : diff, MMT: temp */
#define MMX_LOAD_DIFF_4P( MMP, MMT, MMZ, pix1, pix2 ) \
asm volatile( "movd (%0), " #MMP "\n" \
"punpcklbw " #MMZ ", " #MMP "\n" \
"movd (%1), " #MMT "\n" \
"punpcklbw " #MMZ ", " #MMT "\n" \
"psubw " #MMT ", " #MMP "\n" : : "r"(pix1), "r"(pix2) )
/* in: out: mma=mma+mmb, mmb=mmb-mma */
#define MMX_SUMSUB_BA( MMA, MMB ) \
asm volatile( "paddw " #MMB ", " #MMA "\n"\
"paddw " #MMB ", " #MMB "\n"\
"psubw " #MMA ", " #MMB "\n" :: )
#define MMX_SUMSUB_BADC( MMA, MMB, MMC, MMD ) \
asm volatile( "paddw " #MMB ", " #MMA "\n"\
"paddw " #MMD ", " #MMC "\n"\
"paddw " #MMB ", " #MMB "\n"\
"paddw " #MMD ", " #MMD "\n"\
"psubw " #MMA ", " #MMB "\n"\
"psubw " #MMC ", " #MMD "\n" :: )
/* inputs MMA, MMB output MMA MMT */
#define MMX_SUMSUB2_AB( MMA, MMB, MMT ) \
asm volatile( "movq " #MMA ", " #MMT "\n" \
"paddw " #MMA ", " #MMA "\n" \
"paddw " #MMB ", " #MMA "\n" \
"psubw " #MMB ", " #MMT "\n" \
"psubw " #MMB ", " #MMT "\n" :: )
/* inputs MMA, MMB output MMA MMS */
#define MMX_SUMSUBD2_AB( MMA, MMB, MMT, MMS ) \
asm volatile( "movq " #MMA ", " #MMS "\n" \
"movq " #MMB ", " #MMT "\n" \
"psraw $1 , " #MMB "\n" \
"psraw $1 , " #MMS "\n" \
"paddw " #MMB ", " #MMA "\n" \
"psubw " #MMT ", " #MMS "\n" :: )
#define SBUTTERFLYwd(a,b,t )\
asm volatile( "movq " #a ", " #t " \n\t" \
"punpcklwd " #b ", " #a " \n\t" \
"punpckhwd " #b ", " #t " \n\t" :: )
#define SBUTTERFLYdq(a,b,t )\
asm volatile( "movq " #a ", " #t " \n\t" \
"punpckldq " #b ", " #a " \n\t" \
"punpckhdq " #b ", " #t " \n\t" :: )
/* input ABCD output ADTC */
#define MMX_TRANSPOSE( MMA, MMB, MMC, MMD, MMT ) \
SBUTTERFLYwd( MMA, MMB, MMT ); \
SBUTTERFLYwd( MMC, MMD, MMB ); \
SBUTTERFLYdq( MMA, MMC, MMD ); \
SBUTTERFLYdq( MMT, MMB, MMC )
#define MMX_STORE_DIFF_4P( MMP, MMT, MM32, MMZ, dst ) \
asm volatile( "paddw " #MM32 "," #MMP "\n" \
"psraw $6, " #MMP "\n" \
"movd (%0), " #MMT "\n" \
"punpcklbw " #MMZ ", " #MMT "\n" \
"paddsw " #MMT ", " #MMP "\n" \
"packuswb " #MMZ ", " #MMP "\n" \
"movd " #MMP ", (%0)\n" :: "r"(dst) )
#define UNUSED_LONGLONG( foo ) \
static const unsigned long long foo __asm__ (#foo) __attribute__((unused)) __attribute__((aligned(16)))
UNUSED_LONGLONG( x264_mmx_32 ) = 0x0020002000200020ULL;
UNUSED_LONGLONG( x264_mmx_1 ) = 0x0001000100010001ULL;
/*
* XXX For all dct dc : input could be equal to output so ...
*/
void x264_dct4x4dc_mmxext( int16_t d[4][4] )
{
/* load DCT */
asm volatile(
"movq (%0), %%mm0\n"
"movq 8(%0), %%mm1\n"
"movq 16(%0), %%mm2\n"
"movq 24(%0), %%mm3\n" :: "r"(d) );
MMX_SUMSUB_BADC( %%mm1, %%mm0, %%mm3, %%mm2 ); /* mm1=s01 mm0=d01 mm3=s23 mm2=d23 */
MMX_SUMSUB_BADC( %%mm3, %%mm1, %%mm2, %%mm0 ); /* mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23 */
/* in: mm3, mm1, mm0, mm2 out: mm3, mm2, mm4, mm0 */
MMX_TRANSPOSE ( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 );
MMX_SUMSUB_BADC( %%mm2, %%mm3, %%mm0, %%mm4 ); /* mm2=s01 mm3=d01 mm0=s23 mm4=d23 */
MMX_SUMSUB_BADC( %%mm0, %%mm2, %%mm4, %%mm3 ); /* mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23 */
/* in: mm0, mm2, mm3, mm4 out: mm0, mm4, mm1, mm3 */
MMX_TRANSPOSE ( %%mm0, %%mm2, %%mm3, %%mm4, %%mm1 );
asm volatile( "movq x264_mmx_1, %%mm6" :: );
/* Store back */
asm volatile(
"paddw %%mm6, %%mm0\n"
"paddw %%mm6, %%mm4\n"
"psraw $1, %%mm0\n"
"movq %%mm0, (%0)\n"
"psraw $1, %%mm4\n"
"movq %%mm4, 8(%0)\n"
"paddw %%mm6, %%mm1\n"
"paddw %%mm6, %%mm3\n"
"psraw $1, %%mm1\n"
"movq %%mm1, 16(%0)\n"
"psraw $1, %%mm3\n"
"movq %%mm3, 24(%0)\n" :: "r"(d) );
}
void x264_idct4x4dc_mmxext( int16_t d[4][4] )
{
/* load DCT */
asm volatile(
"movq (%0), %%mm0\n"
"movq 8(%0), %%mm1\n"
"movq 16(%0), %%mm2\n"
"movq 24(%0), %%mm3\n" :: "r"(d) );
MMX_SUMSUB_BADC( %%mm1, %%mm0, %%mm3, %%mm2 ); /* mm1=s01 mm0=d01 mm3=s23 mm2=d23 */
MMX_SUMSUB_BADC( %%mm3, %%mm1, %%mm2, %%mm0 ); /* mm3=s01+s23 mm1=s01-s23 mm2=d01+d23 mm0=d01-d23 */
/* in: mm3, mm1, mm0, mm2 out: mm3, mm2, mm4, mm0 */
MMX_TRANSPOSE( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 );
MMX_SUMSUB_BADC( %%mm2, %%mm3, %%mm0, %%mm4 ); /* mm2=s01 mm3=d01 mm0=s23 mm4=d23 */
MMX_SUMSUB_BADC( %%mm0, %%mm2, %%mm4, %%mm3 ); /* mm0=s01+s23 mm2=s01-s23 mm4=d01+d23 mm3=d01-d23 */
/* in: mm0, mm2, mm3, mm4 out: mm0, mm4, mm1, mm3 */
MMX_TRANSPOSE( %%mm0, %%mm2, %%mm3, %%mm4, %%mm1 );
/* Store back */
asm volatile(
"movq %%mm0, (%0)\n"
"movq %%mm4, 8(%0)\n"
"movq %%mm1, 16(%0)\n"
"movq %%mm3, 24(%0)\n" :: "r"(d) );
}
/****************************************************************************
* subXxX_dct:
****************************************************************************/
inline void x264_sub4x4_dct_mmxext( int16_t dct[4][4], uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
{
/* Reset mm7 */
MMX_ZERO( %%mm7 );
/* Load 4 lines */
MMX_LOAD_DIFF_4P( %%mm0, %%mm6, %%mm7, &pix1[0*i_pix1], &pix2[0*i_pix2] );
MMX_LOAD_DIFF_4P( %%mm1, %%mm6, %%mm7, &pix1[1*i_pix1], &pix2[1*i_pix2] );
MMX_LOAD_DIFF_4P( %%mm2, %%mm6, %%mm7, &pix1[2*i_pix1], &pix2[2*i_pix2] );