Commit c4ffed49 authored by Loren Merritt's avatar Loren Merritt

MMX quantization functions, and optimization of the C versions.

about 3x faster quant_8x8, quant_4x4, quant_4x4_dc, and quant_2x2_dc. total speedup: 4-10%.
patch by Alexander Izvorski and Christian Heine.



git-svn-id: svn://svn.videolan.org/x264/trunk@293 df754926-b1dd-0310-bc7b-ec298dee348c
parent 16f423a0
......@@ -4,7 +4,8 @@ include config.mak
SRCS = common/mc.c common/predict.c common/pixel.c common/macroblock.c \
common/frame.c common/dct.c common/cpu.c common/cabac.c \
common/common.c common/mdate.c common/csp.c common/set.c\
common/common.c common/mdate.c common/csp.c common/set.c \
common/quant.c \
encoder/analyse.c encoder/me.c encoder/ratecontrol.c \
encoder/set.c encoder/macroblock.c encoder/cabac.c \
encoder/cavlc.c encoder/encoder.c encoder/eval.c
......@@ -20,7 +21,7 @@ SRCS += common/i386/mc-c.c common/i386/dct-c.c common/i386/predict.c
ASMSRC = common/i386/dct-a.asm common/i386/cpu-a.asm \
common/i386/pixel-a.asm common/i386/mc-a.asm \
common/i386/mc-a2.asm common/i386/predict-a.asm \
common/i386/pixel-sse2.asm
common/i386/pixel-sse2.asm common/i386/quant-a.asm
OBJASM = $(ASMSRC:%.asm=%.o)
endif
......@@ -30,7 +31,7 @@ SRCS += common/i386/mc-c.c common/i386/dct-c.c common/i386/predict.c
ASMSRC = common/amd64/dct-a.asm common/amd64/cpu-a.asm \
common/amd64/pixel-a.asm common/amd64/mc-a.asm \
common/amd64/mc-a2.asm common/amd64/predict-a.asm \
common/amd64/pixel-sse2.asm
common/amd64/pixel-sse2.asm common/amd64/quant-a.asm
OBJASM = $(ASMSRC:%.asm=%.o)
endif
......
;*****************************************************************************
;* quant-a.asm: h264 encoder library
;*****************************************************************************
;* Copyright (C) 2005 x264 project
;*
;* Authors: Alex Izvorski <aizvorksi@gmail.com>
;* Christian Heine <sennindemokrit@gmx.net>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
;*****************************************************************************
BITS 64
%macro cglobal 1
%ifdef PREFIX
global _%1
%define %1 _%1
%else
global %1
%endif
%endmacro
ALIGN 16
SECTION .text
cglobal x264_quant_8x8_core16_mmx
cglobal x264_quant_4x4_core16_mmx
cglobal x264_quant_8x8_core32_mmx
cglobal x264_quant_4x4_core32_mmx
cglobal x264_quant_4x4_dc_core32_mmx
cglobal x264_quant_2x2_dc_core32_mmx
%macro QUANT_AC_START 0
; mov rdi, rdi ; dct
; mov rsi, rsi ; quant_mf
movd mm6, edx ; i_qbits
movd mm7, ecx ; f
punpckldq mm7, mm7
%endmacro
%macro QUANT_DC_START 0
; mov rdi, rdi ; dct
movd mm5, rsi ; i_quant_mf
movd mm6, edx ; i_qbits
movd mm7, ecx ; f
punpckldq mm5, mm5
punpckldq mm7, mm7
%endmacro
%macro QUANT16_1x4 5
;;; %1 dct[y][x]
;;; %2,%3 quant_mf[i_mf][y][x], entries must fit in int16
;;; %4 i_qbits
;;; %5 f as doublewords
;;; trashes mm0-mm5
movq mm0, %1
movq mm1, %2
movq mm2, %3
packssdw mm1, mm2
movq mm4, mm0
pxor mm5, mm5
pcmpgtw mm4, mm5
movq mm2, mm0
pmullw mm0, mm1
pmulhw mm2, mm1
movq mm1, mm0
punpcklwd mm0, mm2
punpckhwd mm1, mm2
movq mm2, %5
movq mm3, %5
psubd mm2, mm0
psubd mm3, mm1
paddd mm0, %5
paddd mm1, %5
psrad mm0, %4
psrad mm1, %4
psrad mm2, %4
psrad mm3, %4
packssdw mm0, mm1
packssdw mm2, mm3
pxor mm5, mm5
psubw mm5, mm2
pand mm0, mm4
pandn mm4, mm5
por mm0, mm4
movq %1, mm0
%endmacro
%macro QUANT32_1x4 5
;;; %1 dct[y][x]
;;; %2,%3 quant_mf[i_mf][y][x]
;;; %4 i_qbits
;;; %5 f as doublewords
;;; trashes mm0-mm4
movq mm0, %1
pxor mm4, mm4
pcmpgtw mm4, mm0 ; mm4 = sign(mm0)
pxor mm0, mm4
psubw mm0, mm4 ; mm0 = abs(mm0)
movq mm1, mm0
punpcklwd mm0, mm0 ; duplicate the words for the upcomming
punpckhwd mm1, mm1 ; 32 bit multiplication
movq mm2, mm0 ; like in school ...
movq mm3, mm1
pmulhuw mm0, %2 ; ... multiply the parts ...
pmulhuw mm1, %3
pmullw mm2, %2
pmullw mm3, %3
pslld mm0, 16 ; ... shift ...
pslld mm1, 16
paddd mm0, mm2 ; ... and add them
paddd mm1, mm3
paddd mm0, %5 ; round with f
paddd mm1, %5
psrad mm0, %4
psrad mm1, %4
packssdw mm0, mm1 ; pack & store
pxor mm0, mm4
psubw mm0, mm4 ; restore sign
movq %1, mm0
%endmacro
ALIGN 16
;;; void x264_quant_8x8_core16_mmx( int16_t dct[8][8], int quant_mf[8][8], int i_qbits, int f )
x264_quant_8x8_core16_mmx:
QUANT_AC_START
%rep 16
QUANT16_1x4 [rdi], [rsi], [rsi+8], mm6, mm7
add rdi, 8
add rsi, 16
%endrep
ret
ALIGN 16
;;; void x264_quant_4x4_core16_mmx( int16_t dct[4][4], int quant_mf[4][4], int i_qbits, int f )
x264_quant_4x4_core16_mmx:
QUANT_AC_START
%rep 4
QUANT16_1x4 [rdi], [rsi], [rsi+8], mm6, mm7
add rdi, 8
add rsi, 16
%endrep
ret
ALIGN 16
;;; void x264_quant_8x8_core32_mmx( int16_t dct[8][8], int quant_mf[8][8], int i_qbits, int f )
x264_quant_8x8_core32_mmx:
QUANT_AC_START
%rep 16
QUANT32_1x4 [rdi], [rsi], [rsi+8], mm6, mm7
add rdi, 8
add rsi, 16
%endrep
ret
ALIGN 16
;;; void x264_quant_4x4_core32_mmx( int16_t dct[4][4], int quant_mf[4][4], int i_qbits, int f )
x264_quant_4x4_core32_mmx:
QUANT_AC_START
%rep 4
QUANT32_1x4 [rdi], [rsi], [rsi+8], mm6, mm7
add rdi, 8
add rsi, 16
%endrep
ret
ALIGN 16
;;; void x264_quant_4x4_dc_core32_mmx( int16_t dct[4][4], int i_quant_mf, int i_qbits, int f )
x264_quant_4x4_dc_core32_mmx:
QUANT_DC_START
%rep 4
QUANT32_1x4 [rdi], mm5, mm5, mm6, mm7
add rdi, 8
%endrep
ret
ALIGN 16
;;; void x264_quant_2x2_dc_core32_mmx( int16_t dct[2][2], int i_quant_mf, int i_qbits, int f )
x264_quant_2x2_dc_core32_mmx:
QUANT_DC_START
QUANT32_1x4 [rdi], mm5, mm5, mm6, mm7
ret
......@@ -46,6 +46,7 @@
#include "dct.h"
#include "cabac.h"
#include "csp.h"
#include "quant.h"
/****************************************************************************
* Macros
......@@ -486,6 +487,7 @@ struct x264_t
x264_mc_functions_t mc;
x264_dct_function_t dctf;
x264_csp_function_t csp;
x264_quant_function_t quantf;
/* vlc table for decoding purpose only */
x264_vlc_table_t *x264_coeff_token_lookup[5];
......
;*****************************************************************************
;* quant-a.asm: h264 encoder library
;*****************************************************************************
;* Copyright (C) 2005 x264 project
;*
;* Authors: Alex Izvorski <aizvorksi@gmail.com>
;* Christian Heine <sennindemokrit@gmx.net>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
;*****************************************************************************
BITS 32
%macro cglobal 1
%ifdef PREFIX
global _%1
%define %1 _%1
%else
global %1
%endif
%endmacro
ALIGN 16
SECTION .text
cglobal x264_quant_8x8_core16_mmx
cglobal x264_quant_4x4_core16_mmx
cglobal x264_quant_8x8_core32_mmx
cglobal x264_quant_4x4_core32_mmx
cglobal x264_quant_4x4_dc_core32_mmx
cglobal x264_quant_2x2_dc_core32_mmx
%macro QUANT_AC_START 0
mov eax, [esp+ 4] ; dct
mov ecx, [esp+ 8] ; quant_mf
movd mm6, [esp+12] ; i_qbits
movd mm7, [esp+16] ; f
punpckldq mm7, mm7
%endmacro
%macro QUANT_DC_START 0
mov eax, [esp+ 4] ; dct
movd mm5, [esp+ 8] ; i_quant_mf
movd mm6, [esp+12] ; i_qbits
movd mm7, [esp+16] ; f
punpckldq mm5, mm5
punpckldq mm7, mm7
%endmacro
%macro QUANT16_1x4 5
;;; %1 dct[y][x]
;;; %2,%3 quant_mf[i_mf][y][x], entries must fit in int16
;;; %4 i_qbits
;;; %5 f as doublewords
;;; trashes mm0-mm5
movq mm0, %1
movq mm1, %2
movq mm2, %3
packssdw mm1, mm2
movq mm4, mm0
pxor mm5, mm5
pcmpgtw mm4, mm5
movq mm2, mm0
pmullw mm0, mm1
pmulhw mm2, mm1
movq mm1, mm0
punpcklwd mm0, mm2
punpckhwd mm1, mm2
movq mm2, %5
movq mm3, %5
psubd mm2, mm0
psubd mm3, mm1
paddd mm0, %5
paddd mm1, %5
psrad mm0, %4
psrad mm1, %4
psrad mm2, %4
psrad mm3, %4
packssdw mm0, mm1
packssdw mm2, mm3
pxor mm5, mm5
psubw mm5, mm2
pand mm0, mm4
pandn mm4, mm5
por mm0, mm4
movq %1, mm0
%endmacro
%macro QUANT32_1x4 5
;;; %1 dct[y][x]
;;; %2,%3 quant_mf[i_mf][y][x]
;;; %4 i_qbits
;;; %5 f as doublewords
;;; trashes mm0-mm4
movq mm0, %1
pxor mm4, mm4
pcmpgtw mm4, mm0 ; mm4 = sign(mm0)
pxor mm0, mm4
psubw mm0, mm4 ; mm0 = abs(mm0)
movq mm1, mm0
punpcklwd mm0, mm0 ; duplicate the words for the upcomming
punpckhwd mm1, mm1 ; 32 bit multiplication
movq mm2, mm0 ; like in school ...
movq mm3, mm1
pmulhuw mm0, %2 ; ... multiply the parts ...
pmulhuw mm1, %3
pmullw mm2, %2
pmullw mm3, %3
pslld mm0, 16 ; ... shift ...
pslld mm1, 16
paddd mm0, mm2 ; ... and add them
paddd mm1, mm3
paddd mm0, %5 ; round with f
paddd mm1, %5
psrad mm0, %4
psrad mm1, %4
packssdw mm0, mm1 ; pack & store
pxor mm0, mm4
psubw mm0, mm4 ; restore sign
movq %1, mm0
%endmacro
ALIGN 16
;;; void x264_quant_8x8_core16_mmx( int16_t dct[8][8], int quant_mf[8][8], int i_qbits, int f )
x264_quant_8x8_core16_mmx:
QUANT_AC_START
%rep 16
QUANT16_1x4 [eax], [ecx], [ecx+8], mm6, mm7
add eax, 8
add ecx, 16
%endrep
ret
ALIGN 16
;;; void x264_quant_4x4_core16_mmx( int16_t dct[4][4], int quant_mf[4][4], int i_qbits, int f )
x264_quant_4x4_core16_mmx:
QUANT_AC_START
%rep 4
QUANT16_1x4 [eax], [ecx], [ecx+8], mm6, mm7
add eax, 8
add ecx, 16
%endrep
ret
ALIGN 16
;;; void x264_quant_8x8_core32_mmx( int16_t dct[8][8], int quant_mf[8][8], int i_qbits, int f )
x264_quant_8x8_core32_mmx:
QUANT_AC_START
%rep 16
QUANT32_1x4 [eax], [ecx], [ecx+8], mm6, mm7
add eax, 8
add ecx, 16
%endrep
ret
ALIGN 16
;;; void x264_quant_4x4_core32_mmx( int16_t dct[4][4], int quant_mf[4][4], int i_qbits, int f )
x264_quant_4x4_core32_mmx:
QUANT_AC_START
%rep 4
QUANT32_1x4 [eax], [ecx], [ecx+8], mm6, mm7
add eax, 8
add ecx, 16
%endrep
ret
ALIGN 16
;;; void x264_quant_4x4_dc_core32_mmx( int16_t dct[4][4], int i_quant_mf, int i_qbits, int f )
x264_quant_4x4_dc_core32_mmx:
QUANT_DC_START
%rep 4
QUANT32_1x4 [eax], mm5, mm5, mm6, mm7
add eax, 8
%endrep
ret
ALIGN 16
;;; void x264_quant_2x2_dc_core32_mmx( int16_t dct[2][2], int i_quant_mf, int i_qbits, int f )
x264_quant_2x2_dc_core32_mmx:
QUANT_DC_START
QUANT32_1x4 [eax], mm5, mm5, mm6, mm7
ret
/*****************************************************************************
* quant.c: h264 encoder library
*****************************************************************************
* Copyright (C) 2005 x264 project
*
* Authors: Christian Heine <sennindemokrit@gmx.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
*****************************************************************************/
#include "common.h"
void x264_quant_8x8_core16_mmx( int16_t dct[8][8], int quant_mf[8][8], int i_qbits, int f );
void x264_quant_4x4_core16_mmx( int16_t dct[4][4], int quant_mf[4][4], int i_qbits, int f );
void x264_quant_8x8_core32_mmx( int16_t dct[8][8], int quant_mf[8][8], int i_qbits, int f );
void x264_quant_4x4_core32_mmx( int16_t dct[4][4], int quant_mf[4][4], int i_qbits, int f );
void x264_quant_4x4_dc_core32_mmx( int16_t dct[4][4], int i_quant_mf, int i_qbits, int f );
void x264_quant_2x2_dc_core32_mmx( int16_t dct[2][2], int i_quant_mf, int i_qbits, int f );
#define QUANT_ONE( coef, mf ) \
{ \
if( (coef) > 0 ) \
(coef) = ( f + (coef) * (mf) ) >> i_qbits; \
else \
(coef) = - ( ( f - (coef) * (mf) ) >> i_qbits ); \
}
static void quant_8x8_core( int16_t dct[8][8], int quant_mf[8][8], int i_qbits, int f )
{
int i;
for( i = 0; i < 64; i++ )
QUANT_ONE( dct[0][i], quant_mf[0][i] );
}
static void quant_4x4_core( int16_t dct[4][4], int quant_mf[4][4], int i_qbits, int f )
{
int i;
for( i = 0; i < 16; i++ )
QUANT_ONE( dct[0][i], quant_mf[0][i] );
}
static void quant_4x4_dc_core( int16_t dct[4][4], int i_quant_mf, int i_qbits, int f )
{
int i;
for( i = 0; i < 16; i++ )
QUANT_ONE( dct[0][i], i_quant_mf );
}
static void quant_2x2_dc_core( int16_t dct[2][2], int i_quant_mf, int i_qbits, int f )
{
QUANT_ONE( dct[0][0], i_quant_mf );
QUANT_ONE( dct[0][1], i_quant_mf );
QUANT_ONE( dct[0][2], i_quant_mf );
QUANT_ONE( dct[0][3], i_quant_mf );
}
void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
{
const char *name[4] = { "C", "C", "C", "C" };
pf->quant_8x8_core = quant_8x8_core;
pf->quant_4x4_core = quant_4x4_core;
pf->quant_4x4_dc_core = quant_4x4_dc_core;
pf->quant_2x2_dc_core = quant_2x2_dc_core;
#ifdef HAVE_MMXEXT
if( cpu&X264_CPU_MMX )
{
int i;
pf->quant_8x8_core = x264_quant_8x8_core16_mmx;
pf->quant_4x4_core = x264_quant_4x4_core16_mmx;
pf->quant_4x4_dc_core = x264_quant_4x4_dc_core32_mmx;
pf->quant_2x2_dc_core = x264_quant_2x2_dc_core32_mmx;
name[0] = name[1] = "16MMX";
name[2] = name[3] = "32MMX";
for( i = 0; i < 2*6*8*8; i++ )
if( (***h->quant8_mf)[i] >= 0x8000 )
{
pf->quant_8x8_core = x264_quant_8x8_core32_mmx;
name[0] = "32MMX";
}
for( i = 0; i < 4*6*4*4; i++ )
if( (***h->quant4_mf)[i] >= 0x8000 )
{
pf->quant_4x4_core = x264_quant_4x4_core32_mmx;
name[1] = "32MMX";
}
}
#endif
x264_log( h, X264_LOG_DEBUG, "using quant functions 8x8=%s 4x4=%s dc4x4=%s dc2x2=%s\n",
name[0], name[1], name[2], name[3] );
}
/*****************************************************************************
* quant.h: h264 encoder library
*****************************************************************************
* Copyright (C) 2005 x264 project
*
* Authors: Christian Heine <sennindemokrit@gmx.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
*****************************************************************************/
#ifndef _QUANT_H
#define _QUANT_H 1
typedef struct
{
void (*quant_8x8_core)( int16_t dct[8][8], int quant_mf[8][8], int i_qbits, int f );
void (*quant_4x4_core)( int16_t dct[4][4], int quant_mf[4][4], int i_qbits, int f );
void (*quant_4x4_dc_core)( int16_t dct[4][4], int i_quant_mf, int i_qbits, int f );
void (*quant_2x2_dc_core)( int16_t dct[2][2], int i_quant_mf, int i_qbits, int f );
} x264_quant_function_t;
void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf );
#endif
......@@ -584,6 +584,7 @@ x264_t *x264_encoder_open ( x264_param_t *param )