Commit cfebeac1 authored by Loren Merritt's avatar Loren Merritt

faster mmx quant 15bit, and add 16bit version. total speedup: ~0.3%

patch by Christian Heine.


git-svn-id: svn://svn.videolan.org/x264/trunk@298 df754926-b1dd-0310-bc7b-ec298dee348c
parent 49ac5e2f
......@@ -21,6 +21,16 @@
;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
;*****************************************************************************
;*****************************************************************************
;* *
;* Revision history: *
;* *
;* 2005.07.26 quant 4x4 & 8x8 MMX functions (AI) *
;* 2005.09.04 quant MMXEXT (added precision) and DC (CH) *
;* 2005.09.21 faster MMX and added MMXEXT16 (CH) *
;* *
;*****************************************************************************
BITS 64
%macro cglobal 1
......@@ -36,184 +46,338 @@ ALIGN 16
SECTION .text
cglobal x264_quant_8x8_core16_mmx
cglobal x264_quant_4x4_core16_mmx
cglobal x264_quant_8x8_core32_mmx
cglobal x264_quant_4x4_core32_mmx
cglobal x264_quant_4x4_dc_core32_mmx
cglobal x264_quant_2x2_dc_core32_mmx
%macro QUANT_AC_START 0
; mov rdi, rdi ; dct
; mov rsi, rsi ; quant_mf
movd mm6, edx ; i_qbits
movd mm7, ecx ; f
punpckldq mm7, mm7
%endmacro
%macro QUANT_DC_START 0
; mov rdi, rdi ; dct
movd mm5, rsi ; i_quant_mf
movd mm6, edx ; i_qbits
movd mm7, ecx ; f
punpckldq mm5, mm5
punpckldq mm7, mm7
cglobal x264_quant_2x2_dc_core15_mmx
cglobal x264_quant_4x4_dc_core15_mmx
cglobal x264_quant_4x4_core15_mmx
cglobal x264_quant_8x8_core15_mmx
cglobal x264_quant_2x2_dc_core16_mmxext
cglobal x264_quant_4x4_dc_core16_mmxext
cglobal x264_quant_4x4_core16_mmxext
cglobal x264_quant_8x8_core16_mmxext
cglobal x264_quant_2x2_dc_core32_mmxext
cglobal x264_quant_4x4_dc_core32_mmxext
cglobal x264_quant_4x4_core32_mmxext
cglobal x264_quant_8x8_core32_mmxext
%macro MMX_QUANT_AC_START 0
; mov rdi, rdi ; &dct[0][0]
; mov rsi, rsi ; &quant_mf[0][0]
movd mm6, edx ; i_qbits
movd mm7, ecx ; f
punpckldq mm7, mm7 ; f in each dword
%endmacro
%macro QUANT16_1x4 5
;;; %1 dct[y][x]
;;; %2,%3 quant_mf[i_mf][y][x], entries must fit in int16
;;; %4 i_qbits
;;; %5 f as doublewords
;;; trashes mm0-mm5
movq mm0, %1
movq mm1, %2
movq mm2, %3
packssdw mm1, mm2
movq mm4, mm0
pxor mm5, mm5
pcmpgtw mm4, mm5
movq mm2, mm0
pmullw mm0, mm1
pmulhw mm2, mm1
movq mm1, mm0
punpcklwd mm0, mm2
punpckhwd mm1, mm2
movq mm2, %5
movq mm3, %5
psubd mm2, mm0
psubd mm3, mm1
paddd mm0, %5
paddd mm1, %5
psrad mm0, %4
psrad mm1, %4
psrad mm2, %4
psrad mm3, %4
packssdw mm0, mm1
packssdw mm2, mm3
pxor mm5, mm5
psubw mm5, mm2
pand mm0, mm4
pandn mm4, mm5
por mm0, mm4
movq %1, mm0
%macro MMX_QUANT15_DC_START 0
; mov rdi, rdi ; &dct[0][0]
movd mm5, rsi ; i_qmf
movd mm6, edx ; i_qbits
movd mm7, ecx ; f
punpcklwd mm5, mm5
punpcklwd mm5, mm5 ; i_qmf in each word
punpckldq mm7, mm7 ; f in each dword
%endmacro
%macro QUANT32_1x4 5
;;; %1 dct[y][x]
;;; %2,%3 quant_mf[i_mf][y][x]
;;; %4 i_qbits
;;; %5 f as doublewords
;;; trashes mm0-mm4
movq mm0, %1
%macro MMX_QUANT15_1x4 4
;;; %1 (m64) dct[y][x]
;;; %2 (m64/mmx) quant_mf[y][x] or quant_mf[0][0] (as int16_t)
;;; %3 (mmx) i_qbits in the low doubleword
;;; %4 (mmx) f as doublewords
;;; trashes mm0-mm2,mm4
movq mm0, %1 ; load dct coeffs
pxor mm4, mm4
pcmpgtw mm4, mm0 ; mm4 = sign(mm0)
pcmpgtw mm4, mm0 ; sign(coeff)
pxor mm0, mm4
psubw mm0, mm4 ; mm0 = abs(mm0)
psubw mm0, mm4 ; abs(coeff)
movq mm2, mm0
pmullw mm0, %2
pmulhw mm2, %2
movq mm1, mm0
punpcklwd mm0, mm0 ; duplicate the words for the upcomming
punpckhwd mm1, mm1 ; 32 bit multiplication
punpcklwd mm0, mm2
punpckhwd mm1, mm2
paddd mm0, %4 ; round with f
paddd mm1, %4
psrad mm0, %3
psrad mm1, %3
packssdw mm0, mm1 ; pack
pxor mm0, mm4 ; restore sign
psubw mm0, mm4
movq %1, mm0 ; store
%endmacro
movq mm2, mm0 ; like in school ...
movq mm3, mm1
pmulhuw mm0, %2 ; ... multiply the parts ...
pmulhuw mm1, %3
pmullw mm2, %2
pmullw mm3, %3
pslld mm0, 16 ; ... shift ...
pslld mm1, 16
paddd mm0, mm2 ; ... and add them
paddd mm1, mm3
ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_quant_2x2_dc_core15_mmx( int16_t dct[2][2],
; int const i_qmf, int const i_qbits, int const f );
;-----------------------------------------------------------------------------
x264_quant_2x2_dc_core15_mmx:
MMX_QUANT15_DC_START
MMX_QUANT15_1x4 [rdi], mm5, mm6, mm7
ret
paddd mm0, %5 ; round with f
paddd mm1, %5
psrad mm0, %4
psrad mm1, %4
packssdw mm0, mm1 ; pack & store
ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_quant_4x4_dc_core15_mmx( int16_t dct[4][4],
; int const i_qmf, int const i_qbits, int const f );
;-----------------------------------------------------------------------------
x264_quant_4x4_dc_core15_mmx:
MMX_QUANT15_DC_START
%rep 4
MMX_QUANT15_1x4 [rdi], mm5, mm6, mm7
add rdi, byte 8
%endrep
ret
ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_quant_4x4_core15_mmx( int16_t dct[4][4],
; int const quant_mf[4][4], int const i_qbits, int const f );
;-----------------------------------------------------------------------------
x264_quant_4x4_core15_mmx:
MMX_QUANT_AC_START
%rep 4
movq mm5, [rsi]
packssdw mm5, [rsi+8]
MMX_QUANT15_1x4 [rdi], mm5, mm6, mm7
add rsi, byte 16
add rdi, byte 8
%endrep
ret
ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_quant_8x8_core15_mmx( int16_t dct[8][8],
; int const quant_mf[8][8], int const i_qbits, int const f );
;-----------------------------------------------------------------------------
x264_quant_8x8_core15_mmx:
MMX_QUANT_AC_START
%rep 16
movq mm5, [rsi]
packssdw mm5, [rsi+8]
MMX_QUANT15_1x4 [rdi], mm5, mm6, mm7
add rsi, byte 16
add rdi, byte 8
%endrep
ret
; ============================================================================
%macro MMXEXT_QUANT16_DC_START 0
; mov rdi, rdi ; &dct[0][0]
movd mm5, rsi ; i_qmf
movd mm6, edx ; i_qbits
movd mm7, ecx ; f
pshufw mm5, mm5, 0 ; i_qmf in each word
punpckldq mm7, mm7 ; f in each dword
%endmacro
%macro MMXEXT_QUANT16_1x4 4
;;; %1 (m64) dct[y][x]
;;; %2 (m64/mmx) quant_mf[y][x] or quant_mf[0][0] (as uint16_t)
;;; %3 (mmx) i_qbits in the low doubleword
;;; %4 (mmx) f as doublewords
;;; trashes mm0-mm2,mm4
movq mm0, %1 ; load dct coeffs
pxor mm4, mm4
pcmpgtw mm4, mm0 ; sign(coeff)
pxor mm0, mm4
psubw mm0, mm4 ; restore sign
movq %1, mm0
psubw mm0, mm4 ; abs(coeff)
movq mm2, mm0
pmullw mm0, %2
pmulhuw mm2, %2
movq mm1, mm0
punpcklwd mm0, mm2
punpckhwd mm1, mm2
paddd mm0, %4 ; round with f
paddd mm1, %4
psrad mm0, %3
psrad mm1, %3
packssdw mm0, mm1 ; pack
pxor mm0, mm4 ; restore sign
psubw mm0, mm4
movq %1, mm0 ; store
%endmacro
ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_quant_2x2_dc_core16_mmxext( int16_t dct[2][2],
; int const i_qmf, int const i_qbits, int const f );
;-----------------------------------------------------------------------------
x264_quant_2x2_dc_core16_mmxext:
MMXEXT_QUANT16_DC_START
MMXEXT_QUANT16_1x4 [rdi], mm5, mm6, mm7
ret
ALIGN 16
;;; void x264_quant_8x8_core16_mmx( int16_t dct[8][8], int quant_mf[8][8], int i_qbits, int f )
x264_quant_8x8_core16_mmx:
QUANT_AC_START
;-----------------------------------------------------------------------------
; void __cdecl x264_quant_4x4_dc_core16_mmxext( int16_t dct[4][4],
; int const i_qmf, int const i_qbits, int const f );
;-----------------------------------------------------------------------------
x264_quant_4x4_dc_core16_mmxext:
MMXEXT_QUANT16_DC_START
%rep 16
QUANT16_1x4 [rdi], [rsi], [rsi+8], mm6, mm7
add rdi, 8
add rsi, 16
%rep 4
MMXEXT_QUANT16_1x4 [rdi], mm5, mm6, mm7
add rdi, byte 8
%endrep
ret
ALIGN 16
;;; void x264_quant_4x4_core16_mmx( int16_t dct[4][4], int quant_mf[4][4], int i_qbits, int f )
x264_quant_4x4_core16_mmx:
QUANT_AC_START
;-----------------------------------------------------------------------------
; void __cdecl x264_quant_4x4_core16_mmxext( int16_t dct[4][4],
; int const quant_mf[4][4], int const i_qbits, int const f );
;-----------------------------------------------------------------------------
x264_quant_4x4_core16_mmxext:
MMX_QUANT_AC_START
%rep 4
QUANT16_1x4 [rdi], [rsi], [rsi+8], mm6, mm7
add rdi, 8
add rsi, 16
pshufw mm5, [rsi], 10110001b
paddw mm5, [rsi+8]
pshufw mm5, mm5, 10001101b
MMXEXT_QUANT16_1x4 [rdi], mm5, mm6, mm7
add rsi, byte 16
add rdi, byte 8
%endrep
ret
ALIGN 16
;;; void x264_quant_8x8_core32_mmx( int16_t dct[8][8], int quant_mf[8][8], int i_qbits, int f )
x264_quant_8x8_core32_mmx:
QUANT_AC_START
;-----------------------------------------------------------------------------
; void __cdecl x264_quant_8x8_core16_mmxext( int16_t dct[8][8],
; int const quant_mf[8][8], int const i_qbits, int const f );
;-----------------------------------------------------------------------------
x264_quant_8x8_core16_mmxext:
MMX_QUANT_AC_START
%rep 16
QUANT32_1x4 [rdi], [rsi], [rsi+8], mm6, mm7
add rdi, 8
add rsi, 16
pshufw mm5, [rsi], 10110001b
paddw mm5, [rsi+8]
pshufw mm5, mm5, 10001101b
MMXEXT_QUANT16_1x4 [rdi], mm5, mm6, mm7
add rsi, byte 16
add rdi, byte 8
%endrep
ret
%macro MMX_QUANT32_DC_START 0
; mov rdi, rdi ; &dct[0][0]
movd mm5, rsi ; i_qmf
movd mm6, edx ; i_qbits
movd mm7, ecx ; f
punpckldq mm5, mm5 ; i_qmf in each dword
punpckldq mm7, mm7 ; f in each dword
%endmacro
%macro MMXEXT_QUANT32_1x4 5
;;; %1 (m64) dct[y][x]
;;; %2,%3 (m64/mmx) quant_mf[y][x] or quant_mf[0][0] (as int16_t)
;;; %4 (mmx) i_qbits in the low quadword
;;; %5 (mmx) f as doublewords
;;; trashes mm0-mm4
movq mm0, %1 ; load dct coeffs
pxor mm4, mm4
pcmpgtw mm4, mm0 ; sign(mm0)
pxor mm0, mm4
psubw mm0, mm4 ; abs(mm0)
movq mm1, mm0
punpcklwd mm0, mm0 ; duplicate the words for the upcomming
punpckhwd mm1, mm1 ; 32 bit multiplication
movq mm2, mm0 ; like in school ...
movq mm3, mm1
pmulhuw mm0, %2 ; ... multiply the parts ...
pmulhuw mm1, %3
pmullw mm2, %2
pmullw mm3, %3
pslld mm0, 16 ; ... shift ...
pslld mm1, 16
paddd mm0, mm2 ; ... and add them
paddd mm1, mm3
paddd mm0, %5 ; round with f
paddd mm1, %5
psrad mm0, %4
psrad mm1, %4
packssdw mm0, mm1 ; pack to int16_t
pxor mm0, mm4 ; restore sign
psubw mm0, mm4
movq %1, mm0 ; store
%endmacro
ALIGN 16
;;; void x264_quant_4x4_core32_mmx( int16_t dct[4][4], int quant_mf[4][4], int i_qbits, int f )
x264_quant_4x4_core32_mmx:
QUANT_AC_START
;-----------------------------------------------------------------------------
; void __cdecl x264_quant_2x2_dc_core32_mmxext( int16_t dct[2][2],
; int const i_qmf, int const i_qbits, int const f );
;-----------------------------------------------------------------------------
x264_quant_2x2_dc_core32_mmxext:
MMX_QUANT32_DC_START
MMXEXT_QUANT32_1x4 [rdi], mm5, mm5, mm6, mm7
ret
ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_quant_4x4_dc_core32_mmxext( int16_t dct[4][4],
; int const i_qmf, int const i_qbits, int const f );
;-----------------------------------------------------------------------------
x264_quant_4x4_dc_core32_mmxext:
MMX_QUANT32_DC_START
%rep 4
QUANT32_1x4 [rdi], [rsi], [rsi+8], mm6, mm7
add rdi, 8
add rsi, 16
MMXEXT_QUANT32_1x4 [rdi], mm5, mm5, mm6, mm7
add rdi, byte 8
%endrep
ret
ALIGN 16
;;; void x264_quant_4x4_dc_core32_mmx( int16_t dct[4][4], int i_quant_mf, int i_qbits, int f )
x264_quant_4x4_dc_core32_mmx:
QUANT_DC_START
;-----------------------------------------------------------------------------
; void __cdecl x264_quant_4x4_core32_mmxext( int16_t dct[4][4],
; int const quant_mf[4][4], int const i_qbits, int const f );
;-----------------------------------------------------------------------------
x264_quant_4x4_core32_mmxext:
MMX_QUANT_AC_START
%rep 4
QUANT32_1x4 [rdi], mm5, mm5, mm6, mm7
add rdi, 8
MMXEXT_QUANT32_1x4 [rdi], [rsi], [rsi+8], mm6, mm7
add rdi, byte 8
add rsi, byte 16
%endrep
ret
ALIGN 16
;;; void x264_quant_2x2_dc_core32_mmx( int16_t dct[2][2], int i_quant_mf, int i_qbits, int f )
x264_quant_2x2_dc_core32_mmx:
QUANT_DC_START
;-----------------------------------------------------------------------------
; void __cdecl x264_quant_8x8_core32_mmxext( int16_t dct[8][8],
; int const quant_mf[8][8], int const i_qbits, int const f );
;-----------------------------------------------------------------------------
x264_quant_8x8_core32_mmxext:
MMX_QUANT_AC_START
QUANT32_1x4 [rdi], mm5, mm5, mm6, mm7
%rep 16
MMXEXT_QUANT32_1x4 [rdi], [rsi], [rsi+8], mm6, mm7
add rdi, byte 8
add rsi, byte 16
%endrep
ret
......@@ -21,6 +21,16 @@
;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
;*****************************************************************************
;*****************************************************************************
;* *
;* Revision history: *
;* *
;* 2005.07.26 quant 4x4 & 8x8 MMX functions (AI) *
;* 2005.09.04 quant MMXEXT (added precision) and DC (CH) *
;* 2005.09.21 faster MMX and added MMXEXT16 (CH) *
;* *
;*****************************************************************************
BITS 32
%macro cglobal 1
......@@ -36,184 +46,338 @@ ALIGN 16
SECTION .text
cglobal x264_quant_8x8_core16_mmx
cglobal x264_quant_4x4_core16_mmx
cglobal x264_quant_8x8_core32_mmx
cglobal x264_quant_4x4_core32_mmx
cglobal x264_quant_4x4_dc_core32_mmx
cglobal x264_quant_2x2_dc_core32_mmx
%macro QUANT_AC_START 0
mov eax, [esp+ 4] ; dct
mov ecx, [esp+ 8] ; quant_mf
movd mm6, [esp+12] ; i_qbits
movd mm7, [esp+16] ; f
punpckldq mm7, mm7
%endmacro
%macro QUANT_DC_START 0
mov eax, [esp+ 4] ; dct
movd mm5, [esp+ 8] ; i_quant_mf
movd mm6, [esp+12] ; i_qbits
movd mm7, [esp+16] ; f
punpckldq mm5, mm5
punpckldq mm7, mm7
cglobal x264_quant_2x2_dc_core15_mmx
cglobal x264_quant_4x4_dc_core15_mmx
cglobal x264_quant_4x4_core15_mmx
cglobal x264_quant_8x8_core15_mmx
cglobal x264_quant_2x2_dc_core16_mmxext
cglobal x264_quant_4x4_dc_core16_mmxext
cglobal x264_quant_4x4_core16_mmxext
cglobal x264_quant_8x8_core16_mmxext
cglobal x264_quant_2x2_dc_core32_mmxext
cglobal x264_quant_4x4_dc_core32_mmxext
cglobal x264_quant_4x4_core32_mmxext
cglobal x264_quant_8x8_core32_mmxext
%macro MMX_QUANT_AC_START 0
mov eax, [esp+ 4] ; &dct[0][0]
mov ecx, [esp+ 8] ; &quant_mf[0][0]
movd mm6, [esp+12] ; i_qbits
movd mm7, [esp+16] ; f
punpckldq mm7, mm7 ; f in each dword
%endmacro
%macro QUANT16_1x4 5
;;; %1 dct[y][x]
;;; %2,%3 quant_mf[i_mf][y][x], entries must fit in int16
;;; %4 i_qbits
;;; %5 f as doublewords
;;; trashes mm0-mm5
movq mm0, %1
movq mm1, %2
movq mm2, %3
packssdw mm1, mm2
movq mm4, mm0
pxor mm5, mm5
pcmpgtw mm4, mm5
movq mm2, mm0
pmullw mm0, mm1
pmulhw mm2, mm1
movq mm1, mm0
punpcklwd mm0, mm2
punpckhwd mm1, mm2
movq mm2, %5
movq mm3, %5
psubd mm2, mm0
psubd mm3, mm1
paddd mm0, %5
paddd mm1, %5
psrad mm0, %4
psrad mm1, %4
psrad mm2, %4
psrad mm3, %4
packssdw mm0, mm1
packssdw mm2, mm3
pxor mm5, mm5
psubw mm5, mm2
pand mm0, mm4
pandn mm4, mm5
por mm0, mm4
movq %1, mm0
%macro MMX_QUANT15_DC_START 0
mov eax, [esp+ 4] ; &dct[0][0]
movd mm5, [esp+ 8] ; i_qmf
movd mm6, [esp+12] ; i_qbits
movd mm7, [esp+16] ; f
punpcklwd mm5, mm5
punpcklwd mm5, mm5 ; i_qmf in each word
punpckldq mm7, mm7 ; f in each dword
%endmacro
%macro QUANT32_1x4 5
;;; %1 dct[y][x]
;;; %2,%3 quant_mf[i_mf][y][x]
;;; %4 i_qbits
;;; %5 f as doublewords
;;; trashes mm0-mm4
movq mm0, %1
%macro MMX_QUANT15_1x4 4
;;; %1 (m64) dct[y][x]
;;; %2 (m64/mmx) quant_mf[y][x] or quant_mf[0][0] (as int16_t)
;;; %3 (mmx) i_qbits in the low doubleword
;;; %4 (mmx) f as doublewords
;;; trashes mm0-mm2,mm4
movq mm0, %1 ; load dct coeffs
pxor mm4, mm4
pcmpgtw mm4, mm0 ; mm4 = sign(mm0)
pcmpgtw mm4, mm0 ; sign(coeff)
pxor mm0, mm4
psubw mm0, mm4 ; mm0 = abs(mm0)
psubw mm0, mm4 ; abs(coeff)
movq mm2, mm0
pmullw mm0, %2
pmulhw mm2, %2
movq mm1, mm0
punpcklwd mm0, mm0 ; duplicate the words for the upcomming