Commit 10265a0c authored by Loren Merritt's avatar Loren Merritt

2x faster quant. 2% overall.

side effects:
not bit-identical to the previous algorithm.
while the new algorithm covers a wider range of cqms than the previous one did,
I couldn't find a good way to fallback to a general version for the extreme
cqms. so now it refuses to encode extreme cqms instead of just being slower.
lays a framework for custom deadzone matrices, though I didn't add an api.


git-svn-id: svn://svn.videolan.org/x264/trunk@642 df754926-b1dd-0310-bc7b-ec298dee348c
parent b37ac36e
......@@ -3,8 +3,7 @@
;*****************************************************************************
;* Copyright (C) 2005 x264 project
;*
;* Authors: Alex Izvorski <aizvorksi@gmail.com>
;* Christian Heine <sennindemokrit@gmx.net>
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
......@@ -21,16 +20,6 @@
;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
;*****************************************************************************
;*****************************************************************************
;* *
;* Revision history: *
;* *
;* 2005.07.26 quant 4x4 & 8x8 MMX functions (AI) *
;* 2005.09.04 quant MMXEXT (added precision) and DC (CH) *
;* 2005.09.21 faster MMX and added MMXEXT16 (CH) *
;* *
;*****************************************************************************
BITS 64
%include "amd64inc.asm"
......@@ -40,394 +29,106 @@ pd_1: times 2 dd 1
SECTION .text
%macro MMX_QUANT_AC_START 0
; mov rdi, rdi ; &dct[0][0]
; mov rsi, rsi ; &quant_mf[0][0]
movd mm6, parm3d ; i_qbits
movd mm7, parm4d ; f
punpckldq mm7, mm7 ; f in each dword
%macro MMX_QUANT_DC_START 0
movd mm6, parm2d ; mf
movd mm7, parm3d ; bias
pshufw mm6, mm6, 0
pshufw mm7, mm7, 0
%endmacro
%macro MMX_QUANT15_DC_START 0
; mov rdi, rdi ; &dct[0][0]
movd mm5, parm2d ; i_qmf
movd mm6, parm3d ; i_qbits
movd mm7, parm4d ; f
punpcklwd mm5, mm5
punpcklwd mm5, mm5 ; i_qmf in each word
punpckldq mm7, mm7 ; f in each dword
%macro SSE2_QUANT_DC_START 0
movd xmm6, parm2d ; mf
movd xmm7, parm3d ; bias
pshuflw xmm6, xmm6, 0
pshuflw xmm7, xmm7, 0
punpcklqdq xmm6, xmm6
punpcklqdq xmm7, xmm7
%endmacro
%macro SSE2_QUANT_AC_START 0
movd xmm6, parm3d ; i_qbits
movd xmm7, parm4d ; f
pshufd xmm7, xmm7, 0 ; f in each dword
%macro QUANT_ONE 5
;;; %1 (m64) dct[y][x]
;;; %2 (m64/mmx) mf[y][x] or mf[0][0] (as uint16_t)
;;; %3 (m64/mmx) bias[y][x] or bias[0][0] (as uint16_t)
mov%1 %2m0, %3 ; load dct coeffs
pxor %2m1, %2m1
pcmpgtw %2m1, %2m0 ; sign(coeff)
pxor %2m0, %2m1
psubw %2m0, %2m1 ; abs(coeff)
paddusw %2m0, %5 ; round
pmulhuw %2m0, %4 ; divide
pxor %2m0, %2m1 ; restore sign
psubw %2m0, %2m1
mov%1 %3, %2m0 ; store
%endmacro
%macro SSE2_QUANT15_DC_START 0
movd xmm5, parm2d ; i_qmf
movd xmm6, parm3d ; i_qbits
movd xmm7, parm4d ; f
pshuflw xmm5, xmm5, 0
punpcklqdq xmm5, xmm5 ; i_qmf in each word
pshufd xmm7, xmm7, 0 ; f in each dword
%macro MMX_QUANT_1x4 3
QUANT_ONE q, m, %1, %2, %3
%endmacro
%macro MMX_QUANT15_1x4 4
;;; %1 (m64) dct[y][x]
;;; %2 (m64/mmx) quant_mf[y][x] or quant_mf[0][0] (as int16_t)
;;; %3 (mmx) i_qbits in the low doubleword
;;; %4 (mmx) f as doublewords
;;; trashes mm0-mm2,mm4
movq mm0, %1 ; load dct coeffs
pxor mm4, mm4
pcmpgtw mm4, mm0 ; sign(coeff)
pxor mm0, mm4
psubw mm0, mm4 ; abs(coeff)
movq mm2, mm0
pmullw mm0, %2
pmulhw mm2, %2
movq mm1, mm0
punpcklwd mm0, mm2
punpckhwd mm1, mm2
paddd mm0, %4 ; round with f
paddd mm1, %4
psrad mm0, %3
psrad mm1, %3
packssdw mm0, mm1 ; pack
pxor mm0, mm4 ; restore sign
psubw mm0, mm4
movq %1, mm0 ; store
%macro SSE2_QUANT_1x8 3
QUANT_ONE dqa, xm, %1, %2, %3
%endmacro
%macro SSSE3_QUANT15_1x8 4
%macro SSSE3_QUANT_1x8 3
movdqa xmm0, %1 ; load dct coeffs
movdqa xmm4, xmm0 ; save sign
movdqa xmm1, xmm0 ; save sign
pabsw xmm0, xmm0
movdqa xmm2, xmm0
pmullw xmm0, %2
pmulhw xmm2, %2
movdqa xmm1, xmm0
punpcklwd xmm0, xmm2
punpckhwd xmm1, xmm2
paddd xmm0, %4 ; round with f
paddd xmm1, %4
psrad xmm0, %3
psrad xmm1, %3
packssdw xmm0, xmm1 ; pack
psignw xmm0, xmm4 ; restore sign
paddusw xmm0, %3 ; round
pmulhuw xmm0, %2 ; divide
psignw xmm0, xmm1 ; restore sign
movdqa %1, xmm0 ; store
%endmacro
;-----------------------------------------------------------------------------
; void x264_quant_2x2_dc_core15_mmx( int16_t dct[2][2],
; int const i_qmf, int const i_qbits, int const f );
; void x264_quant_2x2_dc_mmxext( int16_t dct[4], int mf, int bias )
;-----------------------------------------------------------------------------
cglobal x264_quant_2x2_dc_core15_mmx
MMX_QUANT15_DC_START
MMX_QUANT15_1x4 [parm1q], mm5, mm6, mm7
ret
;-----------------------------------------------------------------------------
; void x264_quant_4x4_dc_core15_mmx( int16_t dct[4][4],
; int const i_qmf, int const i_qbits, int const f );
;-----------------------------------------------------------------------------
cglobal x264_quant_4x4_dc_core15_mmx
MMX_QUANT15_DC_START
%rep 4
MMX_QUANT15_1x4 [parm1q], mm5, mm6, mm7
add parm1q, byte 8
%endrep
ret
;-----------------------------------------------------------------------------
; void x264_quant_4x4_core15_mmx( int16_t dct[4][4],
; int const quant_mf[4][4], int const i_qbits, int const f );
;-----------------------------------------------------------------------------
cglobal x264_quant_4x4_core15_mmx
MMX_QUANT_AC_START
%rep 4
movq mm5, [parm2q]
packssdw mm5, [parm2q+8]
MMX_QUANT15_1x4 [parm1q], mm5, mm6, mm7
add parm2q, byte 16
add parm1q, byte 8
%endrep
cglobal x264_quant_2x2_dc_mmxext
MMX_QUANT_DC_START
MMX_QUANT_1x4 [parm1q], mm6, mm7
ret
%macro QUANT_SSE 1
;-----------------------------------------------------------------------------
; void x264_quant_8x8_core15_mmx( int16_t dct[8][8],
; int const quant_mf[8][8], int const i_qbits, int const f );
; void x264_quant_4x4_dc_sse2( int16_t dct[16], int mf, int bias )
;-----------------------------------------------------------------------------
cglobal x264_quant_8x8_core15_mmx
MMX_QUANT_AC_START
%rep 16
movq mm5, [parm2q]
packssdw mm5, [parm2q+8]
MMX_QUANT15_1x4 [parm1q], mm5, mm6, mm7
add parm2q, byte 16
add parm1q, byte 8
cglobal x264_quant_4x4_dc_%1
SSE2_QUANT_DC_START
%assign x 0
%rep 2
QUANT_1x8 [parm1q+x], xmm6, xmm7
%assign x (x+16)
%endrep
ret
%ifdef HAVE_SSE3
;-----------------------------------------------------------------------------
; void x264_quant_4x4_dc_core15_ssse3( int16_t dct[4][4],
; int const i_qmf, int const i_qbits, int const f );
;-----------------------------------------------------------------------------
cglobal x264_quant_4x4_dc_core15_ssse3
SSE2_QUANT15_DC_START
SSSE3_QUANT15_1x8 [parm1q], xmm5, xmm6, xmm7
SSSE3_QUANT15_1x8 [parm1q+16], xmm5, xmm6, xmm7
ret
;-----------------------------------------------------------------------------
; void x264_quant_4x4_core15_ssse3( int16_t dct[4][4],
; int const quant_mf[4][4], int const i_qbits, int const f );
; void x264_quant_4x4_sse2( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
;-----------------------------------------------------------------------------
cglobal x264_quant_4x4_core15_ssse3
SSE2_QUANT_AC_START
cglobal x264_quant_4x4_%1
%assign x 0
%rep 2
movdqa xmm5, [parm2q+32*x]
packssdw xmm5, [parm2q+32*x+16]
SSSE3_QUANT15_1x8 [parm1q+16*x], xmm5, xmm6, xmm7
%assign x x+1
QUANT_1x8 [parm1q+x], [parm2q+x], [parm3q+x]
%assign x (x+16)
%endrep
ret
;-----------------------------------------------------------------------------
; void x264_quant_8x8_core15_ssse3( int16_t dct[8][8],
; int const quant_mf[8][8], int const i_qbits, int const f );
; void x264_quant_8x8_sse2( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
;-----------------------------------------------------------------------------
cglobal x264_quant_8x8_core15_ssse3
SSE2_QUANT_AC_START
cglobal x264_quant_8x8_%1
%assign x 0
%rep 8
movdqa xmm5, [parm2q+32*x]
packssdw xmm5, [parm2q+32*x+16]
SSSE3_QUANT15_1x8 [parm1q+16*x], xmm5, xmm6, xmm7
%assign x x+1
%endrep
ret
%endif ; HAVE_SSE3
; ============================================================================
%macro MMXEXT_QUANT16_DC_START 0
; mov rdi, rdi ; &dct[0][0]
movd mm5, parm2d ; i_qmf
movd mm6, parm3d ; i_qbits
movd mm7, parm4d ; f
pshufw mm5, mm5, 0 ; i_qmf in each word
punpckldq mm7, mm7 ; f in each dword
%endmacro
%macro MMXEXT_QUANT16_1x4 4
;;; %1 (m64) dct[y][x]
;;; %2 (m64/mmx) quant_mf[y][x] or quant_mf[0][0] (as uint16_t)
;;; %3 (mmx) i_qbits in the low doubleword
;;; %4 (mmx) f as doublewords
;;; trashes mm0-mm2,mm4
movq mm0, %1 ; load dct coeffs
pxor mm4, mm4
pcmpgtw mm4, mm0 ; sign(coeff)
pxor mm0, mm4
psubw mm0, mm4 ; abs(coeff)
movq mm2, mm0
pmullw mm0, %2
pmulhuw mm2, %2
movq mm1, mm0
punpcklwd mm0, mm2
punpckhwd mm1, mm2
paddd mm0, %4 ; round with f
paddd mm1, %4
psrad mm0, %3
psrad mm1, %3
packssdw mm0, mm1 ; pack
pxor mm0, mm4 ; restore sign
psubw mm0, mm4
movq %1, mm0 ; store
%endmacro
;-----------------------------------------------------------------------------
; void x264_quant_2x2_dc_core16_mmxext( int16_t dct[2][2],
; int const i_qmf, int const i_qbits, int const f );
;-----------------------------------------------------------------------------
cglobal x264_quant_2x2_dc_core16_mmxext
MMXEXT_QUANT16_DC_START
MMXEXT_QUANT16_1x4 [parm1q], mm5, mm6, mm7
ret
;-----------------------------------------------------------------------------
; void x264_quant_4x4_dc_core16_mmxext( int16_t dct[4][4],
; int const i_qmf, int const i_qbits, int const f );
;-----------------------------------------------------------------------------
cglobal x264_quant_4x4_dc_core16_mmxext
MMXEXT_QUANT16_DC_START
%rep 4
MMXEXT_QUANT16_1x4 [parm1q], mm5, mm6, mm7
add parm1q, byte 8
%endrep
ret
;-----------------------------------------------------------------------------
; void x264_quant_4x4_core16_mmxext( int16_t dct[4][4],
; int const quant_mf[4][4], int const i_qbits, int const f );
;-----------------------------------------------------------------------------
cglobal x264_quant_4x4_core16_mmxext
MMX_QUANT_AC_START
%rep 4
pshufw mm5, [parm2q], 10110001b
paddw mm5, [parm2q+8]
pshufw mm5, mm5, 10001101b
MMXEXT_QUANT16_1x4 [parm1q], mm5, mm6, mm7
add parm2q, byte 16
add parm1q, byte 8
QUANT_1x8 [parm1q+x], [parm2q+x], [parm3q+x]
%assign x (x+16)
%endrep
ret
;-----------------------------------------------------------------------------
; void x264_quant_8x8_core16_mmxext( int16_t dct[8][8],
; int const quant_mf[8][8], int const i_qbits, int const f );
;-----------------------------------------------------------------------------
cglobal x264_quant_8x8_core16_mmxext
MMX_QUANT_AC_START
%rep 16
pshufw mm5, [parm2q], 10110001b
paddw mm5, [parm2q+8]
pshufw mm5, mm5, 10001101b
MMXEXT_QUANT16_1x4 [parm1q], mm5, mm6, mm7
add parm2q, byte 16
add parm1q, byte 8
%endrep
ret
%macro MMX_QUANT32_DC_START 0
; mov rdi, rdi ; &dct[0][0]
movd mm5, parm2d ; i_qmf
movd mm6, parm3d ; i_qbits
movd mm7, parm4d ; f
punpckldq mm5, mm5 ; i_qmf in each dword
punpckldq mm7, mm7 ; f in each dword
%endmacro
%macro MMXEXT_QUANT32_1x4 5
;;; %1 (m64) dct[y][x]
;;; %2,%3 (m64/mmx) quant_mf[y][x] or quant_mf[0][0] (as int16_t)
;;; %4 (mmx) i_qbits in the low quadword
;;; %5 (mmx) f as doublewords
;;; trashes mm0-mm4
movq mm0, %1 ; load dct coeffs
pxor mm4, mm4
pcmpgtw mm4, mm0 ; sign(mm0)
pxor mm0, mm4
psubw mm0, mm4 ; abs(mm0)
movq mm1, mm0
punpcklwd mm0, mm0 ; duplicate the words for the upcomming
punpckhwd mm1, mm1 ; 32 bit multiplication
movq mm2, mm0 ; like in school ...
movq mm3, mm1
pmulhuw mm0, %2 ; ... multiply the parts ...
pmulhuw mm1, %3
pmullw mm2, %2
pmullw mm3, %3
pslld mm0, 16 ; ... shift ...
pslld mm1, 16
paddd mm0, mm2 ; ... and add them
paddd mm1, mm3
paddd mm0, %5 ; round with f
paddd mm1, %5
psrad mm0, %4
psrad mm1, %4
packssdw mm0, mm1 ; pack to int16_t
pxor mm0, mm4 ; restore sign
psubw mm0, mm4
movq %1, mm0 ; store
%endmacro
;-----------------------------------------------------------------------------
; void x264_quant_2x2_dc_core32_mmxext( int16_t dct[2][2],
; int const i_qmf, int const i_qbits, int const f );
;-----------------------------------------------------------------------------
cglobal x264_quant_2x2_dc_core32_mmxext
MMX_QUANT32_DC_START
MMXEXT_QUANT32_1x4 [parm1q], mm5, mm5, mm6, mm7
ret
;-----------------------------------------------------------------------------
; void x264_quant_4x4_dc_core32_mmxext( int16_t dct[4][4],
; int const i_qmf, int const i_qbits, int const f );
;-----------------------------------------------------------------------------
cglobal x264_quant_4x4_dc_core32_mmxext
MMX_QUANT32_DC_START
%rep 4
MMXEXT_QUANT32_1x4 [parm1q], mm5, mm5, mm6, mm7
add parm1q, byte 8
%endrep
ret
;-----------------------------------------------------------------------------
; void x264_quant_4x4_core32_mmxext( int16_t dct[4][4],
; int const quant_mf[4][4], int const i_qbits, int const f );
;-----------------------------------------------------------------------------
cglobal x264_quant_4x4_core32_mmxext
MMX_QUANT_AC_START
%rep 4
MMXEXT_QUANT32_1x4 [parm1q], [parm2q], [parm2q+8], mm6, mm7
add parm1q, byte 8
add parm2q, byte 16
%endrep
ret
;-----------------------------------------------------------------------------
; void x264_quant_8x8_core32_mmxext( int16_t dct[8][8],
; int const quant_mf[8][8], int const i_qbits, int const f );
;-----------------------------------------------------------------------------
cglobal x264_quant_8x8_core32_mmxext
MMX_QUANT_AC_START
%rep 16
MMXEXT_QUANT32_1x4 [parm1q], [parm2q], [parm2q+8], mm6, mm7
add parm1q, byte 8
add parm2q, byte 16
%endrep
%define QUANT_1x8 SSE2_QUANT_1x8
QUANT_SSE sse2
%ifdef HAVE_SSE3
%define QUANT_1x8 SSSE3_QUANT_1x8
QUANT_SSE ssse3
%endif
ret
;=============================================================================
......
......@@ -341,10 +341,12 @@ struct x264_t
int (*dequant4_mf[4])[4][4]; /* [4][6][4][4] */
int (*dequant8_mf[2])[8][8]; /* [2][6][8][8] */
int (*quant4_mf[4])[4][4]; /* [4][6][4][4] */
int (*quant8_mf[2])[8][8]; /* [2][6][8][8] */
int (*unquant4_mf[4])[16]; /* [4][52][16] */
int (*unquant8_mf[2])[64]; /* [2][52][64] */
uint16_t (*quant4_mf[4])[16]; /* [4][52][16] */
uint16_t (*quant8_mf[2])[64]; /* [2][52][64] */
uint16_t (*quant4_bias[4])[16]; /* [4][52][16] */
uint16_t (*quant8_bias[2])[64]; /* [2][52][64] */
uint32_t nr_residual_sum[2][64];
uint32_t nr_offset[2][64];
......@@ -436,9 +438,6 @@ struct x264_t
int b_interlaced;
/* Inverted luma quantization deadzone */
int i_luma_deadzone[2]; // {inter, intra}
/* Allowed qpel MV range to stay within the picture + emulated edge pixels */
int mv_min[2];
int mv_max[2];
......
......@@ -3,8 +3,7 @@
;*****************************************************************************
;* Copyright (C) 2005 x264 project
;*
;* Authors: Alex Izvorski <aizvorksi@gmail.com>
;* Christian Heine <sennindemokrit@gmx.net>
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
......@@ -21,16 +20,6 @@
;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
;*****************************************************************************
;*****************************************************************************
;* *
;* Revision history: *
;* *
;* 2005.07.26 quant 4x4 & 8x8 MMX functions (AI) *
;* 2005.09.04 quant MMXEXT (added precision) and DC (CH) *
;* 2005.09.21 faster MMX and added MMXEXT16 (CH) *
;* *
;*****************************************************************************
BITS 32
%include "i386inc.asm"
......@@ -40,313 +29,151 @@ pd_1: times 2 dd 1
SECTION .text
%macro MMX_QUANT_AC_START 0
mov eax, [esp+ 4] ; &dct[0][0]
mov ecx, [esp+ 8] ; &quant_mf[0][0]
movd mm6, [esp+12] ; i_qbits
movd mm7, [esp+16] ; f
punpckldq mm7, mm7 ; f in each dword
%macro QUANT_AC_START 0
mov eax, [esp+ 4] ; dct
mov ecx, [esp+ 8] ; mf
mov edx, [esp+12] ; bias
%endmacro
%macro MMX_QUANT15_DC_START 0
mov eax, [esp+ 4] ; &dct[0][0]
movd mm5, [esp+ 8] ; i_qmf
movd mm6, [esp+12] ; i_qbits
movd mm7, [esp+16] ; f
punpcklwd mm5, mm5
punpcklwd mm5, mm5 ; i_qmf in each word
punpckldq mm7, mm7 ; f in each dword
%macro MMX_QUANT_DC_START 0
mov eax, [esp+ 4] ; dct
movd mm6, [esp+ 8] ; mf
movd mm7, [esp+12] ; bias
pshufw mm6, mm6, 0
pshufw mm7, mm7, 0
%endmacro
%macro MMX_QUANT15_1x4 4
;;; %1 (m64) dct[y][x]
;;; %2 (m64/mmx) quant_mf[y][x] or quant_mf[0][0] (as int16_t)
;;; %3 (mmx) i_qbits in the low doubleword
;;; %4 (mmx) f as doublewords
;;; trashes mm0-mm2,mm4
movq mm0, %1 ; load dct coeffs
pxor mm4, mm4
pcmpgtw mm4, mm0 ; sign(coeff)
pxor mm0, mm4
psubw mm0, mm4 ; abs(coeff)
movq mm2, mm0
pmullw mm0, %2
pmulhw mm2, %2
movq mm1, mm0
punpcklwd mm0, mm2
punpckhwd mm1, mm2
paddd mm0, %4 ; round with f
paddd mm1, %4
psrad mm0, %3
psrad mm1, %3
packssdw mm0, mm1 ; pack
pxor mm0, mm4 ; restore sign
psubw mm0, mm4
movq %1, mm0 ; store
%macro SSE2_QUANT_DC_START 0
mov eax, [esp+ 4] ; dct
movd xmm6, [esp+ 8] ; mf
movd xmm7, [esp+12] ; bias
pshuflw xmm6, xmm6, 0
pshuflw xmm7, xmm7, 0
punpcklqdq xmm6, xmm6
punpcklqdq xmm7, xmm7
%endmacro
;-----------------------------------------------------------------------------
; void __cdecl x264_quant_2x2_dc_core15_mmx( int16_t dct[2][2],
; int const i_qmf, int const i_qbits, int const f );
;-----------------------------------------------------------------------------
cglobal x264_quant_2x2_dc_core15_mmx
MMX_QUANT15_DC_START
MMX_QUANT15_1x4 [eax], mm5, mm6, mm7
ret
;-----------------------------------------------------------------------------
; void __cdecl x264_quant_4x4_dc_core15_mmx( int16_t dct[4][4],
; int const i_qmf, int const i_qbits, int const f );
;-----------------------------------------------------------------------------
cglobal x264_quant_4x4_dc_core15_mmx
MMX_QUANT15_DC_START
%rep 4
MMX_QUANT15_1x4 [eax], mm5, mm6, mm7
add eax, byte 8
%endrep
ret
;-----------------------------------------------------------------------------
; void __cdecl x264_quant_4x4_core15_mmx( int16_t dct[4][4],
; int const quant_mf[4][4], int const i_qbits, int const f );
;-----------------------------------------------------------------------------
cglobal x264_quant_4x4_core15_mmx
MMX_QUANT_AC_START
%rep 4
movq mm5, [ecx]
packssdw mm5, [ecx+8]
MMX_QUANT15_1x4 [eax], mm5, mm6, mm7
add ecx, byte 16
add eax, byte 8
%endrep
ret
;-----------------------------------------------------------------------------
; void __cdecl x264_quant_8x8_core15_mmx( int16_t dct[8][8],
; int const quant_mf[8][8], int const i_qbits, int const f );
;-----------------------------------------------------------------------------
cglobal x264_quant_8x8_core15_mmx
MMX_QUANT_AC_START
%rep 16
movq mm5, [ecx]
packssdw mm5, [ecx+8]
MMX_QUANT15_1x4 [eax], mm5, mm6, mm7
add ecx, byte 16
add eax, byte 8
%endrep
ret
; ============================================================================
%macro MMXEXT_QUANT16_DC_START 0
mov eax, [esp+ 4] ; &dct[0][0]
movd mm5, [esp+ 8] ; i_qmf
movd mm6, [esp+12] ; i_qbits
movd mm7, [esp+16] ; f
pshufw mm5, mm5, 0 ; i_qmf in each word
punpckldq mm7, mm7 ; f in each dword
%macro QUANT_ONE 5
;;; %1 (m64) dct[y][x]
;;; %2 (m64/mmx) mf[y][x] or mf[0][0] (as uint16_t)
;;; %3 (m64/mmx) bias[y][x] or bias[0][0] (as uint16_t)
mov%1 %2m0, %3 ; load dct coeffs
pxor %2m1, %2m1
pcmpgtw %2m1, %2m0 ; sign(coeff)
pxor %2m0, %2m1
psubw %2m0, %2m1 ; abs(coeff)
paddusw %2m0, %5 ; round
pmulhuw %2m0, %4 ; divide
pxor %2m0, %2m1 ; restore sign
psubw %2m0, %2m1
mov%1 %3, %2m0 ; store
%endmacro
%macro MMX_QUANT_1x4 3
QUANT_ONE q, m, %1, %2, %3
%endmacro
%macro SSE2_QUANT_1x8 3
QUANT_ONE dqa, xm, %1, %2, %3
%endmacro
%macro MMXEXT_QUANT16_1x4 4
;;; %1 (m64) dct[y][x]
;;; %2 (m64/mmx) quant_mf[y][x] or quant_mf[0][0] (as uint16_t)
;;; %3 (mmx) i_qbits in the low doubleword
;;; %4 (mmx) f as doublewords
;;; trashes mm0-mm2,mm4
movq mm0, %1 ; load dct coeffs
pxor mm4, mm4
pcmpgtw mm4, mm0 ; sign(coeff)
pxor mm0, mm4
psubw mm0, mm4 ; abs(coeff)
movq mm2, mm0
pmullw mm0, %2
pmulhuw mm2, %2
movq mm1, mm0
punpcklwd mm0, mm2
punpckhwd mm1, mm2
paddd mm0, %4 ; round with f
paddd mm1, %4
psrad mm0, %3
psrad mm1, %3
packssdw mm0, mm1 ; pack
pxor mm0, mm4 ; restore sign
psubw mm0, mm4
movq %1, mm0 ; store
%macro SSSE3_QUANT_1x8 3
movdqa xmm0, %1 ; load dct coeffs
movdqa xmm1, xmm0 ; save sign
pabsw xmm0, xmm0
paddusw xmm0, %3 ; round
pmulhuw xmm0, %2 ; divide