Commit 2d05702f authored by Loren Merritt's avatar Loren Merritt

mmx 8x8 dct.

On a K8: sub16x16_dct8 3806->1461, add16x16_idct8 4852->1297 cycles. total speedup: 1-3%.
patch by Christian Heine (sennindemokrit at gmx dot net)


git-svn-id: svn://svn.videolan.org/x264/trunk@290 df754926-b1dd-0310-bc7b-ec298dee348c
parent 2e5b0b93
......@@ -139,11 +139,15 @@ SECTION .rodata
;-----------------------------------------------------------------------------
ALIGN 16
x264_mmx_1:
dw 1, 1, 1, 1
x264_mmx_32:
dw 32, 32, 32, 32
x264_mmx_1: dw 1, 1, 1, 1
x264_mmx_32: dw 32, 32, 32, 32
x264_mmx_PPNN: dw 1, 1, -1, -1
x264_mmx_PNPN: dw 1, -1, 1, -1
x264_mmx_PNNP: dw 1, -1, -1, 1
x264_mmx_PPPN: dw 1, 1, 1, -1
x264_mmx_PPNP: dw 1, 1, -1, 1
x264_mmx_2121: dw 2, 1, 2, 1
x264_mmx_p2n2p1p1: dw 2, -2, 1, 1
;=============================================================================
; Code
......@@ -306,3 +310,409 @@ x264_add4x4_idct_mmxext:
ret
; =============================================================================
; 8x8 Transform
; =============================================================================
; -----------------------------------------------------------------------------
; input 2x8 unsigned bytes (%5,%6), zero (%7) output: difference (%1,%2)
; -----------------------------------------------------------------------------
%macro MMX_LOAD_DIFF_8P 7
movq %1, %5
movq %2, %1
punpcklbw %1, %7
punpckhbw %2, %7
movq %3, %6
movq %4, %3
punpcklbw %3, %7
punpckhbw %4, %7
psubw %1, %3
psubw %2, %4
%endmacro
%macro MMX_LOADSUMSUB 4 ; returns %1=%3+%4, %2=%3-%4
movq %2, %3
movq %1, %4
MMX_SUMSUB_BA %1, %2
%endmacro
%macro MMX_STORE_DIFF_8P 6
movq %1, %3
movq %2, %1
punpcklbw %1, %6
punpckhbw %2, %6
paddw %1, %4
paddw %2, %5
packuswb %1, %2
movq %3, %1
%endmacro
cglobal x264_pixel_sub_8x8_mmx
cglobal x264_xdct8_mmxext
cglobal x264_ydct8_mmx
cglobal x264_ydct8_sse2
cglobal x264_xidct8_mmxext
cglobal x264_yidct8_mmx
cglobal x264_yidct8_sse2
cglobal x264_pixel_add_8x8_mmx
ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_pixel_sub_8x8_mmx( int16_t *diff, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
;-----------------------------------------------------------------------------
x264_pixel_sub_8x8_mmx:
; mov rdi, rdi ; diff
; mov rsi, rsi ; pix1
movsxd rdx, edx ; i_pix1
; mov rcx, rcx ; pix2
movsxd r8, r8d ; i_pix2
MMX_ZERO mm7
%assign disp 0
%rep 8
MMX_LOAD_DIFF_8P mm0, mm1, mm2, mm3, [rsi], [rcx], mm7
movq [rdi+disp], mm0
movq [rdi+disp+8], mm1
add rsi, rdx
add rcx, r8
%assign disp disp+16
%endrep
ret
ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_xdct8_mmxext( int16_t dest[8][8] );
;-----------------------------------------------------------------------------
x264_xdct8_mmxext:
movq mm5, [x264_mmx_PPNN]
movq mm6, [x264_mmx_PNNP]
movq mm4, [x264_mmx_PPPN]
movq mm7, [x264_mmx_PPNP]
;-------------------------------------------------------------------------
; horizontal dct ( compute 1 row at a time -> 8 loops )
;-------------------------------------------------------------------------
%assign disp 0
%rep 8
movq mm0, [rdi+disp]
movq mm1, [rdi+disp+8]
pshufw mm2, mm1, 00011011b
movq mm1, mm0
paddw mm0, mm2 ; (low)s07/s16/d25/s34(high)
psubw mm1, mm2 ; (low)d07/d16/d25/d34(high)
pshufw mm2, mm0, 00011011b ; (low)s34/s25/s16/s07(high)
pmullw mm0, mm5 ; (low)s07/s16/-s25/-s34(high)
paddw mm0, mm2 ; (low)a0/a1/a3/a2(high)
movq mm3, mm1
psraw mm1, 1 ; (low)d07/d16/d25/d34(high) (x>>1)
pshufw mm2, mm3, 10110001b ; (low)d16/d07/d34/d25(high)
paddw mm1, mm3 ; (low)d07/d16/d25/d34(high) (x+(x>>1))
pshufw mm3, mm2, 00011011b ; (low)d25/d34/d07/d16(high)
pmullw mm2, mm5 ; (low)d16/d07/-d34/-d25(high)
pmullw mm1, mm6 ; (low)d07/-d16/-d25/d34(high) (x+(x>>1))
paddw mm3, mm2
paddw mm1, mm3 ; (low)a4/a6/a5/a7(high)
pshufw mm2, mm0, 11001001b ; (low)a1/a3/a0/a2(high)
pshufw mm0, mm0, 10011100b ; (low)a0/a2/a1/a3(high)
pmullw mm2, [x264_mmx_2121]
pmullw mm0, mm5 ; (low)a0/a2/-a1/-a3(high)
psraw mm2, 1 ; (low)a1/a3>>1/a0/a2>>1(high)
paddw mm0, mm2 ; (low)dst0/dst2/dst4/dst6(high)
pshufw mm1, mm1, 00100111b ; (low)a7/a6/a5/a4(high)
pshufw mm2, mm1, 00011011b ; (low)a4/a5/a6/a7(high)
psraw mm1, 2 ; (low)a7>>2/a6>>2/a5>>2/a4>>2(high)
pmullw mm2, mm4 ; (low)a4/a5/a6/-a7(high)
pmullw mm1, mm7 ; (low)a7>>2/a6>>2/-a5>>2/a4>>2(high)
paddw mm1, mm2 ; (low)dst1/dst3/dst5/dst7(high)
movq mm2, mm0
punpcklwd mm0, mm1 ; (low)dst0/dst1/dst2/dst3(high)
punpckhwd mm2, mm1 ; (low)dst4/dst5/dst6/dst7(high)
movq [rdi+disp], mm0
movq [rdi+disp+8], mm2
%assign disp disp+16
%endrep
ret
ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_ydct8_mmx( int16_t dest[8][8] );
;-----------------------------------------------------------------------------
x264_ydct8_mmx:
;-------------------------------------------------------------------------
; vertical dct ( compute 4 columns at a time -> 2 loops )
;-------------------------------------------------------------------------
%assign disp 0
%rep 2
MMX_LOADSUMSUB mm2, mm3, [rdi+disp+0*16], [rdi+disp+7*16] ; mm2 = s07, mm3 = d07
MMX_LOADSUMSUB mm1, mm5, [rdi+disp+1*16], [rdi+disp+6*16] ; mm1 = s16, mm5 = d16
MMX_LOADSUMSUB mm0, mm6, [rdi+disp+2*16], [rdi+disp+5*16] ; mm0 = s25, mm6 = d25
MMX_LOADSUMSUB mm4, mm7, [rdi+disp+3*16], [rdi+disp+4*16] ; mm4 = s34, mm7 = d34
MMX_SUMSUB_BA mm4, mm2 ; mm4 = a0, mm2 = a2
MMX_SUMSUB_BA mm0, mm1 ; mm0 = a1, mm1 = a3
MMX_SUMSUB_BA mm0, mm4 ; mm0 = dst0, mm1 = dst4
movq [rdi+disp+0*16], mm0
movq [rdi+disp+4*16], mm4
movq mm0, mm1 ; a3
psraw mm0, 1 ; a3>>1
paddw mm0, mm2 ; a2 + (a3>>1)
psraw mm2, 1 ; a2>>1
psubw mm2, mm1 ; (a2>>1) - a3
movq [rdi+disp+2*16], mm0
movq [rdi+disp+6*16], mm2
movq mm0, mm6
psraw mm0, 1
paddw mm0, mm6 ; d25+(d25>>1)
movq mm1, mm3
psubw mm1, mm7 ; a5 = d07-d34-(d25+(d25>>1))
psubw mm1, mm0
movq mm0, mm5
psraw mm0, 1
paddw mm0, mm5 ; d16+(d16>>1)
movq mm2, mm3
paddw mm2, mm7 ; a6 = d07+d34-(d16+(d16>>1))
psubw mm2, mm0
movq mm0, mm3
psraw mm0, 1
paddw mm0, mm3 ; d07+(d07>>1)
paddw mm0, mm5
paddw mm0, mm6 ; a4 = d16+d25+(d07+(d07>>1))
movq mm3, mm7
psraw mm3, 1
paddw mm3, mm7 ; d34+(d34>>1)
paddw mm3, mm5
psubw mm3, mm6 ; a7 = d16-d25+(d34+(d34>>1))
movq mm7, mm3
psraw mm7, 2
paddw mm7, mm0 ; a4 + (a7>>2)
movq mm6, mm2
psraw mm6, 2
paddw mm6, mm1 ; a5 + (a6>>2)
psraw mm0, 2
psraw mm1, 2
psubw mm0, mm3 ; (a4>>2) - a7
psubw mm2, mm1 ; a6 - (a5>>2)
movq [rdi+disp+1*16], mm7
movq [rdi+disp+3*16], mm6
movq [rdi+disp+5*16], mm2
movq [rdi+disp+7*16], mm0
%assign disp disp+8
%endrep
ret
ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_xidct8_mmxext( int16_t dest[8][8] );
;-----------------------------------------------------------------------------
x264_xidct8_mmxext:
movq mm4, [x264_mmx_PPNN]
movq mm5, [x264_mmx_PNPN]
movq mm6, [x264_mmx_PPNP]
movq mm7, [x264_mmx_PPPN]
;-------------------------------------------------------------------------
; horizontal idct ( compute 1 row at a time -> 8 loops )
;-------------------------------------------------------------------------
%assign disp 0
%rep 8
pshufw mm0, [rdi+disp], 11011000b ; (low)d0,d2,d1,d3(high)
pshufw mm2, [rdi+disp+8], 11011000b ; (low)d4,d6,d5,d7(high)
movq mm1, mm0
punpcklwd mm0, mm2 ; (low)d0,d4,d2,d6(high)
punpckhwd mm1, mm2 ; (low)d1,d5,d3,d7(high)
pshufw mm2, mm0, 10110001b ; (low)d4,d0,d6,d2(high)
pmullw mm0, [x264_mmx_p2n2p1p1]; (low)2*d0,-2*d4,d2,d6(high)
pmullw mm2, mm6 ; (low)d4,d0,-d6,d2(high)
psraw mm0, 1 ; (low)d0,-d4,d2>>1,d6>>1(high)
paddw mm0, mm2 ; (low)e0,e2,e4,e6(high)
movq mm3, mm1 ; (low)d1,d5,d3,d7(high)
psraw mm1, 1 ; (low)d1>>1,d5>>1,d3>>1,d7>>1(high)
pshufw mm2, mm3, 10110001b ; (low)d5,d1,d7,d3(high)
paddw mm1, mm3 ; (low)d1+(d1>>1),d5+(d5>>1),d3+(d3>>1),d7+(d7>>1)(high)
pshufw mm3, mm2, 00011011b ; (low)d3,d7,d1,d5(high)
pmullw mm1, mm4 ; (low)d1+(d1>>1),d5+(d5>>1),-d3-(d3>>1),-d7-(d7>>1)(high)
pmullw mm2, mm5 ; (low)d5,-d1,d7,-d3(high)
paddw mm1, mm3
paddw mm1, mm2 ; (low)e7,e5,e3,e1(high)
pshufw mm2, mm0, 00011011b ; (low)e6,e4,e2,e0(high)
pmullw mm0, mm4 ; (low)e0,e2,-e4,-e6(high)
pshufw mm3, mm1, 00011011b ; (low)e1,e3,e5,e7(high)
psraw mm1, 2 ; (low)e7>>2,e5>>2,e3>>2,e1>>2(high)
pmullw mm3, mm6 ; (low)e1,e3,-e5,e7(high)
pmullw mm1, mm7 ; (low)e7>>2,e5>>2,e3>>2,-e1>>2(high)
paddw mm0, mm2 ; (low)f0,f2,f4,f6(high)
paddw mm1, mm3 ; (low)f1,f3,f5,f7(high)
pshufw mm3, mm0, 00011011b ; (low)f6,f4,f2,f0(high)
pshufw mm2, mm1, 00011011b ; (low)f7,f5,f3,f1(high)
psubw mm3, mm1
paddw mm0, mm2
movq [rdi+disp], mm0
movq [rdi+disp+8], mm3
%assign disp disp+16
%endrep
ret
ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_yidct8_mmx( int16_t dest[8][8] );
;-----------------------------------------------------------------------------
x264_yidct8_mmx:
;-------------------------------------------------------------------------
; vertical idct ( compute 4 columns at a time -> 2 loops )
;-------------------------------------------------------------------------
%assign disp 0
%rep 2
movq mm1, [rdi+disp+1*16] ; mm1 = d1
movq mm3, [rdi+disp+3*16] ; mm3 = d3
movq mm5, [rdi+disp+5*16] ; mm5 = d5
movq mm7, [rdi+disp+7*16] ; mm7 = d7
movq mm4, mm7
psraw mm4, 1
movq mm0, mm5
psubw mm0, mm7
psubw mm0, mm4
psubw mm0, mm3 ; mm0 = e1
movq mm6, mm3
psraw mm6, 1
movq mm2, mm7
psubw mm2, mm6
psubw mm2, mm3
paddw mm2, mm1 ; mm2 = e3
movq mm4, mm5
psraw mm4, 1
paddw mm4, mm5
paddw mm4, mm7
psubw mm4, mm1 ; mm4 = e5
movq mm6, mm1
psraw mm6, 1
paddw mm6, mm1
paddw mm6, mm5
paddw mm6, mm3 ; mm6 = e7
movq mm1, mm0
movq mm3, mm4
movq mm5, mm2
movq mm7, mm6
psraw mm6, 2
psraw mm3, 2
psraw mm5, 2
psraw mm0, 2
paddw mm1, mm6 ; mm1 = f1
paddw mm3, mm2 ; mm3 = f3
psubw mm5, mm4 ; mm5 = f5
psubw mm7, mm0 ; mm7 = f7
movq mm2, [rdi+disp+2*16] ; mm2 = d2
movq mm6, [rdi+disp+6*16] ; mm6 = d6
movq mm4, mm2
movq mm0, mm6
psraw mm4, 1
psraw mm6, 1
psubw mm4, mm0 ; mm4 = a4
paddw mm6, mm2 ; mm6 = a6
movq mm2, [rdi+disp+0*16] ; mm2 = d0
movq mm0, [rdi+disp+4*16] ; mm0 = d4
MMX_SUMSUB_BA mm0, mm2 ; mm0 = a0, mm2 = a2
MMX_SUMSUB_BA mm6, mm0 ; mm6 = f0, mm0 = f6
MMX_SUMSUB_BA mm4, mm2 ; mm4 = f2, mm2 = f4
MMX_SUMSUB_BA mm7, mm6 ; mm7 = g0, mm6 = g7
MMX_SUMSUB_BA mm5, mm4 ; mm5 = g1, mm4 = g6
MMX_SUMSUB_BA mm3, mm2 ; mm3 = g2, mm2 = g5
MMX_SUMSUB_BA mm1, mm0 ; mm1 = g3, mm0 = g4
psraw mm7, 6
psraw mm6, 6
psraw mm5, 6
psraw mm4, 6
psraw mm3, 6
psraw mm2, 6
psraw mm1, 6
psraw mm0, 6
movq [rdi+disp+0*16], mm7
movq [rdi+disp+1*16], mm5
movq [rdi+disp+2*16], mm3
movq [rdi+disp+3*16], mm1
movq [rdi+disp+4*16], mm0
movq [rdi+disp+5*16], mm2
movq [rdi+disp+6*16], mm4
movq [rdi+disp+7*16], mm6
%assign disp disp+8
%endrep
ret
ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_pixel_add_8x8_mmx( unit8_t *dst, int i_dst, int16_t src[8][8] );
;-----------------------------------------------------------------------------
x264_pixel_add_8x8_mmx:
; mov rdi, rdi ; dst
movsxd rsi, esi ; i_dst
; mov rdx, rdx ; src
MMX_ZERO mm7
%assign disp 0
%rep 8
MMX_STORE_DIFF_8P mm0, mm1, [rdi], [rdx+disp], [rdx+disp+8], mm7
add rdi, rsi
%assign disp disp+16
%endrep
ret
......@@ -412,6 +412,12 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->add8x8_idct = x264_add8x8_idct_mmxext;
dctf->add16x16_idct = x264_add16x16_idct_mmxext;
dctf->sub8x8_dct8 = x264_sub8x8_dct8_mmxext;
dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmxext;
dctf->add8x8_idct8 = x264_add8x8_idct8_mmxext;
dctf->add16x16_idct8= x264_add16x16_idct8_mmxext;
dctf->dct4x4dc = x264_dct4x4dc_mmxext;
dctf->idct4x4dc = x264_idct4x4dc_mmxext;
}
......
......@@ -6,6 +6,7 @@
;*
;* Authors: Min Chen <chenm001.163.com> (converted to nasm)
;* Laurent Aimar <fenrir@via.ecp.fr> (initial version)
;* Christian Heine <sennindemokrit@gmx.net> (dct8/idct8 functions)
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
......@@ -27,6 +28,7 @@
;* Revision history: *
;* *
;* 2004.04.28 portab all 4x4 function to nasm (CM) *
;* 2005.08.24 added mmxext optimized dct8/idct8 functions (CH) *
;* *
;*****************************************************************************
......@@ -121,9 +123,6 @@ BITS 32
movd %5, %1
%endmacro
;%macro
;%endmacro
;=============================================================================
; Local Data (Read Only)
;=============================================================================
......@@ -139,11 +138,15 @@ SECTION .rodata data align=16
;-----------------------------------------------------------------------------
ALIGN 16
x264_mmx_1:
dw 1, 1, 1, 1
x264_mmx_32:
dw 32, 32, 32, 32
x264_mmx_1: dw 1, 1, 1, 1
x264_mmx_32: dw 32, 32, 32, 32
x264_mmx_PPNN: dw 1, 1, -1, -1
x264_mmx_PNPN: dw 1, -1, 1, -1
x264_mmx_PNNP: dw 1, -1, -1, 1
x264_mmx_PPPN: dw 1, 1, 1, -1
x264_mmx_PPNP: dw 1, 1, -1, 1
x264_mmx_2121: dw 2, 1, 2, 1
x264_mmx_p2n2p1p1: dw 2, -2, 1, 1
;=============================================================================
; Code
......@@ -311,3 +314,421 @@ x264_add4x4_idct_mmxext:
ret
; =============================================================================
; 8x8 Transform
; =============================================================================
; -----------------------------------------------------------------------------
; input 2x8 unsigned bytes (%5,%6), zero (%7) output: difference (%1,%2)
; -----------------------------------------------------------------------------
%macro MMX_LOAD_DIFF_8P 7
movq %1, %5
movq %2, %1
punpcklbw %1, %7
punpckhbw %2, %7
movq %3, %6
movq %4, %3
punpcklbw %3, %7
punpckhbw %4, %7
psubw %1, %3
psubw %2, %4
%endmacro
%macro MMX_LOADSUMSUB 4 ; returns %1=%3+%4, %2=%3-%4
movq %2, %3
movq %1, %4
MMX_SUMSUB_BA %1, %2
%endmacro
%macro MMX_STORE_DIFF_8P 6
movq %1, %3
movq %2, %1
punpcklbw %1, %6
punpckhbw %2, %6
paddw %1, %4
paddw %2, %5
packuswb %1, %2
movq %3, %1
%endmacro
cglobal x264_pixel_sub_8x8_mmx
cglobal x264_pixel_sub_8x8_mmx
cglobal x264_xdct8_mmxext
cglobal x264_ydct8_mmx
cglobal x264_ydct8_sse2
cglobal x264_xidct8_mmxext
cglobal x264_yidct8_mmx
cglobal x264_yidct8_sse2
cglobal x264_pixel_add_8x8_mmx
ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_pixel_sub_8x8_mmx( int16_t *diff, uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 );
;-----------------------------------------------------------------------------
x264_pixel_sub_8x8_mmx:
push ebx
push ebp
mov ebp, [esp+12] ; diff
mov eax, [esp+16] ; pix1
mov ebx, [esp+20] ; i_pix1
mov ecx, [esp+24] ; pix2
mov edx, [esp+28] ; i_pix2
MMX_ZERO mm7
%assign disp 0
%rep 8
MMX_LOAD_DIFF_8P mm0, mm1, mm2, mm3, [eax], [ecx], mm7
movq [ebp+disp], mm0
movq [ebp+disp+8], mm1
add eax, ebx
add ecx, edx
%assign disp disp+16
%endrep
pop ebp
pop ebx
ret
ALIGN 16
;-----------------------------------------------------------------------------
; void __cdecl x264_xdct8_mmxext( int16_t dest[8][8] );
;-----------------------------------------------------------------------------
x264_xdct8_mmxext:
mov eax, [esp+04] ; dest
movq mm5, [x264_mmx_PPNN]
movq mm6, [x264_mmx_PNNP]
movq mm4, [x264_mmx_PPPN]
movq mm7, [x264_mmx_PPNP]
;-------------------------------------------------------------------------
; horizontal dct ( compute 1 row at a time -> 8 loops )
;-------------------------------------------------------------------------
%assign disp 0
%rep 8
movq mm0, [eax+disp]
movq mm1, [eax+disp+8]
pshufw mm2, mm1, 00011011b
movq mm1, mm0
paddw mm0, mm2 ; (low)s07/s16/d25/s34(high)
psubw mm1, mm2 ; (low)d07/d16/d25/d34(high)
pshufw mm2, mm0, 00011011b ; (low)s34/s25/s16/s07(high)
pmullw mm0, mm5 ; (low)s07/s16/-s25/-s34(high)
paddw mm0, mm2 ; (low)a0/a1/a3/a2(high)
movq mm3, mm1
psraw mm1, 1 ; (low)d07/d16/d25/d34(high) (x>>1)
pshufw mm2, mm3, 10110001b ; (low)d16/d07/d34/d25(high)
paddw mm1, mm3 ; (low)d07/d16/d25/d34(high) (x+(x>>1))
pshufw mm3, mm2, 00011011b ; (low)d25/d34/d07/d16(high)
pmullw mm2, mm5 ; (low)d16/d07/-d34/-d25(high)
pmullw mm1, mm6 ; (low)d07/-d16/-d25/d34(high) (x+(x>>1))
paddw mm3, mm2
paddw mm1, mm3 ; (low)a4/a6/a5/a7(high)
pshufw mm2, mm0, 11001001b ; (low)a1/a3/a0/a2(high)
pshufw mm0, mm0, 10011100b ; (low)a0/a2/a1/a3(high)
pmullw mm2, [x264_mmx_2121]
pmullw mm0, mm5 ; (low)a0/a2/-a1/-a3(high)
psraw mm2, 1 ; (low)a1/a3>>1/a0/a2>>1(high)
paddw mm0, mm2 ; (low)dst0/dst2/dst4/dst6(high)