Commit 681b3944 authored by Loren Merritt's avatar Loren Merritt
Browse files

amd64 mmx for some intra pred functions



git-svn-id: svn://svn.videolan.org/x264/trunk@429 df754926-b1dd-0310-bc7b-ec298dee348c
parent e1d852d2
......@@ -29,7 +29,7 @@ endif
# MMX/SSE optims
ifeq ($(ARCH),X86_64)
SRCS += common/i386/mc-c.c common/i386/dct-c.c common/amd64/predict.c
SRCS += common/i386/mc-c.c common/i386/dct-c.c common/i386/predict.c
ASMSRC = common/amd64/dct-a.asm common/amd64/cpu-a.asm \
common/amd64/pixel-a.asm common/amd64/mc-a.asm \
common/amd64/mc-a2.asm common/amd64/predict-a.asm \
......
......@@ -3,6 +3,8 @@
;*****************************************************************************
;* Copyright (C) 2005 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
......@@ -26,23 +28,139 @@ BITS 64
%include "amd64inc.asm"
;=============================================================================
; Macros
;=============================================================================
%macro SAVE_0_1 1
movq [%1] , mm0
movq [%1 + 8] , mm1
%endmacro
%macro SAVE_0_0 1
movq [%1] , mm0
movq [%1 + 8] , mm0
%endmacro
SECTION .rodata align=16
ALIGN 8
pw_2: times 4 dw 2
pw_8: times 4 dw 8
pb_1: times 8 db 1
pw_3210:
dw 0
dw 1
dw 2
dw 3
;=============================================================================
; Code
;=============================================================================
SECTION .text
cglobal predict_8x8_v_mmxext
cglobal predict_8x8_dc_core_mmxext
cglobal predict_8x8c_v_mmx
cglobal predict_8x8c_dc_core_mmxext
cglobal predict_8x8c_p_core_mmx
cglobal predict_16x16_p_core_mmx
cglobal predict_16x16_v_mmx
cglobal predict_16x16_dc_core_mmxext
cglobal predict_16x16_dc_top_mmxext
%macro PRED8x8_LOWPASS 2
movq mm3, mm1
pavgb mm1, mm2
pxor mm2, mm3
movq %1 , %2
pand mm2, [pb_1 GLOBAL]
psubusb mm1, mm2
pavgb %1 , mm1 ; %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
%endmacro
%macro PRED8x8_LOAD_TOP 0
sub parm1q, parm2q
and parm3d, 12
movq mm1, [parm1q-1]
movq mm2, [parm1q+1]
cmp parm3d, byte 8
jge .have_topleft
mov al, [parm1q]
mov ah, [parm1q]
pinsrw mm1, eax, 0
.have_topleft:
and parm3d, byte 4
jne .have_topright
mov al, [parm1q+7]
mov ah, [parm1q+7]
pinsrw mm2, eax, 3
.have_topright:
PRED8x8_LOWPASS mm0, [parm1q]
%endmacro
;-----------------------------------------------------------------------------
;
; void predict_8x8_v_mmxext( uint8_t *src, int i_stride, int i_neighbors )
;
;-----------------------------------------------------------------------------
ALIGN 16
predict_8x8_v_mmxext:
PRED8x8_LOAD_TOP
lea rax, [parm2q + 2*parm2q]
movq [parm1q + parm2q], mm0 ; 0
movq [parm1q + 2*parm2q], mm0 ; 1
movq [parm1q + 4*parm2q], mm0 ; 3
movq [parm1q + 8*parm2q], mm0 ; 7
add parm1q, rax
movq [parm1q], mm0 ; 2
movq [parm1q + 2*parm2q], mm0 ; 4
movq [parm1q + rax ], mm0 ; 5
movq [parm1q + 4*parm2q], mm0 ; 6
ret
;-----------------------------------------------------------------------------
;
; void predict_8x8_dc_core_mmxext( uint8_t *src, int i_stride, int i_neighbors, uint8_t *pix_left );
;
;-----------------------------------------------------------------------------
ALIGN 16
predict_8x8_dc_core_mmxext:
movq mm1, [parm4q-1]
movq mm2, [parm4q+1]
PRED8x8_LOWPASS mm4, [parm4q]
PRED8x8_LOAD_TOP
pxor mm1, mm1
psadbw mm0, mm1
psadbw mm4, mm1
paddw mm0, [pw_8 GLOBAL]
paddw mm0, mm4
psrlw mm0, 4
pshufw mm0, mm0, 0
packuswb mm0, mm0
lea rax, [parm2q + 2*parm2q]
movq [parm1q + parm2q], mm0 ; 0
movq [parm1q + 2*parm2q], mm0 ; 1
movq [parm1q + 4*parm2q], mm0 ; 3
movq [parm1q + 8*parm2q], mm0 ; 7
add parm1q, rax
movq [parm1q], mm0 ; 2
movq [parm1q + 2*parm2q], mm0 ; 4
movq [parm1q + rax ], mm0 ; 5
movq [parm1q + 4*parm2q], mm0 ; 6
ret
;-----------------------------------------------------------------------------
;
......@@ -68,6 +186,154 @@ predict_8x8c_v_mmx :
ret
;-----------------------------------------------------------------------------
;
; void predict_8x8c_dc_core_mmxext( uint8_t *src, int i_stride, int s2, int s3 )
;
;-----------------------------------------------------------------------------
ALIGN 16
predict_8x8c_dc_core_mmxext:
sub parm1q, parm2q
lea rax, [parm2q + 2*parm2q]
movq mm0, [parm1q]
pxor mm1, mm1
pxor mm2, mm2
punpckhbw mm1, mm0
punpcklbw mm0, mm2
psadbw mm1, mm2 ; s1
psadbw mm0, mm2 ; s0
movd mm4, parm3d
movd mm5, parm4d
paddw mm0, mm4
pshufw mm2, mm5, 0
psrlw mm0, 3
paddw mm1, [pw_2 GLOBAL]
movq mm3, mm2
pshufw mm1, mm1, 0
pshufw mm0, mm0, 0 ; dc0 (w)
paddw mm3, mm1
psrlw mm3, 3 ; dc3 (w)
psrlw mm2, 2 ; dc2 (w)
psrlw mm1, 2 ; dc1 (w)
packuswb mm0, mm1 ; dc0,dc1 (b)
packuswb mm2, mm3 ; dc2,dc3 (b)
movq [parm1q + parm2q], mm0 ; 0
movq [parm1q + 2*parm2q], mm0 ; 1
movq [parm1q + rax ], mm0 ; 2
movq [parm1q + 4*parm2q], mm0 ; 3
lea parm1q, [parm1q + 4*parm2q]
movq [parm1q + parm2q], mm2 ; 4
movq [parm1q + 2*parm2q], mm2 ; 5
movq [parm1q + rax ], mm2 ; 6
movq [parm1q + 4*parm2q], mm2 ; 7
ret
;-----------------------------------------------------------------------------
;
; void predict_8x8c_p_core_mmx( uint8_t *src, int i_stride, int i00, int b, int c )
;
;-----------------------------------------------------------------------------
ALIGN 16
predict_8x8c_p_core_mmx:
movd mm0, parm3d
movd mm2, parm4d
movd mm4, parm5d
pshufw mm0, mm0, 0
pshufw mm2, mm2, 0
pshufw mm4, mm4, 0
movq mm1, mm2
pmullw mm2, [pw_3210 GLOBAL]
psllw mm1, 2
paddsw mm0, mm2 ; mm0 = {i+0*b, i+1*b, i+2*b, i+3*b}
paddsw mm1, mm0 ; mm1 = {i+4*b, i+5*b, i+6*b, i+7*b}
pxor mm5, mm5
mov eax, 8
ALIGN 4
.loop:
movq mm6, mm0
movq mm7, mm1
psraw mm6, 5
psraw mm7, 5
pmaxsw mm6, mm5
pmaxsw mm7, mm5
packuswb mm6, mm7
movq [parm1q], mm6
paddsw mm0, mm4
paddsw mm1, mm4
add parm1q, parm2q
dec eax
jg .loop
nop
ret
;-----------------------------------------------------------------------------
;
; void predict_16x16_p_core_mmx( uint8_t *src, int i_stride, int i00, int b, int c )
;
;-----------------------------------------------------------------------------
ALIGN 16
predict_16x16_p_core_mmx:
movd mm0, parm3d
movd mm2, parm4d
movd mm4, parm5d
pshufw mm0, mm0, 0
pshufw mm2, mm2, 0
pshufw mm4, mm4, 0
movq mm5, mm2
movq mm1, mm2
pmullw mm5, [pw_3210 GLOBAL]
psllw mm2, 3
psllw mm1, 2
movq mm3, mm2
paddsw mm0, mm5 ; mm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b}
paddsw mm1, mm0 ; mm1 = {i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
paddsw mm2, mm0 ; mm2 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b}
paddsw mm3, mm1 ; mm3 = {i+12*b, i+13*b, i+14*b, i+15*b}
pxor mm5, mm5
mov eax, 16
ALIGN 4
.loop:
movq mm6, mm0
movq mm7, mm1
psraw mm6, 5
psraw mm7, 5
pmaxsw mm6, mm5
pmaxsw mm7, mm5
packuswb mm6, mm7
movq [parm1q], mm6
movq mm6, mm2
movq mm7, mm3
psraw mm6, 5
psraw mm7, 5
pmaxsw mm6, mm5
pmaxsw mm7, mm5
packuswb mm6, mm7
movq [parm1q+8], mm6
paddsw mm0, mm4
paddsw mm1, mm4
paddsw mm2, mm4
paddsw mm3, mm4
add parm1q, parm2q
dec eax
jg .loop
nop
ret
;-----------------------------------------------------------------------------
;
; void predict_16x16_v_mmx( uint8_t *src, int i_stride )
......@@ -103,3 +369,48 @@ predict_16x16_v_mmx :
SAVE_0_1 (parm1q + rax) ; 15
ret
;-----------------------------------------------------------------------------
;
; void predict_16x16_dc_core_mmxext( uint8_t *src, int i_stride, int i_dc_left )
;
;-----------------------------------------------------------------------------
%macro PRED16x16_DC 2
sub parm1q, parm2q ; parm1q <-- line -1
pxor mm0, mm0
pxor mm1, mm1
psadbw mm0, [parm1q]
psadbw mm1, [parm1q + 8]
paddusw mm0, mm1
paddusw mm0, %1
psrlw mm0, %2 ; dc
pshufw mm0, mm0, 0
lea r8, [parm2q + 2*parm2q] ; eax <-- 3* stride
packuswb mm0, mm0 ; dc in bytes
mov eax, 4
ALIGN 4
.loop:
SAVE_0_0 (parm1q + parm2q) ; 0
SAVE_0_0 (parm1q + 2 * parm2q) ; 1
SAVE_0_0 (parm1q + r8 ) ; 2
SAVE_0_0 (parm1q + 4 * parm2q) ; 3
dec eax
lea parm1q, [parm1q + 4 * parm2q]
jg .loop
nop
%endmacro
ALIGN 16
predict_16x16_dc_core_mmxext:
movd mm2, parm3d
PRED16x16_DC mm2, 5
ret
ALIGN 16
predict_16x16_dc_top_mmxext:
PRED16x16_DC [pw_8 GLOBAL], 4
ret
/*****************************************************************************
* predict.c: h264 encoder
*****************************************************************************
* Copyright (C) 2006 x264 project
*
* Authors: Loren Merritt <lorenm@u.washington.edu>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110 USA
*****************************************************************************/
#ifdef HAVE_STDINT_H
#include <stdint.h>
#else
#include <inttypes.h>
#endif
#include "common/predict.h"
#include "common/i386/predict.h"
extern void predict_16x16_v_mmx( uint8_t *src, int i_stride );
extern void predict_8x8c_v_mmx( uint8_t *src, int i_stride );
/****************************************************************************
* 16x16 prediction for intra luma block
****************************************************************************/
#define PREDICT_16x16_DC(v) \
for( i = 0; i < 16; i++ )\
{\
uint64_t *p = (uint64_t*)src;\
*p++ = v;\
*p++ = v;\
src += i_stride;\
}
static void predict_16x16_dc( uint8_t *src, int i_stride )
{
uint32_t s = 0;
uint64_t dc;
int i;
/* calculate DC value */
for( i = 0; i < 16; i++ )
{
s += src[-1 + i * i_stride];
s += src[i - i_stride];
}
dc = (( s + 16 ) >> 5) * 0x0101010101010101ULL;
PREDICT_16x16_DC(dc);
}
static void predict_16x16_dc_left( uint8_t *src, int i_stride )
{
uint32_t s = 0;
uint64_t dc;
int i;
for( i = 0; i < 16; i++ )
{
s += src[-1 + i * i_stride];
}
dc = (( s + 8 ) >> 4) * 0x0101010101010101ULL;
PREDICT_16x16_DC(dc);
}
static void predict_16x16_h( uint8_t *src, int i_stride )
{
int i;
for( i = 0; i < 16; i++ )
{
const uint64_t v = 0x0101010101010101ULL * src[-1];
uint64_t *p = (uint64_t*)src;
*p++ = v;
*p++ = v;
src += i_stride;
}
}
/****************************************************************************
* 8x8 prediction for intra chroma block
****************************************************************************/
static void predict_8x8c_dc_left( uint8_t *src, int i_stride )
{
int y;
uint32_t s0 = 0, s1 = 0;
uint64_t dc0, dc1;
for( y = 0; y < 4; y++ )
{
s0 += src[y * i_stride - 1];
s1 += src[(y+4) * i_stride - 1];
}
dc0 = (( s0 + 2 ) >> 2)*0x0101010101010101ULL;
dc1 = (( s1 + 2 ) >> 2)*0x0101010101010101ULL;
for( y = 0; y < 4; y++ )
{
*(uint64_t*)src = dc0;
src += i_stride;
}
for( y = 0; y < 4; y++ )
{
*(uint64_t*)src = dc1;
src += i_stride;
}
}
static void predict_8x8c_dc_top( uint8_t *src, int i_stride )
{
int y, x;
uint32_t s0 = 0, s1 = 0;
uint64_t dc;
for( x = 0; x < 4; x++ )
{
s0 += src[x - i_stride];
s1 += src[x + 4 - i_stride];
}
dc = (( s0 + 2 ) >> 2)*0x01010101
+ (( s1 + 2 ) >> 2)*0x0101010100000000ULL;
for( y = 0; y < 8; y++ )
{
*(uint64_t*)src = dc;
src += i_stride;
}
}
static void predict_8x8c_h( uint8_t *src, int i_stride )
{
int i;
for( i = 0; i < 8; i++ )
{
*(uint64_t*)src = 0x0101010101010101ULL * src[-1];
src += i_stride;
}
}
/****************************************************************************
* Exported functions:
****************************************************************************/
void x264_predict_16x16_init_mmxext( x264_predict_t pf[7] )
{
pf[I_PRED_16x16_V ] = predict_16x16_v_mmx;
pf[I_PRED_16x16_H ] = predict_16x16_h;
pf[I_PRED_16x16_DC] = predict_16x16_dc;
pf[I_PRED_16x16_DC_LEFT]= predict_16x16_dc_left;
}
void x264_predict_8x8c_init_mmxext( x264_predict_t pf[7] )
{
pf[I_PRED_CHROMA_V ] = predict_8x8c_v_mmx;
pf[I_PRED_CHROMA_H ] = predict_8x8c_h;
pf[I_PRED_CHROMA_DC_LEFT]= predict_8x8c_dc_left;
pf[I_PRED_CHROMA_DC_TOP ]= predict_8x8c_dc_top;
}
void x264_predict_8x8_init_mmxext( x264_predict8x8_t pf[12] )
{
}
......@@ -91,7 +91,7 @@ cglobal predict_16x16_dc_top_mmxext
jge .have_topleft
mov al, [edx]
mov ah, [edx]
pinsrw mm1, ax, 0
pinsrw mm1, eax, 0
mov eax, [picesp + 12]
.have_topleft:
......@@ -99,7 +99,7 @@ cglobal predict_16x16_dc_top_mmxext
jne .have_topright
mov al, [edx+7]
mov ah, [edx+7]
pinsrw mm2, ax, 3
pinsrw mm2, eax, 3
.have_topright:
PRED8x8_LOWPASS mm0, [edx]
......@@ -133,7 +133,7 @@ predict_8x8_v_mmxext:
;-----------------------------------------------------------------------------
;
; void predict_8x8_dc_core_mmxext( uint8_t *src, int i_stride, int i_neighbors, int i_dc_left );
; void predict_8x8_dc_core_mmxext( uint8_t *src, int i_stride, int i_neighbors, uint8_t *pix_left );
;
;-----------------------------------------------------------------------------
......@@ -264,13 +264,9 @@ predict_8x8c_p_core_mmx:
mov edx, [picesp + 4]
mov ecx, [picesp + 8]
movd mm0, [picesp +12]
movd mm2, [picesp +16]
movd mm4, [picesp +20]
pshufw mm0, mm0, 0
pshufw mm2, mm2, 0
pshufw mm4, mm4, 0
pshufw mm0, [picesp +12], 0
pshufw mm2, [picesp +16], 0
pshufw mm4, [picesp +20], 0
movq mm1, mm2
pmullw mm2, [pw_3210 GLOBAL]
psllw mm1, 2
......@@ -314,13 +310,9 @@ predict_16x16_p_core_mmx:
mov edx, [picesp + 4]
mov ecx, [picesp + 8]
movd mm0, [picesp +12]
movd mm2, [picesp +16]
movd mm4, [picesp +20]
pshufw mm0, mm0, 0 ; FIXME shuf these directly from memory
pshufw mm2, mm2, 0 ; if there is stack alignment?
pshufw mm4, mm4, 0
pshufw mm0, [picesp +12], 0
pshufw mm2, [picesp +16], 0
pshufw mm4, [picesp +20], 0
movq mm5, mm2
movq mm1, mm2
pmullw mm5, [pw_3210 GLOBAL]
......
......@@ -21,14 +21,8 @@
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
*****************************************************************************/
#ifdef HAVE_STDINT_H
#include <stdint.h>
#else
#include <inttypes.h>
#endif
#include "common/clip1.h"
#include "common/common.h"
#include "common/clip1.h"
#include "predict.h"
extern void predict_16x16_v_mmx( uint8_t *src, int i_stride );
......@@ -132,27 +126,130 @@ static void predict_8x8_dc( uint8_t *src, int i_stride, int i_neighbor )
predict_8x8_dc_core_mmxext( src, i_stride, i_neighbor, l+1 );
}