Commit 19d07afa authored by Loren Merritt's avatar Loren Merritt
Browse files

x86 mmx for some intra pred functions



git-svn-id: svn://svn.videolan.org/x264/trunk@416 df754926-b1dd-0310-bc7b-ec298dee348c
parent d2dada76
......@@ -33,7 +33,7 @@ extern void predict_16x16_v_mmx( uint8_t *src, int i_stride );
extern void predict_8x8c_v_mmx( uint8_t *src, int i_stride );
/****************************************************************************
* 16x16 prediction for intra block
* 16x16 prediction for intra luma block
****************************************************************************/
#define PREDICT_16x16_DC(v) \
......@@ -169,3 +169,7 @@ void x264_predict_8x8c_init_mmxext( x264_predict_t pf[7] )
pf[I_PRED_CHROMA_DC_TOP ]= predict_8x8c_dc_top;
}
void x264_predict_8x8_init_mmxext( x264_predict8x8_t pf[12] )
{
}
......@@ -3,6 +3,8 @@
;*****************************************************************************
;* Copyright (C) 2005 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
......@@ -33,25 +35,143 @@ BITS 32
%endif
%endmacro
SECTION .data
;=============================================================================
; Macros
;=============================================================================
%macro SAVE_0_1 1
movq [%1] , mm0
movq [%1 + 8] , mm1
%endmacro
%macro SAVE_0_0 1
movq [%1] , mm0
movq [%1 + 8] , mm0
%endmacro
SECTION .rodata data align=16
ALIGN 8
pw_2: times 4 dw 2
pw_8: times 4 dw 8
pb_1: times 8 db 1
pw_3210:
dw 0
dw 1
dw 2
dw 3
;=============================================================================
; Code
;=============================================================================
SECTION .text
cglobal predict_8x8_v_mmxext
cglobal predict_8x8_dc_core_mmxext
cglobal predict_8x8c_v_mmx
cglobal predict_8x8c_dc_core_mmxext
cglobal predict_8x8c_p_core_mmx
cglobal predict_16x16_p_core_mmx
cglobal predict_16x16_v_mmx
cglobal predict_16x16_dc_core_mmxext
cglobal predict_16x16_dc_top_mmxext
%macro PRED8x8_LOWPASS 2
movq mm3, mm1
pavgb mm1, mm2
pxor mm2, mm3
movq %1 , %2
pand mm2, [pb_1]
psubusb mm1, mm2
pavgb %1 , mm1 ; %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
%endmacro
%macro PRED8x8_LOAD_TOP 0
mov edx, [esp + 4]
mov ecx, [esp + 8]
mov eax, [esp +12]
sub edx, ecx
and eax, 12
movq mm1, [edx-1]
movq mm2, [edx+1]
cmp eax, byte 8
jge .have_topleft
mov al, [edx]
mov ah, [edx]
pinsrw mm1, ax, 0
mov eax, [esp +12]
.have_topleft:
and eax, byte 4
jne .have_topright
mov al, [edx+7]
mov ah, [edx+7]
pinsrw mm2, ax, 3
.have_topright:
PRED8x8_LOWPASS mm0, [edx]
%endmacro
;-----------------------------------------------------------------------------
;
; void predict_8x8_v_mmxext( uint8_t *src, int i_stride, int i_neighbors )
;
;-----------------------------------------------------------------------------
ALIGN 16
predict_8x8_v_mmxext:
PRED8x8_LOAD_TOP
lea eax, [ecx + 2*ecx]
movq [edx + ecx], mm0 ; 0
movq [edx + 2*ecx], mm0 ; 1
movq [edx + 4*ecx], mm0 ; 3
movq [edx + 8*ecx], mm0 ; 7
add edx, eax
movq [edx], mm0 ; 2
movq [edx + 2*ecx], mm0 ; 4
movq [edx + eax], mm0 ; 5
movq [edx + 4*ecx], mm0 ; 6
ret
;-----------------------------------------------------------------------------
;
; void predict_8x8_dc_core_mmxext( uint8_t *src, int i_stride, int i_neighbors, int i_dc_left );
;
;-----------------------------------------------------------------------------
ALIGN 16
predict_8x8_dc_core_mmxext:
mov eax, [esp + 16]
movq mm1, [eax-1]
movq mm2, [eax+1]
PRED8x8_LOWPASS mm4, [eax]
PRED8x8_LOAD_TOP
pxor mm1, mm1
psadbw mm0, mm1
psadbw mm4, mm1
paddw mm0, [pw_8]
paddw mm0, mm4
psrlw mm0, 4
pshufw mm0, mm0, 0
packuswb mm0, mm0
lea eax, [ecx + 2*ecx]
movq [edx + ecx], mm0 ; 0
movq [edx + 2*ecx], mm0 ; 1
movq [edx + 4*ecx], mm0 ; 3
movq [edx + 8*ecx], mm0 ; 7
add edx, eax
movq [edx], mm0 ; 2
movq [edx + 2*ecx], mm0 ; 4
movq [edx + eax], mm0 ; 5
movq [edx + 4*ecx], mm0 ; 6
ret
;-----------------------------------------------------------------------------
;
......@@ -80,6 +200,163 @@ predict_8x8c_v_mmx :
ret
;-----------------------------------------------------------------------------
;
; void predict_8x8c_dc_core_mmxext( uint8_t *src, int i_stride, int s2, int s3 )
;
;-----------------------------------------------------------------------------
ALIGN 16
predict_8x8c_dc_core_mmxext:
mov edx, [esp + 4]
mov ecx, [esp + 8]
sub edx, ecx
lea eax, [ecx + 2*ecx]
movq mm0, [edx]
pxor mm1, mm1
pxor mm2, mm2
punpckhbw mm1, mm0
punpcklbw mm0, mm2
psadbw mm1, mm2 ; s1
psadbw mm0, mm2 ; s0
paddw mm0, [esp + 12]
pshufw mm2, [esp + 16], 0
psrlw mm0, 3
paddw mm1, [pw_2]
movq mm3, mm2
pshufw mm1, mm1, 0
pshufw mm0, mm0, 0 ; dc0 (w)
paddw mm3, mm1
psrlw mm3, 3 ; dc3 (w)
psrlw mm2, 2 ; dc2 (w)
psrlw mm1, 2 ; dc1 (w)
packuswb mm0, mm1 ; dc0,dc1 (b)
packuswb mm2, mm3 ; dc2,dc3 (b)
movq [edx + ecx], mm0 ; 0
movq [edx + 2*ecx], mm0 ; 1
movq [edx + eax], mm0 ; 2
movq [edx + 4*ecx], mm0 ; 3
lea edx, [edx + 4*ecx]
movq [edx + ecx], mm2 ; 4
movq [edx + 2*ecx], mm2 ; 5
movq [edx + eax], mm2 ; 6
movq [edx + 4*ecx], mm2 ; 7
ret
;-----------------------------------------------------------------------------
;
; void predict_8x8c_p_core_mmx( uint8_t *src, int i_stride, int i00, int b, int c )
;
;-----------------------------------------------------------------------------
ALIGN 16
predict_8x8c_p_core_mmx:
mov edx, [esp + 4]
mov ecx, [esp + 8]
movd mm0, [esp +12]
movd mm2, [esp +16]
movd mm4, [esp +20]
pshufw mm0, mm0, 0
pshufw mm2, mm2, 0
pshufw mm4, mm4, 0
movq mm1, mm2
pmullw mm2, [pw_3210]
psllw mm1, 2
paddsw mm0, mm2 ; mm0 = {i+0*b, i+1*b, i+2*b, i+3*b}
paddsw mm1, mm0 ; mm1 = {i+4*b, i+5*b, i+6*b, i+7*b}
pxor mm5, mm5
mov eax, 8
ALIGN 4
.loop:
movq mm6, mm0
movq mm7, mm1
psraw mm6, 5
psraw mm7, 5
pmaxsw mm6, mm5
pmaxsw mm7, mm5
packuswb mm6, mm7
movq [edx], mm6
paddsw mm0, mm4
paddsw mm1, mm4
add edx, ecx
dec eax
jg .loop
nop
ret
;-----------------------------------------------------------------------------
;
; void predict_16x16_p_core_mmx( uint8_t *src, int i_stride, int i00, int b, int c )
;
;-----------------------------------------------------------------------------
ALIGN 16
predict_16x16_p_core_mmx:
mov edx, [esp + 4]
mov ecx, [esp + 8]
movd mm0, [esp +12]
movd mm2, [esp +16]
movd mm4, [esp +20]
pshufw mm0, mm0, 0 ; FIXME shuf these directly from memory
pshufw mm2, mm2, 0 ; if there is stack alignment?
pshufw mm4, mm4, 0
movq mm5, mm2
movq mm1, mm2
pmullw mm5, [pw_3210]
psllw mm2, 3
psllw mm1, 2
movq mm3, mm2
paddsw mm0, mm5 ; mm0 = {i+ 0*b, i+ 1*b, i+ 2*b, i+ 3*b}
paddsw mm1, mm0 ; mm1 = {i+ 4*b, i+ 5*b, i+ 6*b, i+ 7*b}
paddsw mm2, mm0 ; mm2 = {i+ 8*b, i+ 9*b, i+10*b, i+11*b}
paddsw mm3, mm1 ; mm3 = {i+12*b, i+13*b, i+14*b, i+15*b}
pxor mm5, mm5
mov eax, 16
ALIGN 4
.loop:
movq mm6, mm0
movq mm7, mm1
psraw mm6, 5
psraw mm7, 5
pmaxsw mm6, mm5
pmaxsw mm7, mm5
packuswb mm6, mm7
movq [edx], mm6
movq mm6, mm2
movq mm7, mm3
psraw mm6, 5
psraw mm7, 5
pmaxsw mm6, mm5
pmaxsw mm7, mm5
packuswb mm6, mm7
movq [edx+8], mm6
paddsw mm0, mm4
paddsw mm1, mm4
paddsw mm2, mm4
paddsw mm3, mm4
add edx, ecx
dec eax
jg .loop
nop
ret
;-----------------------------------------------------------------------------
;
; void predict_16x16_v_mmx( uint8_t *src, int i_stride )
......@@ -118,3 +395,50 @@ predict_16x16_v_mmx :
SAVE_0_1 (edx + eax) ; 15
ret
;-----------------------------------------------------------------------------
;
; void predict_16x16_dc_core_mmxext( uint8_t *src, int i_stride, int i_dc_left )
;
;-----------------------------------------------------------------------------
%macro PRED16x16_DC 2
mov edx, [esp + 4]
mov ecx, [esp + 8]
sub edx, ecx ; edx <-- line -1
pxor mm0, mm0
pxor mm1, mm1
psadbw mm0, [edx]
psadbw mm1, [edx + 8]
paddusw mm0, mm1
paddusw mm0, %1 ; FIXME is stack alignment guaranteed?
psrlw mm0, %2 ; dc
push edi
pshufw mm0, mm0, 0
lea eax, [ecx + 2*ecx] ; eax <-- 3* stride
packuswb mm0, mm0 ; dc in bytes
mov edi, 4
ALIGN 4
.loop:
SAVE_0_0 (edx + ecx) ; 0
SAVE_0_0 (edx + 2 * ecx) ; 1
SAVE_0_0 (edx + eax) ; 2
SAVE_0_0 (edx + 4 * ecx) ; 3
dec edi
lea edx, [edx + 4 * ecx]
jg .loop
pop edi
ret
%endmacro
ALIGN 16
predict_16x16_dc_core_mmxext:
PRED16x16_DC [esp+12], 5
ALIGN 16
predict_16x16_dc_top_mmxext:
PRED16x16_DC [pw_8], 4
......@@ -27,11 +27,110 @@
#include <inttypes.h>
#endif
#include "common/predict.h"
#include "common/clip1.h"
#include "common/common.h"
#include "predict.h"
extern void predict_16x16_v_mmx( uint8_t *src, int i_stride );
extern void predict_16x16_dc_core_mmxext( uint8_t *src, int i_stride, int i_dc_left );
extern void predict_16x16_dc_top_mmxext( uint8_t *src, int i_stride );
extern void predict_16x16_p_core_mmx( uint8_t *src, int i_stride, int i00, int b, int c );
extern void predict_8x8c_p_core_mmx( uint8_t *src, int i_stride, int i00, int b, int c );
extern void predict_8x8c_dc_core_mmxext( uint8_t *src, int i_stride, int s2, int s3 );
extern void predict_8x8c_v_mmx( uint8_t *src, int i_stride );
extern void predict_8x8_v_mmxext( uint8_t *src, int i_stride, int i_neighbors );
extern void predict_8x8_dc_core_mmxext( uint8_t *src, int i_stride, int i_neighbors, uint8_t *pix_left );
static void predict_16x16_p( uint8_t *src, int i_stride )
{
int a, b, c, i;
int H = 0;
int V = 0;
int i00;
for( i = 1; i <= 8; i++ )
{
H += i * ( src[7+i - i_stride ] - src[7-i - i_stride ] );
V += i * ( src[(7+i)*i_stride -1] - src[(7-i)*i_stride -1] );
}
a = 16 * ( src[15*i_stride -1] + src[15 - i_stride] );
b = ( 5 * H + 32 ) >> 6;
c = ( 5 * V + 32 ) >> 6;
i00 = a - b * 7 - c * 7 + 16;
predict_16x16_p_core_mmx( src, i_stride, i00, b, c );
}
static void predict_8x8c_p( uint8_t *src, int i_stride )
{
int a, b, c, i;
int H = 0;
int V = 0;
int i00;
for( i = 1; i <= 4; i++ )
{
H += i * ( src[3+i - i_stride] - src[3-i - i_stride] );
V += i * ( src[(3+i)*i_stride -1] - src[(3-i)*i_stride -1] );
}
a = 16 * ( src[7*i_stride -1] + src[7 - i_stride] );
b = ( 17 * H + 16 ) >> 5;
c = ( 17 * V + 16 ) >> 5;
i00 = a -3*b -3*c + 16;
predict_8x8c_p_core_mmx( src, i_stride, i00, b, c );
}
static void predict_16x16_dc( uint8_t *src, int i_stride )
{
uint32_t dc=16;
int i;
for( i = 0; i < 16; i+=2 )
{
dc += src[-1 + i * i_stride];
dc += src[-1 + (i+1) * i_stride];
}
predict_16x16_dc_core_mmxext( src, i_stride, dc );
}
static void predict_8x8c_dc( uint8_t *src, int i_stride )
{
int s2 = 4
+ src[-1 + 0*i_stride]
+ src[-1 + 1*i_stride]
+ src[-1 + 2*i_stride]
+ src[-1 + 3*i_stride];
int s3 = 2
+ src[-1 + 4*i_stride]
+ src[-1 + 5*i_stride]
+ src[-1 + 6*i_stride]
+ src[-1 + 7*i_stride];
predict_8x8c_dc_core_mmxext( src, i_stride, s2, s3 );
}
#define SRC(x,y) src[(x)+(y)*i_stride]
static void predict_8x8_dc( uint8_t *src, int i_stride, int i_neighbor )
{
uint8_t l[10];
l[0] = i_neighbor&MB_TOPLEFT ? SRC(-1,-1) : SRC(-1,0);
l[1] = SRC(-1,0);
l[2] = SRC(-1,1);
l[3] = SRC(-1,2);
l[4] = SRC(-1,3);
l[5] = SRC(-1,4);
l[6] = SRC(-1,5);
l[7] = SRC(-1,6);
l[8] =
l[9] = SRC(-1,7);
predict_8x8_dc_core_mmxext( src, i_stride, i_neighbor, l+1 );
}
/****************************************************************************
* Exported functions:
......@@ -39,10 +138,21 @@ extern void predict_8x8c_v_mmx( uint8_t *src, int i_stride );
void x264_predict_16x16_init_mmxext( x264_predict_t pf[7] )
{
pf[I_PRED_16x16_V] = predict_16x16_v_mmx;
pf[I_PRED_16x16_DC] = predict_16x16_dc;
pf[I_PRED_16x16_DC_TOP] = predict_16x16_dc_top_mmxext;
pf[I_PRED_16x16_P] = predict_16x16_p;
}
void x264_predict_8x8c_init_mmxext( x264_predict_t pf[7] )
{
pf[I_PRED_CHROMA_V] = predict_8x8c_v_mmx;
pf[I_PRED_CHROMA_P] = predict_8x8c_p;
pf[I_PRED_CHROMA_DC] = predict_8x8c_dc;
}
void x264_predict_8x8_init_mmxext( x264_predict8x8_t pf[12] )
{
pf[I_PRED_8x8_V] = predict_8x8_v_mmxext;
pf[I_PRED_8x8_DC] = predict_8x8_dc;
}
......@@ -26,5 +26,6 @@
void x264_predict_16x16_init_mmxext ( x264_predict_t pf[7] );
void x264_predict_8x8c_init_mmxext ( x264_predict_t pf[7] );
void x264_predict_8x8_init_mmxext ( x264_predict8x8_t pf[12] );
#endif
......@@ -26,6 +26,7 @@
#include "common.h"
#include "clip1.h"
#ifdef _MSC_VER
#undef HAVE_MMXEXT /* not finished now */
......@@ -34,16 +35,8 @@
# include "i386/predict.h"
#endif
static inline int clip_uint8( int a )
{
if (a&(~255))
return (-a)>>31;
else
return a;
}
/****************************************************************************
* 16x16 prediction for intra block DC, H, V, P
* 16x16 prediction for intra luma block
****************************************************************************/
#define PREDICT_16x16_DC(v) \
......@@ -62,7 +55,6 @@ static void predict_16x16_dc( uint8_t *src, int i_stride )
uint32_t dc = 0;
int i;
/* calculate DC value */
for( i = 0; i < 16; i++ )
{
dc += src[-1 + i * i_stride];
......@@ -162,13 +154,11 @@ static void predict_16x16_p( uint8_t *src, int i_stride )
for( y = 0; y < 16; y++ )
{
int pix = i00;
for( x = 0; x < 16; x++ )
{
int pix;
pix = (i00+b*x)>>5;
src[x] = clip_uint8( pix );
src[x] = x264_clip_uint8( pix>>5 );
pix += b;
}
src += i_stride;
i00 += c;
......@@ -335,10 +325,11 @@ static void predict_8x8c_p( uint8_t *src, int i_stride )
for( y = 0; y < 8; y++ )
{
int pix = i00;
for( x = 0; x < 8; x++ )
{
int pix = (i00 +b*x) >> 5;
src[x] = clip_uint8( pix );
src[x] = x264_clip_uint8( pix>>5 );
pix += b;
}
src += i_stride;
i00 += c;
......@@ -881,6 +872,13 @@ void x264_predict_8x8_init( int cpu, x264_predict8x8_t pf[12] )
pf[I_PRED_8x8_DC_LEFT]= predict_8x8_dc_left;
pf[I_PRED_8x8_DC_TOP] = predict_8x8_dc_top;
pf[I_PRED_8x8_DC_128] = predict_8x8_dc_128;
#ifdef HAVE_MMXEXT
if( cpu&X264_CPU_MMXEXT )
{
x264_predict_8x8_init_mmxext( pf );
}
#endif
}
void x264_predict_4x4_init( int cpu, x264_predict_t pf[12] )
......
......@@ -499,12 +499,18 @@ static int check_intra( int cpu_ref, int cpu_new )
for( i = 0; i < 12; i++ )
INTRA_TEST( predict_4x4, i );
for( i = 0; i < 12; i++ )
INTRA_TEST( predict_8x8, i, 0xf );
for( i = 0; i < 7; i++ )
INTRA_TEST( predict_8x8c, i );
for( i = 0; i < 7; i++ )
INTRA_TEST( predict_16x16, i );
for( i = 0; i < 12; i++ )
</