Commit d2dada76 authored by Loren Merritt's avatar Loren Merritt
Browse files

cosmetics: reorganize intra prediction dsp



git-svn-id: svn://svn.videolan.org/x264/trunk@415 df754926-b1dd-0310-bc7b-ec298dee348c
parent 791495e3
......@@ -28,7 +28,7 @@ endif
# MMX/SSE optims
ifeq ($(ARCH),X86_64)
SRCS += common/i386/mc-c.c common/i386/dct-c.c common/i386/predict.c
SRCS += common/i386/mc-c.c common/i386/dct-c.c common/amd64/predict.c
ASMSRC = common/amd64/dct-a.asm common/amd64/cpu-a.asm \
common/amd64/pixel-a.asm common/amd64/mc-a.asm \
common/amd64/mc-a2.asm common/amd64/predict-a.asm \
......
/*****************************************************************************
* predict.c: h264 encoder
*****************************************************************************
* Copyright (C) 2006 x264 project
*
* Authors: Loren Merritt <lorenm@u.washington.edu>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110 USA
*****************************************************************************/
#ifdef HAVE_STDINT_H
#include <stdint.h>
#else
#include <inttypes.h>
#endif
#include "common/predict.h"
#include "common/i386/predict.h"
extern void predict_16x16_v_mmx( uint8_t *src, int i_stride );
extern void predict_8x8c_v_mmx( uint8_t *src, int i_stride );
/****************************************************************************
* 16x16 prediction for intra block
****************************************************************************/
#define PREDICT_16x16_DC(v) \
for( i = 0; i < 16; i++ )\
{\
uint64_t *p = (uint64_t*)src;\
*p++ = v;\
*p++ = v;\
src += i_stride;\
}
static void predict_16x16_dc( uint8_t *src, int i_stride )
{
uint32_t s = 0;
uint64_t dc;
int i;
/* calculate DC value */
for( i = 0; i < 16; i++ )
{
s += src[-1 + i * i_stride];
s += src[i - i_stride];
}
dc = (( s + 16 ) >> 5) * 0x0101010101010101ULL;
PREDICT_16x16_DC(dc);
}
static void predict_16x16_dc_left( uint8_t *src, int i_stride )
{
uint32_t s = 0;
uint64_t dc;
int i;
for( i = 0; i < 16; i++ )
{
s += src[-1 + i * i_stride];
}
dc = (( s + 8 ) >> 4) * 0x0101010101010101ULL;
PREDICT_16x16_DC(dc);
}
static void predict_16x16_h( uint8_t *src, int i_stride )
{
int i;
for( i = 0; i < 16; i++ )
{
const uint64_t v = 0x0101010101010101ULL * src[-1];
uint64_t *p = (uint64_t*)src;
*p++ = v;
*p++ = v;
src += i_stride;
}
}
/****************************************************************************
* 8x8 prediction for intra chroma block
****************************************************************************/
static void predict_8x8c_dc_left( uint8_t *src, int i_stride )
{
int y;
uint32_t s0 = 0, s1 = 0;
uint64_t dc0, dc1;
for( y = 0; y < 4; y++ )
{
s0 += src[y * i_stride - 1];
s1 += src[(y+4) * i_stride - 1];
}
dc0 = (( s0 + 2 ) >> 2)*0x0101010101010101ULL;
dc1 = (( s1 + 2 ) >> 2)*0x0101010101010101ULL;
for( y = 0; y < 4; y++ )
{
*(uint64_t*)src = dc0;
src += i_stride;
}
for( y = 0; y < 4; y++ )
{
*(uint64_t*)src = dc1;
src += i_stride;
}
}
static void predict_8x8c_dc_top( uint8_t *src, int i_stride )
{
int y, x;
uint32_t s0 = 0, s1 = 0;
uint64_t dc;
for( x = 0; x < 4; x++ )
{
s0 += src[x - i_stride];
s1 += src[x + 4 - i_stride];
}
dc = (( s0 + 2 ) >> 2)*0x01010101
+ (( s1 + 2 ) >> 2)*0x0101010100000000ULL;
for( y = 0; y < 8; y++ )
{
*(uint64_t*)src = dc;
src += i_stride;
}
}
static void predict_8x8c_h( uint8_t *src, int i_stride )
{
int i;
for( i = 0; i < 8; i++ )
{
*(uint64_t*)src = 0x0101010101010101ULL * src[-1];
src += i_stride;
}
}
/****************************************************************************
* Exported functions:
****************************************************************************/
void x264_predict_16x16_init_mmxext( x264_predict_t pf[7] )
{
pf[I_PRED_16x16_V ] = predict_16x16_v_mmx;
pf[I_PRED_16x16_H ] = predict_16x16_h;
pf[I_PRED_16x16_DC] = predict_16x16_dc;
pf[I_PRED_16x16_DC_LEFT]= predict_16x16_dc_left;
}
void x264_predict_8x8c_init_mmxext( x264_predict_t pf[7] )
{
pf[I_PRED_CHROMA_V ] = predict_8x8c_v_mmx;
pf[I_PRED_CHROMA_H ] = predict_8x8c_h;
pf[I_PRED_CHROMA_DC_LEFT]= predict_8x8c_dc_left;
pf[I_PRED_CHROMA_DC_TOP ]= predict_8x8c_dc_top;
}
......@@ -33,12 +33,6 @@ BITS 32
%endif
%endmacro
;=============================================================================
; Read only data
;=============================================================================
SECTION .rodata data align=16
SECTION .data
;=============================================================================
......@@ -68,28 +62,22 @@ cglobal predict_16x16_v_mmx
ALIGN 16
predict_8x8c_v_mmx :
;push edi
;push esi
mov edx , [esp + 4]
mov ecx , [esp + 8]
sub edx , ecx ; esi <-- line -1
sub edx , ecx ; edx <-- line -1
movq mm0 , [edx]
movq [edx + ecx] , mm0 ; 0
movq [edx + 2 * ecx] , mm0 ; 1
movq [edx + 4 * ecx] , mm0 ; 3
movq [edx + 8 * ecx] , mm0 ; 7
add edx , ecx ; esi <-- line 0
add edx , ecx ; edx <-- line 0
movq [edx + 2 * ecx] , mm0 ; 2
movq [edx + 4 * ecx] , mm0 ; 4
lea edx , [edx + 4 * ecx] ; esi <-- line 4
lea edx , [edx + 4 * ecx] ; edx <-- line 4
movq [edx + ecx] , mm0 ; 5
movq [edx + 2 * ecx] , mm0 ; 6
;pop esi
;pop edi
ret
;-----------------------------------------------------------------------------
......@@ -101,18 +89,13 @@ predict_8x8c_v_mmx :
ALIGN 16
predict_16x16_v_mmx :
;push edi
;push esi
mov edx, [esp + 4]
mov ecx, [esp + 8]
sub edx, ecx ; esi <-- line -1
sub edx, ecx ; edx <-- line -1
movq mm0, [edx]
movq mm1, [edx + 8]
mov eax, ecx
shl eax, 1
add eax, ecx ; eax <-- 3* stride
lea eax, [ecx + 2*ecx] ; eax <-- 3* stride
SAVE_0_1 (edx + ecx) ; 0
SAVE_0_1 (edx + 2 * ecx) ; 1
......@@ -121,21 +104,17 @@ predict_16x16_v_mmx :
SAVE_0_1 (edx + 2 * eax) ; 5
SAVE_0_1 (edx + 8 * ecx) ; 7
SAVE_0_1 (edx + 4 * eax) ; 11
add edx, ecx ; esi <-- line 0
add edx, ecx ; edx <-- line 0
SAVE_0_1 (edx + 4 * ecx) ; 4
SAVE_0_1 (edx + 2 * eax) ; 6
SAVE_0_1 (edx + 8 * ecx) ; 8
SAVE_0_1 (edx + 4 * eax) ; 12
lea edx, [edx + 8 * ecx] ; esi <-- line 8
lea edx, [edx + 8 * ecx] ; edx <-- line 8
SAVE_0_1 (edx + ecx) ; 9
SAVE_0_1 (edx + 2 * ecx) ; 10
lea edx, [edx + 4 * ecx] ; esi <-- line 12
lea edx, [edx + 4 * ecx] ; edx <-- line 12
SAVE_0_1 (edx + ecx) ; 13
SAVE_0_1 (edx + 2 * ecx) ; 14
SAVE_0_1 (edx + eax) ; 15
;pop esi
;pop edi
ret
......@@ -21,391 +21,28 @@
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
*****************************************************************************/
/* XXX predict4x4 are inspired from ffmpeg h264 decoder
*/
#ifdef HAVE_STDINT_H
#include <stdint.h>
#else
#include <inttypes.h>
#endif
#include <stdlib.h>
#include <stdarg.h>
#include "x264.h" /* for keyword inline */
#include "common/predict.h"
#include "predict.h"
static inline int clip_uint8( int a )
{
if (a&(~255))
return (-a)>>31;
else
return a;
}
/****************************************************************************
* 16x16 prediction for intra block DC, H, V, P
****************************************************************************/
static void predict_16x16_dc( uint8_t *src, int i_stride )
{
uint32_t dc = 0;
int i;
/* calculate DC value */
for( i = 0; i < 16; i++ )
{
dc += src[-1 + i * i_stride];
dc += src[i - i_stride];
}
dc = (( dc + 16 ) >> 5) * 0x01010101;
for( i = 0; i < 16; i++ )
{
uint32_t *p = (uint32_t*)src;
*p++ = dc;
*p++ = dc;
*p++ = dc;
*p++ = dc;
src += i_stride;
}
}
static void predict_16x16_dc_left( uint8_t *src, int i_stride )
{
uint32_t dc = 0;
int i;
for( i = 0; i < 16; i++ )
{
dc += src[-1 + i * i_stride];
}
dc = (( dc + 8 ) >> 4) * 0x01010101;
for( i = 0; i < 16; i++ )
{
uint32_t *p = (uint32_t*)src;
*p++ = dc;
*p++ = dc;
*p++ = dc;
*p++ = dc;
src += i_stride;
}
}
static void predict_16x16_dc_top( uint8_t *src, int i_stride )
{
uint32_t dc = 0;
int i;
for( i = 0; i < 16; i++ )
{
dc += src[i - i_stride];
}
dc = (( dc + 8 ) >> 4) * 0x01010101;
for( i = 0; i < 16; i++ )
{
uint32_t *p = (uint32_t*)src;
*p++ = dc;
*p++ = dc;
*p++ = dc;
*p++ = dc;
src += i_stride;
}
}
static void predict_16x16_dc_128( uint8_t *src, int i_stride )
{
int i;
for( i = 0; i < 16; i++ )
{
uint32_t *p = (uint32_t*)src;
*p++ = 0x80808080;
*p++ = 0x80808080;
*p++ = 0x80808080;
*p++ = 0x80808080;
src += i_stride;
}
}
static void predict_16x16_h( uint8_t *src, int i_stride )
{
int i;
for( i = 0; i < 16; i++ )
{
const uint32_t v = 0x01010101 * src[-1];
uint32_t *p = (uint32_t*)src;
*p++ = v;
*p++ = v;
*p++ = v;
*p++ = v;
src += i_stride;
}
}
extern void predict_16x16_v_mmx( uint8_t *src, int i_stride );
/****************************************************************************
* 8x8 prediction for intra chroma block DC, H, V, P
****************************************************************************/
static void predict_8x8c_dc_128( uint8_t *src, int i_stride )
{
int y;
for( y = 0; y < 8; y++ )
{
uint32_t *p = (uint32_t*)src;
*p++ = 0x80808080;
*p++ = 0x80808080;
src += i_stride;
}
}
static void predict_8x8c_dc_left( uint8_t *src, int i_stride )
{
int y;
uint32_t dc0 = 0, dc1 = 0;
for( y = 0; y < 4; y++ )
{
dc0 += src[y * i_stride - 1];
dc1 += src[(y+4) * i_stride - 1];
}
dc0 = (( dc0 + 2 ) >> 2)*0x01010101;
dc1 = (( dc1 + 2 ) >> 2)*0x01010101;
for( y = 0; y < 4; y++ )
{
uint32_t *p = (uint32_t*)src;
*p++ = dc0;
*p++ = dc0;
src += i_stride;
}
for( y = 0; y < 4; y++ )
{
uint32_t *p = (uint32_t*)src;
*p++ = dc1;
*p++ = dc1;
src += i_stride;
}
}
static void predict_8x8c_dc_top( uint8_t *src, int i_stride )
{
int y, x;
uint32_t dc0 = 0, dc1 = 0;
for( x = 0; x < 4; x++ )
{
dc0 += src[x - i_stride];
dc1 += src[x + 4 - i_stride];
}
dc0 = (( dc0 + 2 ) >> 2)*0x01010101;
dc1 = (( dc1 + 2 ) >> 2)*0x01010101;
for( y = 0; y < 8; y++ )
{
uint32_t *p = (uint32_t*)src;
*p++ = dc0;
*p++ = dc1;
src += i_stride;
}
}
static void predict_8x8c_dc( uint8_t *src, int i_stride )
{
int y;
int s0 = 0, s1 = 0, s2 = 0, s3 = 0;
uint32_t dc0, dc1, dc2, dc3;
int i;
/* First do :
s0 s1
s2
s3
*/
for( i = 0; i < 4; i++ )
{
s0 += src[i - i_stride];
s1 += src[i + 4 - i_stride];
s2 += src[-1 + i * i_stride];
s3 += src[-1 + (i+4)*i_stride];
}
/* now calculate
dc0 dc1
dc2 dc3
*/
dc0 = (( s0 + s2 + 4 ) >> 3)*0x01010101;
dc1 = (( s1 + 2 ) >> 2)*0x01010101;
dc2 = (( s3 + 2 ) >> 2)*0x01010101;
dc3 = (( s1 + s3 + 4 ) >> 3)*0x01010101;
for( y = 0; y < 4; y++ )
{
uint32_t *p = (uint32_t*)src;
*p++ = dc0;
*p++ = dc1;
src += i_stride;
}
for( y = 0; y < 4; y++ )
{
uint32_t *p = (uint32_t*)src;
*p++ = dc2;
*p++ = dc3;
src += i_stride;
}
}
static void predict_8x8c_h( uint8_t *src, int i_stride )
{
int i;
for( i = 0; i < 8; i++ )
{
uint32_t v = 0x01010101 * src[-1];
uint32_t *p = (uint32_t*)src;
*p++ = v;
*p++ = v;
src += i_stride;
}
}
extern void predict_8x8c_v_mmx( uint8_t *src, int i_stride );
/****************************************************************************
* 4x4 prediction for intra luma block
****************************************************************************/
static void predict_4x4_dc_128( uint8_t *src, int i_stride )
{
int y;
for( y = 0; y < 4; y++ )
{
uint32_t *p = (uint32_t*)src;
*p = 0x80808080;
src += i_stride;
}
}
static void predict_4x4_dc_left( uint8_t *src, int i_stride )
{
int y;
uint32_t dc = (( src[-1+0*i_stride] + src[-1+i_stride]+
src[-1+2*i_stride] + src[-1+3*i_stride] + 2 ) >> 2)*0x01010101;
for( y = 0; y < 4; y++ )
{
uint32_t *p = (uint32_t*)src;
*p = dc;
src += i_stride;
}
}
static void predict_4x4_dc_top( uint8_t *src, int i_stride )
{
int y;
uint32_t dc = (( src[0 - i_stride] + src[1 - i_stride] +
src[2 - i_stride] + src[3 - i_stride] + 2 ) >> 2)*0x01010101;
for( y = 0; y < 4; y++ )
{
uint32_t *p = (uint32_t*)src;
*p = dc;
src += i_stride;
}
}
static void predict_4x4_dc( uint8_t *src, int i_stride )
{
int y;
uint32_t dc = (( src[-1+0*i_stride] + src[-1+i_stride]+
src[-1+2*i_stride] + src[-1+3*i_stride] +
src[0 - i_stride] + src[1 - i_stride] +
src[2 - i_stride] + src[3 - i_stride] + 4 ) >> 3)*0x01010101;
for( y = 0; y < 4; y++ )
{
uint32_t *p = (uint32_t*)src;
*p = dc;
src += i_stride;
}
}
static void predict_4x4_h( uint8_t *src, int i_stride )
{
int i;
for( i = 0; i < 4; i++ )
{
uint32_t *p = (uint32_t*)src;
*p = 0x01010101*src[-1];
src += i_stride;
}
}
static void predict_4x4_v( uint8_t *src, int i_stride )
{
uint32_t top = *((uint32_t*)&src[-i_stride]);
int i;