Commit 42d57caa authored by Fiona Glaser's avatar Fiona Glaser

Merge avg and avg_weight

avg_weight no longer has to be special-cased in the code; faster weightb
parent b7d27eaa
......@@ -556,6 +556,7 @@ static inline void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int he
const int i8 = x264_scan8[0]+x+8*y;
const int i_ref0 = h->mb.cache.ref[0][i8];
const int i_ref1 = h->mb.cache.ref[1][i8];
const int weight = h->mb.bipred_weight[i_ref0][i_ref1];
const int mvx0 = x264_clip3( h->mb.cache.mv[0][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] );
const int mvx1 = x264_clip3( h->mb.cache.mv[1][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] );
int mvy0 = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] );
......@@ -570,48 +571,24 @@ static inline void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int he
mvx0 + 4*4*x, mvy0 + 4*4*y, 4*width, 4*height );
src1 = h->mc.get_ref( tmp1, &i_stride1, h->mb.pic.p_fref[1][i_ref1], h->mb.pic.i_stride[0],
mvx1 + 4*4*x, mvy1 + 4*4*y, 4*width, 4*height );
h->mc.avg[i_mode]( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE,
src0, i_stride0, src1, i_stride1, weight );
if( h->mb.b_interlaced & i_ref0 )
mvy0 += (h->mb.i_mb_y & 1)*4 - 2;
if( h->mb.b_interlaced & i_ref1 )
mvy1 += (h->mb.i_mb_y & 1)*4 - 2;
h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
&h->mb.pic.p_fref[0][i_ref0][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
h->mc.mc_chroma( tmp0, 16, &h->mb.pic.p_fref[0][i_ref0][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
mvx0, mvy0, 2*width, 2*height );
h->mc.mc_chroma( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
&h->mb.pic.p_fref[0][i_ref0][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
h->mc.mc_chroma( tmp1, 16, &h->mb.pic.p_fref[1][i_ref1][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
mvx1, mvy1, 2*width, 2*height );
h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight );
h->mc.mc_chroma( tmp0, 16, &h->mb.pic.p_fref[0][i_ref0][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
mvx0, mvy0, 2*width, 2*height );
if( h->param.analyse.b_weighted_bipred )
{
const int weight = h->mb.bipred_weight[i_ref0][i_ref1];
h->mc.avg_weight[i_mode]( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE,
src0, i_stride0, src1, i_stride1, weight );
h->mc.mc_chroma( tmp0, 16, &h->mb.pic.p_fref[1][i_ref1][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
mvx1, mvy1, 2*width, 2*height );
h->mc.avg_weight[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
&h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, weight );
h->mc.mc_chroma( tmp0, 16, &h->mb.pic.p_fref[1][i_ref1][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
mvx1, mvy1, 2*width, 2*height );
h->mc.avg_weight[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
&h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, weight );
}
else
{
h->mc.avg[i_mode]( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE,
src0, i_stride0, src1, i_stride1 );
h->mc.mc_chroma( tmp0, 16, &h->mb.pic.p_fref[1][i_ref1][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
mvx1, mvy1, 2*width, 2*height );
h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
&h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16 );
h->mc.mc_chroma( tmp0, 16, &h->mb.pic.p_fref[1][i_ref1][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
mvx1, mvy1, 2*width, 2*height );
h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
&h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16 );
}
h->mc.mc_chroma( tmp1, 16, &h->mb.pic.p_fref[1][i_ref1][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
mvx1, mvy1, 2*width, 2*height );
h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight );
}
static void x264_mb_mc_direct8x8( x264_t *h, int x, int y )
......
......@@ -64,25 +64,6 @@ static inline void pixel_avg_wxh( uint8_t *dst, int i_dst, uint8_t *src1, int i_
}
}
#define PIXEL_AVG_C( name, width, height ) \
static void name( uint8_t *pix1, int i_stride_pix1, \
uint8_t *pix2, int i_stride_pix2, \
uint8_t *pix3, int i_stride_pix3 ) \
{ \
pixel_avg_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height ); \
}
PIXEL_AVG_C( pixel_avg_16x16, 16, 16 )
PIXEL_AVG_C( pixel_avg_16x8, 16, 8 )
PIXEL_AVG_C( pixel_avg_8x16, 8, 16 )
PIXEL_AVG_C( pixel_avg_8x8, 8, 8 )
PIXEL_AVG_C( pixel_avg_8x4, 8, 4 )
PIXEL_AVG_C( pixel_avg_4x8, 4, 8 )
PIXEL_AVG_C( pixel_avg_4x4, 4, 4 )
PIXEL_AVG_C( pixel_avg_4x2, 4, 2 )
PIXEL_AVG_C( pixel_avg_2x4, 2, 4 )
PIXEL_AVG_C( pixel_avg_2x2, 2, 2 )
/* Implicit weighted bipred only:
* assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64 */
#define op_scale2(x) dst[x] = x264_clip_uint8( (src1[x]*i_weight1 + src2[x]*i_weight2 + (1<<5)) >> 6 )
......@@ -113,28 +94,28 @@ static inline void pixel_avg_weight_wxh( uint8_t *dst, int i_dst, uint8_t *src1,
op_scale2(15);
}
}
#undef op_scale2
#define PIXEL_AVG_WEIGHT_C( width, height ) \
static void pixel_avg_weight_##width##x##height( \
uint8_t *pix1, int i_stride_pix1, \
uint8_t *pix2, int i_stride_pix2, \
uint8_t *pix3, int i_stride_pix3, int i_weight1 ) \
#define PIXEL_AVG_C( name, width, height ) \
static void name( uint8_t *pix1, int i_stride_pix1, \
uint8_t *pix2, int i_stride_pix2, \
uint8_t *pix3, int i_stride_pix3, int weight ) \
{ \
pixel_avg_weight_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height, i_weight1 ); \
if( weight == 32 )\
pixel_avg_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height ); \
else\
pixel_avg_weight_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height, weight ); \
}
PIXEL_AVG_WEIGHT_C(16,16)
PIXEL_AVG_WEIGHT_C(16,8)
PIXEL_AVG_WEIGHT_C(8,16)
PIXEL_AVG_WEIGHT_C(8,8)
PIXEL_AVG_WEIGHT_C(8,4)
PIXEL_AVG_WEIGHT_C(4,8)
PIXEL_AVG_WEIGHT_C(4,4)
PIXEL_AVG_WEIGHT_C(4,2)
PIXEL_AVG_WEIGHT_C(2,4)
PIXEL_AVG_WEIGHT_C(2,2)
#undef op_scale2
#undef PIXEL_AVG_WEIGHT_C
PIXEL_AVG_C( pixel_avg_16x16, 16, 16 )
PIXEL_AVG_C( pixel_avg_16x8, 16, 8 )
PIXEL_AVG_C( pixel_avg_8x16, 8, 16 )
PIXEL_AVG_C( pixel_avg_8x8, 8, 8 )
PIXEL_AVG_C( pixel_avg_8x4, 8, 4 )
PIXEL_AVG_C( pixel_avg_4x8, 4, 8 )
PIXEL_AVG_C( pixel_avg_4x4, 4, 4 )
PIXEL_AVG_C( pixel_avg_4x2, 4, 2 )
PIXEL_AVG_C( pixel_avg_2x4, 2, 4 )
PIXEL_AVG_C( pixel_avg_2x2, 2, 2 )
static void mc_copy( uint8_t *src, int i_src_stride, uint8_t *dst, int i_dst_stride, int i_width, int i_height )
{
......@@ -358,17 +339,6 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf )
pf->avg[PIXEL_2x4] = pixel_avg_2x4;
pf->avg[PIXEL_2x2] = pixel_avg_2x2;
pf->avg_weight[PIXEL_16x16]= pixel_avg_weight_16x16;
pf->avg_weight[PIXEL_16x8] = pixel_avg_weight_16x8;
pf->avg_weight[PIXEL_8x16] = pixel_avg_weight_8x16;
pf->avg_weight[PIXEL_8x8] = pixel_avg_weight_8x8;
pf->avg_weight[PIXEL_8x4] = pixel_avg_weight_8x4;
pf->avg_weight[PIXEL_4x8] = pixel_avg_weight_4x8;
pf->avg_weight[PIXEL_4x4] = pixel_avg_weight_4x4;
pf->avg_weight[PIXEL_4x2] = pixel_avg_weight_4x2;
pf->avg_weight[PIXEL_2x4] = pixel_avg_weight_2x4;
pf->avg_weight[PIXEL_2x2] = pixel_avg_weight_2x2;
pf->copy[PIXEL_16x16] = mc_copy_w16;
pf->copy[PIXEL_8x8] = mc_copy_w8;
pf->copy[PIXEL_4x4] = mc_copy_w4;
......
......@@ -45,8 +45,7 @@ typedef struct
int mvx, int mvy,
int i_width, int i_height );
void (*avg[10])( uint8_t *dst, int, uint8_t *src1, int, uint8_t *src2, int );
void (*avg_weight[10])( uint8_t *dst, int, uint8_t *src1, int, uint8_t *src2, int, int i_weight );
void (*avg[10])( uint8_t *dst, int, uint8_t *src1, int, uint8_t *src2, int, int i_weight );
/* only 16x16, 8x8, and 4x4 defined */
void (*copy[7])( uint8_t *dst, int, uint8_t *src, int, int i_height );
......
......@@ -36,30 +36,10 @@ sw_64: dd 64
SECTION .text
;=============================================================================
; pixel avg
; weighted prediction
;=============================================================================
;-----------------------------------------------------------------------------
; void x264_pixel_avg_4x4_mmxext( uint8_t *dst, int dst_stride,
; uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride );
;-----------------------------------------------------------------------------
%macro AVGH 3
%assign function_align 8 ; the whole function fits in 8 bytes, so a larger align just wastes space
cglobal x264_pixel_avg_%1x%2_%3,0,0
mov eax, %2
%ifidn %3, sse2
test dword r4m, 15
jnz x264_pixel_avg_w%1_mmxext
%endif
jmp x264_pixel_avg_w%1_%3
%assign function_align 16
%endmacro
;-----------------------------------------------------------------------------
; void x264_pixel_avg_w4_mmxext( uint8_t *dst, int dst_stride,
; uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride,
; int height );
;-----------------------------------------------------------------------------
; implicit bipred only:
; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
%ifdef ARCH_X86_64
%define t0 r0
%define t1 r1
......@@ -69,7 +49,6 @@ cglobal x264_pixel_avg_%1x%2_%3,0,0
%define t5 r5
%macro AVG_START 1
cglobal %1, 6,7
.height_loop:
%endmacro
%else
%define t0 r1
......@@ -86,10 +65,100 @@ cglobal x264_pixel_avg_%1x%2_%3,0,0
mov t3, r3m
mov t4, r4m
mov t5, r5m
.height_loop:
%endmacro
%endif
%macro SPLATW 2
%if mmsize==16
pshuflw %1, %2, 0
movlhps %1, %1
%else
pshufw %1, %2, 0
%endif
%endmacro
%macro BIWEIGHT 3
movh m0, %2
movh m1, %3
punpcklbw m0, m7
punpcklbw m1, m7
pmullw m0, m4
pmullw m1, m5
paddw m0, m1
paddw m0, m6
psraw m0, 6
pmaxsw m0, m7
packuswb m0, m0
movh %1, m0
%endmacro
%macro BIWEIGHT_START 0
movd m4, r6m
SPLATW m4, m4 ; weight_dst
mova m5, [pw_64 GLOBAL]
psubw m5, m4 ; weight_src
mova m6, [pw_32 GLOBAL] ; rounding
pxor m7, m7
.height_loop:
%endmacro
INIT_MMX
;-----------------------------------------------------------------------------
; int x264_pixel_avg_weight_w16_mmxext( uint8_t *dst, int, uint8_t *src1, int, uint8_t *src2, int, int i_weight )
;-----------------------------------------------------------------------------
%macro AVG_WEIGHT 2
AVG_START x264_pixel_avg_weight_w%2_%1
BIWEIGHT_START
%assign x 0
%rep %2*2/mmsize
BIWEIGHT [t0+x], [t2+x], [t4+x]
BIWEIGHT [t0+x+t1], [t2+x+t3], [t4+x+t5]
%assign x x+mmsize/2
%endrep
lea t0, [t0+t1*2]
lea t2, [t2+t3*2]
lea t4, [t4+t5*2]
sub eax, 2
jg .height_loop
REP_RET
%endmacro
AVG_WEIGHT mmxext, 4
AVG_WEIGHT mmxext, 8
AVG_WEIGHT mmxext, 16
INIT_XMM
AVG_WEIGHT sse2, 8
AVG_WEIGHT sse2, 16
;=============================================================================
; pixel avg
;=============================================================================
;-----------------------------------------------------------------------------
; void x264_pixel_avg_4x4_mmxext( uint8_t *dst, int dst_stride,
; uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride, int weight );
;-----------------------------------------------------------------------------
%macro AVGH 3
cglobal x264_pixel_avg_%1x%2_%3,0,0
mov eax, %2
cmp dword r6m, 32
jne x264_pixel_avg_weight_w%1_mmxext
%if mmsize == 16 && %1 == 16
test dword r4m, 15
jz x264_pixel_avg_w%1_sse2
%endif
jmp x264_pixel_avg_w%1_mmxext
%endmacro
;-----------------------------------------------------------------------------
; void x264_pixel_avg_w4_mmxext( uint8_t *dst, int dst_stride,
; uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride,
; int height, int weight );
;-----------------------------------------------------------------------------
%macro AVG_END 0
sub eax, 2
lea t4, [t4+t5*2]
......@@ -99,7 +168,10 @@ cglobal x264_pixel_avg_%1x%2_%3,0,0
REP_RET
%endmacro
INIT_MMX
AVG_START x264_pixel_avg_w4_mmxext
.height_loop:
movd mm0, [t2]
movd mm1, [t2+t3]
pavgb mm0, [t4]
......@@ -113,6 +185,7 @@ AVGH 4, 4, mmxext
AVGH 4, 2, mmxext
AVG_START x264_pixel_avg_w8_mmxext
.height_loop:
movq mm0, [t2]
movq mm1, [t2+t3]
pavgb mm0, [t4]
......@@ -126,6 +199,7 @@ AVGH 8, 8, mmxext
AVGH 8, 4, mmxext
AVG_START x264_pixel_avg_w16_mmxext
.height_loop:
movq mm0, [t2 ]
movq mm1, [t2+8]
movq mm2, [t2+t3 ]
......@@ -144,6 +218,7 @@ AVGH 16, 16, mmxext
AVGH 16, 8, mmxext
AVG_START x264_pixel_avg_w16_sse2
.height_loop:
movdqu xmm0, [t2]
movdqu xmm1, [t2+t3]
pavgb xmm0, [t4]
......@@ -152,8 +227,12 @@ AVG_START x264_pixel_avg_w16_sse2
movdqa [t0+t1], xmm1
AVG_END
INIT_XMM
AVGH 16, 16, sse2
AVGH 16, 8, sse2
AVGH 8, 16, sse2
AVGH 8, 8, sse2
AVGH 8, 4, sse2
......@@ -476,90 +555,6 @@ COPY_W16_SSE2 x264_mc_copy_w16_aligned_sse2, movdqa
;=============================================================================
; weighted prediction
;=============================================================================
; implicit bipred only:
; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
%macro SPLATW 2
%if mmsize==16
pshuflw %1, %2, 0
movlhps %1, %1
%else
pshufw %1, %2, 0
%endif
%endmacro
%macro BIWEIGHT 3
movh m0, %2
movh m1, %3
punpcklbw m0, m7
punpcklbw m1, m7
pmullw m0, m4
pmullw m1, m5
paddw m0, m1
paddw m0, m6
psraw m0, 6
pmaxsw m0, m7
packuswb m0, m0
movh %1, m0
%endmacro
%macro BIWEIGHT_START 1
movd m4, r6m
SPLATW m4, m4 ; weight_dst
mova m5, [pw_64 GLOBAL]
psubw m5, m4 ; weight_src
mova m6, [pw_32 GLOBAL] ; rounding
pxor m7, m7
%if %1
%define t0 r6d
mov r6d, r7m
%endif
.height_loop:
%endmacro
INIT_MMX
;-----------------------------------------------------------------------------
; int x264_pixel_avg_weight_w16_mmxext( uint8_t *dst, int, uint8_t *src1, int, uint8_t *src2, int, int i_weight, int )
;-----------------------------------------------------------------------------
cglobal x264_pixel_avg_weight_4x4_mmxext, 6,6
BIWEIGHT_START 0
BIWEIGHT [r0 ], [r2 ], [r4 ]
BIWEIGHT [r0+r1 ], [r2+r3 ], [r4+r5 ]
BIWEIGHT [r0+r1*2], [r2+r3*2], [r4+r5*2]
add r0, r1
add r2, r3
add r4, r5
BIWEIGHT [r0+r1*2], [r2+r3*2], [r4+r5*2]
RET
%macro AVG_WEIGHT 2
cglobal x264_pixel_avg_weight_w%2_%1, 6,7
BIWEIGHT_START 1
%assign x 0
%rep %2*2/mmsize
BIWEIGHT [r0+x], [r2+x], [r4+x]
BIWEIGHT [r0+x+r1], [r2+x+r3], [r4+x+r5]
%assign x x+mmsize/2
%endrep
lea r0, [r0+r1*2]
lea r2, [r2+r3*2]
lea r4, [r4+r5*2]
sub t0, 2
jg .height_loop
REP_RET
%endmacro
AVG_WEIGHT mmxext, 8
AVG_WEIGHT mmxext, 16
INIT_XMM
AVG_WEIGHT sse2, 8
AVG_WEIGHT sse2, 16
;=============================================================================
; prefetch
;=============================================================================
......
......@@ -30,27 +30,25 @@
#include "mc.h"
/* NASM functions */
extern void x264_pixel_avg_16x16_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg_16x8_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg_16x16_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg_16x8_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg_8x16_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg_8x8_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg_8x4_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg_4x8_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg_4x4_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg_4x2_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg_16x16_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_16x8_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_8x16_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_8x8_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_8x4_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_16x16_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_16x8_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_8x16_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_8x8_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_8x4_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_4x8_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_4x4_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_4x2_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w4_mmx( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w8_mmx( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w16_mmx( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w16_sse2( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w16_sse3( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w16_aligned_sse2( uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int, int );
extern void x264_pixel_avg_weight_w8_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int, int, int );
extern void x264_pixel_avg_weight_w16_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int, int, int );
extern void x264_pixel_avg_weight_w16_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int, int );
extern void x264_prefetch_fenc_mmxext( uint8_t *, int, uint8_t *, int, int );
extern void x264_prefetch_ref_mmxext( uint8_t *, int, int );
extern void x264_mc_chroma_mmxext( uint8_t *src, int i_src_stride,
......@@ -87,23 +85,6 @@ PIXEL_AVG_WALL(cache64_mmxext)
PIXEL_AVG_WALL(cache64_sse2)
PIXEL_AVG_WALL(sse2)
#define AVG_WEIGHT(W,H,name) \
static void x264_pixel_avg_weight_ ## W ## x ## H ## _##name( uint8_t *dst, int i_dst, uint8_t *src1, int i_src1, uint8_t *src2, int i_src2, int i_weight_dst ) \
{ \
x264_pixel_avg_weight_w ## W ## _##name( dst, i_dst, src1, i_src1, src2, i_src2, i_weight_dst, H ); \
}
AVG_WEIGHT(16,16,mmxext)
AVG_WEIGHT(16,8,mmxext)
AVG_WEIGHT(8,16,mmxext)
AVG_WEIGHT(8,8,mmxext)
AVG_WEIGHT(8,4,mmxext)
AVG_WEIGHT(16,16,sse2)
AVG_WEIGHT(16,8,sse2)
AVG_WEIGHT(8,16,sse2)
AVG_WEIGHT(8,8,sse2)
AVG_WEIGHT(8,4,sse2)
#define PIXEL_AVG_WTAB(instr, name1, name2, name3, name4, name5)\
static void (* const x264_pixel_avg_wtab_##instr[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) =\
{\
......@@ -268,14 +249,6 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_mmxext;
pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_mmxext;
pf->avg_weight[PIXEL_16x16] = x264_pixel_avg_weight_16x16_mmxext;
pf->avg_weight[PIXEL_16x8] = x264_pixel_avg_weight_16x8_mmxext;
pf->avg_weight[PIXEL_8x16] = x264_pixel_avg_weight_8x16_mmxext;
pf->avg_weight[PIXEL_8x8] = x264_pixel_avg_weight_8x8_mmxext;
pf->avg_weight[PIXEL_8x4] = x264_pixel_avg_weight_8x4_mmxext;
pf->avg_weight[PIXEL_4x4] = x264_pixel_avg_weight_4x4_mmxext;
// avg_weight_4x8 is rare and 4x2 is not used
pf->plane_copy = x264_plane_copy_mmxext;
pf->hpel_filter = x264_hpel_filter_mmxext;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_mmxext;
......@@ -311,11 +284,9 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse2;
pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2;
pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_sse2;
pf->avg_weight[PIXEL_16x16] = x264_pixel_avg_weight_16x16_sse2;
pf->avg_weight[PIXEL_16x8] = x264_pixel_avg_weight_16x8_sse2;
pf->avg_weight[PIXEL_8x16] = x264_pixel_avg_weight_8x16_sse2;
pf->avg_weight[PIXEL_8x8] = x264_pixel_avg_weight_8x8_sse2;
pf->avg_weight[PIXEL_8x4] = x264_pixel_avg_weight_8x4_sse2;
pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_sse2;
pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_sse2;
pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_sse2;
pf->hpel_filter = x264_hpel_filter_sse2;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2;
pf->mc_chroma = x264_mc_chroma_sse2;
......
......@@ -1476,11 +1476,7 @@ static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
#define WEIGHTED_AVG( size, pix, stride, src1, stride1, src2, stride2 ) \
{ \
if( h->param.analyse.b_weighted_bipred ) \
h->mc.avg_weight[size]( pix, stride, src1, stride1, src2, stride2, \
h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); \
else \
h->mc.avg[size]( pix, stride, src1, stride1, src2, stride2 ); \
h->mc.avg[size]( pix, stride, src1, stride1, src2, stride2, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); \
}
static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
......@@ -1489,7 +1485,6 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
DECLARE_ALIGNED_16( uint8_t pix1[16*16] );
uint8_t *src0, *src1;
int stride0 = 16, stride1 = 16;
int weight;
x264_me_t m;
int i_ref, i_mvc;
......@@ -1559,7 +1554,6 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
x264_macroblock_cache_ref( h, 0, 0, 4, 4, 1, a->l1.i_ref );
/* get cost of BI mode */
weight = h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref];
src0 = h->mc.get_ref( pix0, &stride0,
h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
a->l0.me16x16.mv[0], a->l0.me16x16.mv[1], 16, 16 );
......@@ -1567,10 +1561,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
a->l1.me16x16.mv[0], a->l1.me16x16.mv[1], 16, 16 );
if( h->param.analyse.b_weighted_bipred )
h->mc.avg_weight[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, weight );
else
h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1 );
h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
+ REF_COST( 0, a->l0.i_ref )
......@@ -1713,7 +1704,7 @@ static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
i_part_cost_bi += m->cost_mv;
/* FIXME: ref cost */
}
WEIGHTED_AVG( PIXEL_8x8, pix[0], 8, src[0], stride[0], src[1], stride[1] );
h->mc.avg[PIXEL_8x8]( pix[0], 8, src[0], stride[0], src[1], stride[1], h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] );
i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
+ a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
......@@ -1777,8 +1768,7 @@ static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
/* FIXME: ref cost */
i_part_cost_bi += m->cost_mv;
}
WEIGHTED_AVG( PIXEL_16x8, pix[0], 16, src[0], stride[0