Commit b7d27eaa authored by Fiona Glaser's avatar Fiona Glaser

Rewrite avg/avg_weight to take two source pointers

This allows the use of get_ref instead of mc_luma almost everywhere for bipred
parent c4f3dabe
......@@ -554,47 +554,63 @@ static inline void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int hei
static inline void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int height )
{
const int i8 = x264_scan8[0]+x+8*y;
const int i_ref0 = h->mb.cache.ref[0][i8];
const int i_ref1 = h->mb.cache.ref[1][i8];
const int mvx0 = x264_clip3( h->mb.cache.mv[0][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] );
const int mvx1 = x264_clip3( h->mb.cache.mv[1][i8][0], h->mb.mv_min[0], h->mb.mv_max[0] );
int mvy0 = x264_clip3( h->mb.cache.mv[0][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] );
int mvy1 = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] );
DECLARE_ALIGNED_16( uint8_t tmp[16*16] );
int i_mode = x264_size2pixel[height][width];
x264_mb_mc_0xywh( h, x, y, width, height );
h->mc.mc_luma( tmp, 16, h->mb.pic.p_fref[1][i_ref1], h->mb.pic.i_stride[0],
mvx1 + 4*4*x, mvy1 + 4*4*y, 4*width, 4*height );
int i_mode = x264_size2pixel[height][width];
int i_stride0 = 16, i_stride1 = 16;
DECLARE_ALIGNED_16( uint8_t tmp0[16*16] );
DECLARE_ALIGNED_16( uint8_t tmp1[16*16] );
uint8_t *src0, *src1;
src0 = h->mc.get_ref( tmp0, &i_stride0, h->mb.pic.p_fref[0][i_ref0], h->mb.pic.i_stride[0],
mvx0 + 4*4*x, mvy0 + 4*4*y, 4*width, 4*height );
src1 = h->mc.get_ref( tmp1, &i_stride1, h->mb.pic.p_fref[1][i_ref1], h->mb.pic.i_stride[0],
mvx1 + 4*4*x, mvy1 + 4*4*y, 4*width, 4*height );
if( h->mb.b_interlaced & i_ref0 )
mvy0 += (h->mb.i_mb_y & 1)*4 - 2;
if( h->mb.b_interlaced & i_ref1 )
mvy1 += (h->mb.i_mb_y & 1)*4 - 2;
h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
&h->mb.pic.p_fref[0][i_ref0][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
mvx0, mvy0, 2*width, 2*height );
h->mc.mc_chroma( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
&h->mb.pic.p_fref[0][i_ref0][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
mvx0, mvy0, 2*width, 2*height );
if( h->param.analyse.b_weighted_bipred )
{
const int i_ref0 = h->mb.cache.ref[0][i8];
const int weight = h->mb.bipred_weight[i_ref0][i_ref1];
h->mc.avg_weight[i_mode]( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE, tmp, 16, weight );
h->mc.mc_chroma( tmp, 16, &h->mb.pic.p_fref[1][i_ref1][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
h->mc.avg_weight[i_mode]( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE,
src0, i_stride0, src1, i_stride1, weight );
h->mc.mc_chroma( tmp0, 16, &h->mb.pic.p_fref[1][i_ref1][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
mvx1, mvy1, 2*width, 2*height );
h->mc.avg_weight[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp, 16, weight );
h->mc.mc_chroma( tmp, 16, &h->mb.pic.p_fref[1][i_ref1][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
h->mc.avg_weight[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
&h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, weight );
h->mc.mc_chroma( tmp0, 16, &h->mb.pic.p_fref[1][i_ref1][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
mvx1, mvy1, 2*width, 2*height );
h->mc.avg_weight[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp, 16, weight );
h->mc.avg_weight[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
&h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, weight );
}
else
{
h->mc.avg[i_mode]( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE, tmp, 16 );
h->mc.mc_chroma( tmp, 16, &h->mb.pic.p_fref[1][i_ref1][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
h->mc.avg[i_mode]( &h->mb.pic.p_fdec[0][4*y*FDEC_STRIDE+4*x], FDEC_STRIDE,
src0, i_stride0, src1, i_stride1 );
h->mc.mc_chroma( tmp0, 16, &h->mb.pic.p_fref[1][i_ref1][4][2*y*h->mb.pic.i_stride[1]+2*x], h->mb.pic.i_stride[1],
mvx1, mvy1, 2*width, 2*height );
h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp, 16 );
h->mc.mc_chroma( tmp, 16, &h->mb.pic.p_fref[1][i_ref1][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
&h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16 );
h->mc.mc_chroma( tmp0, 16, &h->mb.pic.p_fref[1][i_ref1][5][2*y*h->mb.pic.i_stride[2]+2*x], h->mb.pic.i_stride[2],
mvx1, mvy1, 2*width, 2*height );
h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp, 16 );
h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
&h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16 );
}
}
......
......@@ -49,25 +49,27 @@ static inline void pixel_avg( uint8_t *dst, int i_dst_stride,
}
}
static inline void pixel_avg_wxh( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int width, int height )
static inline void pixel_avg_wxh( uint8_t *dst, int i_dst, uint8_t *src1, int i_src1, uint8_t *src2, int i_src2, int width, int height )
{
int x, y;
for( y = 0; y < height; y++ )
{
for( x = 0; x < width; x++ )
{
dst[x] = ( dst[x] + src[x] + 1 ) >> 1;
dst[x] = ( src1[x] + src2[x] + 1 ) >> 1;
}
src1 += i_src1;
src2 += i_src2;
dst += i_dst;
src += i_src;
}
}
#define PIXEL_AVG_C( name, width, height ) \
static void name( uint8_t *pix1, int i_stride_pix1, \
uint8_t *pix2, int i_stride_pix2 ) \
uint8_t *pix2, int i_stride_pix2, \
uint8_t *pix3, int i_stride_pix3 ) \
{ \
pixel_avg_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, width, height ); \
pixel_avg_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height ); \
}
PIXEL_AVG_C( pixel_avg_16x16, 16, 16 )
PIXEL_AVG_C( pixel_avg_16x8, 16, 8 )
......@@ -83,11 +85,13 @@ PIXEL_AVG_C( pixel_avg_2x2, 2, 2 )
/* Implicit weighted bipred only:
* assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64 */
#define op_scale2(x) dst[x] = x264_clip_uint8( (dst[x]*i_weight1 + src[x]*i_weight2 + (1<<5)) >> 6 )
static inline void pixel_avg_weight_wxh( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int width, int height, int i_weight1 ){
#define op_scale2(x) dst[x] = x264_clip_uint8( (src1[x]*i_weight1 + src2[x]*i_weight2 + (1<<5)) >> 6 )
static inline void pixel_avg_weight_wxh( uint8_t *dst, int i_dst, uint8_t *src1, int i_src1, uint8_t *src2, int i_src2, int width, int height, int i_weight1 )
{
int y;
const int i_weight2 = 64 - i_weight1;
for(y=0; y<height; y++, dst += i_dst, src += i_src){
for( y = 0; y<height; y++, dst += i_dst, src1 += i_src1, src2 += i_src2 )
{
op_scale2(0);
op_scale2(1);
if(width==2) continue;
......@@ -113,9 +117,10 @@ static inline void pixel_avg_weight_wxh( uint8_t *dst, int i_dst, uint8_t *src,
#define PIXEL_AVG_WEIGHT_C( width, height ) \
static void pixel_avg_weight_##width##x##height( \
uint8_t *pix1, int i_stride_pix1, \
uint8_t *pix2, int i_stride_pix2, int i_weight1 ) \
uint8_t *pix2, int i_stride_pix2, \
uint8_t *pix3, int i_stride_pix3, int i_weight1 ) \
{ \
pixel_avg_weight_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, width, height, i_weight1 ); \
pixel_avg_weight_wxh( pix1, i_stride_pix1, pix2, i_stride_pix2, pix3, i_stride_pix3, width, height, i_weight1 ); \
}
PIXEL_AVG_WEIGHT_C(16,16)
......
......@@ -45,8 +45,8 @@ typedef struct
int mvx, int mvy,
int i_width, int i_height );
void (*avg[10])( uint8_t *dst, int, uint8_t *src, int );
void (*avg_weight[10])( uint8_t *dst, int, uint8_t *src, int, int i_weight );
void (*avg[10])( uint8_t *dst, int, uint8_t *src1, int, uint8_t *src2, int );
void (*avg_weight[10])( uint8_t *dst, int, uint8_t *src1, int, uint8_t *src2, int, int i_weight );
/* only 16x16, 8x8, and 4x4 defined */
void (*copy[7])( uint8_t *dst, int, uint8_t *src, int, int i_height );
......
......@@ -41,19 +41,23 @@ SECTION .text
;-----------------------------------------------------------------------------
; void x264_pixel_avg_4x4_mmxext( uint8_t *dst, int dst_stride,
; uint8_t *src, int src_stride );
; uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride );
;-----------------------------------------------------------------------------
%macro AVGH 3
%assign function_align 8 ; the whole function fits in 8 bytes, so a larger align just wastes space
cglobal x264_pixel_avg_%1x%2_%3
cglobal x264_pixel_avg_%1x%2_%3,0,0
mov eax, %2
%ifidn %3, sse2
test dword r4m, 15
jnz x264_pixel_avg_w%1_mmxext
%endif
jmp x264_pixel_avg_w%1_%3
%assign function_align 16
%endmacro
;-----------------------------------------------------------------------------
; void x264_pixel_avg_w4_mmxext( uint8_t *dst, int dst_stride,
; uint8_t *src, int src_stride,
; uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride,
; int height );
;-----------------------------------------------------------------------------
%ifdef ARCH_X86_64
......@@ -61,8 +65,10 @@ cglobal x264_pixel_avg_%1x%2_%3
%define t1 r1
%define t2 r2
%define t3 r3
%define t4 r4
%define t5 r5
%macro AVG_START 1
cglobal %1, 4,5
cglobal %1, 6,7
.height_loop:
%endmacro
%else
......@@ -70,18 +76,23 @@ cglobal x264_pixel_avg_%1x%2_%3
%define t1 r2
%define t2 r3
%define t3 r4
%define t4 r5
%define t5 r6
%macro AVG_START 1
cglobal %1, 0,5
cglobal %1, 0,7
mov t0, r0m
mov t1, r1m
mov t2, r2m
mov t3, r3m
mov t4, r4m
mov t5, r5m
.height_loop:
%endmacro
%endif
%macro AVG_END 0
sub eax, 2
lea t4, [t4+t5*2]
lea t2, [t2+t3*2]
lea t0, [t0+t1*2]
jg .height_loop
......@@ -91,8 +102,8 @@ cglobal x264_pixel_avg_%1x%2_%3
AVG_START x264_pixel_avg_w4_mmxext
movd mm0, [t2]
movd mm1, [t2+t3]
pavgb mm0, [t0]
pavgb mm1, [t0+t1]
pavgb mm0, [t4]
pavgb mm1, [t4+t5]
movd [t0], mm0
movd [t0+t1], mm1
AVG_END
......@@ -104,8 +115,8 @@ AVGH 4, 2, mmxext
AVG_START x264_pixel_avg_w8_mmxext
movq mm0, [t2]
movq mm1, [t2+t3]
pavgb mm0, [t0]
pavgb mm1, [t0+t1]
pavgb mm0, [t4]
pavgb mm1, [t4+t5]
movq [t0], mm0
movq [t0+t1], mm1
AVG_END
......@@ -119,10 +130,10 @@ AVG_START x264_pixel_avg_w16_mmxext
movq mm1, [t2+8]
movq mm2, [t2+t3 ]
movq mm3, [t2+t3+8]
pavgb mm0, [t0 ]
pavgb mm1, [t0+8]
pavgb mm2, [t0+t1 ]
pavgb mm3, [t0+t1+8]
pavgb mm0, [t4 ]
pavgb mm1, [t4+8]
pavgb mm2, [t4+t5 ]
pavgb mm3, [t4+t5+8]
movq [t0 ], mm0
movq [t0+8], mm1
movq [t0+t1 ], mm2
......@@ -135,8 +146,8 @@ AVGH 16, 8, mmxext
AVG_START x264_pixel_avg_w16_sse2
movdqu xmm0, [t2]
movdqu xmm1, [t2+t3]
pavgb xmm0, [t0]
pavgb xmm1, [t0+t1]
pavgb xmm0, [t4]
pavgb xmm1, [t4+t5]
movdqa [t0], xmm0
movdqa [t0+t1], xmm1
AVG_END
......@@ -480,9 +491,9 @@ COPY_W16_SSE2 x264_mc_copy_w16_aligned_sse2, movdqa
%endif
%endmacro
%macro BIWEIGHT 2
movh m0, %1
movh m1, %2
%macro BIWEIGHT 3
movh m0, %2
movh m1, %3
punpcklbw m0, m7
punpcklbw m1, m7
pmullw m0, m4
......@@ -496,52 +507,46 @@ COPY_W16_SSE2 x264_mc_copy_w16_aligned_sse2, movdqa
%endmacro
%macro BIWEIGHT_START 1
%ifidn r4m, r4d
movd m4, r4m
movd m4, r6m
SPLATW m4, m4 ; weight_dst
%else
SPLATW m4, r4m
%endif
mova m5, [pw_64 GLOBAL]
psubw m5, m4 ; weight_src
mova m6, [pw_32 GLOBAL] ; rounding
pxor m7, m7
%if %1
%ifidn r5m, r5d
%define t0 r5d
%else
%define t0 r4d
mov r4d, r5m
%endif
%define t0 r6d
mov r6d, r7m
%endif
.height_loop:
%endmacro
INIT_MMX
;-----------------------------------------------------------------------------
; int x264_pixel_avg_weight_w16_mmxext( uint8_t *dst, int, uint8_t *src, int, int i_weight, int )
; int x264_pixel_avg_weight_w16_mmxext( uint8_t *dst, int, uint8_t *src1, int, uint8_t *src2, int, int i_weight, int )
;-----------------------------------------------------------------------------
cglobal x264_pixel_avg_weight_4x4_mmxext, 4,4
cglobal x264_pixel_avg_weight_4x4_mmxext, 6,6
BIWEIGHT_START 0
BIWEIGHT [r0 ], [r2 ]
BIWEIGHT [r0+r1 ], [r2+r3 ]
BIWEIGHT [r0+r1*2], [r2+r3*2]
BIWEIGHT [r0 ], [r2 ], [r4 ]
BIWEIGHT [r0+r1 ], [r2+r3 ], [r4+r5 ]
BIWEIGHT [r0+r1*2], [r2+r3*2], [r4+r5*2]
add r0, r1
add r2, r3
BIWEIGHT [r0+r1*2], [r2+r3*2]
add r4, r5
BIWEIGHT [r0+r1*2], [r2+r3*2], [r4+r5*2]
RET
%macro AVG_WEIGHT 2
cglobal x264_pixel_avg_weight_w%2_%1, 4,5
cglobal x264_pixel_avg_weight_w%2_%1, 6,7
BIWEIGHT_START 1
%assign x 0
%rep %2*2/mmsize
BIWEIGHT [r0+x], [r2+x]
BIWEIGHT [r0+x+r1], [r2+x+r3]
BIWEIGHT [r0+x], [r2+x], [r4+x]
BIWEIGHT [r0+x+r1], [r2+x+r3], [r4+x+r5]
%assign x x+mmsize/2
%endrep
lea r0, [r0+r1*2]
lea r2, [r2+r3*2]
lea r4, [r4+r5*2]
sub t0, 2
jg .height_loop
REP_RET
......
......@@ -30,27 +30,27 @@
#include "mc.h"
/* NASM functions */
extern void x264_pixel_avg_16x16_sse2( uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg_16x8_sse2( uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg_16x16_mmxext( uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg_16x8_mmxext( uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg_8x16_mmxext( uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg_8x8_mmxext( uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg_8x4_mmxext( uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg_4x8_mmxext( uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg_4x4_mmxext( uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg_4x2_mmxext( uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg_16x16_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg_16x8_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg_16x16_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg_16x8_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg_8x16_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg_8x8_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg_8x4_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg_4x8_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg_4x4_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
extern void x264_pixel_avg_4x2_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int );
extern void x264_mc_copy_w4_mmx( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w8_mmx( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w16_mmx( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w16_sse2( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w16_sse3( uint8_t *, int, uint8_t *, int, int );
extern void x264_mc_copy_w16_aligned_sse2( uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, int, int );
extern void x264_pixel_avg_weight_w8_sse2( uint8_t *, int, uint8_t *, int, int, int );
extern void x264_pixel_avg_weight_w16_sse2( uint8_t *, int, uint8_t *, int, int, int );
extern void x264_pixel_avg_weight_w16_mmxext( uint8_t *, int, uint8_t *, int, int, int );
extern void x264_pixel_avg_weight_4x4_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
extern void x264_pixel_avg_weight_w8_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int, int );
extern void x264_pixel_avg_weight_w8_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int, int, int );
extern void x264_pixel_avg_weight_w16_sse2( uint8_t *, int, uint8_t *, int, uint8_t *, int, int, int );
extern void x264_pixel_avg_weight_w16_mmxext( uint8_t *, int, uint8_t *, int, uint8_t *, int, int, int );
extern void x264_prefetch_fenc_mmxext( uint8_t *, int, uint8_t *, int, int );
extern void x264_prefetch_ref_mmxext( uint8_t *, int, int );
extern void x264_mc_chroma_mmxext( uint8_t *src, int i_src_stride,
......@@ -88,9 +88,9 @@ PIXEL_AVG_WALL(cache64_sse2)
PIXEL_AVG_WALL(sse2)
#define AVG_WEIGHT(W,H,name) \
static void x264_pixel_avg_weight_ ## W ## x ## H ## _##name( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int i_weight_dst ) \
static void x264_pixel_avg_weight_ ## W ## x ## H ## _##name( uint8_t *dst, int i_dst, uint8_t *src1, int i_src1, uint8_t *src2, int i_src2, int i_weight_dst ) \
{ \
x264_pixel_avg_weight_w ## W ## _##name( dst, i_dst, src, i_src, i_weight_dst, H ); \
x264_pixel_avg_weight_w ## W ## _##name( dst, i_dst, src1, i_src1, src2, i_src2, i_weight_dst, H ); \
}
AVG_WEIGHT(16,16,mmxext)
......@@ -311,14 +311,11 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_sse2;
pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_sse2;
pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_sse2;
if( !(cpu&X264_CPU_STACK_MOD4) )
{
pf->avg_weight[PIXEL_16x16] = x264_pixel_avg_weight_16x16_sse2;
pf->avg_weight[PIXEL_16x8] = x264_pixel_avg_weight_16x8_sse2;
pf->avg_weight[PIXEL_8x16] = x264_pixel_avg_weight_8x16_sse2;
pf->avg_weight[PIXEL_8x8] = x264_pixel_avg_weight_8x8_sse2;
pf->avg_weight[PIXEL_8x4] = x264_pixel_avg_weight_8x4_sse2;
}
pf->avg_weight[PIXEL_16x16] = x264_pixel_avg_weight_16x16_sse2;
pf->avg_weight[PIXEL_16x8] = x264_pixel_avg_weight_16x8_sse2;
pf->avg_weight[PIXEL_8x16] = x264_pixel_avg_weight_8x16_sse2;
pf->avg_weight[PIXEL_8x8] = x264_pixel_avg_weight_8x8_sse2;
pf->avg_weight[PIXEL_8x4] = x264_pixel_avg_weight_8x4_sse2;
pf->hpel_filter = x264_hpel_filter_sse2;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2;
pf->mc_chroma = x264_mc_chroma_sse2;
......
......@@ -1474,21 +1474,21 @@ static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
}
}
#define WEIGHTED_AVG( size, pix1, stride1, src2, stride2 ) \
{ \
if( h->param.analyse.b_weighted_bipred ) \
h->mc.avg_weight[size]( pix1, stride1, src2, stride2, \
h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); \
else \
h->mc.avg[size]( pix1, stride1, src2, stride2 ); \
}
#define WEIGHTED_AVG( size, pix, stride, src1, stride1, src2, stride2 ) \
{ \
if( h->param.analyse.b_weighted_bipred ) \
h->mc.avg_weight[size]( pix, stride, src1, stride1, src2, stride2, \
h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref] ); \
else \
h->mc.avg[size]( pix, stride, src1, stride1, src2, stride2 ); \
}
static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
{
DECLARE_ALIGNED_16( uint8_t pix0[16*16] );
DECLARE_ALIGNED_16( uint8_t pix1[16*16] );
DECLARE_ALIGNED_16( uint8_t pix2[16*16] );
uint8_t *src2;
int stride2 = 16;
uint8_t *src0, *src1;
int stride0 = 16, stride1 = 16;
int weight;
x264_me_t m;
......@@ -1560,40 +1560,19 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
/* get cost of BI mode */
weight = h->mb.bipred_weight[a->l0.i_ref][a->l1.i_ref];
if ( (*(uint32_t*)a->l0.me16x16.mv & 0x10001) == 0 )
{
/* l0 reference is halfpel, so get_ref on it will make it faster */
src2 =
h->mc.get_ref( pix2, &stride2,
h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
a->l0.me16x16.mv[0], a->l0.me16x16.mv[1],
16, 16 );
h->mc.mc_luma( pix1, 16,
h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
a->l1.me16x16.mv[0], a->l1.me16x16.mv[1],
16, 16 );
weight = 64 - weight;
}
else
{
/* if l0 was qpel, we'll use get_ref on l1 instead */
h->mc.mc_luma( pix1, 16,
h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
a->l0.me16x16.mv[0], a->l0.me16x16.mv[1],
16, 16 );
src2 =
h->mc.get_ref( pix2, &stride2,
h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
a->l1.me16x16.mv[0], a->l1.me16x16.mv[1],
16, 16 );
}
src0 = h->mc.get_ref( pix0, &stride0,
h->mb.pic.p_fref[0][a->l0.i_ref], h->mb.pic.i_stride[0],
a->l0.me16x16.mv[0], a->l0.me16x16.mv[1], 16, 16 );
src1 = h->mc.get_ref( pix1, &stride1,
h->mb.pic.p_fref[1][a->l1.i_ref], h->mb.pic.i_stride[0],
a->l1.me16x16.mv[0], a->l1.me16x16.mv[1], 16, 16 );
if( h->param.analyse.b_weighted_bipred )
h->mc.avg_weight[PIXEL_16x16]( pix1, 16, src2, stride2, weight );
h->mc.avg_weight[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1, weight );
else
h->mc.avg[PIXEL_16x16]( pix1, 16, src2, stride2 );
h->mc.avg[PIXEL_16x16]( pix0, 16, src0, stride0, src1, stride1 );
a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix1, 16 )
a->i_cost16x16bi = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix0, 16 )
+ REF_COST( 0, a->l0.i_ref )
+ REF_COST( 1, a->l1.i_ref )
+ a->l0.me16x16.cost_mv
......@@ -1709,6 +1688,8 @@ static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
const int y8 = i/2;
int i_part_cost;
int i_part_cost_bi = 0;
int stride[2] = {8,8};
uint8_t *src[2];
for( l = 0; l < 2; l++ )
{
......@@ -1727,13 +1708,12 @@ static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
x264_macroblock_cache_mv_ptr( h, 2*x8, 2*y8, 2, 2, l, m->mv );
/* BI mode */
h->mc.mc_luma( pix[l], 8, m->p_fref, m->i_stride[0],
m->mv[0], m->mv[1], 8, 8 );
src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
m->mv[0], m->mv[1], 8, 8 );
i_part_cost_bi += m->cost_mv;
/* FIXME: ref cost */
}
WEIGHTED_AVG( PIXEL_8x8, pix[0], 8, pix[1], 8 );
WEIGHTED_AVG( PIXEL_8x8, pix[0], 8, src[0], stride[0], src[1], stride[1] );
i_part_cost_bi += h->pixf.mbcmp[PIXEL_8x8]( a->l0.me8x8[i].p_fenc[0], FENC_STRIDE, pix[0], 8 )
+ a->i_lambda * i_sub_mb_b_cost_table[D_BI_8x8];
a->l0.me8x8[i].cost += a->i_lambda * i_sub_mb_b_cost_table[D_L0_8x8];
......@@ -1759,7 +1739,7 @@ static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
uint8_t **p_fref[2] =
{ h->mb.pic.p_fref[0][a->l0.i_ref],
h->mb.pic.p_fref[1][a->l1.i_ref] };
DECLARE_ALIGNED_16( uint8_t pix[2][16*8] );
DECLARE_ALIGNED_16( uint8_t pix[2][16*8] );
DECLARE_ALIGNED_4( int16_t mvc[2][2] );
int i, l;
......@@ -1770,6 +1750,8 @@ static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
{
int i_part_cost;
int i_part_cost_bi = 0;
int stride[2] = {16,16};
uint8_t *src[2];
/* TODO: check only the list(s) that were used in b8x8? */
for( l = 0; l < 2; l++ )
......@@ -1790,13 +1772,13 @@ static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a )
x264_me_search( h, m, mvc, 2 );
/* BI mode */
h->mc.mc_luma( pix[l], 16, m->p_fref, m->i_stride[0],
m->mv[0], m->mv[1], 16, 8 );
src[l] = h->mc.get_ref( pix[l], &stride[l], m->p_fref, m->i_stride[0],
m->mv[0], m->mv[1], 16, 8 );
/* FIXME: ref cost */
i_part_cost_bi += m->cost_mv;
}
WEIGHTED_AVG( PIXEL_16x8, pix[0], 16, pix[1], 16 );
WEIGHTED_AVG( PIXEL_16x8, pix[0], 16, src[0], stride[0], src[1], stride[1] );
i_part_cost_bi += h->pixf.mbcmp[PIXEL_16x8]( a->l0.me16x8[i].p_fenc[0], FENC_STRIDE, pix[0], 16 );
i_part_cost = a->l0.me16x8[i].cost;
......@@ -1839,6 +1821,8 @@ static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )
{
int i_part_cost;
int i_part_cost_bi = 0;
int stride[2] = {8,8};
uint8_t *src[2];
for( l = 0; l < 2; l++ )
{
......@@ -1858,13 +1842,13 @@ static void x264_mb_analyse_inter_b8x16( x264_t *h, x264_mb_analysis_t *a )