Commit 3131a19c authored by Henrik Gramner's avatar Henrik Gramner Committed by Fiona Glaser

Fix incorrect zero-extension assumptions in x86_64 asm

Some x264 asm assumed that the high 32 bits of registers containing "int" values would be zero.
This is almost always the case, and it seems to work with gcc, but it is *not* guaranteed by the ABI.
As a result, it breaks with some other compilers, like Clang, that take advantage of this in optimizations.
Accordingly, fix all x86 code by using intptr_t instead of int or using movsxd where neccessary.
Also add checkasm hack to detect when assembly functions incorrectly assumes that 32-bit integers are zero-extended to 64-bit.
parent d52d0b1e
......@@ -32,7 +32,7 @@
// note: prefetch stuff assumes 64-byte cacheline, true for the Cortex-A8
// They also use nothing above armv5te, but we don't care about pre-armv6
// void prefetch_ref( uint8_t *pix, int stride, int parity )
// void prefetch_ref( uint8_t *pix, intptr_t stride, int parity )
function x264_prefetch_ref_arm
sub r2, r2, #1
add r0, r0, #64
......@@ -51,8 +51,8 @@ function x264_prefetch_ref_arm
bx lr
.endfunc
// void prefetch_fenc( uint8_t *pix_y, int stride_y,
// uint8_t *pix_uv, int stride_uv, int mb_x )
// void prefetch_fenc( uint8_t *pix_y, intptr_t stride_y,
// uint8_t *pix_uv, intptr_t stride_uv, int mb_x )
function x264_prefetch_fenc_arm
ldr ip, [sp]
push {lr}
......@@ -78,7 +78,7 @@ function x264_prefetch_fenc_arm
.endfunc
// void *x264_memcpy_aligned( void * dst, const void * src, size_t n )
// void *x264_memcpy_aligned( void *dst, const void *src, size_t n )
function x264_memcpy_aligned_neon
orr r3, r0, r1, lsr #1
movrel ip, memcpy_table
......@@ -158,9 +158,9 @@ memzero_loop:
.endfunc
// void pixel_avg( uint8_t *dst, int dst_stride,
// uint8_t *src1, int src1_stride,
// uint8_t *src2, int src2_stride, int weight );
// void pixel_avg( uint8_t *dst, intptr_t dst_stride,
// uint8_t *src1, intptr_t src1_stride,
// uint8_t *src2, intptr_t src2_stride, int weight );
.macro AVGH w h
function x264_pixel_avg_\w\()x\h\()_neon
ldr ip, [sp, #8]
......@@ -455,7 +455,7 @@ avg2_w20_loop:
.endif
.endm
// void mc_weight( uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
// void mc_weight( uint8_t *src, intptr_t src_stride, uint8_t *dst, intptr_t dst_stride,
// const x264_weight_t *weight, int height )
function x264_mc_weight_w20_neon
weight_prologue full
......@@ -744,7 +744,7 @@ weight_simple offsetadd, vqadd.u8
weight_simple offsetsub, vqsub.u8
// void mc_copy( uint8_t *dst, int dst_stride, uint8_t *src, int src_stride, int height )
// void mc_copy( uint8_t *dst, intptr_t dst_stride, uint8_t *src, intptr_t src_stride, int height )
function x264_mc_copy_w4_neon
ldr ip, [sp]
copy_w4_loop:
......@@ -810,8 +810,8 @@ copy_w16_aligned_loop:
.endfunc
// void x264_mc_chroma_neon( uint8_t *dst, int i_dst_stride,
// uint8_t *src, int i_src_stride,
// void x264_mc_chroma_neon( uint8_t *dst, intptr_t i_dst_stride,
// uint8_t *src, intptr_t i_src_stride,
// int dx, int dy, int i_width, int i_height );
function x264_mc_chroma_neon
push {r4-r6, lr}
......@@ -1052,7 +1052,7 @@ mc_chroma_w8:
.endfunc
// hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, int stride, int width)
// hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, intptr_t stride, int width )
function x264_hpel_filter_v_neon
ldr ip, [sp]
sub r1, r1, r3, lsl #1
......@@ -1266,7 +1266,7 @@ filter_h_loop:
// frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv,
// uint8_t *dstc, int src_stride, int dst_stride, int width,
// uint8_t *dstc, intptr_t src_stride, intptr_t dst_stride, int width,
// int height )
function x264_frame_init_lowres_core_neon
push {r4-r10,lr}
......
......@@ -26,33 +26,33 @@
#include "common/common.h"
#include "mc.h"
void x264_prefetch_ref_arm( uint8_t *, int, int );
void x264_prefetch_fenc_arm( uint8_t *, int, uint8_t *, int, int );
void *x264_memcpy_aligned_neon( void * dst, const void * src, size_t n );
void x264_memzero_aligned_neon( void *dst, int n );
void x264_pixel_avg_16x16_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
void x264_pixel_avg_16x8_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
void x264_pixel_avg_8x16_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
void x264_pixel_avg_8x8_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
void x264_pixel_avg_8x4_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
void x264_pixel_avg_4x8_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
void x264_pixel_avg_4x4_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
void x264_pixel_avg_4x2_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int );
void x264_pixel_avg2_w4_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int );
void x264_pixel_avg2_w8_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int );
void x264_pixel_avg2_w16_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int );
void x264_pixel_avg2_w20_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int );
void x264_prefetch_ref_arm( uint8_t *, intptr_t, int );
void x264_prefetch_fenc_arm( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
void *x264_memcpy_aligned_neon( void *dst, const void *src, size_t n );
void x264_memzero_aligned_neon( void *dst, size_t n );
void x264_pixel_avg_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
void x264_pixel_avg_16x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
void x264_pixel_avg_8x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
void x264_pixel_avg_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
void x264_pixel_avg_8x4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
void x264_pixel_avg_4x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
void x264_pixel_avg_4x4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
void x264_pixel_avg_4x2_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
void x264_pixel_avg2_w4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
void x264_pixel_avg2_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
#define MC_WEIGHT(func)\
void x264_mc_weight_w20##func##_neon( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int );\
void x264_mc_weight_w16##func##_neon( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int );\
void x264_mc_weight_w8##func##_neon( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int );\
void x264_mc_weight_w4##func##_neon( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int );\
void x264_mc_weight_w20##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
void x264_mc_weight_w16##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
void x264_mc_weight_w8##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
void x264_mc_weight_w4##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
\
static void (* const x264_mc##func##_wtab_neon[6])( uint8_t *, int, uint8_t *, int, const x264_weight_t *, int ) =\
static void (* const x264_mc##func##_wtab_neon[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int ) =\
{\
x264_mc_weight_w4##func##_neon,\
x264_mc_weight_w4##func##_neon,\
......@@ -67,15 +67,15 @@ MC_WEIGHT(_nodenom)
MC_WEIGHT(_offsetadd)
MC_WEIGHT(_offsetsub)
void x264_mc_copy_w4_neon( uint8_t *, int, uint8_t *, int, int );
void x264_mc_copy_w8_neon( uint8_t *, int, uint8_t *, int, int );
void x264_mc_copy_w16_neon( uint8_t *, int, uint8_t *, int, int );
void x264_mc_copy_w16_aligned_neon( uint8_t *, int, uint8_t *, int, int );
void x264_mc_copy_w4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
void x264_mc_copy_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
void x264_mc_copy_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
void x264_mc_copy_w16_aligned_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
void x264_mc_chroma_neon( uint8_t *, int, uint8_t *, int, int, int, int, int );
void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int, int, int);
void x264_mc_chroma_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, int, int );
void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, intptr_t, int, int );
void x264_hpel_filter_v_neon( uint8_t *, uint8_t *, int16_t *, int, int );
void x264_hpel_filter_v_neon( uint8_t *, uint8_t *, int16_t *, intptr_t, int );
void x264_hpel_filter_c_neon( uint8_t *, int16_t *, int );
void x264_hpel_filter_h_neon( uint8_t *, uint8_t *, int );
......@@ -101,7 +101,7 @@ static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w )
w->weightfn = x264_mc_wtab_neon;
}
static void (* const x264_pixel_avg_wtab_neon[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) =
static void (* const x264_pixel_avg_wtab_neon[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ) =
{
NULL,
x264_pixel_avg2_w4_neon,
......@@ -111,7 +111,7 @@ static void (* const x264_pixel_avg_wtab_neon[6])( uint8_t *, int, uint8_t *, in
x264_pixel_avg2_w20_neon,
};
static void (* const x264_mc_copy_wtab_neon[5])( uint8_t *, int, uint8_t *, int, int ) =
static void (* const x264_mc_copy_wtab_neon[5])( uint8_t *, intptr_t, uint8_t *, intptr_t, int ) =
{
NULL,
x264_mc_copy_w4_neon,
......@@ -123,13 +123,13 @@ static void (* const x264_mc_copy_wtab_neon[5])( uint8_t *, int, uint8_t *, int,
static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
static void mc_luma_neon( uint8_t *dst, int i_dst_stride,
uint8_t *src[4], int i_src_stride,
static void mc_luma_neon( uint8_t *dst, intptr_t i_dst_stride,
uint8_t *src[4], intptr_t i_src_stride,
int mvx, int mvy,
int i_width, int i_height, const x264_weight_t *weight )
{
int qpel_idx = ((mvy&3)<<2) + (mvx&3);
int offset = (mvy>>2)*i_src_stride + (mvx>>2);
intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset;
if ( (mvy&3) == 3 ) // explict if() to force conditional add
src1 += i_src_stride;
......@@ -149,13 +149,13 @@ static void mc_luma_neon( uint8_t *dst, int i_dst_stride,
x264_mc_copy_wtab_neon[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, i_height );
}
static uint8_t *get_ref_neon( uint8_t *dst, int *i_dst_stride,
uint8_t *src[4], int i_src_stride,
static uint8_t *get_ref_neon( uint8_t *dst, intptr_t *i_dst_stride,
uint8_t *src[4], intptr_t i_src_stride,
int mvx, int mvy,
int i_width, int i_height, const x264_weight_t *weight )
{
int qpel_idx = ((mvy&3)<<2) + (mvx&3);
int offset = (mvy>>2)*i_src_stride + (mvx>>2);
intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset;
if ( (mvy&3) == 3 ) // explict if() to force conditional add
src1 += i_src_stride;
......@@ -183,9 +183,9 @@ static uint8_t *get_ref_neon( uint8_t *dst, int *i_dst_stride,
}
static void hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
int stride, int width, int height, int16_t *buf )
intptr_t stride, int width, int height, int16_t *buf )
{
int realign = (intptr_t)src & 15;
intptr_t realign = (intptr_t)src & 15;
src -= realign;
dstv -= realign;
dstc -= realign;
......
......@@ -39,11 +39,11 @@
DECL_PIXELS( int, name, suffix, ( uint8_t *, int, uint8_t *, int ) )
#define DECL_X4( name, suffix ) \
DECL_PIXELS( void, name##_x3, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ) )\
DECL_PIXELS( void, name##_x4, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int * ) )
DECL_PIXELS( void, name##_x3, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )\
DECL_PIXELS( void, name##_x4, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )
int x264_pixel_sad_4x4_armv6( uint8_t *, int, uint8_t *, int );
int x264_pixel_sad_4x8_armv6( uint8_t *, int, uint8_t *, int );
int x264_pixel_sad_4x4_armv6( uint8_t *, intptr_t, uint8_t *, intptr_t );
int x264_pixel_sad_4x8_armv6( uint8_t *, intptr_t, uint8_t *, intptr_t );
DECL_X1( sad, neon )
DECL_X1( sad_aligned, neon )
......@@ -52,21 +52,21 @@ DECL_X4( sad, neon )
DECL_X1( satd, neon )
DECL_X1( ssd, neon )
int x264_pixel_sa8d_8x8_neon( uint8_t *, int, uint8_t *, int );
int x264_pixel_sa8d_16x16_neon( uint8_t *, int, uint8_t *, int );
int x264_pixel_sa8d_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t );
int x264_pixel_sa8d_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
uint64_t x264_pixel_var_8x8_neon( uint8_t *, int );
uint64_t x264_pixel_var_16x16_neon( uint8_t *, int );
int x264_pixel_var2_8x8_neon( uint8_t *, int, uint8_t *, int, int * );
uint64_t x264_pixel_var_8x8_neon ( uint8_t *, intptr_t );
uint64_t x264_pixel_var_16x16_neon( uint8_t *, intptr_t );
int x264_pixel_var2_8x8_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
uint64_t x264_pixel_hadamard_ac_8x8_neon( uint8_t *, int );
uint64_t x264_pixel_hadamard_ac_8x16_neon( uint8_t *, int );
uint64_t x264_pixel_hadamard_ac_16x8_neon( uint8_t *, int );
uint64_t x264_pixel_hadamard_ac_16x16_neon( uint8_t *, int );
uint64_t x264_pixel_hadamard_ac_8x8_neon ( uint8_t *, intptr_t );
uint64_t x264_pixel_hadamard_ac_8x16_neon ( uint8_t *, intptr_t );
uint64_t x264_pixel_hadamard_ac_16x8_neon ( uint8_t *, intptr_t );
uint64_t x264_pixel_hadamard_ac_16x16_neon( uint8_t *, intptr_t );
void x264_pixel_ssim_4x4x2_core_neon( const uint8_t *, int,
const uint8_t *, int,
int sums[2][4]);
void x264_pixel_ssim_4x4x2_core_neon( const uint8_t *, intptr_t,
const uint8_t *, intptr_t,
int sums[2][4] );
float x264_pixel_ssim_end4_neon( int sum0[5][4], int sum1[5][4], int width );
#endif
......@@ -76,7 +76,7 @@ static const int8_t i_tc0_table[52+12*3][4] =
#define tc0_table(x) i_tc0_table[(x)+24]
/* From ffmpeg */
static ALWAYS_INLINE void deblock_edge_luma_c( pixel *pix, int xstride, int alpha, int beta, int8_t tc0 )
static ALWAYS_INLINE void deblock_edge_luma_c( pixel *pix, intptr_t xstride, int alpha, int beta, int8_t tc0 )
{
int p2 = pix[-3*xstride];
int p1 = pix[-2*xstride];
......@@ -107,7 +107,7 @@ static ALWAYS_INLINE void deblock_edge_luma_c( pixel *pix, int xstride, int alph
pix[ 0*xstride] = x264_clip_pixel( q0 - delta ); /* q0' */
}
}
static inline void deblock_luma_c( pixel *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
static inline void deblock_luma_c( pixel *pix, intptr_t xstride, intptr_t ystride, int alpha, int beta, int8_t *tc0 )
{
for( int i = 0; i < 4; i++ )
{
......@@ -120,21 +120,21 @@ static inline void deblock_luma_c( pixel *pix, int xstride, int ystride, int alp
deblock_edge_luma_c( pix, xstride, alpha, beta, tc0[i] );
}
}
static void deblock_h_luma_mbaff_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
static void deblock_h_luma_mbaff_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
{
for( int d = 0; d < 8; d++, pix += stride )
deblock_edge_luma_c( pix, 1, alpha, beta, tc0[d>>1] );
}
static void deblock_v_luma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
static void deblock_v_luma_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
{
deblock_luma_c( pix, stride, 1, alpha, beta, tc0 );
}
static void deblock_h_luma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
static void deblock_h_luma_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
{
deblock_luma_c( pix, 1, stride, alpha, beta, tc0 );
}
static ALWAYS_INLINE void deblock_edge_chroma_c( pixel *pix, int xstride, int alpha, int beta, int8_t tc )
static ALWAYS_INLINE void deblock_edge_chroma_c( pixel *pix, intptr_t xstride, int alpha, int beta, int8_t tc )
{
int p1 = pix[-2*xstride];
int p0 = pix[-1*xstride];
......@@ -148,7 +148,7 @@ static ALWAYS_INLINE void deblock_edge_chroma_c( pixel *pix, int xstride, int al
pix[ 0*xstride] = x264_clip_pixel( q0 - delta ); /* q0' */
}
}
static ALWAYS_INLINE void deblock_chroma_c( pixel *pix, int height, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
static ALWAYS_INLINE void deblock_chroma_c( pixel *pix, int height, intptr_t xstride, intptr_t ystride, int alpha, int beta, int8_t *tc0 )
{
for( int i = 0; i < 4; i++ )
{
......@@ -163,24 +163,24 @@ static ALWAYS_INLINE void deblock_chroma_c( pixel *pix, int height, int xstride,
deblock_edge_chroma_c( pix, xstride, alpha, beta, tc0[i] );
}
}
static void deblock_h_chroma_mbaff_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
static void deblock_h_chroma_mbaff_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
{
deblock_chroma_c( pix, 1, 2, stride, alpha, beta, tc0 );
}
static void deblock_v_chroma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
static void deblock_v_chroma_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
{
deblock_chroma_c( pix, 2, stride, 2, alpha, beta, tc0 );
}
static void deblock_h_chroma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
static void deblock_h_chroma_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
{
deblock_chroma_c( pix, 2, 2, stride, alpha, beta, tc0 );
}
static void deblock_h_chroma_422_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
static void deblock_h_chroma_422_c( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
{
deblock_chroma_c( pix, 4, 2, stride, alpha, beta, tc0 );
}
static ALWAYS_INLINE void deblock_edge_luma_intra_c( pixel *pix, int xstride, int alpha, int beta )
static ALWAYS_INLINE void deblock_edge_luma_intra_c( pixel *pix, intptr_t xstride, int alpha, int beta )
{
int p2 = pix[-3*xstride];
int p1 = pix[-2*xstride];
......@@ -219,26 +219,26 @@ static ALWAYS_INLINE void deblock_edge_luma_intra_c( pixel *pix, int xstride, in
}
}
}
static inline void deblock_luma_intra_c( pixel *pix, int xstride, int ystride, int alpha, int beta )
static inline void deblock_luma_intra_c( pixel *pix, intptr_t xstride, intptr_t ystride, int alpha, int beta )
{
for( int d = 0; d < 16; d++, pix += ystride )
deblock_edge_luma_intra_c( pix, xstride, alpha, beta );
}
static void deblock_h_luma_intra_mbaff_c( pixel *pix, int ystride, int alpha, int beta )
static void deblock_h_luma_intra_mbaff_c( pixel *pix, intptr_t ystride, int alpha, int beta )
{
for( int d = 0; d < 8; d++, pix += ystride )
deblock_edge_luma_intra_c( pix, 1, alpha, beta );
}
static void deblock_v_luma_intra_c( pixel *pix, int stride, int alpha, int beta )
static void deblock_v_luma_intra_c( pixel *pix, intptr_t stride, int alpha, int beta )
{
deblock_luma_intra_c( pix, stride, 1, alpha, beta );
}
static void deblock_h_luma_intra_c( pixel *pix, int stride, int alpha, int beta )
static void deblock_h_luma_intra_c( pixel *pix, intptr_t stride, int alpha, int beta )
{
deblock_luma_intra_c( pix, 1, stride, alpha, beta );
}
static ALWAYS_INLINE void deblock_edge_chroma_intra_c( pixel *pix, int xstride, int alpha, int beta )
static ALWAYS_INLINE void deblock_edge_chroma_intra_c( pixel *pix, intptr_t xstride, int alpha, int beta )
{
int p1 = pix[-2*xstride];
int p0 = pix[-1*xstride];
......@@ -251,25 +251,25 @@ static ALWAYS_INLINE void deblock_edge_chroma_intra_c( pixel *pix, int xstride,
pix[ 0*xstride] = (2*q1 + q0 + p1 + 2) >> 2; /* q0' */
}
}
static ALWAYS_INLINE void deblock_chroma_intra_c( pixel *pix, int width, int height, int xstride, int ystride, int alpha, int beta )
static ALWAYS_INLINE void deblock_chroma_intra_c( pixel *pix, int width, int height, intptr_t xstride, intptr_t ystride, int alpha, int beta )
{
for( int d = 0; d < height; d++, pix += ystride-2 )
for( int e = 0; e < width; e++, pix++ )
deblock_edge_chroma_intra_c( pix, xstride, alpha, beta );
}
static void deblock_h_chroma_intra_mbaff_c( pixel *pix, int stride, int alpha, int beta )
static void deblock_h_chroma_intra_mbaff_c( pixel *pix, intptr_t stride, int alpha, int beta )
{
deblock_chroma_intra_c( pix, 2, 4, 2, stride, alpha, beta );
}
static void deblock_v_chroma_intra_c( pixel *pix, int stride, int alpha, int beta )
static void deblock_v_chroma_intra_c( pixel *pix, intptr_t stride, int alpha, int beta )
{
deblock_chroma_intra_c( pix, 1, 16, stride, 2, alpha, beta );
}
static void deblock_h_chroma_intra_c( pixel *pix, int stride, int alpha, int beta )
static void deblock_h_chroma_intra_c( pixel *pix, intptr_t stride, int alpha, int beta )
{
deblock_chroma_intra_c( pix, 2, 8, 2, stride, alpha, beta );
}
static void deblock_h_chroma_422_intra_c( pixel *pix, int stride, int alpha, int beta )
static void deblock_h_chroma_422_intra_c( pixel *pix, intptr_t stride, int alpha, int beta )
{
deblock_chroma_intra_c( pix, 2, 16, 2, stride, alpha, beta );
}
......@@ -303,7 +303,8 @@ static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264
}
}
static ALWAYS_INLINE void deblock_edge( x264_t *h, pixel *pix, int i_stride, uint8_t bS[4], int i_qp, int a, int b, int b_chroma, x264_deblock_inter_t pf_inter )
static ALWAYS_INLINE void deblock_edge( x264_t *h, pixel *pix, intptr_t i_stride, uint8_t bS[4], int i_qp,
int a, int b, int b_chroma, x264_deblock_inter_t pf_inter )
{
int index_a = i_qp + a;
int index_b = i_qp + b;
......@@ -322,7 +323,8 @@ static ALWAYS_INLINE void deblock_edge( x264_t *h, pixel *pix, int i_stride, uin
pf_inter( pix, i_stride, alpha, beta, tc );
}
static ALWAYS_INLINE void deblock_edge_intra( x264_t *h, pixel *pix, int i_stride, uint8_t bS[4], int i_qp, int a, int b, int b_chroma, x264_deblock_intra_t pf_intra )
static ALWAYS_INLINE void deblock_edge_intra( x264_t *h, pixel *pix, intptr_t i_stride, uint8_t bS[4], int i_qp,
int a, int b, int b_chroma, x264_deblock_intra_t pf_intra )
{
int index_a = i_qp + a;
int index_b = i_qp + b;
......@@ -631,30 +633,30 @@ void x264_macroblock_deblock( x264_t *h )
}
#if HAVE_MMX
void x264_deblock_v_luma_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v_luma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_luma_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_luma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v_chroma_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v_chroma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_mbaff_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_mbaff_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_422_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_422_sse2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_422_avx ( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v_luma_intra_sse2( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_v_luma_intra_avx ( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_luma_intra_sse2( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_luma_intra_avx ( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_v_chroma_intra_sse2( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_v_chroma_intra_avx ( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_sse2( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_avx ( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_chroma_422_intra_mmx2( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_chroma_422_intra_sse2( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_chroma_422_intra_avx ( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_v_luma_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v_luma_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_luma_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_luma_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v_chroma_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v_chroma_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_mbaff_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_mbaff_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_422_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_422_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_422_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v_luma_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_v_luma_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_h_luma_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_h_luma_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_v_chroma_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_v_chroma_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_h_chroma_422_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_h_chroma_422_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_h_chroma_422_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_strength_mmx2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
......@@ -668,32 +670,32 @@ void x264_deblock_strength_avx ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X2
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
void x264_deblock_h_chroma_intra_mbaff_mmx2( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_mbaff_sse2( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_mbaff_avx ( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_mbaff_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_mbaff_sse2( pixel *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_mbaff_avx ( pixel *pix, intptr_t stride, int alpha, int beta );
#if ARCH_X86
void x264_deblock_h_luma_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v8_luma_mmx2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v_chroma_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_mbaff_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_luma_intra_mmx2( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_v8_luma_intra_mmx2( uint8_t *pix, int stride, int alpha, int beta );
void x264_deblock_v_chroma_intra_mmx2( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_mmx2( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_mbaff_mmx2( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_h_luma_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v8_luma_mmx2( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v_chroma_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_mbaff_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_luma_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_v8_luma_intra_mmx2( uint8_t *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_v_chroma_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_mbaff_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
#if HIGH_BIT_DEPTH
void x264_deblock_v_luma_mmx2( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v_luma_intra_mmx2( pixel *pix, int stride, int alpha, int beta );
void x264_deblock_v_luma_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v_luma_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
#else
// FIXME this wrapper has a significant cpu cost
static void x264_deblock_v_luma_mmx2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
static void x264_deblock_v_luma_mmx2( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
{
x264_deblock_v8_luma_mmx2( pix, stride, alpha, beta, tc0 );
x264_deblock_v8_luma_mmx2( pix+8, stride, alpha, beta, tc0+2 );
}
static void x264_deblock_v_luma_intra_mmx2( uint8_t *pix, int stride, int alpha, int beta )
static void x264_deblock_v_luma_intra_mmx2( uint8_t *pix, intptr_t stride, int alpha, int beta )
{
x264_deblock_v8_luma_intra_mmx2( pix, stride, alpha, beta );
x264_deblock_v8_luma_intra_mmx2( pix+8, stride, alpha, beta );
......@@ -703,15 +705,15 @@ static void x264_deblock_v_luma_intra_mmx2( uint8_t *pix, int stride, int alpha,
#endif
#if ARCH_PPC
void x264_deblock_v_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_luma_altivec( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v_luma_altivec( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_luma_altivec( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#endif // ARCH_PPC
#if HAVE_ARMV6
void x264_deblock_v_luma_neon( uint8_t *, int, int, int, int8_t * );
void x264_deblock_h_luma_neon( uint8_t *, int, int, int, int8_t * );
void x264_deblock_v_chroma_neon( uint8_t *, int, int, int, int8_t * );
void x264_deblock_h_chroma_neon( uint8_t *, int, int, int, int8_t * );
void x264_deblock_v_luma_neon ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_luma_neon ( uint8_t *pix, intptr_t stride, int alpha</