Commit ebc334f8 authored by Loren Merritt's avatar Loren Merritt Committed by Fiona Glaser

Optimize predict_8x8_filter and incidentally remove a valgrind false-positive

parent 94493149
......@@ -39,8 +39,8 @@ void x264_predict_8x8c_h_neon( uint8_t *src );
void x264_predict_8x8c_v_neon( uint8_t *src );
void x264_predict_8x8c_p_neon( uint8_t *src );
void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[33] );
void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[33] );
void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[36] );
void x264_predict_16x16_dc_neon( uint8_t *src );
void x264_predict_16x16_dc_top_neon( uint8_t *src );
......
......@@ -350,8 +350,7 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
for( int i = 0; i <= 4*PARAM_INTERLACED; i++ )
for( int j = 0; j < (CHROMA444 ? 3 : 2); j++ )
{
/* shouldn't really be initialized, just silences a valgrind false-positive in predict_8x8_filter_mmx */
CHECKED_MALLOCZERO( h->intra_border_backup[i][j], (h->sps->i_mb_width*16+32) * sizeof(pixel) );
CHECKED_MALLOC( h->intra_border_backup[i][j], (h->sps->i_mb_width*16+32) * sizeof(pixel) );
h->intra_border_backup[i][j] += 16;
if( !PARAM_INTERLACED )
h->intra_border_backup[1][j] = h->intra_border_backup[i][j];
......
......@@ -499,7 +499,7 @@ SATD_X_DECL7( _neon )
#endif // !HIGH_BIT_DEPTH
#define INTRA_MBCMP_8x8( mbcmp, cpu )\
void x264_intra_##mbcmp##_x3_8x8##cpu( pixel *fenc, pixel edge[33], int res[3] )\
void x264_intra_##mbcmp##_x3_8x8##cpu( pixel *fenc, pixel edge[36], int res[3] )\
{\
ALIGNED_ARRAY_16( pixel, pix, [8*FDEC_STRIDE] );\
x264_predict_8x8_v_c( pix, edge );\
......
......@@ -117,9 +117,9 @@ typedef struct
void (*intra_mbcmp_x3_4x4) ( pixel *fenc, pixel *fdec , int res[3] );
void (*intra_satd_x3_4x4) ( pixel *fenc, pixel *fdec , int res[3] );
void (*intra_sad_x3_4x4) ( pixel *fenc, pixel *fdec , int res[3] );
void (*intra_mbcmp_x3_8x8) ( pixel *fenc, pixel edge[33], int res[3] );
void (*intra_sa8d_x3_8x8) ( pixel *fenc, pixel edge[33], int res[3] );
void (*intra_sad_x3_8x8) ( pixel *fenc, pixel edge[33], int res[3] );
void (*intra_mbcmp_x3_8x8) ( pixel *fenc, pixel edge[36], int res[3] );
void (*intra_sa8d_x3_8x8) ( pixel *fenc, pixel edge[36], int res[3] );
void (*intra_sad_x3_8x8) ( pixel *fenc, pixel edge[36], int res[3] );
} x264_pixel_function_t;
void x264_pixel_init( int cpu, x264_pixel_function_t *pixf );
......
......@@ -2009,7 +2009,7 @@ SATD_X( 4x4 )
#define INTRA_MBCMP_8x8( mbcmp )\
void intra_##mbcmp##_x3_8x8_altivec( uint8_t *fenc, uint8_t edge[33], int res[3] )\
void intra_##mbcmp##_x3_8x8_altivec( uint8_t *fenc, uint8_t edge[36], int res[3] )\
{\
ALIGNED_8( uint8_t pix[8*FDEC_STRIDE] );\
x264_predict_8x8_v_c( pix, edge );\
......
......@@ -458,7 +458,7 @@ static void x264_predict_4x4_hu_c( pixel *src )
#define PT(x) \
edge[16+x] = F2(SRC(x-1,-1), SRC(x,-1), SRC(x+1,-1));
static void x264_predict_8x8_filter_c( pixel *src, pixel edge[33], int i_neighbor, int i_filters )
static void x264_predict_8x8_filter_c( pixel *src, pixel edge[36], int i_neighbor, int i_filters )
{
/* edge[7..14] = l7..l0
* edge[15] = lt
......@@ -525,30 +525,30 @@ static void x264_predict_8x8_filter_c( pixel *src, pixel edge[33], int i_neighbo
src += FDEC_STRIDE; \
}
static void x264_predict_8x8_dc_128_c( pixel *src, pixel edge[33] )
static void x264_predict_8x8_dc_128_c( pixel *src, pixel edge[36] )
{
PREDICT_8x8_DC( PIXEL_SPLAT_X4( 1 << (BIT_DEPTH-1) ) );
}
static void x264_predict_8x8_dc_left_c( pixel *src, pixel edge[33] )
static void x264_predict_8x8_dc_left_c( pixel *src, pixel edge[36] )
{
PREDICT_8x8_LOAD_LEFT
pixel4 dc = PIXEL_SPLAT_X4( (l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3 );
PREDICT_8x8_DC( dc );
}
static void x264_predict_8x8_dc_top_c( pixel *src, pixel edge[33] )
static void x264_predict_8x8_dc_top_c( pixel *src, pixel edge[36] )
{
PREDICT_8x8_LOAD_TOP
pixel4 dc = PIXEL_SPLAT_X4( (t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3 );
PREDICT_8x8_DC( dc );
}
void x264_predict_8x8_dc_c( pixel *src, pixel edge[33] )
void x264_predict_8x8_dc_c( pixel *src, pixel edge[36] )
{
PREDICT_8x8_LOAD_LEFT
PREDICT_8x8_LOAD_TOP
pixel4 dc = PIXEL_SPLAT_X4( (l0+l1+l2+l3+l4+l5+l6+l7+t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4 );
PREDICT_8x8_DC( dc );
}
void x264_predict_8x8_h_c( pixel *src, pixel edge[33] )
void x264_predict_8x8_h_c( pixel *src, pixel edge[36] )
{
PREDICT_8x8_LOAD_LEFT
#define ROW(y) MPIXEL_X4( src+y*FDEC_STRIDE+0 ) =\
......@@ -556,7 +556,7 @@ void x264_predict_8x8_h_c( pixel *src, pixel edge[33] )
ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
#undef ROW
}
void x264_predict_8x8_v_c( pixel *src, pixel edge[33] )
void x264_predict_8x8_v_c( pixel *src, pixel edge[36] )
{
pixel4 top[2] = { MPIXEL_X4( edge+16 ),
MPIXEL_X4( edge+20 ) };
......@@ -566,7 +566,7 @@ void x264_predict_8x8_v_c( pixel *src, pixel edge[33] )
MPIXEL_X4( src+y*FDEC_STRIDE+4 ) = top[1];
}
}
static void x264_predict_8x8_ddl_c( pixel *src, pixel edge[33] )
static void x264_predict_8x8_ddl_c( pixel *src, pixel edge[36] )
{
PREDICT_8x8_LOAD_TOP
PREDICT_8x8_LOAD_TOPRIGHT
......@@ -586,7 +586,7 @@ static void x264_predict_8x8_ddl_c( pixel *src, pixel edge[33] )
SRC(6,7)=SRC(7,6)= F2(t13,t14,t15);
SRC(7,7)= F2(t14,t15,t15);
}
static void x264_predict_8x8_ddr_c( pixel *src, pixel edge[33] )
static void x264_predict_8x8_ddr_c( pixel *src, pixel edge[36] )
{
PREDICT_8x8_LOAD_TOP
PREDICT_8x8_LOAD_LEFT
......@@ -608,7 +608,7 @@ static void x264_predict_8x8_ddr_c( pixel *src, pixel edge[33] )
SRC(7,0)= F2(t5,t6,t7);
}
static void x264_predict_8x8_vr_c( pixel *src, pixel edge[33] )
static void x264_predict_8x8_vr_c( pixel *src, pixel edge[36] )
{
PREDICT_8x8_LOAD_TOP
PREDICT_8x8_LOAD_LEFT
......@@ -636,7 +636,7 @@ static void x264_predict_8x8_vr_c( pixel *src, pixel edge[33] )
SRC(7,1)= F2(t5,t6,t7);
SRC(7,0)= F1(t6,t7);
}
static void x264_predict_8x8_hd_c( pixel *src, pixel edge[33] )
static void x264_predict_8x8_hd_c( pixel *src, pixel edge[36] )
{
PREDICT_8x8_LOAD_TOP
PREDICT_8x8_LOAD_LEFT
......@@ -663,7 +663,7 @@ static void x264_predict_8x8_hd_c( pixel *src, pixel edge[33] )
SRC_X4(4,1)= pack_pixel_2to4(p9,p10);
SRC_X4(4,0)= pack_pixel_2to4(p10,p11);
}
static void x264_predict_8x8_vl_c( pixel *src, pixel edge[33] )
static void x264_predict_8x8_vl_c( pixel *src, pixel edge[36] )
{
PREDICT_8x8_LOAD_TOP
PREDICT_8x8_LOAD_TOPRIGHT
......@@ -690,7 +690,7 @@ static void x264_predict_8x8_vl_c( pixel *src, pixel edge[33] )
SRC(7,6)= F1(t10,t11);
SRC(7,7)= F2(t10,t11,t12);
}
static void x264_predict_8x8_hu_c( pixel *src, pixel edge[33] )
static void x264_predict_8x8_hu_c( pixel *src, pixel edge[36] )
{
PREDICT_8x8_LOAD_LEFT
int p1 = pack_pixel_1to2(F1(l0,l1), F2(l0,l1,l2));
......
......@@ -28,8 +28,8 @@
#define X264_PREDICT_H
typedef void (*x264_predict_t)( pixel *src );
typedef void (*x264_predict8x8_t)( pixel *src, pixel edge[33] );
typedef void (*x264_predict_8x8_filter_t) ( pixel *src, pixel edge[33], int i_neighbor, int i_filters );
typedef void (*x264_predict8x8_t)( pixel *src, pixel edge[36] );
typedef void (*x264_predict_8x8_filter_t) ( pixel *src, pixel edge[36], int i_neighbor, int i_filters );
enum intra_chroma_pred_e
{
......@@ -109,9 +109,9 @@ enum intra8x8_pred_e
I_PRED_8x8_DC_128 = 11,
};
void x264_predict_8x8_dc_c ( pixel *src, pixel edge[33] );
void x264_predict_8x8_h_c ( pixel *src, pixel edge[33] );
void x264_predict_8x8_v_c ( pixel *src, pixel edge[33] );
void x264_predict_8x8_dc_c ( pixel *src, pixel edge[36] );
void x264_predict_8x8_h_c ( pixel *src, pixel edge[36] );
void x264_predict_8x8_v_c ( pixel *src, pixel edge[36] );
void x264_predict_4x4_dc_c ( pixel *src );
void x264_predict_4x4_h_c ( pixel *src );
void x264_predict_4x4_v_c ( pixel *src );
......
......@@ -37,6 +37,8 @@ pw_m3: times 8 dw -3
pb_00s_ff: times 8 db 0
pb_0s_ff: times 7 db 0
db 0xff
shuf_fixtr: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7
shuf_nop: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
SECTION .text
......@@ -596,9 +598,9 @@ cglobal predict_4x4_dc_mmx2, 1,4
%macro PREDICT_FILTER 5
;-----------------------------------------------------------------------------
;void predict_8x8_filter( pixel *src, pixel edge[33], int i_neighbor, int i_filters )
;void predict_8x8_filter( pixel *src, pixel edge[36], int i_neighbor, int i_filters )
;-----------------------------------------------------------------------------
cglobal predict_8x8_filter, 4,5,7
cglobal predict_8x8_filter, 4,6,6
add r0, 0x58*SIZEOF_PIXEL
%define src r0-0x58*SIZEOF_PIXEL
%ifndef ARCH_X86_64
......@@ -609,10 +611,13 @@ cglobal predict_8x8_filter, 4,5,7
%define t1 r1
%define t4 r4
%endif
test r3b, 0x01
test r3b, 1
je .check_top
mov t4d, r2d
and t4d, 8
neg t4
mova m0, [src+0*FDEC_STRIDEB-8*SIZEOF_PIXEL]
punpckh%1%2 m0, [src-1*FDEC_STRIDEB-8*SIZEOF_PIXEL]
punpckh%1%2 m0, [src+0*FDEC_STRIDEB-8*SIZEOF_PIXEL+t4*(FDEC_STRIDEB/8)]
mova m1, [src+2*FDEC_STRIDEB-8*SIZEOF_PIXEL]
punpckh%1%2 m1, [src+1*FDEC_STRIDEB-8*SIZEOF_PIXEL]
punpckh%2%3 m1, m0
......@@ -628,74 +633,79 @@ cglobal predict_8x8_filter, 4,5,7
mova m2, m3
PALIGNR m4, m0, 7*SIZEOF_PIXEL, m0
PALIGNR m1, m2, 1*SIZEOF_PIXEL, m2
test r2b, 0x08
je .fix_lt_1
.do_left:
mova m0, m4
PRED8x8_LOWPASS %1, m2, m1, m4, m3, m5
mova [t1+8*SIZEOF_PIXEL], m2
mova m4, m0
PRED8x8_LOWPASS %1, m1, m3, m0, m4, m5
movd t4, m1
movzx t4d, pixel [src+7*FDEC_STRIDEB-1*SIZEOF_PIXEL]
movzx r5d, pixel [src+6*FDEC_STRIDEB-1*SIZEOF_PIXEL]
lea t4d, [t4*3+2]
add t4d, r5d
shr t4d, 2
mov [t1+7*SIZEOF_PIXEL], t4%1
.check_top:
test r3b, 0x02
test r3b, 2
je .done
.check_top:
%if SIZEOF_PIXEL==1 && cpuflag(ssse3)
INIT_XMM cpuname
movu m3, [src-1*FDEC_STRIDEB]
movhps m0, [src-1*FDEC_STRIDEB-8]
test r2b, 8
je .fix_lt_2
.do_top:
and r2d, 4
%ifdef PIC
lea r3, [shuf_fixtr]
pshufb m3, [r3+r2*4]
%else
pshufb m3, [shuf_fixtr+r2*4] ; neighbor&MB_TOPRIGHT ? shuf_nop : shuf_fixtr
%endif
psrldq m1, m3, 15
PALIGNR m2, m3, m0, 15, m0
PALIGNR m1, m3, 1, m5
PRED8x8_LOWPASS %1, m0, m2, m1, m3, m5
mova [t1+16*SIZEOF_PIXEL], m0
psrldq m0, 15
movd [t1+32*SIZEOF_PIXEL], m0
.done:
REP_RET
.fix_lt_2:
pslldq m0, m3, 15
jmp .do_top
%else
mova m0, [src-1*FDEC_STRIDEB-8*SIZEOF_PIXEL]
mova m3, [src-1*FDEC_STRIDEB]
mova m1, [src-1*FDEC_STRIDEB+8*SIZEOF_PIXEL]
mova m2, m3
mova m4, m3
PALIGNR m2, m0, 7*SIZEOF_PIXEL, m0
PALIGNR m1, m4, 1*SIZEOF_PIXEL, m4
test r2b, 0x08
test r2b, 8
je .fix_lt_2
test r2b, 0x04
test r2b, 4
je .fix_tr_1
.do_top:
PRED8x8_LOWPASS %1, m4, m2, m1, m3, m5
PALIGNR m2, m3, m0, 7*SIZEOF_PIXEL, m0
PALIGNR m0, m1, m3, 1*SIZEOF_PIXEL, m5
PRED8x8_LOWPASS %1, m4, m2, m0, m3, m5
mova [t1+16*SIZEOF_PIXEL], m4
test r3b, 0x04
test r3b, 4
je .done
test r2b, 0x04
je .fix_tr_2
mova m0, [src-1*FDEC_STRIDEB+8*SIZEOF_PIXEL]
mova m2, m0
mova m4, m0
psrl%4 m5, m0, 7*%5
mova m2, m1
mova m4, m1
psrl%4 m5, m1, 7*%5
PALIGNR m2, m3, 7*SIZEOF_PIXEL, m3
PALIGNR m5, m4, 1*SIZEOF_PIXEL, m4
PRED8x8_LOWPASS %1, m1, m2, m5, m0, m4
jmp .do_topright
.fix_tr_2:
punpckh%1%2 m3, m3
pshuf%2 m1, m3, q3333
.do_topright:
mova [t1+24*SIZEOF_PIXEL], m1
psrl%4 m1, 7*%5
movd t4, m1
mov [t1+32*SIZEOF_PIXEL], t4%1
PRED8x8_LOWPASS %1, m0, m2, m5, m1, m4
mova [t1+24*SIZEOF_PIXEL], m0
psrl%4 m0, 7*%5
movd [t1+32*SIZEOF_PIXEL], m0
.done:
REP_RET
.fix_lt_1:
pxor m5, m3, m4
psrl%4 m5, 7*%5
psll%4 m5, 6*%5
pxor m1, m5
jmp .do_left
.fix_lt_2:
pxor m5, m3, m2
psll%4 m5, 7*%5
psrl%4 m5, 7*%5
pxor m2, m5
test r2b, 0x04
psll%4 m0, m3, 7*%5
test r2b, 4
jne .do_top
.fix_tr_1:
pxor m5, m3, m1
psrl%4 m5, 7*%5
psll%4 m5, 7*%5
pxor m1, m5
punpckh%1%2 m1, m3, m3
pshuf%2 m1, m1, q3333
jmp .do_top
%endif
%endmacro
%ifdef HIGH_BIT_DEPTH
......@@ -731,7 +741,7 @@ PREDICT_8x8_V
%endif
;-----------------------------------------------------------------------------
; void predict_8x8_h( pixel *src, pixel edge[33] )
; void predict_8x8_h( pixel *src, pixel edge[36] )
;-----------------------------------------------------------------------------
%macro PREDICT_8x8_H 2
cglobal predict_8x8_h, 2,2
......
......@@ -55,40 +55,40 @@
void x264_predict_8x8c_h_mmx2( uint8_t *src );
void x264_predict_8x8c_h_sse2( pixel *src );
void x264_predict_8x8c_h_ssse3( uint8_t *src );
void x264_predict_8x8_v_mmx2( uint8_t *src, uint8_t edge[33] );
void x264_predict_8x8_v_sse2( uint16_t *src, uint16_t edge[33] );
void x264_predict_8x8_h_mmx2( uint8_t *src, uint8_t edge[33] );
void x264_predict_8x8_h_sse2( uint16_t *src, uint16_t edge[33] );
void x264_predict_8x8_hd_mmx2( uint8_t *src, uint8_t edge[33] );
void x264_predict_8x8_hu_mmx2( uint8_t *src, uint8_t edge[33] );
void x264_predict_8x8_dc_mmx2( uint8_t *src, uint8_t edge[33] );
void x264_predict_8x8_dc_sse2( uint16_t *src, uint16_t edge[33] );
void x264_predict_8x8_dc_top_mmx2( uint8_t *src, uint8_t edge[33] );
void x264_predict_8x8_dc_top_sse2( uint16_t *src, uint16_t edge[33] );
void x264_predict_8x8_dc_left_mmx2( uint8_t *src, uint8_t edge[33] );
void x264_predict_8x8_dc_left_sse2( uint16_t *src, uint16_t edge[33] );
void x264_predict_8x8_ddl_mmx2( uint8_t *src, uint8_t edge[33] );
void x264_predict_8x8_ddl_sse2( pixel *src, pixel edge[33] );
void x264_predict_8x8_ddl_avx( pixel *src, pixel edge[33] );
void x264_predict_8x8_ddr_mmx2( uint8_t *src, uint8_t edge[33] );
void x264_predict_8x8_ddr_sse2( pixel *src, pixel edge[33] );
void x264_predict_8x8_ddr_avx( pixel *src, pixel edge[33] );
void x264_predict_8x8_vl_sse2( uint8_t *src, uint8_t edge[33] );
void x264_predict_8x8_vl_avx( uint8_t *src, uint8_t edge[33] );
void x264_predict_8x8_vr_mmx2( uint8_t *src, uint8_t edge[33] );
void x264_predict_8x8_vr_sse2( pixel *src, pixel edge[33] );
void x264_predict_8x8_vr_ssse3( uint16_t *src, uint16_t edge[33] );
void x264_predict_8x8_vr_avx( pixel *src, pixel edge[33] );
void x264_predict_8x8_hu_sse2( pixel *src, pixel edge[33] );
void x264_predict_8x8_hu_ssse3( pixel *src, pixel edge[33] );
void x264_predict_8x8_hu_avx( pixel *src, pixel edge[33] );
void x264_predict_8x8_hd_sse2( pixel *src, pixel edge[33] );
void x264_predict_8x8_hd_ssse3( pixel *src, pixel edge[33] );
void x264_predict_8x8_hd_avx( pixel *src, pixel edge[33] );
void x264_predict_8x8_filter_mmx2( uint8_t *src, uint8_t edge[33], int i_neighbor, int i_filters );
void x264_predict_8x8_filter_sse2( uint16_t *src, uint16_t edge[33], int i_neighbor, int i_filters );
void x264_predict_8x8_filter_ssse3( pixel *src, pixel edge[33], int i_neighbor, int i_filters );
void x264_predict_8x8_filter_avx( uint16_t *src, uint16_t edge[33], int i_neighbor, int i_filters );
void x264_predict_8x8_v_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_v_sse2( uint16_t *src, uint16_t edge[36] );
void x264_predict_8x8_h_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_h_sse2( uint16_t *src, uint16_t edge[36] );
void x264_predict_8x8_hd_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_hu_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_dc_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_dc_sse2( uint16_t *src, uint16_t edge[36] );
void x264_predict_8x8_dc_top_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_dc_top_sse2( uint16_t *src, uint16_t edge[36] );
void x264_predict_8x8_dc_left_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_dc_left_sse2( uint16_t *src, uint16_t edge[36] );
void x264_predict_8x8_ddl_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_ddl_sse2( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddl_avx( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddr_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_ddr_sse2( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddr_avx( pixel *src, pixel edge[36] );
void x264_predict_8x8_vl_sse2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_vl_avx( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_vr_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_vr_sse2( pixel *src, pixel edge[36] );
void x264_predict_8x8_vr_ssse3( uint16_t *src, uint16_t edge[36] );
void x264_predict_8x8_vr_avx( pixel *src, pixel edge[36] );
void x264_predict_8x8_hu_sse2( pixel *src, pixel edge[36] );
void x264_predict_8x8_hu_ssse3( pixel *src, pixel edge[36] );
void x264_predict_8x8_hu_avx( pixel *src, pixel edge[36] );
void x264_predict_8x8_hd_sse2( pixel *src, pixel edge[36] );
void x264_predict_8x8_hd_ssse3( pixel *src, pixel edge[36] );
void x264_predict_8x8_hd_avx( pixel *src, pixel edge[36] );
void x264_predict_8x8_filter_mmx2( uint8_t *src, uint8_t edge[36], int i_neighbor, int i_filters );
void x264_predict_8x8_filter_sse2( uint16_t *src, uint16_t edge[36], int i_neighbor, int i_filters );
void x264_predict_8x8_filter_ssse3( pixel *src, pixel edge[36], int i_neighbor, int i_filters );
void x264_predict_8x8_filter_avx( uint16_t *src, uint16_t edge[36], int i_neighbor, int i_filters );
void x264_predict_4x4_ddl_mmx2( pixel *src );
void x264_predict_4x4_ddl_sse2( uint16_t *src );
void x264_predict_4x4_ddl_avx( uint16_t *src );
......@@ -368,7 +368,7 @@ static void x264_predict_8x8c_dc_left( uint8_t *src )
t=g; g+=h; h-=t;
#define INTRA_SA8D_X3(cpu)\
void x264_intra_sa8d_x3_8x8_##cpu( uint8_t *fenc, uint8_t edge[33], int res[3] )\
void x264_intra_sa8d_x3_8x8_##cpu( uint8_t *fenc, uint8_t edge[36], int res[3] )\
{\
PREDICT_8x8_LOAD_TOP\
PREDICT_8x8_LOAD_LEFT\
......
......@@ -424,7 +424,7 @@ INIT_XMM avx
INTRA_SADx3_4x4
;-----------------------------------------------------------------------------
; void intra_sad_x3_8x8( uint8_t *fenc, uint8_t edge[33], int res[3]);
; void intra_sad_x3_8x8( uint8_t *fenc, uint8_t edge[36], int res[3]);
;-----------------------------------------------------------------------------
;m0 = DC
......
......@@ -834,7 +834,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
/* 8x8 prediction selection */
if( flags & X264_ANALYSE_I8x8 )
{
ALIGNED_ARRAY_16( pixel, edge,[33] );
ALIGNED_ARRAY_16( pixel, edge,[36] );
x264_pixel_cmp_t sa8d = (h->pixf.mbcmp[0] == h->pixf.satd[0]) ? h->pixf.sa8d[PIXEL_8x8] : h->pixf.mbcmp[PIXEL_8x8];
int i_satd_thresh = a->i_mbrd ? COST_MAX : X264_MIN( i_satd_inter, a->i_satd_i16x16 );
......
......@@ -171,7 +171,7 @@ void x264_mb_encode_i8x8( x264_t *h, int p, int idx, int i_qp, int i_mode, pixel
pixel *p_src = &h->mb.pic.p_fenc[p][8*x + 8*y*FENC_STRIDE];
pixel *p_dst = &h->mb.pic.p_fdec[p][8*x + 8*y*FDEC_STRIDE];
ALIGNED_ARRAY_16( dctcoef, dct8x8,[64] );
ALIGNED_ARRAY_16( pixel, edge_buf,[33] );
ALIGNED_ARRAY_16( pixel, edge_buf,[36] );
if( !edge )
{
......@@ -519,7 +519,7 @@ void x264_predict_lossless_4x4( x264_t *h, pixel *p_dst, int p, int idx, int i_m
h->predict_4x4[i_mode]( p_dst );
}
void x264_predict_lossless_8x8( x264_t *h, pixel *p_dst, int p, int idx, int i_mode, pixel edge[33] )
void x264_predict_lossless_8x8( x264_t *h, pixel *p_dst, int p, int idx, int i_mode, pixel edge[36] )
{
int stride = h->fenc->i_stride[p] << MB_INTERLACED;
pixel *p_src = h->mb.pic.p_fenc_plane[p] + (idx&1)*8 + (idx>>1)*8*stride;
......
......@@ -43,7 +43,7 @@ int x264_macroblock_probe_skip( x264_t *h, int b_bidir );
void x264_predict_lossless_8x8_chroma( x264_t *h, int i_mode );
void x264_predict_lossless_4x4( x264_t *h, pixel *p_dst, int p, int idx, int i_mode );
void x264_predict_lossless_8x8( x264_t *h, pixel *p_dst, int p, int idx, int i_mode, pixel edge[33] );
void x264_predict_lossless_8x8( x264_t *h, pixel *p_dst, int p, int idx, int i_mode, pixel edge[36] );
void x264_predict_lossless_16x16( x264_t *h, int p, int i_mode );
void x264_macroblock_encode ( x264_t *h );
......
......@@ -622,7 +622,7 @@ skip_motionest:
lowres_intra_mb:
if( !fenc->b_intra_calculated )
{
ALIGNED_ARRAY_16( pixel, edge,[33] );
ALIGNED_ARRAY_16( pixel, edge,[36] );
pixel *pix = &pix1[8+FDEC_STRIDE - 1];
pixel *src = &fenc->lowres[0][i_pel_offset - 1];
const int intra_penalty = 5 * a->i_lambda;
......
......@@ -244,7 +244,7 @@ static int check_pixel( int cpu_ref, int cpu_new )
x264_pixel_function_t pixel_asm;
x264_predict8x8_t predict_8x8[9+3];
x264_predict_8x8_filter_t predict_8x8_filter;
ALIGNED_16( pixel edge[33] );
ALIGNED_16( pixel edge[36] );
uint16_t cost_mv[32];
int ret = 0, ok, used_asm;
......@@ -1784,8 +1784,8 @@ static int check_quant( int cpu_ref, int cpu_new )
static int check_intra( int cpu_ref, int cpu_new )
{
int ret = 0, ok = 1, used_asm = 0;
ALIGNED_16( pixel edge[33] );
ALIGNED_16( pixel edge2[33] );
ALIGNED_16( pixel edge[36] );
ALIGNED_16( pixel edge2[36] );
ALIGNED_16( pixel fdec[FDEC_STRIDE*20] );
struct
{
......@@ -1864,10 +1864,16 @@ static int check_intra( int cpu_ref, int cpu_new )
used_asm = 1;
for( int i = 0; i < 32; i++ )
{
memcpy( edge2, edge, 33 * sizeof(pixel) );
call_c(ip_c.predict_8x8_filter, pbuf1+48, edge, (i&24)>>1, i&7);
call_a(ip_a.predict_8x8_filter, pbuf1+48, edge2, (i&24)>>1, i&7);
if( memcmp( edge, edge2, 33 * sizeof(pixel) ) )
if( !(i&7) || ((i&MB_TOPRIGHT) && !(i&MB_TOP)) )
continue;
int neighbor = (i&24)>>1;
memset( edge, 0, sizeof(edge) );
memset( edge2, 0, sizeof(edge2) );
call_c( ip_c.predict_8x8_filter, pbuf1+48, edge, neighbor, i&7 );
call_a( ip_a.predict_8x8_filter, pbuf1+48, edge2, neighbor, i&7 );
if( !(neighbor&MB_TOPLEFT) )
edge[15] = edge2[15] = 0;
if( memcmp( edge+7, edge2+7, (i&MB_TOPRIGHT ? 26 : i&MB_TOP ? 17 : 8) * sizeof(pixel) ) )
{
fprintf( stderr, "predict_8x8_filter : [FAILED] %d %d\n", (i&24)>>1, i&7);
ok = 0;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment