Commit bcf540a8 authored by Fiona Glaser's avatar Fiona Glaser

Merge array_non_zero into zigzag_sub

Faster lossless, cleaner code.
SSSE3 version of zigzag_sub_4x4_field, faster lossless interlaced coding.
parent 5394872f
...@@ -553,13 +553,13 @@ void x264_dct_init_weights( void ) ...@@ -553,13 +553,13 @@ void x264_dct_init_weights( void )
ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7) ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7)
#define ZIGZAG4_FRAME\ #define ZIGZAG4_FRAME\
ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\ ZIGDC( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\ ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)\ ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)\
ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3) ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
#define ZIGZAG4_FIELD\ #define ZIGZAG4_FIELD\
ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)\ ZIGDC( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)\
ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)\ ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)\
ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)\ ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)\
ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3) ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3)
...@@ -576,6 +576,7 @@ static void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[8][8] ) ...@@ -576,6 +576,7 @@ static void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[8][8] )
#undef ZIG #undef ZIG
#define ZIG(i,y,x) level[i] = dct[0][x*4+y]; #define ZIG(i,y,x) level[i] = dct[0][x*4+y];
#define ZIGDC(i,y,x) ZIG(i,y,x)
static void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] ) static void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] )
{ {
...@@ -596,6 +597,7 @@ static void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] ) ...@@ -596,6 +597,7 @@ static void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] )
int oe = x+y*FENC_STRIDE;\ int oe = x+y*FENC_STRIDE;\
int od = x+y*FDEC_STRIDE;\ int od = x+y*FDEC_STRIDE;\
level[i] = p_src[oe] - p_dst[od];\ level[i] = p_src[oe] - p_dst[od];\
nz |= level[i];\
} }
#define COPY4x4\ #define COPY4x4\
*(uint32_t*)(p_dst+0*FDEC_STRIDE) = *(uint32_t*)(p_src+0*FENC_STRIDE);\ *(uint32_t*)(p_dst+0*FDEC_STRIDE) = *(uint32_t*)(p_src+0*FENC_STRIDE);\
...@@ -612,27 +614,59 @@ static void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] ) ...@@ -612,27 +614,59 @@ static void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] )
*(uint64_t*)(p_dst+6*FDEC_STRIDE) = *(uint64_t*)(p_src+6*FENC_STRIDE);\ *(uint64_t*)(p_dst+6*FDEC_STRIDE) = *(uint64_t*)(p_src+6*FENC_STRIDE);\
*(uint64_t*)(p_dst+7*FDEC_STRIDE) = *(uint64_t*)(p_src+7*FENC_STRIDE); *(uint64_t*)(p_dst+7*FDEC_STRIDE) = *(uint64_t*)(p_src+7*FENC_STRIDE);
static void zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst ) static int zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
{
int nz = 0;
ZIGZAG4_FRAME
COPY4x4
return !!nz;
}
static int zigzag_sub_4x4_field( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
{
int nz = 0;
ZIGZAG4_FIELD
COPY4x4
return !!nz;
}
#undef ZIGDC
#define ZIGDC(i,y,x) {\
int oe = x+y*FENC_STRIDE;\
int od = x+y*FDEC_STRIDE;\
*dc = p_src[oe] - p_dst[od];\
level[0] = 0;\
}
static int zigzag_sub_4x4ac_frame( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst, int16_t *dc )
{ {
int nz = 0;
ZIGZAG4_FRAME ZIGZAG4_FRAME
COPY4x4 COPY4x4
return !!nz;
} }
static void zigzag_sub_4x4_field( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst ) static int zigzag_sub_4x4ac_field( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst, int16_t *dc )
{ {
int nz = 0;
ZIGZAG4_FIELD ZIGZAG4_FIELD
COPY4x4 COPY4x4
return !!nz;
} }
static void zigzag_sub_8x8_frame( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst ) static int zigzag_sub_8x8_frame( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst )
{ {
int nz = 0;
ZIGZAG8_FRAME ZIGZAG8_FRAME
COPY8x8 COPY8x8
return !!nz;
} }
static void zigzag_sub_8x8_field( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst ) static int zigzag_sub_8x8_field( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst )
{ {
int nz = 0;
ZIGZAG8_FIELD ZIGZAG8_FIELD
COPY8x8 COPY8x8
return !!nz;
} }
#undef ZIG #undef ZIG
...@@ -661,9 +695,15 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced ) ...@@ -661,9 +695,15 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
pf->scan_4x4 = zigzag_scan_4x4_field; pf->scan_4x4 = zigzag_scan_4x4_field;
pf->sub_8x8 = zigzag_sub_8x8_field; pf->sub_8x8 = zigzag_sub_8x8_field;
pf->sub_4x4 = zigzag_sub_4x4_field; pf->sub_4x4 = zigzag_sub_4x4_field;
pf->sub_4x4ac = zigzag_sub_4x4ac_field;
#ifdef HAVE_MMX #ifdef HAVE_MMX
if( cpu&X264_CPU_MMXEXT ) if( cpu&X264_CPU_MMXEXT )
pf->scan_4x4 = x264_zigzag_scan_4x4_field_mmxext; pf->scan_4x4 = x264_zigzag_scan_4x4_field_mmxext;
if( cpu&X264_CPU_SSSE3 )
{
pf->sub_4x4 = x264_zigzag_sub_4x4_field_ssse3;
pf->sub_4x4ac= x264_zigzag_sub_4x4ac_field_ssse3;
}
#endif #endif
#ifdef ARCH_PPC #ifdef ARCH_PPC
...@@ -677,6 +717,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced ) ...@@ -677,6 +717,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
pf->scan_4x4 = zigzag_scan_4x4_frame; pf->scan_4x4 = zigzag_scan_4x4_frame;
pf->sub_8x8 = zigzag_sub_8x8_frame; pf->sub_8x8 = zigzag_sub_8x8_frame;
pf->sub_4x4 = zigzag_sub_4x4_frame; pf->sub_4x4 = zigzag_sub_4x4_frame;
pf->sub_4x4ac = zigzag_sub_4x4ac_frame;
#ifdef HAVE_MMX #ifdef HAVE_MMX
if( cpu&X264_CPU_MMX ) if( cpu&X264_CPU_MMX )
pf->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx; pf->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx;
...@@ -687,6 +728,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced ) ...@@ -687,6 +728,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
if( cpu&X264_CPU_SSSE3 ) if( cpu&X264_CPU_SSSE3 )
{ {
pf->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3; pf->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3;
pf->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_ssse3;
pf->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3; pf->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3;
if( cpu&X264_CPU_SHUFFLE_IS_FAST ) if( cpu&X264_CPU_SHUFFLE_IS_FAST )
pf->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3; pf->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
......
...@@ -118,8 +118,9 @@ typedef struct ...@@ -118,8 +118,9 @@ typedef struct
{ {
void (*scan_8x8)( int16_t level[64], int16_t dct[8][8] ); void (*scan_8x8)( int16_t level[64], int16_t dct[8][8] );
void (*scan_4x4)( int16_t level[16], int16_t dct[4][4] ); void (*scan_4x4)( int16_t level[16], int16_t dct[4][4] );
void (*sub_8x8)( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst ); int (*sub_8x8) ( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst );
void (*sub_4x4)( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst ); int (*sub_4x4) ( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst );
int (*sub_4x4ac)( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst, int16_t *dc );
void (*interleave_8x8_cavlc)( int16_t *dst, int16_t *src, uint8_t *nnz ); void (*interleave_8x8_cavlc)( int16_t *dst, int16_t *src, uint8_t *nnz );
} x264_zigzag_function_t; } x264_zigzag_function_t;
......
...@@ -401,8 +401,8 @@ static ALWAYS_INLINE void x264_macroblock_cache_intra8x8_pred( x264_t *h, int x, ...@@ -401,8 +401,8 @@ static ALWAYS_INLINE void x264_macroblock_cache_intra8x8_pred( x264_t *h, int x,
cache[0] = cache[1] = cache[8] = cache[9] = i_mode; cache[0] = cache[1] = cache[8] = cache[9] = i_mode;
} }
#define array_non_zero(a) array_non_zero_int(a, sizeof(a)) #define array_non_zero(a) array_non_zero_int(a, sizeof(a))
#define array_non_zero_int array_non_zero_int_c #define array_non_zero_int array_non_zero_int
static ALWAYS_INLINE int array_non_zero_int_c( void *v, int i_count ) static ALWAYS_INLINE int array_non_zero_int( void *v, int i_count )
{ {
union {uint16_t s[4]; uint64_t l;} *x = v; union {uint16_t s[4]; uint64_t l;} *x = v;
if(i_count == 8) if(i_count == 8)
......
...@@ -31,6 +31,8 @@ pw_32: times 8 dw 32 ...@@ -31,6 +31,8 @@ pw_32: times 8 dw 32
pw_8000: times 8 dw 0x8000 pw_8000: times 8 dw 0x8000
hsub_mul: times 8 db 1, -1 hsub_mul: times 8 db 1, -1
pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15 pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
pb_sub4field: db 0,4,1,8,12,5,9,13,2,6,10,14,3,7,11,15
pb_subacmask: dw 0,-1,-1,-1,-1,-1,-1,-1
pb_scan4framea: db 12,13,6,7,14,15,0,1,8,9,2,3,4,5,10,11 pb_scan4framea: db 12,13,6,7,14,15,0,1,8,9,2,3,4,5,10,11
pb_scan4frameb: db 0,1,8,9,2,3,4,5,10,11,12,13,6,7,14,15 pb_scan4frameb: db 0,1,8,9,2,3,4,5,10,11,12,13,6,7,14,15
pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3 pb_idctdc_unpack: db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
...@@ -780,7 +782,12 @@ cglobal x264_zigzag_scan_4x4_field_mmxext, 2,3 ...@@ -780,7 +782,12 @@ cglobal x264_zigzag_scan_4x4_field_mmxext, 2,3
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void x264_zigzag_sub_4x4_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst ) ; void x264_zigzag_sub_4x4_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst )
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
cglobal x264_zigzag_sub_4x4_frame_ssse3, 3,3,8 %macro ZIGZAG_SUB_4x4 2
%ifidn %1, ac
cglobal x264_zigzag_sub_4x4%1_%2_ssse3, 4,4,8
%else
cglobal x264_zigzag_sub_4x4%1_%2_ssse3, 3,3,8
%endif
movd xmm0, [r1+0*FENC_STRIDE] movd xmm0, [r1+0*FENC_STRIDE]
movd xmm1, [r1+1*FENC_STRIDE] movd xmm1, [r1+1*FENC_STRIDE]
movd xmm2, [r1+2*FENC_STRIDE] movd xmm2, [r1+2*FENC_STRIDE]
...@@ -799,7 +806,11 @@ cglobal x264_zigzag_sub_4x4_frame_ssse3, 3,3,8 ...@@ -799,7 +806,11 @@ cglobal x264_zigzag_sub_4x4_frame_ssse3, 3,3,8
punpckldq xmm6, xmm7 punpckldq xmm6, xmm7
punpcklqdq xmm0, xmm2 punpcklqdq xmm0, xmm2
punpcklqdq xmm4, xmm6 punpcklqdq xmm4, xmm6
%ifidn %2, frame
movdqa xmm7, [pb_sub4frame GLOBAL] movdqa xmm7, [pb_sub4frame GLOBAL]
%else
movdqa xmm7, [pb_sub4field GLOBAL]
%endif
pshufb xmm0, xmm7 pshufb xmm0, xmm7
pshufb xmm4, xmm7 pshufb xmm4, xmm7
pxor xmm6, xmm6 pxor xmm6, xmm6
...@@ -811,9 +822,28 @@ cglobal x264_zigzag_sub_4x4_frame_ssse3, 3,3,8 ...@@ -811,9 +822,28 @@ cglobal x264_zigzag_sub_4x4_frame_ssse3, 3,3,8
punpckhbw xmm5, xmm6 punpckhbw xmm5, xmm6
psubw xmm0, xmm4 psubw xmm0, xmm4
psubw xmm1, xmm5 psubw xmm1, xmm5
%ifidn %1, ac
movd r2d, xmm0
pand xmm0, [pb_subacmask GLOBAL]
%endif
movdqa [r0], xmm0 movdqa [r0], xmm0
pxor xmm2, xmm2
movdqa [r0+16], xmm1 movdqa [r0+16], xmm1
por xmm0, xmm1
pcmpeqb xmm0, xmm2
pmovmskb eax, xmm0
%ifidn %1, ac
mov [r3], r2w
%endif
sub eax, 0xffff
shr eax, 31
RET RET
%endmacro
ZIGZAG_SUB_4x4 , frame
ZIGZAG_SUB_4x4 ac, frame
ZIGZAG_SUB_4x4 , field
ZIGZAG_SUB_4x4 ac, field
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
; void x264_zigzag_interleave_8x8_cavlc_mmx( int16_t *dst, int16_t *src, uint8_t *nnz ) ; void x264_zigzag_interleave_8x8_cavlc_mmx( int16_t *dst, int16_t *src, uint8_t *nnz )
......
...@@ -68,7 +68,10 @@ void x264_zigzag_scan_8x8_frame_mmxext( int16_t level[64], int16_t dct[8][8] ); ...@@ -68,7 +68,10 @@ void x264_zigzag_scan_8x8_frame_mmxext( int16_t level[64], int16_t dct[8][8] );
void x264_zigzag_scan_4x4_frame_ssse3 ( int16_t level[16], int16_t dct[4][4] ); void x264_zigzag_scan_4x4_frame_ssse3 ( int16_t level[16], int16_t dct[4][4] );
void x264_zigzag_scan_4x4_frame_mmx ( int16_t level[16], int16_t dct[4][4] ); void x264_zigzag_scan_4x4_frame_mmx ( int16_t level[16], int16_t dct[4][4] );
void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] ); void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] );
void x264_zigzag_sub_4x4_frame_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst ); int x264_zigzag_sub_4x4_frame_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst );
int x264_zigzag_sub_4x4ac_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
int x264_zigzag_sub_4x4_field_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst );
int x264_zigzag_sub_4x4ac_field_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
void x264_zigzag_interleave_8x8_cavlc_mmx( int16_t *dst, int16_t *src, uint8_t *nnz ); void x264_zigzag_interleave_8x8_cavlc_mmx( int16_t *dst, int16_t *src, uint8_t *nnz );
void x264_zigzag_interleave_8x8_cavlc_sse2( int16_t *dst, int16_t *src, uint8_t *nnz ); void x264_zigzag_interleave_8x8_cavlc_sse2( int16_t *dst, int16_t *src, uint8_t *nnz );
......
...@@ -74,39 +74,6 @@ static inline int x264_predictor_difference_mmxext( int16_t (*mvc)[2], intptr_t ...@@ -74,39 +74,6 @@ static inline int x264_predictor_difference_mmxext( int16_t (*mvc)[2], intptr_t
sum += output[0] + output[1] + output[2] + output[3]; sum += output[0] + output[1] + output[2] + output[3];
return sum; return sum;
} }
#undef array_non_zero_int
#define array_non_zero_int array_non_zero_int_mmx
static ALWAYS_INLINE int array_non_zero_int_mmx( void *v, int i_count )
{
if(i_count == 128)
{
int nonzero = 0;
asm(
"movq (%1), %%mm0 \n"
"por 8(%1), %%mm0 \n"
"por 16(%1), %%mm0 \n"
"por 24(%1), %%mm0 \n"
"por 32(%1), %%mm0 \n"
"por 40(%1), %%mm0 \n"
"por 48(%1), %%mm0 \n"
"por 56(%1), %%mm0 \n"
"por 64(%1), %%mm0 \n"
"por 72(%1), %%mm0 \n"
"por 80(%1), %%mm0 \n"
"por 88(%1), %%mm0 \n"
"por 96(%1), %%mm0 \n"
"por 104(%1), %%mm0 \n"
"por 112(%1), %%mm0 \n"
"por 120(%1), %%mm0 \n"
"packsswb %%mm0, %%mm0 \n"
"movd %%mm0, %0 \n"
:"=r"(nonzero)
:"r"(v), "m"(*(struct {int16_t x[64];} *)v)
);
return !!nonzero;
}
else return array_non_zero_int_c( v, i_count );
}
#define x264_cabac_amvd_sum x264_cabac_amvd_sum_mmxext #define x264_cabac_amvd_sum x264_cabac_amvd_sum_mmxext
static ALWAYS_INLINE uint32_t x264_cabac_amvd_sum_mmxext(int16_t *mvdleft, int16_t *mvdtop) static ALWAYS_INLINE uint32_t x264_cabac_amvd_sum_mmxext(int16_t *mvdleft, int16_t *mvdtop)
{ {
......
...@@ -134,8 +134,7 @@ void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp ) ...@@ -134,8 +134,7 @@ void x264_mb_encode_i4x4( x264_t *h, int idx, int i_qp )
if( h->mb.b_lossless ) if( h->mb.b_lossless )
{ {
h->zigzagf.sub_4x4( h->dct.luma4x4[idx], p_src, p_dst ); nz = h->zigzagf.sub_4x4( h->dct.luma4x4[idx], p_src, p_dst );
nz = array_non_zero( h->dct.luma4x4[idx] );
h->mb.cache.non_zero_count[x264_scan8[idx]] = nz; h->mb.cache.non_zero_count[x264_scan8[idx]] = nz;
h->mb.i_cbp_luma |= nz<<(idx>>2); h->mb.i_cbp_luma |= nz<<(idx>>2);
return; return;
...@@ -171,8 +170,7 @@ void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp ) ...@@ -171,8 +170,7 @@ void x264_mb_encode_i8x8( x264_t *h, int idx, int i_qp )
if( h->mb.b_lossless ) if( h->mb.b_lossless )
{ {
h->zigzagf.sub_8x8( h->dct.luma8x8[idx], p_src, p_dst ); nz = h->zigzagf.sub_8x8( h->dct.luma8x8[idx], p_src, p_dst );
nz = array_non_zero( h->dct.luma8x8[idx] );
STORE_8x8_NNZ(idx,nz); STORE_8x8_NNZ(idx,nz);
h->mb.i_cbp_luma |= nz<<idx; h->mb.i_cbp_luma |= nz<<idx;
return; return;
...@@ -211,10 +209,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp ) ...@@ -211,10 +209,7 @@ static void x264_mb_encode_i16x16( x264_t *h, int i_qp )
{ {
int oe = block_idx_xy_fenc[i]; int oe = block_idx_xy_fenc[i];
int od = block_idx_xy_fdec[i]; int od = block_idx_xy_fdec[i];
h->zigzagf.sub_4x4( h->dct.luma4x4[i], p_src+oe, p_dst+od ); nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[i], p_src+oe, p_dst+od, &dct_dc4x4[0][block_idx_yx_1d[i]] );
dct_dc4x4[0][block_idx_yx_1d[i]] = h->dct.luma4x4[i][0];
h->dct.luma4x4[i][0] = 0;
nz = array_non_zero( h->dct.luma4x4[i] );
h->mb.cache.non_zero_count[x264_scan8[i]] = nz; h->mb.cache.non_zero_count[x264_scan8[i]] = nz;
h->mb.i_cbp_luma |= nz; h->mb.i_cbp_luma |= nz;
} }
...@@ -349,10 +344,7 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp ) ...@@ -349,10 +344,7 @@ void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
{ {
int oe = block_idx_x[i]*4 + block_idx_y[i]*4*FENC_STRIDE; int oe = block_idx_x[i]*4 + block_idx_y[i]*4*FENC_STRIDE;
int od = block_idx_x[i]*4 + block_idx_y[i]*4*FDEC_STRIDE; int od = block_idx_x[i]*4 + block_idx_y[i]*4*FDEC_STRIDE;
h->zigzagf.sub_4x4( h->dct.luma4x4[16+i+ch*4], p_src+oe, p_dst+od ); nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+i+ch*4], p_src+oe, p_dst+od, &h->dct.chroma_dc[ch][i] );
h->dct.chroma_dc[ch][i] = h->dct.luma4x4[16+i+ch*4][0];
h->dct.luma4x4[16+i+ch*4][0] = 0;
nz = array_non_zero( h->dct.luma4x4[16+i+ch*4] );
h->mb.cache.non_zero_count[x264_scan8[16+i+ch*4]] = nz; h->mb.cache.non_zero_count[x264_scan8[16+i+ch*4]] = nz;
h->mb.i_cbp_chroma |= nz; h->mb.i_cbp_chroma |= nz;
} }
...@@ -664,20 +656,18 @@ void x264_macroblock_encode( x264_t *h ) ...@@ -664,20 +656,18 @@ void x264_macroblock_encode( x264_t *h )
{ {
int x = 8*(i8x8&1); int x = 8*(i8x8&1);
int y = 8*(i8x8>>1); int y = 8*(i8x8>>1);
h->zigzagf.sub_8x8( h->dct.luma8x8[i8x8], nz = h->zigzagf.sub_8x8( h->dct.luma8x8[i8x8],
h->mb.pic.p_fenc[0]+x+y*FENC_STRIDE, h->mb.pic.p_fenc[0]+x+y*FENC_STRIDE,
h->mb.pic.p_fdec[0]+x+y*FDEC_STRIDE ); h->mb.pic.p_fdec[0]+x+y*FDEC_STRIDE );
nz = array_non_zero( h->dct.luma8x8[i8x8] );
STORE_8x8_NNZ(i8x8,nz); STORE_8x8_NNZ(i8x8,nz);
h->mb.i_cbp_luma |= nz << i8x8; h->mb.i_cbp_luma |= nz << i8x8;
} }
else else
for( i4x4 = 0; i4x4 < 16; i4x4++ ) for( i4x4 = 0; i4x4 < 16; i4x4++ )
{ {
h->zigzagf.sub_4x4( h->dct.luma4x4[i4x4], nz = h->zigzagf.sub_4x4( h->dct.luma4x4[i4x4],
h->mb.pic.p_fenc[0]+block_idx_xy_fenc[i4x4], h->mb.pic.p_fenc[0]+block_idx_xy_fenc[i4x4],
h->mb.pic.p_fdec[0]+block_idx_xy_fdec[i4x4] ); h->mb.pic.p_fdec[0]+block_idx_xy_fdec[i4x4] );
nz = array_non_zero( h->dct.luma4x4[i4x4] );
h->mb.cache.non_zero_count[x264_scan8[i4x4]] = nz; h->mb.cache.non_zero_count[x264_scan8[i4x4]] = nz;
h->mb.i_cbp_luma |= nz << (i4x4>>2); h->mb.i_cbp_luma |= nz << (i4x4>>2);
} }
...@@ -993,8 +983,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 ) ...@@ -993,8 +983,7 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
int i4; int i4;
if( h->mb.b_transform_8x8 ) if( h->mb.b_transform_8x8 )
{ {
h->zigzagf.sub_8x8( h->dct.luma8x8[i8], p_fenc, p_fdec ); nnz8x8 = h->zigzagf.sub_8x8( h->dct.luma8x8[i8], p_fenc, p_fdec );
nnz8x8 = array_non_zero( h->dct.luma8x8[i8] );
STORE_8x8_NNZ(i8,nnz8x8); STORE_8x8_NNZ(i8,nnz8x8);
} }
else else
...@@ -1002,21 +991,20 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 ) ...@@ -1002,21 +991,20 @@ void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
for( i4 = i8*4; i4 < i8*4+4; i4++ ) for( i4 = i8*4; i4 < i8*4+4; i4++ )
{ {
int nz; int nz;
h->zigzagf.sub_4x4( h->dct.luma4x4[i4], nz = h->zigzagf.sub_4x4( h->dct.luma4x4[i4],
h->mb.pic.p_fenc[0]+block_idx_xy_fenc[i4], h->mb.pic.p_fenc[0]+block_idx_xy_fenc[i4],
h->mb.pic.p_fdec[0]+block_idx_xy_fdec[i4] ); h->mb.pic.p_fdec[0]+block_idx_xy_fdec[i4] );
nz = array_non_zero( h->dct.luma4x4[i4] );
h->mb.cache.non_zero_count[x264_scan8[i4]] = nz; h->mb.cache.non_zero_count[x264_scan8[i4]] = nz;
nnz8x8 |= nz; nnz8x8 |= nz;
} }
} }
for( ch = 0; ch < 2; ch++ ) for( ch = 0; ch < 2; ch++ )
{ {
int16_t dc;
p_fenc = h->mb.pic.p_fenc[1+ch] + (i8&1)*4 + (i8>>1)*4*FENC_STRIDE; p_fenc = h->mb.pic.p_fenc[1+ch] + (i8&1)*4 + (i8>>1)*4*FENC_STRIDE;
p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE; p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE;
h->zigzagf.sub_4x4( h->dct.luma4x4[16+i8+ch*4], p_fenc, p_fdec ); nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+i8+ch*4], p_fenc, p_fdec, &dc );
h->dct.luma4x4[16+i8+ch*4][0] = 0; h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = nz;
h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*4]] = array_non_zero( h->dct.luma4x4[16+i8+ch*4] );
} }
} }
else else
...@@ -1121,8 +1109,8 @@ void x264_macroblock_encode_p4x4( x264_t *h, int i4 ) ...@@ -1121,8 +1109,8 @@ void x264_macroblock_encode_p4x4( x264_t *h, int i4 )
if( h->mb.b_lossless ) if( h->mb.b_lossless )
{ {
h->zigzagf.sub_4x4( h->dct.luma4x4[i4], p_fenc, p_fdec ); nz = h->zigzagf.sub_4x4( h->dct.luma4x4[i4], p_fenc, p_fdec );
h->mb.cache.non_zero_count[x264_scan8[i4]] = array_non_zero( h->dct.luma4x4[i4] ); h->mb.cache.non_zero_count[x264_scan8[i4]] = nz;
} }
else else
{ {
......
...@@ -643,13 +643,14 @@ static int check_dct( int cpu_ref, int cpu_new ) ...@@ -643,13 +643,14 @@ static int check_dct( int cpu_ref, int cpu_new )
#define TEST_ZIGZAG_SUB( name, t1, t2, size ) \ #define TEST_ZIGZAG_SUB( name, t1, t2, size ) \
if( zigzag_asm.name != zigzag_ref.name ) \ if( zigzag_asm.name != zigzag_ref.name ) \
{ \ { \
int nz_a, nz_c; \
set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" );\ set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" );\
used_asm = 1; \ used_asm = 1; \
memcpy( buf3, buf1, 16*FDEC_STRIDE ); \ memcpy( buf3, buf1, 16*FDEC_STRIDE ); \
memcpy( buf4, buf1, 16*FDEC_STRIDE ); \ memcpy( buf4, buf1, 16*FDEC_STRIDE ); \
call_c1( zigzag_c.name, t1, buf2, buf3 ); \ nz_c = call_c1( zigzag_c.name, t1, buf2, buf3 ); \
call_a1( zigzag_asm.name, t2, buf2, buf4 ); \ nz_a = call_a1( zigzag_asm.name, t2, buf2, buf4 ); \
if( memcmp( t1, t2, size*sizeof(int16_t) )|| memcmp( buf3, buf4, 16*FDEC_STRIDE ) ) \ if( memcmp( t1, t2, size*sizeof(int16_t) )|| memcmp( buf3, buf4, 16*FDEC_STRIDE ) || nz_c != nz_a ) \
{ \ { \
ok = 0; \ ok = 0; \
fprintf( stderr, #name " [FAILED]\n" ); \ fprintf( stderr, #name " [FAILED]\n" ); \
...@@ -658,6 +659,35 @@ static int check_dct( int cpu_ref, int cpu_new ) ...@@ -658,6 +659,35 @@ static int check_dct( int cpu_ref, int cpu_new )
call_a2( zigzag_asm.name, t2, buf2, buf4 ); \ call_a2( zigzag_asm.name, t2, buf2, buf4 ); \
} }
#define TEST_ZIGZAG_SUBAC( name, t1, t2 ) \
if( zigzag_asm.name != zigzag_ref.name ) \
{ \
int nz_a, nz_c; \
int16_t dc_a, dc_c; \
set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" );\
used_asm = 1; \
for( i = 0; i < 2; i++ ) \
{ \
memcpy( buf3, buf2, 16*FDEC_STRIDE ); \
memcpy( buf4, buf2, 16*FDEC_STRIDE ); \
for( j = 0; j < 4; j++ ) \
{ \
memcpy( buf3 + j*FDEC_STRIDE, (i?buf1:buf2) + j*FENC_STRIDE, 4 ); \
memcpy( buf4 + j*FDEC_STRIDE, (i?buf1:buf2) + j*FENC_STRIDE, 4 ); \
} \
nz_c = call_c1( zigzag_c.name, t1, buf2, buf3, &dc_c ); \
nz_a = call_a1( zigzag_asm.name, t2, buf2, buf4, &dc_a ); \
if( memcmp( t1+1, t2+1, 15*sizeof(int16_t) ) || memcmp( buf3, buf4, 16*FDEC_STRIDE ) || nz_c != nz_a || dc_c != dc_a ) \
{ \
ok = 0; \
fprintf( stderr, #name " [FAILED]\n" ); \
break; \
} \
} \
call_c2( zigzag_c.name, t1, buf2, buf3, &dc_c ); \
call_a2( zigzag_asm.name, t2, buf2, buf4, &dc_a ); \
}
#define TEST_INTERLEAVE( name, t1, t2, dct, size ) \ #define TEST_INTERLEAVE( name, t1, t2, dct, size ) \
if( zigzag_asm.name != zigzag_ref.name ) \ if( zigzag_asm.name != zigzag_ref.name ) \
{ \ { \
...@@ -687,6 +717,7 @@ static int check_dct( int cpu_ref, int cpu_new ) ...@@ -687,6 +717,7 @@ static int check_dct( int cpu_ref, int cpu_new )
TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, (void*)dct1, 64 ); TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, (void*)dct1, 64 );
TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 16 ); TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 16 );
TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16 ); TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16 );
TEST_ZIGZAG_SUBAC( sub_4x4ac, level1, level2 );
report( "zigzag_frame :" ); report( "zigzag_frame :" );
interlace = 1; interlace = 1;
...@@ -698,6 +729,7 @@ static int check_dct( int cpu_ref, int cpu_new ) ...@@ -698,6 +729,7 @@ static int check_dct( int cpu_ref, int cpu_new )
TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, (void*)dct1, 64 ); TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, (void*)dct1, 64 );
TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 16 ); TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 16 );
TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16 ); TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16 );
TEST_ZIGZAG_SUBAC( sub_4x4ac, level1, level2 );
report( "zigzag_field :" ); report( "zigzag_field :" );
ok = 1; used_asm = 0; ok = 1; used_asm = 0;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment