Commit 75d92705 authored by Oskar Arvidsson's avatar Oskar Arvidsson Committed by Fiona Glaser
Browse files

x86: combined SA8D/SATD dsp function

Speedup is most apparent for 8-bit (~30%), but gives some improvements
for 10-bit too (~12%).
64-bit only for now.
parent 790c648d
......@@ -370,7 +370,6 @@ static int x264_pixel_sa8d_16x16( pixel *pix1, intptr_t i_pix1, pixel *pix2, int
return (sum+2)>>2;
}
static NOINLINE uint64_t pixel_hadamard_ac( pixel *pix, intptr_t stride )
{
sum2_t tmp[32];
......@@ -881,6 +880,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse2;
#if ARCH_X86_64
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2;
pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_sse2;
#endif
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_sse2;
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_sse2;
......@@ -938,6 +938,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_ssse3;
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3;
#if ARCH_X86_64
pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_ssse3;
#endif
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_ssse3;
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_ssse3;
pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_ssse3;
......@@ -953,6 +956,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
}
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_sse4;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse4;
#if ARCH_X86_64
pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_sse4;
#endif
}
if( cpu&X264_CPU_AVX )
{
......@@ -971,6 +977,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx;
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_avx;
pixf->ssim_end4 = x264_pixel_ssim_end4_avx;
#if ARCH_X86_64
pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_avx;
#endif
}
if( cpu&X264_CPU_XOP )
{
......@@ -1056,6 +1065,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse2;
#if ARCH_X86_64
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2;
pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_sse2;
#endif
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_sse2;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_sse2;
......@@ -1153,6 +1163,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_ssse3;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_ssse3;
pixf->asd8 = x264_pixel_asd8_ssse3;
#if ARCH_X86_64
pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_ssse3;
#endif
if( cpu&X264_CPU_CACHELINE_64 )
{
INIT2( sad, _cache64_ssse3 );
......@@ -1183,6 +1196,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_sse4;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse4;
pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_sse4;
#if ARCH_X86_64
pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_sse4;
#endif
}
if( cpu&X264_CPU_AVX )
......@@ -1211,6 +1227,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_avx;
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_avx;
pixf->ssim_end4 = x264_pixel_ssim_end4_avx;
#if ARCH_X86_64
pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_avx;
#endif
}
if( cpu&X264_CPU_XOP )
......@@ -1232,6 +1251,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_xop;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_xop;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_xop;
#if ARCH_X86_64
pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_xop;
#endif
}
#endif //HAVE_MMX
......
......@@ -90,6 +90,7 @@ typedef struct
x264_pixel_cmp_t sad_aligned[8]; /* Aligned SAD for mbcmp */
int (*vsad)( pixel *, intptr_t, int );
int (*asd8)( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
uint64_t (*sa8d_satd[1])( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
uint64_t (*var[4])( pixel *pix, intptr_t stride );
int (*var2[4])( pixel *pix1, intptr_t stride1,
......
......@@ -136,6 +136,7 @@ cextern pw_ppppmmmm
cextern pw_ppmmppmm
cextern pw_pmpmpmpm
cextern pw_pmmpzzzz
cextern pd_1
cextern hsub_mul
;=============================================================================
......@@ -1719,6 +1720,157 @@ cglobal pixel_sa8d_16x16, 4,7
%endif ; !ARCH_X86_64
%endmacro ; SA8D
;=============================================================================
; SA8D_SATD
;=============================================================================
; %1-%4: sa8d output regs (m0,m1,m2,m3,m4,m5,m8,m9)
; m10: satd result
; m6, m11-15: tmp regs
%macro SA8D_SATD_8x4 4
%if HIGH_BIT_DEPTH == 0 && cpuflag(ssse3)
LOAD_SUMSUB_8x4P_SSSE3 %1, %2, %3, %4, 6, 11, 7, r0, r2, 1
HADAMARD4_V %1, %2, %3, %4, 6
pabsw m12, m%1 ; doing the abs first is a slight advantage
pabsw m14, m%3
pabsw m13, m%2
pabsw m15, m%4
HADAMARD 1, max, 12, 14, 6, 11
paddw m10, m12
HADAMARD 1, max, 13, 15, 6, 11
paddw m10, m13
%else
LOAD_DIFF_8x4P %1, %2, %3, %4, 6, 11, 7, r0, r2, 1
HADAMARD 0, sumsub, %1, %2, 6
HADAMARD 0, sumsub, %3, %4, 6
SBUTTERFLY wd, %1, %2, 6
SBUTTERFLY wd, %3, %4, 6
HADAMARD2_2D %1, %3, %2, %4, 6, dq
mova m12, m%1
mova m13, m%2
mova m14, m%3
mova m15, m%4
HADAMARD 0, sumsub, %1, %2, 6
HADAMARD 0, sumsub, %3, %4, 6
SBUTTERFLY qdq, 12, 13, 6
HADAMARD 0, amax, 12, 13, 6
SBUTTERFLY qdq, 14, 15, 6
paddw m10, m12
HADAMARD 0, amax, 14, 15, 6
paddw m10, m14
%endif
%endmacro ; SA8D_SATD_8x4
; %1: add spilled regs?
; %2: spill regs?
%macro SA8D_SATD_ACCUM 2
%if HIGH_BIT_DEPTH
pmaddwd m10, [pw_1]
HADDUWD m0, m1
%if %1
paddd m10, temp1
paddd m0, temp0
%endif
%if %2
mova temp1, m10
pxor m10, m10
%endif
%elif %1
paddw m0, temp0
%endif
%if %2
mova temp0, m0
%endif
%endmacro
%macro SA8D_SATD 0
cglobal pixel_sa8d_satd_8x8_internal
SA8D_SATD_8x4 0, 1, 2, 3
SA8D_SATD_8x4 4, 5, 8, 9
; complete sa8d
%if HIGH_BIT_DEPTH == 0 && cpuflag(ssse3)
SUMSUB_BADC w, 0, 4, 1, 5, 12
HADAMARD 2, sumsub, 0, 4, 12, 11
HADAMARD 2, sumsub, 1, 5, 12, 11
SUMSUB_BADC w, 2, 8, 3, 9, 12
HADAMARD 2, sumsub, 2, 8, 12, 11
HADAMARD 2, sumsub, 3, 9, 12, 11
HADAMARD 1, amax, 0, 4, 12, 11
HADAMARD 1, amax, 1, 5, 12, 4
HADAMARD 1, amax, 2, 8, 12, 4
HADAMARD 1, amax, 3, 9, 12, 4
%else ; sse2
HADAMARD2_2D 0, 4, 2, 8, 6, qdq, amax
HADAMARD2_2D 1, 5, 3, 9, 6, qdq, amax
%endif
; create sa8d sub results
paddw m1, m2
paddw m0, m3
paddw m0, m1
SAVE_MM_PERMUTATION
ret
;-------------------------------------------------------------------------------
; uint64_t pixel_sa8d_satd_16x16( pixel *, intptr_t, pixel *, intptr_t )
;-------------------------------------------------------------------------------
cglobal pixel_sa8d_satd_16x16, 4,8,16,SIZEOF_PIXEL*mmsize
%define temp0 [rsp+0*mmsize]
%define temp1 [rsp+1*mmsize]
FIX_STRIDES r1, r3
%if HIGH_BIT_DEPTH == 0 && cpuflag(ssse3)
mova m7, [hmul_8p]
%endif
lea r4, [3*r1]
lea r5, [3*r3]
lea r6, [r2+8*SIZEOF_PIXEL]
lea r7, [r0+8*SIZEOF_PIXEL]
pxor m10, m10
call pixel_sa8d_satd_8x8_internal
SA8D_SATD_ACCUM 0, 1
call pixel_sa8d_satd_8x8_internal
SA8D_SATD_ACCUM 1, 1
mov r0, r7
mov r2, r6
call pixel_sa8d_satd_8x8_internal
SA8D_SATD_ACCUM 1, 1
call pixel_sa8d_satd_8x8_internal
SA8D_SATD_ACCUM 1, 0
; xop already has fast horizontal sums
%if cpuflag(sse4) && notcpuflag(xop) && HIGH_BIT_DEPTH==0
pmaddwd m10, [pw_1]
HADDUWD m0, m1
phaddd m0, m10 ; sa8d1 sa8d2 satd1 satd2
pshufd m1, m0, q2301 ; sa8d2 sa8d1 satd2 satd1
paddd m0, m1 ; sa8d sa8d satd satd
movd r0d, m0
pextrd eax, m0, 2
%else
%if HIGH_BIT_DEPTH
HADDD m0, m1
HADDD m10, m2
%else
HADDUW m0, m1
HADDW m10, m2
%endif
movd r0d, m0
movd eax, m10
%endif
add r0d, 1
shl rax, 32
shr r0d, 1
or rax, r0
RET
%endmacro ; SA8D_SATD
;=============================================================================
; INTRA SATD
;=============================================================================
......@@ -3817,6 +3969,9 @@ SA8D
INIT_XMM sse2
SA8D
SATDS_SSE2
%if ARCH_X86_64
SA8D_SATD
%endif
%if HIGH_BIT_DEPTH == 0
INTRA_SA8D_SSE2
%endif
......@@ -3836,6 +3991,9 @@ INIT_XMM ssse3
SATDS_SSE2
SA8D
HADAMARD_AC_SSE2
%if ARCH_X86_64
SA8D_SATD
%endif
%if HIGH_BIT_DEPTH == 0
INTRA_X9
INTRA8_X9
......@@ -3854,6 +4012,9 @@ INIT_XMM sse4
SATDS_SSE2
SA8D
HADAMARD_AC_SSE2
%if ARCH_X86_64
SA8D_SATD
%endif
%if HIGH_BIT_DEPTH == 0
INTRA_X9
INTRA8_X9
......@@ -3862,6 +4023,9 @@ INTRA8_X9
INIT_XMM avx
SATDS_SSE2
SA8D
%if ARCH_X86_64
SA8D_SATD
%endif
%if HIGH_BIT_DEPTH == 0
INTRA_X9
INTRA8_X9
......@@ -3872,6 +4036,9 @@ HADAMARD_AC_SSE2
INIT_XMM xop
SATDS_SSE2
SA8D
%if ARCH_X86_64
SA8D_SATD
%endif
%if HIGH_BIT_DEPTH == 0
INTRA_X9
; no xop INTRA8_X9. it's slower than avx on bulldozer. dunno why.
......
......@@ -162,6 +162,11 @@ int x264_pixel_vsad_xop ( pixel *src, intptr_t stride, int height );
int x264_pixel_asd8_sse2 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
int x264_pixel_asd8_ssse3( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
int x264_pixel_asd8_xop ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
uint64_t x264_pixel_sa8d_satd_16x16_sse2 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
uint64_t x264_pixel_sa8d_satd_16x16_ssse3( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
uint64_t x264_pixel_sa8d_satd_16x16_sse4 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
uint64_t x264_pixel_sa8d_satd_16x16_avx ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
uint64_t x264_pixel_sa8d_satd_16x16_xop ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
#define DECL_ADS( size, suffix ) \
int x264_pixel_ads##size##_##suffix( int enc_dc[size], uint16_t *sums, int delta,\
......
......@@ -294,16 +294,24 @@
%endif
%endmacro
%macro HADDUWD 2
%if cpuflag(xop)
vphadduwd %1, %1
%else
psrld %2, %1, 16
pslld %1, 16
psrld %1, 16
paddd %1, %2
%endif
%endmacro
%macro HADDUW 2
%if cpuflag(xop) && mmsize == 16
vphadduwq %1, %1
movhlps %2, %1
paddd %1, %2
%else
psrld %2, %1, 16
pslld %1, 16
psrld %1, 16
paddd %1, %2
HADDUWD %1, %2
HADDD %1, %2
%endif
%endmacro
......
......@@ -2836,12 +2836,28 @@ static inline void x264_mb_analyse_transform( x264_t *h )
int plane_count = CHROMA444 && h->mb.b_chroma_me ? 3 : 1;
int i_cost8 = 0, i_cost4 = 0;
for( int p = 0; p < plane_count; p++ )
/* Not all platforms have a merged SATD function */
if( h->pixf.sa8d_satd[PIXEL_16x16] )
{
i_cost8 += h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
h->mb.pic.p_fdec[p], FDEC_STRIDE );
i_cost4 += h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
h->mb.pic.p_fdec[p], FDEC_STRIDE );
uint64_t cost = 0;
for( int p = 0; p < plane_count; p++ )
{
cost += h->pixf.sa8d_satd[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
h->mb.pic.p_fdec[p], FDEC_STRIDE );
}
i_cost8 = (uint32_t)cost;
i_cost4 = (uint32_t)(cost >> 32);
}
else
{
for( int p = 0; p < plane_count; p++ )
{
i_cost8 += h->pixf.sa8d[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
h->mb.pic.p_fdec[p], FDEC_STRIDE );
i_cost4 += h->pixf.satd[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE,
h->mb.pic.p_fdec[p], FDEC_STRIDE );
}
}
h->mb.b_transform_8x8 = i_cost8 < i_cost4;
......
......@@ -338,6 +338,43 @@ static int check_pixel( int cpu_ref, int cpu_new )
TEST_PIXEL( satd, 0 );
TEST_PIXEL( sa8d, 1 );
ok = 1, used_asm = 0;
if( pixel_asm.sa8d_satd[PIXEL_16x16] != pixel_ref.sa8d_satd[PIXEL_16x16] )
{
set_func_name( "sa8d_satd_%s", pixel_names[PIXEL_16x16] );
used_asm = 1;
for( int j = 0; j < 64; j++ )
{
uint32_t cost8_c = pixel_c.sa8d[PIXEL_16x16]( pbuf1, 16, pbuf2, 64 );
uint32_t cost4_c = pixel_c.satd[PIXEL_16x16]( pbuf1, 16, pbuf2, 64 );
uint64_t res_a = call_a( pixel_asm.sa8d_satd[PIXEL_16x16], pbuf1, (intptr_t)16, pbuf2, (intptr_t)64 );
uint32_t cost8_a = res_a;
uint32_t cost4_a = res_a >> 32;
if( cost8_a != cost8_c || cost4_a != cost4_c )
{
ok = 0;
fprintf( stderr, "sa8d_satd [%d]: (%d,%d) != (%d,%d) [FAILED]\n", PIXEL_16x16,
cost8_c, cost4_c, cost8_a, cost4_a );
break;
}
}
for( int j = 0; j < 0x1000 && ok; j += 256 ) \
{
uint32_t cost8_c = pixel_c.sa8d[PIXEL_16x16]( pbuf3+j, 16, pbuf4+j, 16 );
uint32_t cost4_c = pixel_c.satd[PIXEL_16x16]( pbuf3+j, 16, pbuf4+j, 16 );
uint64_t res_a = pixel_asm.sa8d_satd[PIXEL_16x16]( pbuf3+j, 16, pbuf4+j, 16 );
uint32_t cost8_a = res_a;
uint32_t cost4_a = res_a >> 32;
if( cost8_a != cost8_c || cost4_a != cost4_c )
{
ok = 0;
fprintf( stderr, "sa8d_satd [%d]: overflow (%d,%d) != (%d,%d) [FAILED]\n", PIXEL_16x16,
cost8_c, cost4_c, cost8_a, cost4_a );
}
}
}
report( "pixel sa8d_satd :" );
#define TEST_PIXEL_X( N ) \
ok = 1; used_asm = 0; \
for( int i = 0; i < 7; i++ ) \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment