Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • videolan/x264
  • EwoutH/x264
  • gramner/x264
  • BugMaster/x264
  • MaskRay/x264
  • thresh/x264
  • tpm/x264
  • wolfired/x264
  • ifb/x264
  • robinstorm/x264
  • ltnokiago/x264
  • janne/x264
  • Kromjunya/x264
  • trisnaayu0596/x264
  • felipegarcia1402/x264
  • coder2004/x264
  • philou/x264
  • walagnatalia/x264
  • DonDiego/x264
  • JHammler/x264
  • qyot27/x264
  • dwbuiten/x264
  • Kagami/x264
  • andriy-andreyev/x264
  • gxw/x264
  • trofi/x264
  • kierank/x264
  • aureliendavid/x264
  • galad/x264
  • roommini/x264
  • ocrete/x264
  • mstorsjo/x264
  • yinsj0116/x264
  • mamonet/x264
  • 1div0/x264
  • ko1265/x264
  • sergiomb2/x264
  • xutongda/x264
  • wenzhiwu/x264
  • arrowd/x264
  • FranceBB/x264
  • ziemek99/x264
  • longervision/x264
  • xopok/x264
  • jbk/x264
  • szatmary/x264
  • pekdon/x264
  • Jiangguyu/x264
  • jrtc27/x264
  • kankanol1/x264
  • gxwLite/x264
  • brad/x264
  • Gc6026/x264
  • jdek/x264
  • appcrash/x264
  • tguillem/x264
  • As/x264
  • wevian/x264
  • wangluls/x264
  • RellikJaeger/x264
  • hum/x264
  • rogerhardiman/x264
  • jankowalski12611/x264
  • zhijie1996/x264
  • yinshiyou/x264
  • Freed-Wu/x264
  • yajcoca/x264
  • bUd/x264
  • chienvannguyen2020/x264
  • nurbinakhatun386/x264
  • Siberiawind/x-264-meson
  • HecaiYuan/x264
  • david.chen/x264
  • Ytsejam76/x264
  • robUx4/x264
  • zhaoshiz/x-264-arm64ec
  • yintong.ustc/x-264-bd-ventana
  • nekobasu/x264
  • Courmisch/x264
  • BD-qjy/x264
  • quink/x264
  • markos/x264
82 results
Show changes
Commits on Source (2)
......@@ -109,7 +109,7 @@ void x264_bitstream_init( uint32_t cpu, x264_bitstream_function_t *pf )
pf->nal_escape = nal_escape_c;
#if HAVE_MMX
#if ARCH_X86_64 && !defined( __MACH__ )
#if ARCH_X86_64
pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_sse2;
pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_sse2;
pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_sse2;
......@@ -122,7 +122,7 @@ void x264_bitstream_init( uint32_t cpu, x264_bitstream_function_t *pf )
if( cpu&X264_CPU_SSE2_IS_FAST )
pf->nal_escape = x264_nal_escape_sse2;
}
#if ARCH_X86_64 && !defined( __MACH__ )
#if ARCH_X86_64
if( cpu&X264_CPU_LZCNT )
{
pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_lzcnt;
......
......@@ -888,7 +888,6 @@ void x264_pixel_init( uint32_t cpu, x264_pixel_function_t *pixf )
INIT7( satd_x4, _mmx2 );
INIT4( hadamard_ac, _mmx2 );
INIT8( ssd, _mmx2 );
INIT_ADS( _mmx2 );
pixf->intra_sad_x3_4x4 = intra_sad_x3_4x4_mmx2;
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmx2;
......@@ -961,9 +960,7 @@ void x264_pixel_init( uint32_t cpu, x264_pixel_function_t *pixf )
INIT7( sad, _ssse3 );
INIT7( sad_x3, _ssse3 );
INIT7( sad_x4, _ssse3 );
#if ARCH_X86 || !defined( __MACH__ )
INIT_ADS( _ssse3 );
#endif
INIT6( satd, _ssse3 );
pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_ssse3;
......@@ -1003,9 +1000,7 @@ void x264_pixel_init( uint32_t cpu, x264_pixel_function_t *pixf )
if( cpu&X264_CPU_AVX )
{
INIT5_NAME( sad_aligned, sad, _ssse3 ); /* AVX-capable CPUs doesn't benefit from an aligned version */
#if ARCH_X86 || !defined( __MACH__ )
INIT_ADS( _avx );
#endif
INIT6( satd, _avx );
pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_avx;
if( !(cpu&X264_CPU_STACK_MOD4) )
......@@ -1043,6 +1038,7 @@ void x264_pixel_init( uint32_t cpu, x264_pixel_function_t *pixf )
INIT2_NAME( sad_aligned, sad, _avx2 );
INIT2( sad_x3, _avx2 );
INIT2( sad_x4, _avx2 );
INIT_ADS( _avx2 );
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx2;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_avx2;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx2;
......@@ -1201,9 +1197,7 @@ void x264_pixel_init( uint32_t cpu, x264_pixel_function_t *pixf )
pixf->intra_sa8d_x9_8x8 = x264_intra_sa8d_x9_8x8_ssse3;
#endif
}
#if ARCH_X86 || !defined( __MACH__ )
INIT_ADS( _ssse3 );
#endif
if( cpu&X264_CPU_SLOW_ATOM )
{
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3_atom;
......@@ -1286,9 +1280,7 @@ void x264_pixel_init( uint32_t cpu, x264_pixel_function_t *pixf )
INIT8( satd, _avx );
INIT7( satd_x3, _avx );
INIT7( satd_x4, _avx );
#if ARCH_X86 || !defined( __MACH__ )
INIT_ADS( _avx );
#endif
INIT4( hadamard_ac, _avx );
if( !(cpu&X264_CPU_STACK_MOD4) )
{
......@@ -1341,9 +1333,7 @@ void x264_pixel_init( uint32_t cpu, x264_pixel_function_t *pixf )
INIT2( sad_x4, _avx2 );
INIT4( satd, _avx2 );
INIT2( hadamard_ac, _avx2 );
#if ARCH_X86 || !defined( __MACH__ )
INIT_ADS( _avx2 );
#endif
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx2;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx2;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx2;
......
......@@ -648,7 +648,6 @@ void x264_quant_init( x264_t *h, uint32_t cpu, x264_quant_function_t *pf )
pf->decimate_score16 = x264_decimate_score16_ssse3;
pf->decimate_score64 = x264_decimate_score64_ssse3;
INIT_TRELLIS( ssse3 );
#if ARCH_X86 || !defined( __MACH__ )
pf->coeff_level_run4 = x264_coeff_level_run4_ssse3;
pf->coeff_level_run8 = x264_coeff_level_run8_ssse3;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_ssse3;
......@@ -660,7 +659,6 @@ void x264_quant_init( x264_t *h, uint32_t cpu, x264_quant_function_t *pf )
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_ssse3_lzcnt;
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_ssse3_lzcnt;
}
#endif
}
if( cpu&X264_CPU_SSE4 )
......@@ -711,10 +709,8 @@ void x264_quant_init( x264_t *h, uint32_t cpu, x264_quant_function_t *pf )
pf->decimate_score64 = x264_decimate_score64_avx2;
pf->denoise_dct = x264_denoise_dct_avx2;
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2;
#if ARCH_X86 || !defined( __MACH__ )
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_avx2;
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_avx2;
#endif
}
if( cpu&X264_CPU_AVX512 )
{
......
......@@ -5220,13 +5220,13 @@ ASD8
shl r2d, 1
%endmacro
%macro ADS_END 1 ; unroll_size
add r1, 8*%1
add r3, 8*%1
add r6, 4*%1
sub r0d, 4*%1
jg .loop
WIN64_RESTORE_XMM
%macro ADS_END 1-2 .loop ; unroll_size, loop_label
add r1, 2*%1
add r3, 2*%1
add r6, %1
sub r0d, %1
jg %2
WIN64_RESTORE_XMM_INTERNAL
%if mmsize==32
vzeroupper
%endif
......@@ -5243,105 +5243,220 @@ ASD8
; int pixel_ads4( int enc_dc[4], uint16_t *sums, int delta,
; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
;-----------------------------------------------------------------------------
INIT_MMX mmx2
cglobal pixel_ads4, 5,7
mova m6, [r0]
mova m4, [r0+8]
pshufw m7, m6, 0
pshufw m6, m6, q2222
pshufw m5, m4, 0
pshufw m4, m4, q2222
%if HIGH_BIT_DEPTH
%macro ADS_XMM 0
%if ARCH_X86_64
cglobal pixel_ads4, 5,7,9
%else
cglobal pixel_ads4, 5,7,8
%endif
%if mmsize >= 32
vpbroadcastd m7, [r0+ 0]
vpbroadcastd m6, [r0+ 4]
vpbroadcastd m5, [r0+ 8]
vpbroadcastd m4, [r0+12]
%else
mova m4, [r0]
pshufd m7, m4, 0
pshufd m6, m4, q1111
pshufd m5, m4, q2222
pshufd m4, m4, q3333
%endif
%if ARCH_X86_64
SPLATD m8, r6m
%endif
ADS_START
.loop:
movu m0, [r1]
movu m1, [r1+16]
psubw m0, m7
psubw m1, m6
ABSW m0, m0, m2
ABSW m1, m1, m3
movu m2, [r1+r2]
movu m3, [r1+r2+16]
psubw m2, m5
psubw m3, m4
paddw m0, m1
ABSW m2, m2, m1
ABSW m3, m3, m1
paddw m0, m2
paddw m0, m3
pshufw m1, r6m, 0
paddusw m0, [r3]
psubusw m1, m0
packsswb m1, m1
%if cpuflag(avx)
pmovzxwd m0, [r1]
pmovzxwd m1, [r1+16]
%else
movh m0, [r1]
movh m1, [r1+16]
pxor m3, m3
punpcklwd m0, m3
punpcklwd m1, m3
%endif
psubd m0, m7
psubd m1, m6
ABSD m0, m0, m2
ABSD m1, m1, m3
%if cpuflag(avx)
pmovzxwd m2, [r1+r2]
pmovzxwd m3, [r1+r2+16]
paddd m0, m1
%else
movh m2, [r1+r2]
movh m3, [r1+r2+16]
paddd m0, m1
pxor m1, m1
punpcklwd m2, m1
punpcklwd m3, m1
%endif
psubd m2, m5
psubd m3, m4
ABSD m2, m2, m1
ABSD m3, m3, m1
paddd m0, m2
paddd m0, m3
%if cpuflag(avx)
pmovzxwd m1, [r3]
%else
movh m1, [r3]
pxor m3, m3
punpcklwd m1, m3
%endif
paddd m0, m1
%if ARCH_X86_64
psubd m1, m8, m0
%else
SPLATD m1, r6m
psubd m1, m0
%endif
packssdw m1, m1
%if mmsize == 32
vpermq m1, m1, q3120
packuswb m1, m1
movq [r6], xm1
%else
packuswb m1, m1
movd [r6], m1
ADS_END 1
%endif
ADS_END mmsize/4
cglobal pixel_ads2, 5,7
mova m6, [r0]
pshufw m5, r6m, 0
pshufw m7, m6, 0
pshufw m6, m6, q2222
cglobal pixel_ads2, 5,7,8
%if mmsize >= 32
vpbroadcastd m7, [r0+0]
vpbroadcastd m6, [r0+4]
vpbroadcastd m5, r6m
%else
movq m6, [r0]
movd m5, r6m
pshufd m7, m6, 0
pshufd m6, m6, q1111
pshufd m5, m5, 0
%endif
pxor m4, m4
ADS_START
.loop:
movu m0, [r1]
movu m1, [r1+r2]
psubw m0, m7
psubw m1, m6
ABSW m0, m0, m2
ABSW m1, m1, m3
paddw m0, m1
paddusw m0, [r3]
mova m4, m5
psubusw m4, m0
packsswb m4, m4
movd [r6], m4
ADS_END 1
cglobal pixel_ads1, 5,7
pshufw m7, [r0], 0
pshufw m6, r6m, 0
%if cpuflag(avx)
pmovzxwd m0, [r1]
pmovzxwd m1, [r1+r2]
pmovzxwd m2, [r3]
%else
movh m0, [r1]
movh m1, [r1+r2]
movh m2, [r3]
punpcklwd m0, m4
punpcklwd m1, m4
punpcklwd m2, m4
%endif
psubd m0, m7
psubd m1, m6
ABSD m0, m0, m3
ABSD m1, m1, m3
paddd m0, m1
paddd m0, m2
psubd m1, m5, m0
packssdw m1, m1
%if mmsize == 32
vpermq m1, m1, q3120
packuswb m1, m1
movq [r6], xm1
%else
packuswb m1, m1
movd [r6], m1
%endif
ADS_END mmsize/4
cglobal pixel_ads1, 5,7,8
%if mmsize >= 32
vpbroadcastd m7, [r0]
vpbroadcastd m6, r6m
%else
movd m7, [r0]
movd m6, r6m
pshufd m7, m7, 0
pshufd m6, m6, 0
%endif
pxor m5, m5
ADS_START
.loop:
movu m0, [r1]
movu m1, [r1+8]
psubw m0, m7
psubw m1, m7
ABSW m0, m0, m2
ABSW m1, m1, m3
paddusw m0, [r3]
paddusw m1, [r3+8]
mova m4, m6
mova m5, m6
psubusw m4, m0
psubusw m5, m1
packsswb m4, m5
mova [r6], m4
ADS_END 2
movu m1, [r1]
movu m3, [r3]
punpcklwd m0, m1, m5
punpckhwd m1, m5
punpcklwd m2, m3, m5
punpckhwd m3, m5
psubd m0, m7
psubd m1, m7
ABSD m0, m0, m4
ABSD m1, m1, m4
paddd m0, m2
paddd m1, m3
psubd m2, m6, m0
psubd m3, m6, m1
packssdw m2, m3
packuswb m2, m2
%if mmsize == 32
vpermq m2, m2, q3120
mova [r6], xm2
%else
movq [r6], m2
%endif
ADS_END mmsize/2
%endmacro
INIT_XMM sse2
ADS_XMM
INIT_XMM ssse3
ADS_XMM
INIT_XMM avx
ADS_XMM
INIT_YMM avx2
ADS_XMM
%else ; !HIGH_BIT_DEPTH
%macro ADS_XMM 0
%if mmsize==32
%if ARCH_X86_64 && mmsize == 16
cglobal pixel_ads4, 5,7,12
%elif ARCH_X86_64 && mmsize != 8
cglobal pixel_ads4, 5,7,9
%else
cglobal pixel_ads4, 5,7,8
%endif
test dword r6m, 0xffff0000
%if mmsize >= 32
vpbroadcastw m7, [r0+ 0]
vpbroadcastw m6, [r0+ 4]
vpbroadcastw m5, [r0+ 8]
vpbroadcastw m4, [r0+12]
%else
cglobal pixel_ads4, 5,7,12
mova m4, [r0]
pshuflw m7, m4, q0000
pshuflw m6, m4, q2222
pshufhw m5, m4, q0000
pshufhw m4, m4, q2222
%elif mmsize == 16
mova m4, [r0]
pshuflw m7, m4, 0
pshuflw m6, m4, q2222
pshufhw m5, m4, 0
pshufhw m4, m4, q2222
punpcklqdq m7, m7
punpcklqdq m6, m6
punpckhqdq m5, m5
punpckhqdq m4, m4
%else
mova m6, [r0]
mova m4, [r0+8]
pshufw m7, m6, 0
pshufw m6, m6, q2222
pshufw m5, m4, 0
pshufw m4, m4, q2222
%endif
%if ARCH_X86_64 && mmsize == 16
movd m8, r6m
SPLATW m8, m8
jnz .nz
ADS_START
%if ARCH_X86_64 && mmsize == 16
movu m10, [r1]
movu m11, [r1+r2]
SPLATW m8, r6m
.loop:
psubw m0, m10, m7
movu m10, [r1+16]
......@@ -5360,7 +5475,9 @@ cglobal pixel_ads4, 5,7,12
paddusw m0, m9
psubusw m1, m8, m0
%else
ADS_START
%if ARCH_X86_64 && mmsize != 8
SPLATW m8, r6m
%endif
.loop:
movu m0, [r1]
movu m1, [r1+16]
......@@ -5378,81 +5495,196 @@ cglobal pixel_ads4, 5,7,12
paddw m0, m2
paddw m0, m3
movu m2, [r3]
%if mmsize==32
vpbroadcastw m1, r6m
%if ARCH_X86_64 && mmsize != 8
mova m1, m8
%else
movd m1, r6m
pshuflw m1, m1, 0
punpcklqdq m1, m1
SPLATW m1, r6m
%endif
paddusw m0, m2
psubusw m1, m0
%endif ; ARCH
packsswb m1, m1
%if mmsize==32
%if mmsize == 32
vpermq m1, m1, q3120
mova [r6], xm1
%else
movh [r6], m1
%endif
ADS_END mmsize/8
ADS_END mmsize/2
.nz:
ADS_START
%if ARCH_X86_64 && mmsize == 16
movu m10, [r1]
movu m11, [r1+r2]
SPLATD m8, r6m
.loop_nz:
psubw m0, m10, m7
movu m10, [r1+16]
psubw m1, m10, m6
ABSW m0, m0, m2
ABSW m1, m1, m3
psubw m2, m11, m5
movu m11, [r1+r2+16]
paddw m0, m1
psubw m3, m11, m4
movu m9, [r3]
ABSW m2, m2, m1
ABSW m3, m3, m1
paddw m0, m2
paddw m0, m3
pxor m3, m3
mova m2, m0
mova m1, m9
punpcklwd m0, m3
punpcklwd m9, m3
punpckhwd m2, m3
punpckhwd m1, m3
paddd m0, m9
paddd m2, m1
psubd m1, m8, m0
psubd m3, m8, m2
packssdw m1, m3
packuswb m1, m1
%else
%if ARCH_X86_64 && mmsize != 8
SPLATD m8, r6m
%endif
.loop_nz:
movu m0, [r1]
movu m1, [r1+16]
psubw m0, m7
psubw m1, m6
ABSW m0, m0, m2
ABSW m1, m1, m3
movu m2, [r1+r2]
movu m3, [r1+r2+16]
psubw m2, m5
psubw m3, m4
paddw m0, m1
ABSW m2, m2, m1
ABSW m3, m3, m1
paddw m0, m2
paddw m0, m3
%if mmsize == 32
movu m1, [r3]
%else
movh m1, [r3]
%endif
pxor m3, m3
mova m2, m0
punpcklwd m0, m3
punpcklwd m1, m3
punpckhwd m2, m3
paddd m0, m1
%if mmsize == 32
movu m1, [r3]
punpckhwd m1, m3
%else
movh m1, [r3+mmsize/2]
punpcklwd m1, m3
%endif
paddd m2, m1
%if ARCH_X86_64 && mmsize != 8
mova m1, m8
%else
SPLATD m1, r6m
%endif
mova m3, m1
psubd m1, m0
psubd m3, m2
packssdw m1, m3
packuswb m1, m1
%endif ; ARCH
%if mmsize == 32
vpermq m1, m1, q3120
mova [r6], xm1
%else
movh [r6], m1
%endif
ADS_END mmsize/2, .loop_nz
cglobal pixel_ads2, 5,7,8
%if mmsize==32
test dword r6m, 0xffff0000
%if mmsize >= 32
vpbroadcastw m7, [r0+0]
vpbroadcastw m6, [r0+4]
vpbroadcastw m5, r6m
%else
movq m6, [r0]
movd m5, r6m
pshuflw m7, m6, 0
pshuflw m6, m6, q2222
pshuflw m5, m5, 0
%elif mmsize == 16
movq m6, [r0]
pshuflw m7, m6, 0
pshuflw m6, m6, q2222
punpcklqdq m7, m7
punpcklqdq m6, m6
punpcklqdq m5, m5
%else
mova m6, [r0]
pshufw m7, m6, 0
pshufw m6, m6, q2222
%endif
jnz .nz
ADS_START
SPLATW m5, r6m
.loop:
movu m0, [r1]
movu m1, [r1+r2]
movu m2, [r3]
psubw m0, m7
psubw m1, m6
movu m4, [r3]
ABSW m0, m0, m2
ABSW m1, m1, m3
ABSW m0, m0, m3
ABSW m1, m1, m4
paddw m0, m1
paddusw m0, m4
paddusw m0, m2
psubusw m1, m5, m0
packsswb m1, m1
%if mmsize==32
%if mmsize == 32
vpermq m1, m1, q3120
mova [r6], xm1
%else
movh [r6], m1
%endif
ADS_END mmsize/8
cglobal pixel_ads1, 5,7,8
%if mmsize==32
vpbroadcastw m7, [r0]
vpbroadcastw m6, r6m
ADS_END mmsize/2
.nz:
ADS_START
SPLATD m5, r6m
pxor m4, m4
.loop_nz:
movu m0, [r1]
movu m1, [r1+r2]
movu m2, [r3]
psubw m0, m7
psubw m1, m6
ABSW m0, m0, m3
ABSW m1, m1, m3
paddw m0, m1
punpckhwd m3, m2, m4
punpckhwd m1, m0, m4
punpcklwd m2, m4
punpcklwd m0, m4
paddd m1, m3
paddd m0, m2
psubd m3, m5, m1
psubd m2, m5, m0
packssdw m2, m3
packuswb m2, m2
%if mmsize == 32
vpermq m2, m2, q3120
mova [r6], xm2
%else
movd m7, [r0]
movd m6, r6m
pshuflw m7, m7, 0
pshuflw m6, m6, 0
punpcklqdq m7, m7
punpcklqdq m6, m6
movh [r6], m2
%endif
ADS_END mmsize/2, .loop_nz
cglobal pixel_ads1, 5,7,8
test dword r6m, 0xffff0000
SPLATW m7, [r0]
jnz .nz
ADS_START
SPLATW m6, r6m
.loop:
movu m0, [r1]
movu m1, [r1+mmsize]
psubw m0, m7
psubw m1, m7
movu m2, [r3]
movu m3, [r3+mmsize]
psubw m0, m7
psubw m1, m7
ABSW m0, m0, m4
ABSW m1, m1, m5
paddusw m0, m2
......@@ -5460,13 +5692,52 @@ cglobal pixel_ads1, 5,7,8
psubusw m4, m6, m0
psubusw m5, m6, m1
packsswb m4, m5
%if mmsize==32
%if mmsize == 32
vpermq m4, m4, q3120
%endif
mova [r6], m4
ADS_END mmsize/4
ADS_END mmsize
.nz:
ADS_START
SPLATD m6, r6m
pxor m5, m5
.loop_nz:
movu m0, [r1]
movu m1, [r1+mmsize]
movu m2, [r3]
psubw m0, m7
psubw m1, m7
ABSW m0, m0, m3
ABSW m1, m1, m4
punpckhwd m3, m2, m5
punpckhwd m4, m0, m5
punpcklwd m2, m5
punpcklwd m0, m5
paddd m4, m3
paddd m0, m2
psubd m3, m6, m4
movu m4, [r3+mmsize]
psubd m2, m6, m0
packssdw m2, m3
punpckhwd m0, m1, m5
punpckhwd m3, m4, m5
punpcklwd m1, m5
punpcklwd m4, m5
paddd m0, m3
paddd m1, m4
psubd m3, m6, m0
psubd m4, m6, m1
packssdw m4, m3
packuswb m2, m4
%if mmsize == 32
vpermq m2, m2, q3120
%endif
mova [r6], m2
ADS_END mmsize, .loop_nz
%endmacro
INIT_MMX mmx2
ADS_XMM
INIT_XMM sse2
ADS_XMM
INIT_XMM ssse3
......@@ -5476,6 +5747,8 @@ ADS_XMM
INIT_YMM avx2
ADS_XMM
%endif ; HIGH_BIT_DEPTH
; int pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
; {
; int nmv=0, i, j;
......@@ -5521,7 +5794,7 @@ ALIGN 16
test r2, r2
%else
mov r3, r2
add r3d, [r6+r1+4]
or r3d, [r6+r1+4]
%endif
jz .loopi0
xor r3d, r3d
......
......@@ -664,6 +664,8 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
%if WIN64 == 0
%macro WIN64_SPILL_XMM 1
%endmacro
%macro WIN64_RESTORE_XMM_INTERNAL 0
%endmacro
%macro WIN64_RESTORE_XMM 0
%endmacro
%macro WIN64_PUSH_XMM 0
......
......@@ -286,18 +286,51 @@
%if cpuflag(avx2) && %3 == 0
vpbroadcastw %1, %2
%else
PSHUFLW %1, %2, (%3)*q1111
%if mmsize == 16
%define %%s %2
%ifid %2
%define %%s xmm%2
%elif %3 == 0
movd xmm%1, %2
%define %%s xmm%1
%endif
PSHUFLW xmm%1, %%s, (%3)*q1111
%if mmsize >= 32
vpbroadcastq %1, xmm%1
%elif mmsize == 16
punpcklqdq %1, %1
%endif
%endif
%endmacro
%imacro SPLATD 2-3 0
%if mmsize == 16
pshufd %1, %2, (%3)*q1111
%if cpuflag(avx2) && %3 == 0
vpbroadcastd %1, %2
%else
%define %%s %2
%ifid %2
%define %%s xmm%2
%elif %3 == 0
movd xmm%1, %2
%define %%s xmm%1
%endif
%if mmsize == 8 && %3 == 0
%ifidn %1, %%s
punpckldq %1, %1
%else
pshufw %1, %2, (%3)*q0101 + ((%3)+1)*q1010
pshufw %1, %%s, q1010
%endif
%elif mmsize == 8 && %3 == 1
%ifidn %1, %%s
punpckhdq %1, %1
%else
pshufw %1, %%s, q3232
%endif
%else
pshufd xmm%1, %%s, (%3)*q1111
%endif
%if mmsize >= 32
vpbroadcastq %1, xmm%1
%endif
%endif
%endmacro
......
......@@ -754,7 +754,7 @@ void x264_cabac_block_residual_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat
static ALWAYS_INLINE void cabac_block_residual( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
{
#if ARCH_X86_64 && HAVE_MMX && !defined( __MACH__ )
#if ARCH_X86_64 && HAVE_MMX
h->bsf.cabac_block_residual_internal( l, MB_INTERLACED, ctx_block_cat, cb );
#else
x264_cabac_block_residual_c( h, cb, ctx_block_cat, l );
......@@ -868,7 +868,7 @@ void x264_cabac_block_residual_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_
static ALWAYS_INLINE void cabac_block_residual_8x8( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
{
#if ARCH_X86_64 && HAVE_MMX && !defined( __MACH__ )
#if ARCH_X86_64 && HAVE_MMX
h->bsf.cabac_block_residual_8x8_rd_internal( l, MB_INTERLACED, ctx_block_cat, cb );
#else
x264_cabac_block_residual_8x8_rd_c( h, cb, ctx_block_cat, l );
......@@ -876,7 +876,7 @@ static ALWAYS_INLINE void cabac_block_residual_8x8( x264_t *h, x264_cabac_t *cb,
}
static ALWAYS_INLINE void cabac_block_residual( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
{
#if ARCH_X86_64 && HAVE_MMX && !defined( __MACH__ )
#if ARCH_X86_64 && HAVE_MMX
h->bsf.cabac_block_residual_rd_internal( l, MB_INTERLACED, ctx_block_cat, cb );
#else
x264_cabac_block_residual_rd_c( h, cb, ctx_block_cat, l );
......
......@@ -704,7 +704,7 @@ int quant_trellis_cabac( x264_t *h, dctcoef *dct,
return !!dct[0];
}
#if HAVE_MMX && ARCH_X86_64 && !defined( __MACH__ )
#if HAVE_MMX && ARCH_X86_64
uint64_t level_state0;
memcpy( &level_state0, cabac_state, sizeof(uint64_t) );
uint16_t level_state1;
......
......@@ -60,6 +60,9 @@ static int quiet = 0;
#define MAX_FUNCS 1000 // just has to be big enough to hold all the existing functions
#define MAX_CPUS 30 // number of different combinations of cpu flags
// RAND_MAX is guaranteed to be at least 32767, to get 30 bits of random data, we'll call rand() twice
#define rand30() (((rand() & 0x7fff) << 15) + (rand() & 0x7fff))
typedef struct
{
void *pointer; // just for detecting duplicates
......@@ -799,7 +802,7 @@ static int check_pixel( uint32_t cpu_ref, uint32_t cpu_new )
ok = 1; used_asm = 0;
for( int i = 0; i < 32; i++ )
cost_mv[i] = i*10;
cost_mv[i] = rand30() & 0xffff;
for( int i = 0; i < 100 && ok; i++ )
if( pixel_asm.ads[i&3] != pixel_ref.ads[i&3] )
{
......@@ -808,18 +811,36 @@ static int check_pixel( uint32_t cpu_ref, uint32_t cpu_new )
ALIGNED_16( int16_t mvs_a[48] );
ALIGNED_16( int16_t mvs_c[48] );
int mvn_a, mvn_c;
int thresh = rand() & 0x3fff;
set_func_name( "esa_ads" );
for( int j = 0; j < 72; j++ )
sums[j] = rand() & 0x3fff;
for( int j = 0; j < 4; j++ )
dc[j] = rand() & 0x3fff;
int thresh = (rand() % 257) * PIXEL_MAX + (rand30() & 0xffff);
set_func_name( "esa_ads_%s", pixel_names[i&3] );
if( i < 40 )
{
for( int j = 0; j < 72; j++ )
sums[j] = (rand() % 9) * 8 * PIXEL_MAX;
for( int j = 0; j < 4; j++ )
dc[j] = (rand() % 9) * 8 * PIXEL_MAX;
}
else
{
#if BIT_DEPTH + 6 > 15
for( int j = 0; j < 72; j++ )
sums[j] = rand30() & ((1 << (BIT_DEPTH + 6))-1);
for( int j = 0; j < 4; j++ )
dc[j] = rand30() & ((1 << (BIT_DEPTH + 6))-1);
#else
for( int j = 0; j < 72; j++ )
sums[j] = rand() & ((1 << (BIT_DEPTH + 6))-1);
for( int j = 0; j < 4; j++ )
dc[j] = rand() & ((1 << (BIT_DEPTH + 6))-1);
#endif
}
used_asm = 1;
mvn_c = call_c( pixel_c.ads[i&3], dc, sums, 32, cost_mv, mvs_c, 28, thresh );
mvn_a = call_a( pixel_asm.ads[i&3], dc, sums, 32, cost_mv, mvs_a, 28, thresh );
if( mvn_c != mvn_a || memcmp( mvs_c, mvs_a, mvn_c*sizeof(*mvs_c) ) )
{
ok = 0;
printf( "thresh: %d\n", thresh );
printf( "c%d: ", i&3 );
for( int j = 0; j < mvn_c; j++ )
printf( "%d ", mvs_c[j] );
......@@ -1721,7 +1742,7 @@ static int check_mc( uint32_t cpu_ref, uint32_t cpu_new )
x264_emms();
for( int i = 0; i < 10; i++ )
{
float fps_factor = (rand()&65535) / 65535.0f;
float fps_factor = (rand30()&65535) / 65535.0f;
set_func_name( "mbtree_propagate_cost" );
int16_t *dsta = (int16_t*)buf3;
int16_t *dstc = dsta+400;
......