Commit f8f8d13d authored by Janne Grunau's avatar Janne Grunau Committed by Anton Mitrofanov

aarch64: NEON asm for 4x16 sad, satd and ssd

pixel_sad_4x16_neon: 33% faster than C
pixel_satd_4x16_neon: 5 times faster
pixel_ssd_4x16_neon:  4 times faster
parent 35b91f24
......@@ -114,6 +114,7 @@ endfunc
SAD_FUNC 4, 4
SAD_FUNC 4, 8
SAD_FUNC 4, 16
SAD_FUNC 8, 4
SAD_FUNC 8, 8
SAD_FUNC 8, 16
......@@ -367,6 +368,7 @@ endfunc
SSD_FUNC 4, 4
SSD_FUNC 4, 8
SSD_FUNC 4, 16
SSD_FUNC 8, 4
SSD_FUNC 8, 8
SSD_FUNC 8, 16
......@@ -895,6 +897,61 @@ function x264_satd_16x4_neon
b x264_satd_8x4v_8x8h_neon
endfunc
function x264_pixel_satd_4x16_neon, export=1
mov x4, x30
ld1 {v1.s}[0], [x2], x3
ld1 {v0.s}[0], [x0], x1
ld1 {v3.s}[0], [x2], x3
ld1 {v2.s}[0], [x0], x1
ld1 {v5.s}[0], [x2], x3
ld1 {v4.s}[0], [x0], x1
ld1 {v7.s}[0], [x2], x3
ld1 {v6.s}[0], [x0], x1
ld1 {v1.s}[1], [x2], x3
ld1 {v0.s}[1], [x0], x1
ld1 {v3.s}[1], [x2], x3
ld1 {v2.s}[1], [x0], x1
ld1 {v5.s}[1], [x2], x3
ld1 {v4.s}[1], [x0], x1
ld1 {v7.s}[1], [x2], x3
ld1 {v6.s}[1], [x0], x1
usubl v16.8h, v0.8b, v1.8b
usubl v17.8h, v2.8b, v3.8b
usubl v18.8h, v4.8b, v5.8b
usubl v19.8h, v6.8b, v7.8b
ld1 {v1.s}[0], [x2], x3
ld1 {v0.s}[0], [x0], x1
ld1 {v3.s}[0], [x2], x3
ld1 {v2.s}[0], [x0], x1
ld1 {v5.s}[0], [x2], x3
ld1 {v4.s}[0], [x0], x1
ld1 {v7.s}[0], [x2], x3
ld1 {v6.s}[0], [x0], x1
ld1 {v1.s}[1], [x2], x3
ld1 {v0.s}[1], [x0], x1
ld1 {v3.s}[1], [x2], x3
ld1 {v2.s}[1], [x0], x1
ld1 {v5.s}[1], [x2], x3
ld1 {v4.s}[1], [x0], x1
ld1 {v7.s}[1], [x2], x3
ld1 {v6.s}[1], [x0], x1
usubl v20.8h, v0.8b, v1.8b
usubl v21.8h, v2.8b, v3.8b
usubl v22.8h, v4.8b, v5.8b
usubl v23.8h, v6.8b, v7.8b
SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h
SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h
bl x264_satd_8x4v_8x8h_neon
add v30.8h, v0.8h, v1.8h
add v31.8h, v2.8h, v3.8h
add v0.8h, v30.8h, v31.8h
uaddlv s0, v0.8h
mov w0, v0.s[0]
ret x4
endfunc
function x264_pixel_sa8d_8x8_neon, export=1
mov x4, x30
......
......@@ -33,6 +33,7 @@
ret x264_pixel_##name##_8x16_##suffix args;\
ret x264_pixel_##name##_8x8_##suffix args;\
ret x264_pixel_##name##_8x4_##suffix args;\
ret x264_pixel_##name##_4x16_##suffix args;\
ret x264_pixel_##name##_4x8_##suffix args;\
ret x264_pixel_##name##_4x4_##suffix args;\
......
......@@ -1409,13 +1409,13 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
#if ARCH_AARCH64
if( cpu&X264_CPU_NEON )
{
INIT7( sad, _neon );
INIT8( sad, _neon );
// AArch64 has no distinct instructions for aligned load/store
INIT7_NAME( sad_aligned, sad, _neon );
INIT8_NAME( sad_aligned, sad, _neon );
INIT7( sad_x3, _neon );
INIT7( sad_x4, _neon );
INIT7( ssd, _neon );
INIT7( satd, _neon );
INIT8( ssd, _neon );
INIT8( satd, _neon );
INIT7( satd_x3, _neon );
INIT7( satd_x4, _neon );
INIT4( hadamard_ac, _neon );
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment