Provide arm64 neon optimized functions for sad for high bit depth

Hi, I'm sending below a patch which adds support for high bit depth for sad functions. The bench results are provided in commit message. I'm looking forward to your comments and review. Hope we could merge it to the main branch after review.
Thanks, Hubert
From fbe5c3e1ac7650d9698abd13e9d3c55a7ccc9f97 Mon Sep 17 00:00:00 2001
From: Hubert Mazur <hum@semihalf.com>
Date: Thu, 6 Oct 2022 11:37:53 +0000
Subject: [PATCH] aarch64: pixel: add 10bits sad functions

Provide routines for sad functions for high bit depth, i.e. 10 bits.
Benchmarks run on AWS Gravtion 2 instances.

sad_4x4_c: 586
sad_4x4_neon: 275
sad_4x8_c: 1194
sad_4x8_neon: 362
sad_4x16_c: 2079
sad_4x16_neon: 552
sad_8x4_c: 955
sad_8x4_neon: 209
sad_8x8_c: 1719
sad_8x8_neon: 311
sad_8x16_c: 3533
sad_8x16_neon: 496
sad_16x8_c: 3084
sad_16x8_neon: 629
sad_16x16_c: 6055
sad_16x16_neon: 1111

Signed-off-by: Hubert Mazur <hum@semihalf.com>
Signed-off-by: Grzegorz Bernacki <gjb@semihalf.com>
---
 common/aarch64/pixel-a.S | 105 +++++++++++++++++++++++++++++++++++++++
 common/aarch64/pixel.h   |   6 +--
 common/pixel.c           |   7 +++
 3 files changed, 115 insertions(+), 3 deletions(-)

diff --git a/common/aarch64/pixel-a.S b/common/aarch64/pixel-a.S
index e3bc12d4..20ee8024 100644
--- a/common/aarch64/pixel-a.S
+++ b/common/aarch64/pixel-a.S
@@ -40,6 +40,7 @@ const mask_ac_4_8
 .short 0, -1, -1, -1, -1, -1, -1, -1
 endconst
 
+#if BIT_DEPTH == 8
 .macro SAD_START_4
     ld1        {v1.s}[0], [x2], x3
     ld1        {v0.s}[0], [x0], x1
@@ -112,6 +113,110 @@ function pixel_sad\name\()_\w\()x\h\()_neon, export=1
 endfunc
 .endm
 
+#else /* BIT_DEPTH == 8 */
+
+.macro SAD_START_4
+    mov        x1, x1, lsl #1
+    mov        x3, x3, lsl #1
+    ld1        {v1.d}[0], [x2], x3
+    ld1        {v0.d}[0], [x0], x1
+    ld1        {v1.d}[1], [x2], x3
+    ld1        {v0.d}[1], [x0], x1
+    uabdl       v16.4s,  v0.4h,  v1.4h
+    uabdl2      v18.4s,  v0.8h,  v1.8h
+.endm
+
+.macro SAD_4
+    ld1        {v1.d}[0], [x2], x3
+    ld1        {v0.d}[0], [x0], x1
+    ld1        {v1.d}[1], [x2], x3
+    ld1        {v0.d}[1], [x0], x1
+    uabal       v16.4s,  v0.4h,  v1.4h
+    uabal2      v18.4s,  v0.8h,  v1.8h
+.endm
+
+.macro SAD_START_8
+    mov         x1, x1, lsl #1
+    mov         x3, x3, lsl #1
+    ld1         {v1.8h}, [x2], x3
+    ld1         {v0.8h}, [x0], x1
+    ld1         {v3.8h}, [x2], x3
+    ld1         {v2.8h}, [x0], x1
+    uabdl       v16.4s,  v0.4h,  v1.4h
+    uabdl2      v17.4s,  v0.8h,  v1.8h
+    uabdl       v18.4s,  v2.4h,  v3.4h
+    uabdl2      v19.4s,  v2.8h,  v3.8h
+.endm
+
+.macro SAD_8
+    ld1         {v1.8h}, [x2], x3
+    ld1         {v0.8h}, [x0], x1
+    ld1         {v3.8h}, [x2], x3
+    ld1         {v2.8h}, [x0], x1
+    uabal       v16.4s,  v0.4h,  v1.4h
+    uabal2      v17.4s,  v0.8h,  v1.8h
+    uabal       v18.4s,  v2.4h,  v3.4h
+    uabal2      v19.4s,  v2.8h,  v3.8h
+.endm
+
+.macro SAD_START_16
+    mov         x1, x1, lsl #1
+    mov         x3, x3, lsl #1
+    ld2         {v0.8h, v1.8h}, [x2], x3
+    ld2         {v2.8h, v3.8h}, [x0], x1
+    ld2         {v4.8h, v5.8h}, [x2], x3
+    ld2         {v6.8h, v7.8h}, [x0], x1
+    uabdl       v16.4s,  v0.4h,  v2.4h
+    uabdl2      v17.4s,  v0.8h,  v2.8h
+    uabdl       v20.4s,  v1.4h,  v3.4h
+    uabdl2      v21.4s,  v1.8h,  v3.8h
+    uabdl       v18.4s,  v4.4h,  v6.4h
+    uabdl2      v19.4s,  v4.8h,  v6.8h
+    uabdl       v22.4s,  v5.4h,  v7.4h
+    uabdl2      v23.4s,  v5.8h,  v7.8h
+.endm
+
+.macro SAD_16
+    ld2         {v0.8h, v1.8h}, [x2], x3
+    ld2         {v2.8h, v3.8h}, [x0], x1
+    ld2         {v4.8h, v5.8h}, [x2], x3
+    ld2         {v6.8h, v7.8h}, [x0], x1
+    uabal       v16.4s,  v0.4h,  v2.4h
+    uabal2      v17.4s,  v0.8h,  v2.8h
+    uabal       v20.4s,  v1.4h,  v3.4h
+    uabal2      v21.4s,  v1.8h,  v3.8h
+    uabal       v18.4s,  v4.4h,  v6.4h
+    uabal2      v19.4s,  v4.8h,  v6.8h
+    uabal       v22.4s,  v5.4h,  v7.4h
+    uabal2      v23.4s,  v5.8h,  v7.8h
+.endm
+
+.macro SAD_FUNC w, h, name
+function pixel_sad\name\()_\w\()x\h\()_neon, export=1
+    SAD_START_\w
+
+.rept \h / 2 - 1
+    SAD_\w
+.endr
+.if \w > 8
+    add         v20.4s,  v20.4s,  v21.4s
+    add         v16.4s,  v16.4s,  v20.4s
+    add         v22.4s,  v22.4s,  v23.4s
+    add         v18.4s,  v18.4s,  v22.4s
+.endif
+ .if \w > 4
+    add         v16.4s,  v16.4s,  v17.4s
+    add         v18.4s,  v18.4s,  v19.4s
+.endif
+    add         v16.4s,  v16.4s,  v18.4s
+    uaddlv      s0,  v16.8h
+    fmov        w0,  s0
+    ret
+endfunc
+.endm
+
+#endif /* BIT_DEPTH == 8 */
+
 SAD_FUNC  4,  4
 SAD_FUNC  4,  8
 SAD_FUNC  4,  16
diff --git a/common/aarch64/pixel.h b/common/aarch64/pixel.h
index 02c969c1..d1e51269 100644
--- a/common/aarch64/pixel.h
+++ b/common/aarch64/pixel.h
@@ -76,11 +76,11 @@
     ret x264_pixel_##name##_4x4_##suffix args;\
 
 #define DECL_X1( name, suffix ) \
-    DECL_PIXELS( int, name, suffix, ( uint8_t *, intptr_t, uint8_t *, intptr_t ) )
+    DECL_PIXELS( int, name, suffix, ( pixel *, intptr_t, pixel *, intptr_t ) )
 
 #define DECL_X4( name, suffix ) \
-    DECL_PIXELS( void, name##_x3, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )\
-    DECL_PIXELS( void, name##_x4, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )
+    DECL_PIXELS( void, name##_x3, suffix, ( pixel *, pixel *, pixel *, pixel *, intptr_t, int * ) )\
+    DECL_PIXELS( void, name##_x4, suffix, ( pixel *, pixel *, pixel *, pixel *, pixel *, intptr_t, int * ) )
 
 DECL_X1( sad, neon )
 DECL_X4( sad, neon )
diff --git a/common/pixel.c b/common/pixel.c
index 113df307..6080bb5d 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -1054,6 +1054,13 @@ void x264_pixel_init( uint32_t cpu, x264_pixel_function_t *pixf )
         pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx512;
     }
 #endif // HAVE_MMX
+#if HAVE_AARCH64
+    if( cpu&X264_CPU_NEON )
+    {
+        INIT8( sad, _neon );
+    }
+#endif // HAVE_AARCH64
+
 #else // !HIGH_BIT_DEPTH
 #if HAVE_MMX
     if( cpu&X264_CPU_MMX )
-- 
2.37.1
To upload designs, you'll need to enable LFS and have an admin enable hashed storage. More information