Provide arm64 neon optimized functions for sad for high bit depth
Hi, I'm sending below a patch which adds support for high bit depth for sad functions. The bench results are provided in commit message. I'm looking forward to your comments and review. Hope we could merge it to the main branch after review.
Thanks, Hubert
From fbe5c3e1ac7650d9698abd13e9d3c55a7ccc9f97 Mon Sep 17 00:00:00 2001
From: Hubert Mazur <hum@semihalf.com>
Date: Thu, 6 Oct 2022 11:37:53 +0000
Subject: [PATCH] aarch64: pixel: add 10bits sad functions
Provide routines for sad functions for high bit depth, i.e. 10 bits.
Benchmarks run on AWS Gravtion 2 instances.
sad_4x4_c: 586
sad_4x4_neon: 275
sad_4x8_c: 1194
sad_4x8_neon: 362
sad_4x16_c: 2079
sad_4x16_neon: 552
sad_8x4_c: 955
sad_8x4_neon: 209
sad_8x8_c: 1719
sad_8x8_neon: 311
sad_8x16_c: 3533
sad_8x16_neon: 496
sad_16x8_c: 3084
sad_16x8_neon: 629
sad_16x16_c: 6055
sad_16x16_neon: 1111
Signed-off-by: Hubert Mazur <hum@semihalf.com>
Signed-off-by: Grzegorz Bernacki <gjb@semihalf.com>
---
common/aarch64/pixel-a.S | 105 +++++++++++++++++++++++++++++++++++++++
common/aarch64/pixel.h | 6 +--
common/pixel.c | 7 +++
3 files changed, 115 insertions(+), 3 deletions(-)
diff --git a/common/aarch64/pixel-a.S b/common/aarch64/pixel-a.S
index e3bc12d4..20ee8024 100644
--- a/common/aarch64/pixel-a.S
+++ b/common/aarch64/pixel-a.S
@@ -40,6 +40,7 @@ const mask_ac_4_8
.short 0, -1, -1, -1, -1, -1, -1, -1
endconst
+#if BIT_DEPTH == 8
.macro SAD_START_4
ld1 {v1.s}[0], [x2], x3
ld1 {v0.s}[0], [x0], x1
@@ -112,6 +113,110 @@ function pixel_sad\name\()_\w\()x\h\()_neon, export=1
endfunc
.endm
+#else /* BIT_DEPTH == 8 */
+
+.macro SAD_START_4
+ mov x1, x1, lsl #1
+ mov x3, x3, lsl #1
+ ld1 {v1.d}[0], [x2], x3
+ ld1 {v0.d}[0], [x0], x1
+ ld1 {v1.d}[1], [x2], x3
+ ld1 {v0.d}[1], [x0], x1
+ uabdl v16.4s, v0.4h, v1.4h
+ uabdl2 v18.4s, v0.8h, v1.8h
+.endm
+
+.macro SAD_4
+ ld1 {v1.d}[0], [x2], x3
+ ld1 {v0.d}[0], [x0], x1
+ ld1 {v1.d}[1], [x2], x3
+ ld1 {v0.d}[1], [x0], x1
+ uabal v16.4s, v0.4h, v1.4h
+ uabal2 v18.4s, v0.8h, v1.8h
+.endm
+
+.macro SAD_START_8
+ mov x1, x1, lsl #1
+ mov x3, x3, lsl #1
+ ld1 {v1.8h}, [x2], x3
+ ld1 {v0.8h}, [x0], x1
+ ld1 {v3.8h}, [x2], x3
+ ld1 {v2.8h}, [x0], x1
+ uabdl v16.4s, v0.4h, v1.4h
+ uabdl2 v17.4s, v0.8h, v1.8h
+ uabdl v18.4s, v2.4h, v3.4h
+ uabdl2 v19.4s, v2.8h, v3.8h
+.endm
+
+.macro SAD_8
+ ld1 {v1.8h}, [x2], x3
+ ld1 {v0.8h}, [x0], x1
+ ld1 {v3.8h}, [x2], x3
+ ld1 {v2.8h}, [x0], x1
+ uabal v16.4s, v0.4h, v1.4h
+ uabal2 v17.4s, v0.8h, v1.8h
+ uabal v18.4s, v2.4h, v3.4h
+ uabal2 v19.4s, v2.8h, v3.8h
+.endm
+
+.macro SAD_START_16
+ mov x1, x1, lsl #1
+ mov x3, x3, lsl #1
+ ld2 {v0.8h, v1.8h}, [x2], x3
+ ld2 {v2.8h, v3.8h}, [x0], x1
+ ld2 {v4.8h, v5.8h}, [x2], x3
+ ld2 {v6.8h, v7.8h}, [x0], x1
+ uabdl v16.4s, v0.4h, v2.4h
+ uabdl2 v17.4s, v0.8h, v2.8h
+ uabdl v20.4s, v1.4h, v3.4h
+ uabdl2 v21.4s, v1.8h, v3.8h
+ uabdl v18.4s, v4.4h, v6.4h
+ uabdl2 v19.4s, v4.8h, v6.8h
+ uabdl v22.4s, v5.4h, v7.4h
+ uabdl2 v23.4s, v5.8h, v7.8h
+.endm
+
+.macro SAD_16
+ ld2 {v0.8h, v1.8h}, [x2], x3
+ ld2 {v2.8h, v3.8h}, [x0], x1
+ ld2 {v4.8h, v5.8h}, [x2], x3
+ ld2 {v6.8h, v7.8h}, [x0], x1
+ uabal v16.4s, v0.4h, v2.4h
+ uabal2 v17.4s, v0.8h, v2.8h
+ uabal v20.4s, v1.4h, v3.4h
+ uabal2 v21.4s, v1.8h, v3.8h
+ uabal v18.4s, v4.4h, v6.4h
+ uabal2 v19.4s, v4.8h, v6.8h
+ uabal v22.4s, v5.4h, v7.4h
+ uabal2 v23.4s, v5.8h, v7.8h
+.endm
+
+.macro SAD_FUNC w, h, name
+function pixel_sad\name\()_\w\()x\h\()_neon, export=1
+ SAD_START_\w
+
+.rept \h / 2 - 1
+ SAD_\w
+.endr
+.if \w > 8
+ add v20.4s, v20.4s, v21.4s
+ add v16.4s, v16.4s, v20.4s
+ add v22.4s, v22.4s, v23.4s
+ add v18.4s, v18.4s, v22.4s
+.endif
+ .if \w > 4
+ add v16.4s, v16.4s, v17.4s
+ add v18.4s, v18.4s, v19.4s
+.endif
+ add v16.4s, v16.4s, v18.4s
+ uaddlv s0, v16.8h
+ fmov w0, s0
+ ret
+endfunc
+.endm
+
+#endif /* BIT_DEPTH == 8 */
+
SAD_FUNC 4, 4
SAD_FUNC 4, 8
SAD_FUNC 4, 16
diff --git a/common/aarch64/pixel.h b/common/aarch64/pixel.h
index 02c969c1..d1e51269 100644
--- a/common/aarch64/pixel.h
+++ b/common/aarch64/pixel.h
@@ -76,11 +76,11 @@
ret x264_pixel_##name##_4x4_##suffix args;\
#define DECL_X1( name, suffix ) \
- DECL_PIXELS( int, name, suffix, ( uint8_t *, intptr_t, uint8_t *, intptr_t ) )
+ DECL_PIXELS( int, name, suffix, ( pixel *, intptr_t, pixel *, intptr_t ) )
#define DECL_X4( name, suffix ) \
- DECL_PIXELS( void, name##_x3, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )\
- DECL_PIXELS( void, name##_x4, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )
+ DECL_PIXELS( void, name##_x3, suffix, ( pixel *, pixel *, pixel *, pixel *, intptr_t, int * ) )\
+ DECL_PIXELS( void, name##_x4, suffix, ( pixel *, pixel *, pixel *, pixel *, pixel *, intptr_t, int * ) )
DECL_X1( sad, neon )
DECL_X4( sad, neon )
diff --git a/common/pixel.c b/common/pixel.c
index 113df307..6080bb5d 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -1054,6 +1054,13 @@ void x264_pixel_init( uint32_t cpu, x264_pixel_function_t *pixf )
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx512;
}
#endif // HAVE_MMX
+#if HAVE_AARCH64
+ if( cpu&X264_CPU_NEON )
+ {
+ INIT8( sad, _neon );
+ }
+#endif // HAVE_AARCH64
+
#else // !HIGH_BIT_DEPTH
#if HAVE_MMX
if( cpu&X264_CPU_MMX )
--
2.37.1