aarch64: pixel: add 10bits sad functions

Provide routines for sad functions for high bit depth, i.e. 10 bits. Benchmarks run on AWS Gravtion 2 instances. sad_4x4_c: 583 sad_4x4_neon: 273 sad_4x8_c: 1179 sad_4x8_neon: 366 sad_4x16_c: 2121 sad_4x16_neon: 550 sad_8x4_c: 924 sad_8x4_neon: 213 sad_8x8_c: 1711 sad_8x8_neon: 316 sad_8x16_c: 3505 sad_8x16_neon: 497 sad_16x8_c: 3070 sad_16x8_neon: 635 sad_16x16_c: 6113 sad_16x16_neon: 1118 Signed-off-by: Hubert Mazur <hum@semihalf.com> Signed-off-by: Grzegorz Bernacki <gjb@semihalf.com>

aarch64: pixel: add 10bits sad functions
Provide routines for sad functions for high bit depth, i.e. 10 bits. Benchmarks run on AWS Gravtion 2 instances. sad_4x4_c: 583 sad_4x4_neon: 273 sad_4x8_c: 1179 sad_4x8_neon: 366 sad_4x16_c: 2121 sad_4x16_neon: 550 sad_8x4_c: 924 sad_8x4_neon: 213 sad_8x8_c: 1711 sad_8x8_neon: 316 sad_8x16_c: 3505 sad_8x16_neon: 497 sad_16x8_c: 3070 sad_16x8_neon: 635 sad_16x16_c: 6113 sad_16x16_neon: 1118 Signed-off-by: Hubert Mazur <hum@semihalf.com> Signed-off-by: Grzegorz Bernacki <gjb@semihalf.com>
416e3eb2 · Hubert Mazur · b093bbe7 · 416e3eb2 · 416e3eb2 · 416e3eb2
Commit 416e3eb2 authored 2 years ago by Hubert Mazur
--- a/common/aarch64/pixel-a.S
+++ b/common/aarch64/pixel-a.S
@@ -40,6 +40,7 @@ const mask_ac_4_8
 .short 0, -1, -1, -1, -1, -1, -1, -1
 endconst

+#if BIT_DEPTH == 8
 .macro SAD_START_4
    ld1        {v1.s}[0], [x2], x3
    ld1        {v0.s}[0], [x0], x1
@@ -112,6 +113,110 @@ function pixel_sad\name\()_\w\()x\h\()_neon, export=1
 endfunc
 .endm

+#else /* BIT_DEPTH == 8 */
+
+.macro SAD_START_4
+    lsl        x1, x1, #1
+    lsl        x3, x3, #1
+    ld1        {v1.d}[0], [x2], x3
+    ld1        {v0.d}[0], [x0], x1
+    ld1        {v1.d}[1], [x2], x3
+    ld1        {v0.d}[1], [x0], x1
+    uabdl       v16.4s,  v0.4h,  v1.4h
+    uabdl2      v18.4s,  v0.8h,  v1.8h
+.endm
+
+.macro SAD_4
+    ld1        {v1.d}[0], [x2], x3
+    ld1        {v0.d}[0], [x0], x1
+    ld1        {v1.d}[1], [x2], x3
+    ld1        {v0.d}[1], [x0], x1
+    uabal       v16.4s,  v0.4h,  v1.4h
+    uabal2      v18.4s,  v0.8h,  v1.8h
+.endm
+
+.macro SAD_START_8
+    lsl         x1, x1, #1
+    lsl         x3, x3, #1
+    ld1         {v1.8h}, [x2], x3
+    ld1         {v0.8h}, [x0], x1
+    ld1         {v3.8h}, [x2], x3
+    ld1         {v2.8h}, [x0], x1
+    uabdl       v16.4s,  v0.4h,  v1.4h
+    uabdl2      v17.4s,  v0.8h,  v1.8h
+    uabdl       v18.4s,  v2.4h,  v3.4h
+    uabdl2      v19.4s,  v2.8h,  v3.8h
+.endm
+
+.macro SAD_8
+    ld1         {v1.8h}, [x2], x3
+    ld1         {v0.8h}, [x0], x1
+    ld1         {v3.8h}, [x2], x3
+    ld1         {v2.8h}, [x0], x1
+    uabal       v16.4s,  v0.4h,  v1.4h
+    uabal2      v17.4s,  v0.8h,  v1.8h
+    uabal       v18.4s,  v2.4h,  v3.4h
+    uabal2      v19.4s,  v2.8h,  v3.8h
+.endm
+
+.macro SAD_START_16
+    lsl         x1, x1, #1
+    lsl         x3, x3, #1
+    ld2         {v0.8h, v1.8h}, [x2], x3
+    ld2         {v2.8h, v3.8h}, [x0], x1
+    ld2         {v4.8h, v5.8h}, [x2], x3
+    ld2         {v6.8h, v7.8h}, [x0], x1
+    uabdl       v16.4s,  v0.4h,  v2.4h
+    uabdl2      v17.4s,  v0.8h,  v2.8h
+    uabdl       v20.4s,  v1.4h,  v3.4h
+    uabdl2      v21.4s,  v1.8h,  v3.8h
+    uabdl       v18.4s,  v4.4h,  v6.4h
+    uabdl2      v19.4s,  v4.8h,  v6.8h
+    uabdl       v22.4s,  v5.4h,  v7.4h
+    uabdl2      v23.4s,  v5.8h,  v7.8h
+.endm
+
+.macro SAD_16
+    ld2         {v0.8h, v1.8h}, [x2], x3
+    ld2         {v2.8h, v3.8h}, [x0], x1
+    ld2         {v4.8h, v5.8h}, [x2], x3
+    ld2         {v6.8h, v7.8h}, [x0], x1
+    uabal       v16.4s,  v0.4h,  v2.4h
+    uabal2      v17.4s,  v0.8h,  v2.8h
+    uabal       v20.4s,  v1.4h,  v3.4h
+    uabal2      v21.4s,  v1.8h,  v3.8h
+    uabal       v18.4s,  v4.4h,  v6.4h
+    uabal2      v19.4s,  v4.8h,  v6.8h
+    uabal       v22.4s,  v5.4h,  v7.4h
+    uabal2      v23.4s,  v5.8h,  v7.8h
+.endm
+
+.macro SAD_FUNC w, h, name
+function pixel_sad\name\()_\w\()x\h\()_neon, export=1
+    SAD_START_\w
+
+.rept \h / 2 - 1
+    SAD_\w
+.endr
+.if \w > 8
+    add         v20.4s,  v20.4s,  v21.4s
+    add         v16.4s,  v16.4s,  v20.4s
+    add         v22.4s,  v22.4s,  v23.4s
+    add         v18.4s,  v18.4s,  v22.4s
+.endif
+.if \w > 4
+    add         v16.4s,  v16.4s,  v17.4s
+    add         v18.4s,  v18.4s,  v19.4s
+.endif
+    add         v16.4s,  v16.4s,  v18.4s
+    uaddlv      s0,  v16.8h
+    fmov        w0,  s0
+    ret
+endfunc
+.endm
+
+#endif /* BIT_DEPTH == 8 */
+
 SAD_FUNC  4,  4
 SAD_FUNC  4,  8
 SAD_FUNC  4,  16

--- a/common/aarch64/pixel.h
+++ b/common/aarch64/pixel.h
@@ -76,11 +76,11 @@
    ret x264_pixel_##name##_4x4_##suffix args;\

 #define DECL_X1( name, suffix ) \
-    DECL_PIXELS( int, name, suffix, ( uint8_t *, intptr_t, uint8_t *, intptr_t ) )
+    DECL_PIXELS( int, name, suffix, ( pixel *, intptr_t, pixel *, intptr_t ) )

 #define DECL_X4( name, suffix ) \
-    DECL_PIXELS( void, name##_x3, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )\
-    DECL_PIXELS( void, name##_x4, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )
+    DECL_PIXELS( void, name##_x3, suffix, ( pixel *, pixel *, pixel *, pixel *, intptr_t, int * ) )\
+    DECL_PIXELS( void, name##_x4, suffix, ( pixel *, pixel *, pixel *, pixel *, pixel *, intptr_t, int * ) )

 DECL_X1( sad, neon )
 DECL_X4( sad, neon )

--- a/common/pixel.c
+++ b/common/pixel.c
@@ -1054,6 +1054,13 @@ void x264_pixel_init( uint32_t cpu, x264_pixel_function_t *pixf )
        pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx512;
    }
 #endif // HAVE_MMX
+#if HAVE_AARCH64
+    if( cpu&X264_CPU_NEON )
+    {
+        INIT8( sad, _neon );
+    }
+#endif // HAVE_AARCH64
+
 #else // !HIGH_BIT_DEPTH
 #if HAVE_MMX
    if( cpu&X264_CPU_MMX )