Skip to content
Snippets Groups Projects
Commit 416e3eb2 authored by Hubert Mazur's avatar Hubert Mazur
Browse files

aarch64: pixel: add 10bits sad functions


Provide routines for sad functions for high bit depth, i.e. 10 bits.
Benchmarks run on AWS Gravtion 2 instances.

sad_4x4_c: 583
sad_4x4_neon: 273
sad_4x8_c: 1179
sad_4x8_neon: 366
sad_4x16_c: 2121
sad_4x16_neon: 550
sad_8x4_c: 924
sad_8x4_neon: 213
sad_8x8_c: 1711
sad_8x8_neon: 316
sad_8x16_c: 3505
sad_8x16_neon: 497
sad_16x8_c: 3070
sad_16x8_neon: 635
sad_16x16_c: 6113
sad_16x16_neon: 1118

Signed-off-by: default avatarHubert Mazur <hum@semihalf.com>
Signed-off-by: default avatarGrzegorz Bernacki <gjb@semihalf.com>
parent b093bbe7
No related branches found
No related tags found
No related merge requests found
......@@ -40,6 +40,7 @@ const mask_ac_4_8
.short 0, -1, -1, -1, -1, -1, -1, -1
endconst
#if BIT_DEPTH == 8
.macro SAD_START_4
ld1 {v1.s}[0], [x2], x3
ld1 {v0.s}[0], [x0], x1
......@@ -112,6 +113,110 @@ function pixel_sad\name\()_\w\()x\h\()_neon, export=1
endfunc
.endm
#else /* BIT_DEPTH == 8 */
.macro SAD_START_4
lsl x1, x1, #1
lsl x3, x3, #1
ld1 {v1.d}[0], [x2], x3
ld1 {v0.d}[0], [x0], x1
ld1 {v1.d}[1], [x2], x3
ld1 {v0.d}[1], [x0], x1
uabdl v16.4s, v0.4h, v1.4h
uabdl2 v18.4s, v0.8h, v1.8h
.endm
.macro SAD_4
ld1 {v1.d}[0], [x2], x3
ld1 {v0.d}[0], [x0], x1
ld1 {v1.d}[1], [x2], x3
ld1 {v0.d}[1], [x0], x1
uabal v16.4s, v0.4h, v1.4h
uabal2 v18.4s, v0.8h, v1.8h
.endm
.macro SAD_START_8
lsl x1, x1, #1
lsl x3, x3, #1
ld1 {v1.8h}, [x2], x3
ld1 {v0.8h}, [x0], x1
ld1 {v3.8h}, [x2], x3
ld1 {v2.8h}, [x0], x1
uabdl v16.4s, v0.4h, v1.4h
uabdl2 v17.4s, v0.8h, v1.8h
uabdl v18.4s, v2.4h, v3.4h
uabdl2 v19.4s, v2.8h, v3.8h
.endm
.macro SAD_8
ld1 {v1.8h}, [x2], x3
ld1 {v0.8h}, [x0], x1
ld1 {v3.8h}, [x2], x3
ld1 {v2.8h}, [x0], x1
uabal v16.4s, v0.4h, v1.4h
uabal2 v17.4s, v0.8h, v1.8h
uabal v18.4s, v2.4h, v3.4h
uabal2 v19.4s, v2.8h, v3.8h
.endm
.macro SAD_START_16
lsl x1, x1, #1
lsl x3, x3, #1
ld2 {v0.8h, v1.8h}, [x2], x3
ld2 {v2.8h, v3.8h}, [x0], x1
ld2 {v4.8h, v5.8h}, [x2], x3
ld2 {v6.8h, v7.8h}, [x0], x1
uabdl v16.4s, v0.4h, v2.4h
uabdl2 v17.4s, v0.8h, v2.8h
uabdl v20.4s, v1.4h, v3.4h
uabdl2 v21.4s, v1.8h, v3.8h
uabdl v18.4s, v4.4h, v6.4h
uabdl2 v19.4s, v4.8h, v6.8h
uabdl v22.4s, v5.4h, v7.4h
uabdl2 v23.4s, v5.8h, v7.8h
.endm
.macro SAD_16
ld2 {v0.8h, v1.8h}, [x2], x3
ld2 {v2.8h, v3.8h}, [x0], x1
ld2 {v4.8h, v5.8h}, [x2], x3
ld2 {v6.8h, v7.8h}, [x0], x1
uabal v16.4s, v0.4h, v2.4h
uabal2 v17.4s, v0.8h, v2.8h
uabal v20.4s, v1.4h, v3.4h
uabal2 v21.4s, v1.8h, v3.8h
uabal v18.4s, v4.4h, v6.4h
uabal2 v19.4s, v4.8h, v6.8h
uabal v22.4s, v5.4h, v7.4h
uabal2 v23.4s, v5.8h, v7.8h
.endm
.macro SAD_FUNC w, h, name
function pixel_sad\name\()_\w\()x\h\()_neon, export=1
SAD_START_\w
.rept \h / 2 - 1
SAD_\w
.endr
.if \w > 8
add v20.4s, v20.4s, v21.4s
add v16.4s, v16.4s, v20.4s
add v22.4s, v22.4s, v23.4s
add v18.4s, v18.4s, v22.4s
.endif
.if \w > 4
add v16.4s, v16.4s, v17.4s
add v18.4s, v18.4s, v19.4s
.endif
add v16.4s, v16.4s, v18.4s
uaddlv s0, v16.8h
fmov w0, s0
ret
endfunc
.endm
#endif /* BIT_DEPTH == 8 */
SAD_FUNC 4, 4
SAD_FUNC 4, 8
SAD_FUNC 4, 16
......
......@@ -76,11 +76,11 @@
ret x264_pixel_##name##_4x4_##suffix args;\
#define DECL_X1( name, suffix ) \
DECL_PIXELS( int, name, suffix, ( uint8_t *, intptr_t, uint8_t *, intptr_t ) )
DECL_PIXELS( int, name, suffix, ( pixel *, intptr_t, pixel *, intptr_t ) )
#define DECL_X4( name, suffix ) \
DECL_PIXELS( void, name##_x3, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )\
DECL_PIXELS( void, name##_x4, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )
DECL_PIXELS( void, name##_x3, suffix, ( pixel *, pixel *, pixel *, pixel *, intptr_t, int * ) )\
DECL_PIXELS( void, name##_x4, suffix, ( pixel *, pixel *, pixel *, pixel *, pixel *, intptr_t, int * ) )
DECL_X1( sad, neon )
DECL_X4( sad, neon )
......
......@@ -1054,6 +1054,13 @@ void x264_pixel_init( uint32_t cpu, x264_pixel_function_t *pixf )
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx512;
}
#endif // HAVE_MMX
#if HAVE_AARCH64
if( cpu&X264_CPU_NEON )
{
INIT8( sad, _neon );
}
#endif // HAVE_AARCH64
#else // !HIGH_BIT_DEPTH
#if HAVE_MMX
if( cpu&X264_CPU_MMX )
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment