Commit 99a1ca1f authored by Janne Grunau's avatar Janne Grunau Committed by Anton Mitrofanov

aarch64: implement x264_pixel_vsad_neon

35 times faster than C.
parent 6c163249
......@@ -148,7 +148,7 @@ SAD_FUNC 16, 16
\first v17.8h, v2.8b, v0.8b
ld1 {v3.8b}, [x3], x5
ld1 {v1.8b}, [x1], x5
\first v18.8h, v3.8b, v0.8b
\first v18.8h, v3.8b, v0.8b
uabal v16.8h, v1.8b, v5.8b
ld1 {v2.8b}, [x2], x5
ld1 {v3.8b}, [x3], x5
......@@ -248,6 +248,30 @@ SAD_X_FUNC 4, 16, 8
SAD_X_FUNC 4, 16, 16
function x264_pixel_vsad_neon, export=1
subs w2, w2, #2
ld1 {v0.16b}, [x0], x1
ld1 {v1.16b}, [x0], x1
uabdl v6.8h, v0.8b, v1.8b
uabdl2 v7.8h, v0.16b, v1.16b
b.le 2f
1:
subs w2, w2, #2
ld1 {v0.16b}, [x0], x1
uabal v6.8h, v1.8b, v0.8b
uabal2 v7.8h, v1.16b, v0.16b
ld1 {v1.16b}, [x0], x1
b.lt 2f
uabal v6.8h, v0.8b, v1.8b
uabal2 v7.8h, v0.16b, v1.16b
b.gt 1b
2:
add v5.8h, v6.8h, v7.8h
uaddlv s0, v5.8h
fmov w0, s0
ret
endfunc
.macro SSD_START_4
ld1 {v16.s}[0], [x0], x1
ld1 {v17.s}[0], [x2], x3
......
......@@ -48,6 +48,8 @@ DECL_X4( sad, neon )
DECL_X1( satd, neon )
DECL_X1( ssd, neon )
int x264_pixel_vsad_neon( uint8_t *, intptr_t, int );
int x264_pixel_sa8d_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t );
int x264_pixel_sa8d_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
uint64_t x264_pixel_sa8d_satd_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
......
......@@ -1429,6 +1429,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_neon;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_neon;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_neon;
pixf->vsad = x264_pixel_vsad_neon;
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_neon;
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_neon;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment