Commit 90f0b5c1 authored by Janne Grunau's avatar Janne Grunau Committed by Anton Mitrofanov

aarch64: implement x264_pixel_asd8_neon

7 times faster than C.
parent f8f8d13d
......@@ -273,6 +273,32 @@ function x264_pixel_vsad_neon, export=1
ret
endfunc
function x264_pixel_asd8_neon, export=1
sub w4, w4, #2
ld1 {v0.8b}, [x0], x1
ld1 {v1.8b}, [x2], x3
ld1 {v2.8b}, [x0], x1
ld1 {v3.8b}, [x2], x3
usubl v16.8h, v0.8b, v1.8b
1:
subs w4, w4, #2
ld1 {v4.8b}, [x0], x1
ld1 {v5.8b}, [x2], x3
usubl v17.8h, v2.8b, v3.8b
usubl v18.8h, v4.8b, v5.8b
add v16.8h, v16.8h, v17.8h
ld1 {v2.8b}, [x0], x1
ld1 {v3.8b}, [x2], x3
add v16.8h, v16.8h, v18.8h
b.gt 1b
usubl v17.8h, v2.8b, v3.8b
add v16.8h, v16.8h, v17.8h
saddlv s0, v16.8h
abs v0.2s, v0.2s
fmov w0, s0
ret
endfunc
.macro SSD_START_4
ld1 {v16.s}[0], [x0], x1
ld1 {v17.s}[0], [x2], x3
......
......@@ -74,4 +74,6 @@ void x264_pixel_ssim_4x4x2_core_neon( const uint8_t *, intptr_t,
int sums[2][4] );
float x264_pixel_ssim_end4_neon( int sum0[5][4], int sum1[5][4], int width );
int x264_pixel_asd8_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
#endif
......@@ -1430,6 +1430,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_neon;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_neon;
pixf->vsad = x264_pixel_vsad_neon;
pixf->asd8 = x264_pixel_asd8_neon;
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_neon;
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_neon;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment