Commit f13573e4 authored by Janne Grunau's avatar Janne Grunau Committed by Anton Mitrofanov

aarch64: {plane_copy,memcpy_aligned,memzero_aligned}_neon

2-3 times faster than C.
parent 8d655b63
......@@ -1253,6 +1253,34 @@ load_deinterleave_chroma:
ret
endfunc
function x264_plane_copy_neon, export=1
add x8, x4, #15
and x4, x8, #~15
sub x1, x1, x4
sub x3, x3, x4
1:
mov w8, w4
16:
tst w8, #16
b.eq 32f
subs w8, w8, #16
ldr q0, [x2], #16
str q0, [x0], #16
b.eq 0f
32:
subs w8, w8, #32
ldp q0, q1, [x2], #32
stp q0, q1, [x0], #32
b.gt 32b
0:
subs w5, w5, #1
add x2, x2, x3
add x0, x0, x1
b.gt 1b
ret
endfunc
function x264_plane_copy_deinterleave_neon, export=1
add w9, w6, #15
and w9, w9, #0xfffffff0
......@@ -1601,3 +1629,41 @@ function x264_mbtree_propagate_list_internal_neon, export=1
b.ge 8b
ret
endfunc
function x264_memcpy_aligned_neon, export=1
tst x2, #16
b.eq 32f
sub x2, x2, #16
ldr q0, [x1], #16
str q0, [x0], #16
32:
tst x2, #32
b.eq 640f
sub x2, x2, #32
ldp q0, q1, [x1], #32
stp q0, q1, [x0], #32
640:
cbz x2, 1f
64:
subs x2, x2, #64
ldp q0, q1, [x1, #32]
ldp q2, q3, [x1], #64
stp q0, q1, [x0, #32]
stp q2, q3, [x0], #64
b.gt 64b
1:
ret
endfunc
function x264_memzero_aligned_neon, export=1
movi v0.16b, #0
movi v1.16b, #0
1:
subs x1, x1, #128
stp q0, q1, [x0, #96]
stp q0, q1, [x0, #64]
stp q0, q1, [x0, #32]
stp q0, q1, [x0], 128
b.gt 1b
ret
endfunc
......@@ -49,6 +49,8 @@ void x264_pixel_avg2_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t
void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
void x264_plane_copy_neon( pixel *dst, intptr_t i_dst,
pixel *src, intptr_t i_src, int w, int h );
void x264_plane_copy_deinterleave_neon( pixel *dstu, intptr_t i_dstu,
pixel *dstv, intptr_t i_dstv,
pixel *src, intptr_t i_src, int w, int h );
......@@ -304,6 +306,7 @@ void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf )
pf->copy[PIXEL_8x8] = x264_mc_copy_w8_neon;
pf->copy[PIXEL_4x4] = x264_mc_copy_w4_neon;
pf->plane_copy = x264_plane_copy_neon;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_neon;
pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon;
pf->plane_copy_interleave = x264_plane_copy_interleave_neon;
......@@ -340,5 +343,8 @@ void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf )
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon;
pf->mbtree_propagate_list = x264_mbtree_propagate_list_neon;
pf->memcpy_aligned = x264_memcpy_aligned_neon;
pf->memzero_aligned = x264_memzero_aligned_neon;
#endif // !HIGH_BIT_DEPTH
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment