Commit b6f189eb authored by Janne Grunau's avatar Janne Grunau Committed by Henrik Gramner

aarch64: Add asm for mbtree fixed point conversion

pack is ~7 times faster and unpack is ~9 times faster on a cortex-a53
compared to gcc-5.3.

mbtree_fix8_pack_c: 41534
mbtree_fix8_pack_neon: 5766
mbtree_fix8_unpack_c: 44102
mbtree_fix8_unpack_neon: 4868
parent a5e06b9a
......@@ -1667,3 +1667,60 @@ function x264_memzero_aligned_neon, export=1
b.gt 1b
ret
endfunc
// void mbtree_fix8_pack( int16_t *dst, float *src, int count )
function x264_mbtree_fix8_pack_neon, export=1
subs w3, w2, #8
b.lt 2f
1:
subs w3, w3, #8
ld1 {v0.4s,v1.4s}, [x1], #32
fcvtzs v0.4s, v0.4s, #8
fcvtzs v1.4s, v1.4s, #8
sqxtn v2.4h, v0.4s
sqxtn2 v2.8h, v1.4s
rev16 v3.16b, v2.16b
st1 {v3.8h}, [x0], #16
b.ge 1b
2:
adds w3, w3, #8
b.eq 4f
3:
subs w3, w3, #1
ldr s0, [x1], #4
fcvtzs w4, s0, #8
rev16 w5, w4
strh w5, [x0], #2
b.gt 3b
4:
ret
endfunc
// void mbtree_fix8_unpack( float *dst, int16_t *src, int count )
function x264_mbtree_fix8_unpack_neon, export=1
subs w3, w2, #8
b.lt 2f
1:
subs w3, w3, #8
ld1 {v0.8h}, [x1], #16
rev16 v1.16b, v0.16b
sxtl v2.4s, v1.4h
sxtl2 v3.4s, v1.8h
scvtf v4.4s, v2.4s, #8
scvtf v5.4s, v3.4s, #8
st1 {v4.4s,v5.4s}, [x0], #32
b.ge 1b
2:
adds w3, w3, #8
b.eq 4f
3:
subs w3, w3, #1
ldrh w4, [x1], #2
rev16 w5, w4
sxth w6, w5
scvtf s0, w6, #8
str s0, [x0], #4
b.gt 3b
4:
ret
endfunc
......@@ -100,6 +100,9 @@ void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t
void x264_mbtree_propagate_cost_neon( int16_t *, uint16_t *, uint16_t *, uint16_t *, uint16_t *, float *, int );
void x264_mbtree_fix8_pack_neon( uint16_t *dst, float *src, int count );
void x264_mbtree_fix8_unpack_neon( float *dst, uint16_t *src, int count );
#if !HIGH_BIT_DEPTH
static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w )
{
......@@ -262,6 +265,8 @@ void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf )
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon;
pf->mbtree_propagate_list = x264_mbtree_propagate_list_neon;
pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_neon;
pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_neon;
pf->memcpy_aligned = x264_memcpy_aligned_neon;
pf->memzero_aligned = x264_memzero_aligned_neon;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment