Commit 14a58532 authored by Janne Grunau's avatar Janne Grunau Committed by Henrik Gramner
Browse files

arm: Add asm for mbtree fixed point conversion

7-8 times faster on a cortex-a53 vs. gcc-5.3.

mbtree_fix8_pack_c: 44114
mbtree_fix8_pack_neon: 5805
mbtree_fix8_unpack_c: 38924
mbtree_fix8_unpack_neon: 4870
parent b6f189eb
......@@ -1880,3 +1880,60 @@ function x264_mbtree_propagate_list_internal_neon
bge 8b
bx lr
endfunc
@ void mbtree_fix8_pack( int16_t *dst, float *src, int count )
function x264_mbtree_fix8_pack_neon, export=1
subs r3, r2, #8
blt 2f
1:
subs r3, r3, #8
vld1.32 {q0,q1}, [r1,:128]!
vcvt.s32.f32 q0, q0, #8
vcvt.s32.f32 q1, q1, #8
vqmovn.s32 d4, q0
vqmovn.s32 d5, q1
vrev16.8 q3, q2
vst1.16 {q3}, [r0,:128]!
bge 1b
2:
adds r3, r3, #8
bxeq lr
3:
subs r3, r3, #1
vld1.32 {d0[0]}, [r1]!
vcvt.s32.f32 s0, s0, #8
vrev16.8 d0, d0
vst1.16 {d0[0]}, [r0]!
bgt 3b
bx lr
endfunc
@ void mbtree_fix8_unpack( float *dst, int16_t *src, int count )
function x264_mbtree_fix8_unpack_neon, export=1
subs r3, r2, #8
blt 2f
1:
subs r3, r3, #8
vld1.16 {q0}, [r1,:128]!
vrev16.8 q1, q0
vmovl.s16 q0, d2
vmovl.s16 q1, d3
vcvt.f32.s32 q0, q0, #8
vcvt.f32.s32 q1, q1, #8
vst1.32 {q0,q1}, [r0,:128]!
bge 1b
2:
adds r3, r3, #8
bxeq lr
3:
subs r3, r3, #1
vld1.16 {d0[0]}, [r1]!
vrev16.8 d0, d0
vmovl.s16 q0, d0
vcvt.f32.s32 d0, d0, #8
vst1.32 {d0[0]}, [r0]!
bgt 3b
bx lr
endfunc
......@@ -109,6 +109,9 @@ void integral_init8v_neon( uint16_t *, intptr_t );
void x264_mbtree_propagate_cost_neon( int16_t *, uint16_t *, uint16_t *, uint16_t *, uint16_t *, float *, int );
void x264_mbtree_fix8_pack_neon( uint16_t *dst, float *src, int count );
void x264_mbtree_fix8_unpack_neon( float *dst, uint16_t *src, int count );
#if !HIGH_BIT_DEPTH
static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w )
{
......@@ -291,6 +294,8 @@ void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf )
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon;
pf->mbtree_propagate_list = x264_mbtree_propagate_list_neon;
pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_neon;
pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_neon;
#endif // !HIGH_BIT_DEPTH
// Apple's gcc stupidly cannot align stack variables, and ALIGNED_ARRAY can't work on structs
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment