Commit 8d655b63 authored by Janne Grunau's avatar Janne Grunau Committed by Anton Mitrofanov

aarch64: x264_mbtree_propagate_{cost,list}_neon

x264_mbtree_propagate_cost_neon is ~7 times faster.
x264_mbtree_propagate_list_neon is 33% faster.
parent 4d400a6e
......@@ -1484,3 +1484,120 @@ function integral_init8v_neon, export=1
2:
ret
endfunc
function x264_mbtree_propagate_cost_neon, export=1
ld1r {v5.4s}, [x5]
8:
subs w6, w6, #8
ld1 {v1.8h}, [x1], #16
ld1 {v2.8h}, [x2], #16
ld1 {v3.8h}, [x3], #16
ld1 {v4.8h}, [x4], #16
bic v3.8h, #0xc0, lsl #8
umin v3.8h, v2.8h, v3.8h
umull v20.4s, v2.4h, v4.4h // propagate_intra
umull2 v21.4s, v2.8h, v4.8h // propagate_intra
usubl v22.4s, v2.4h, v3.4h // propagate_num
usubl2 v23.4s, v2.8h, v3.8h // propagate_num
uxtl v26.4s, v2.4h // propagate_denom
uxtl2 v27.4s, v2.8h // propagate_denom
uxtl v24.4s, v1.4h
uxtl2 v25.4s, v1.8h
ucvtf v20.4s, v20.4s
ucvtf v21.4s, v21.4s
ucvtf v26.4s, v26.4s
ucvtf v27.4s, v27.4s
ucvtf v22.4s, v22.4s
ucvtf v23.4s, v23.4s
frecpe v28.4s, v26.4s
frecpe v29.4s, v27.4s
ucvtf v24.4s, v24.4s
ucvtf v25.4s, v25.4s
frecps v30.4s, v28.4s, v26.4s
frecps v31.4s, v29.4s, v27.4s
fmla v24.4s, v20.4s, v5.4s // propagate_amount
fmla v25.4s, v21.4s, v5.4s // propagate_amount
fmul v28.4s, v28.4s, v30.4s
fmul v29.4s, v29.4s, v31.4s
fmul v16.4s, v24.4s, v22.4s
fmul v17.4s, v25.4s, v23.4s
fmul v18.4s, v16.4s, v28.4s
fmul v19.4s, v17.4s, v29.4s
fcvtns v20.4s, v18.4s
fcvtns v21.4s, v19.4s
sqxtn v0.4h, v20.4s
sqxtn2 v0.8h, v21.4s
st1 {v0.8h}, [x0], #16
b.ge 8b
ret
endfunc
const pw_0to15, align=5
.short 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
endconst
function x264_mbtree_propagate_list_internal_neon, export=1
movrel x11, pw_0to15
dup v31.8h, w4 // bipred_weight
movi v30.8h, #0xc0, lsl #8
ld1 {v29.8h}, [x11] //h->mb.i_mb_x,h->mb.i_mb_y
movi v28.4s, #4//, lsl #16
movi v27.8h, #31
movi v26.8h, #32
dup v24.8h, w5 // mb_y
zip1 v29.8h, v29.8h, v24.8h
8:
subs w6, w6, #8
ld1 {v1.8h}, [x1], #16 // propagate_amount
ld1 {v2.8h}, [x2], #16 // lowres_cost
and v2.16b, v2.16b, v30.16b
cmeq v25.8h, v2.8h, v30.8h
umull v16.4s, v1.4h, v31.4h
umull2 v17.4s, v1.8h, v31.8h
rshrn v16.4h, v16.4s, #6
rshrn2 v16.8h, v17.4s, #6
bsl v25.16b, v16.16b, v1.16b // if( lists_used == 3 )
// propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
ld1 {v4.8h,v5.8h}, [x0], #32
sshr v6.8h, v4.8h, #5
sshr v7.8h, v5.8h, #5
add v6.8h, v6.8h, v29.8h
add v29.8h, v29.8h, v28.8h
add v7.8h, v7.8h, v29.8h
add v29.8h, v29.8h, v28.8h
st1 {v6.8h,v7.8h}, [x3], #32
and v4.16b, v4.16b, v27.16b
and v5.16b, v5.16b, v27.16b
uzp1 v6.8h, v4.8h, v5.8h // x & 31
uzp2 v7.8h, v4.8h, v5.8h // y & 31
sub v4.8h, v26.8h, v6.8h // 32 - (x & 31)
sub v5.8h, v26.8h, v7.8h // 32 - (y & 31)
mul v19.8h, v6.8h, v7.8h // idx3weight = y*x;
mul v18.8h, v4.8h, v7.8h // idx2weight = y*(32-x);
mul v17.8h, v6.8h, v5.8h // idx1weight = (32-y)*x;
mul v16.8h, v4.8h, v5.8h // idx0weight = (32-y)*(32-x) ;
umull v6.4s, v19.4h, v25.4h
umull2 v7.4s, v19.8h, v25.8h
umull v4.4s, v18.4h, v25.4h
umull2 v5.4s, v18.8h, v25.8h
umull v2.4s, v17.4h, v25.4h
umull2 v3.4s, v17.8h, v25.8h
umull v0.4s, v16.4h, v25.4h
umull2 v1.4s, v16.8h, v25.8h
rshrn v19.4h, v6.4s, #10
rshrn2 v19.8h, v7.4s, #10
rshrn v18.4h, v4.4s, #10
rshrn2 v18.8h, v5.4s, #10
rshrn v17.4h, v2.4s, #10
rshrn2 v17.8h, v3.4s, #10
rshrn v16.4h, v0.4s, #10
rshrn2 v16.8h, v1.4s, #10
zip1 v0.8h, v16.8h, v17.8h
zip2 v1.8h, v16.8h, v17.8h
zip1 v2.8h, v18.8h, v19.8h
zip2 v3.8h, v18.8h, v19.8h
st1 {v0.8h,v1.8h}, [x3], #32
st1 {v2.8h,v3.8h}, [x3], #32
b.ge 8b
ret
endfunc
......@@ -96,6 +96,8 @@ void integral_init8h_neon( uint16_t *, uint8_t *, intptr_t );
void integral_init8v_neon( uint16_t *, intptr_t );
void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, intptr_t, int, int );
void x264_mbtree_propagate_cost_neon( int16_t *, uint16_t *, uint16_t *, uint16_t *, uint16_t *, float *, int );
#if !HIGH_BIT_DEPTH
static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w )
{
......@@ -201,6 +203,89 @@ void x264_hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
int height, int16_t *buf );
#endif // !HIGH_BIT_DEPTH
#define CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<15)-1)
#define CLIP_ADD2(s,x)\
do\
{\
CLIP_ADD((s)[0], (x)[0]);\
CLIP_ADD((s)[1], (x)[1]);\
} while(0)
void x264_mbtree_propagate_list_internal_neon( int16_t (*mvs)[2],
int16_t *propagate_amount,
uint16_t *lowres_costs,
int16_t *output,
int bipred_weight, int mb_y,
int len );
static void x264_mbtree_propagate_list_neon( x264_t *h, uint16_t *ref_costs,
int16_t (*mvs)[2],
int16_t *propagate_amount,
uint16_t *lowres_costs,
int bipred_weight, int mb_y,
int len, int list )
{
int16_t *current = h->scratch_buffer2;
x264_mbtree_propagate_list_internal_neon( mvs, propagate_amount,
lowres_costs, current,
bipred_weight, mb_y, len );
unsigned stride = h->mb.i_mb_stride;
unsigned width = h->mb.i_mb_width;
unsigned height = h->mb.i_mb_height;
for( unsigned i = 0; i < len; current += 32 )
{
int end = X264_MIN( i+8, len );
for( ; i < end; i++, current += 2 )
{
if( !(lowres_costs[i] & (1 << (list+LOWRES_COST_SHIFT))) )
continue;
unsigned mbx = current[0];
unsigned mby = current[1];
unsigned idx0 = mbx + mby * stride;
unsigned idx2 = idx0 + stride;
/* Shortcut for the simple/common case of zero MV */
if( !M32( mvs[i] ) )
{
CLIP_ADD( ref_costs[idx0], current[16] );
continue;
}
if( mbx < width-1 && mby < height-1 )
{
CLIP_ADD2( ref_costs+idx0, current+16 );
CLIP_ADD2( ref_costs+idx2, current+32 );
}
else
{
/* Note: this takes advantage of unsigned representation to
* catch negative mbx/mby. */
if( mby < height )
{
if( mbx < width )
CLIP_ADD( ref_costs[idx0+0], current[16] );
if( mbx+1 < width )
CLIP_ADD( ref_costs[idx0+1], current[17] );
}
if( mby+1 < height )
{
if( mbx < width )
CLIP_ADD( ref_costs[idx2+0], current[32] );
if( mbx+1 < width )
CLIP_ADD( ref_costs[idx2+1], current[33] );
}
}
}
}
}
#undef CLIP_ADD
#undef CLIP_ADD2
void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf )
{
#if !HIGH_BIT_DEPTH
......@@ -252,5 +337,8 @@ void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf )
pf->integral_init8h = integral_init8h_neon;
pf->integral_init4v = integral_init4v_neon;
pf->integral_init8v = integral_init8v_neon;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon;
pf->mbtree_propagate_list = x264_mbtree_propagate_list_neon;
#endif // !HIGH_BIT_DEPTH
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment