Commit e2696a60 authored by Martin Storsjö's avatar Martin Storsjö Committed by Henrik Gramner

arm: Implement some neon 8x16c intra predict functions

checkasm timing       Cortex-A7      A8     A9
intra_predict_8x16c_dct_c    862     540    590
intra_predict_8x16c_dct_neon 608     511    657
intra_predict_8x16c_h_c      972     707    719
intra_predict_8x16c_h_neon   722     656    672
intra_predict_8x16c_p_c      10183   9819   8655
intra_predict_8x16c_p_neon   2622    1972   1983
parent 5db8b6b9
......@@ -5,6 +5,7 @@
*
* Authors: David Conrad <lessen42@gmail.com>
* Mans Rullgard <mans@mansr.com>
* Martin Storsjo <martin@martin.st>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
......@@ -48,6 +49,26 @@ p16weight: .short 1,2,3,4,5,6,7,8
.endif
.endm
.macro ldcol.16 rd1, rd2, rs, rt, ru
add \ru, \rs, \rt, lsl #3
vld1.8 {\rd1[0]}, [\rs], \rt
vld1.8 {\rd2[0]}, [\ru], \rt
vld1.8 {\rd1[1]}, [\rs], \rt
vld1.8 {\rd2[1]}, [\ru], \rt
vld1.8 {\rd1[2]}, [\rs], \rt
vld1.8 {\rd2[2]}, [\ru], \rt
vld1.8 {\rd1[3]}, [\rs], \rt
vld1.8 {\rd2[3]}, [\ru], \rt
vld1.8 {\rd1[4]}, [\rs], \rt
vld1.8 {\rd2[4]}, [\ru], \rt
vld1.8 {\rd1[5]}, [\rs], \rt
vld1.8 {\rd2[5]}, [\ru], \rt
vld1.8 {\rd1[6]}, [\rs], \rt
vld1.8 {\rd2[6]}, [\ru], \rt
vld1.8 {\rd1[7]}, [\rs], \rt
vld1.8 {\rd2[7]}, [\ru], \rt
.endm
.macro add16x8 dq, dl, dh, rl, rh
vaddl.u8 \dq, \rl, \rh
vadd.u16 \dl, \dl, \dh
......@@ -552,6 +573,101 @@ function x264_predict_8x8c_p_neon
endfunc
function x264_predict_8x16c_dc_top_neon
sub r2, r0, #FDEC_STRIDE
mov r1, #FDEC_STRIDE
vld1.8 {d0}, [r2,:64]
vpaddl.u8 d0, d0
vpadd.u16 d0, d0, d0
vrshrn.u16 d0, q0, #2
vdup.8 d1, d0[1]
vdup.8 d0, d0[0]
vtrn.32 d0, d1
add r2, r0, r1, lsl #2
.rept 4
vst1.8 {d0}, [r0,:64], r1
vst1.8 {d1}, [r2,:64], r1
.endr
add r2, r2, r1, lsl #2
add r0, r0, r1, lsl #2
.rept 4
vst1.8 {d0}, [r0,:64], r1
vst1.8 {d1}, [r2,:64], r1
.endr
bx lr
endfunc
function x264_predict_8x16c_h_neon
sub r1, r0, #1
mov ip, #FDEC_STRIDE
.rept 8
vld1.8 {d0[]}, [r1], ip
vld1.8 {d2[]}, [r1], ip
vst1.64 {d0}, [r0,:64], ip
vst1.64 {d2}, [r0,:64], ip
.endr
bx lr
endfunc
function x264_predict_8x16c_p_neon
sub r3, r0, #FDEC_STRIDE
mov r1, #FDEC_STRIDE
add r2, r3, #4
sub r3, r3, #1
vld1.32 {d0[0]}, [r3]
vld1.32 {d2[0]}, [r2,:32], r1
ldcol.8 d1, r3, r1
add r3, r3, r1
ldcol.8 d3, r3, r1
vrev64.32 d16, d3
vaddl.u8 q8, d2, d16
vrev32.8 d0, d0
vsubl.u8 q2, d2, d0
vrev64.8 d1, d1
vsubl.u8 q3, d3, d1
movrel r3, p16weight
vld1.16 {q0}, [r3,:128]
vmul.s16 d4, d4, d0
vmul.s16 q3, q3, q0
vpadd.i16 d4, d4, d5
vpadd.i16 d6, d6, d7
vpaddl.s16 d4, d4 @ d4[0] = H
vpaddl.s16 d6, d6
vpadd.s32 d6, d6 @ d6[0] = V
vshl.i32 d5, d4, #4
vadd.s32 d4, d4, d5 @ d4[0] = 17*H
vshl.i32 d7, d6, #2
vrshrn.s32 d4, q2, #5 @ d4[0] = b
vadd.s32 d6, d6, d7 @ d6[0] = 5*V
vrshrn.s32 d6, q3, #6 @ d6[0] = c
mov r3, #0
vshl.i16 d3, d4, #2
vsub.i16 d3, d3, d4 @ d2[0] = 3 * b
vshl.i16 d2, d6, #3
vadd.i16 d3, d3, d2 @ d2[0] = 3 * b + 8 * c
vsub.i16 d3, d3, d6 @ d2[0] = 3 * b + 7 * c
vrev64.16 d16, d16
vadd.i16 d16, d16, d0 @ d16[0] = src[]+src[] + 1
vshl.i16 d2, d16, #4 @ d3[0] = a + 16
vsub.i16 d2, d2, d3 @ i00
vext.16 q0, q0, q0, #7
vmov.16 d0[0], r3
vmul.i16 q0, q0, d4[0]
vdup.16 q1, d2[0]
vdup.16 q3, d6[0]
vadd.i16 q1, q1, q0
mov r3, #16
1:
vqshrun.s16 d0, q1, #5
vadd.i16 q1, q1, q3
vst1.8 {d0}, [r0,:64], r1
subs r3, r3, #1
bne 1b
bx lr
endfunc
function x264_predict_16x16_dc_top_neon
sub r2, r0, #FDEC_STRIDE
mov r1, #FDEC_STRIDE
......
......@@ -61,6 +61,19 @@ void x264_predict_8x8c_init_arm( int cpu, x264_predict_t pf[7] )
#endif // !HIGH_BIT_DEPTH
}
void x264_predict_8x16c_init_arm( int cpu, x264_predict_t pf[7] )
{
if (!(cpu&X264_CPU_NEON))
return;
#if !HIGH_BIT_DEPTH
/* The other functions weren't faster than C (gcc 4.7.3) on Cortex A8 and A9. */
pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x16c_dc_top_neon;
pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_neon;
pf[I_PRED_CHROMA_P] = x264_predict_8x16c_p_neon;
#endif // !HIGH_BIT_DEPTH
}
void x264_predict_8x8_init_arm( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter )
{
if (!(cpu&X264_CPU_NEON))
......
......@@ -40,6 +40,10 @@ void x264_predict_8x8c_h_neon( uint8_t *src );
void x264_predict_8x8c_v_neon( uint8_t *src );
void x264_predict_8x8c_p_neon( uint8_t *src );
void x264_predict_8x16c_h_neon( uint8_t *src );
void x264_predict_8x16c_dc_top_neon( uint8_t *src );
void x264_predict_8x16c_p_neon( uint8_t *src );
void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_ddl_neon( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_ddr_neon( uint8_t *src, uint8_t edge[36] );
......@@ -60,6 +64,7 @@ void x264_predict_16x16_p_neon( uint8_t *src );
void x264_predict_4x4_init_arm( int cpu, x264_predict_t pf[12] );
void x264_predict_8x8_init_arm( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter );
void x264_predict_8x8c_init_arm( int cpu, x264_predict_t pf[7] );
void x264_predict_8x16c_init_arm( int cpu, x264_predict_t pf[7] );
void x264_predict_16x16_init_arm( int cpu, x264_predict_t pf[7] );
#endif
......@@ -977,6 +977,10 @@ void x264_predict_8x16c_init( int cpu, x264_predict_t pf[7] )
x264_predict_8x16c_init_mmx( cpu, pf );
#endif
#if HAVE_ARMV6
x264_predict_8x16c_init_arm( cpu, pf );
#endif
#if ARCH_AARCH64
x264_predict_8x16c_init_aarch64( cpu, pf );
#endif
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment