Commit aec81efd authored by Janne Grunau's avatar Janne Grunau Committed by Henrik Gramner

aarch64: Optimize various intra_predict asm functions

Make them at least as fast as the compiled C version (tested on
cortex-a53 vs. gcc 4.9.2).

                        C     NEON (before)   NEON (after)
intra_predict_4x4_dc:   260   335             260
intra_predict_4x4_dct:  210   265             200
intra_predict_8x8c_dc:  497   548             493
intra_predict_8x8c_v:   232   309             179 (arm64)
intra_predict_8x16c_dc: 795   830             790
parent b16268ac
...@@ -90,40 +90,37 @@ endfunc ...@@ -90,40 +90,37 @@ endfunc
function x264_predict_4x4_dc_neon, export=1 function x264_predict_4x4_dc_neon, export=1
sub x1, x0, #FDEC_STRIDE sub x1, x0, #FDEC_STRIDE
sub x2, x0, #1 ldrb w4, [x0, #-1 + 0 * FDEC_STRIDE]
mov x7, #FDEC_STRIDE ldrb w5, [x0, #-1 + 1 * FDEC_STRIDE]
ld1 {v0.8b}, [x1] ldrb w6, [x0, #-1 + 2 * FDEC_STRIDE]
ld1r {v1.8b}, [x2], x7 ldrb w7, [x0, #-1 + 3 * FDEC_STRIDE]
ld1r {v2.8b}, [x2], x7 add w4, w4, w5
ld1r {v3.8b}, [x2], x7 ldr s0, [x1]
ld1r {v4.8b}, [x2], x7 add w6, w6, w7
uaddlp v0.4h, v0.8b uaddlv h0, v0.8b
uaddl v1.8h, v1.8b, v2.8b add w4, w4, w6
uaddl v2.8h, v3.8b, v4.8b
addp v0.4h, v0.4h, v0.4h
add v1.4h, v1.4h, v2.4h
dup v0.4h, v0.h[0] dup v0.4h, v0.h[0]
dup v1.4h, w4
add v0.4h, v0.4h, v1.4h add v0.4h, v0.4h, v1.4h
rshrn v0.8b, v0.8h, #3 rshrn v0.8b, v0.8h, #3
str s0, [x0], #FDEC_STRIDE
str s0, [x0], #FDEC_STRIDE
str s0, [x0], #FDEC_STRIDE
str s0, [x0] str s0, [x0]
str s0, [x0, #1 * FDEC_STRIDE]
str s0, [x0, #2 * FDEC_STRIDE]
str s0, [x0, #3 * FDEC_STRIDE]
ret ret
endfunc endfunc
function x264_predict_4x4_dc_top_neon, export=1 function x264_predict_4x4_dc_top_neon, export=1
sub x1, x0, #FDEC_STRIDE sub x1, x0, #FDEC_STRIDE
mov x7, #FDEC_STRIDE ldr s0, [x1]
ld1 {v0.8b}, [x1] uaddlv h0, v0.8b
uaddlp v0.4h, v0.8b
addp v0.4h, v0.4h, v0.4h
dup v0.4h, v0.h[0] dup v0.4h, v0.h[0]
rshrn v0.8b, v0.8h, #2 rshrn v0.8b, v0.8h, #2
str s0, [x0], #FDEC_STRIDE
str s0, [x0], #FDEC_STRIDE
str s0, [x0], #FDEC_STRIDE
str s0, [x0] str s0, [x0]
str s0, [x0, #1 * FDEC_STRIDE]
str s0, [x0, #2 * FDEC_STRIDE]
str s0, [x0, #3 * FDEC_STRIDE]
ret
ret ret
endfunc endfunc
...@@ -456,30 +453,48 @@ function x264_predict_8x8c_dc_left_neon, export=1 ...@@ -456,30 +453,48 @@ function x264_predict_8x8c_dc_left_neon, export=1
endfunc endfunc
function x264_predict_8x8c_dc_neon, export=1 function x264_predict_8x8c_dc_neon, export=1
sub x2, x0, #FDEC_STRIDE
sub x3, x0, #1
mov x1, #FDEC_STRIDE mov x1, #FDEC_STRIDE
ld1 {v2.8b}, [x2] sub x2, x0, #FDEC_STRIDE
ldcol.8 v3, x3, x1 ldrb w10, [x0, #0 * FDEC_STRIDE - 1]
transpose v0.2s, v1.2s, v2.2s, v3.2s ldrb w11, [x0, #1 * FDEC_STRIDE - 1]
uaddlp v0.4h, v0.8b // s0, s2 ldrb w12, [x0, #2 * FDEC_STRIDE - 1]
uaddlp v1.4h, v1.8b // s1, s3 ldrb w13, [x0, #3 * FDEC_STRIDE - 1]
addp v0.4h, v0.4h, v1.4h // s0, s2, s1, s3 add w10, w10, w11
addp v1.4h, v0.4h, v0.4h ldrb w4, [x0, #4 * FDEC_STRIDE - 1]
rshrn v2.8b, v0.8h, #2 ldrb w5, [x0, #5 * FDEC_STRIDE - 1]
add w12, w12, w13
ldrb w6, [x0, #6 * FDEC_STRIDE - 1]
ldrb w7, [x0, #7 * FDEC_STRIDE - 1]
add w4, w4, w5
add w6, w6, w7
add w10, w10, w12, lsl #16
add w4, w4, w6, lsl #16
ld1 {v0.8b}, [x2]
add x10, x10, x4, lsl #32
uaddlp v0.4h, v0.8b // s0, s1
mov v1.d[0], x10 // s2, s3
add v3.4h, v0.4h, v1.4h
addp v0.4h, v0.4h, v1.4h // s0, s1, s2, s3
addp v1.4h, v3.4h, v3.4h // s0+s2, s1+s3, s0+s2, s1+s3
uzp2 v0.4h, v0.4h, v0.4h // s1, s3, s1, s3
uzp1 v1.2d, v1.2d, v1.2d
uzp1 v0.2d, v0.2d, v0.2d
rshrn v3.8b, v1.8h, #3 rshrn v3.8b, v1.8h, #3
dup v5.8b, v2.b[2] // dc1 rshrn v2.8b, v0.8h, #2
dup v6.8b, v3.b[1] // dc2 uzp1 v0.8b, v3.8b, v2.8b
dup v4.8b, v3.b[0] // dc0 uzp2 v1.8b, v2.8b, v3.8b
dup v7.8b, v2.b[3] // dc3
trn1 v0.2s, v4.2s, v5.2s
trn1 v1.2s, v7.2s, v6.2s
pred8x8c_dc_end: pred8x8c_dc_end:
add x2, x0, x1, lsl #2 add x2, x0, #2 * FDEC_STRIDE
.rept 4 add x4, x0, #4 * FDEC_STRIDE
add x5, x0, #6 * FDEC_STRIDE
st1 {v0.8b}, [x0], x1 st1 {v0.8b}, [x0], x1
st1 {v1.8b}, [x2], x1 st1 {v0.8b}, [x2], x1
.endr st1 {v0.8b}, [x0]
st1 {v0.8b}, [x2]
st1 {v1.8b}, [x4], x1
st1 {v1.8b}, [x5], x1
st1 {v1.8b}, [x4]
st1 {v1.8b}, [x5]
ret ret
endfunc endfunc
...@@ -495,12 +510,10 @@ function x264_predict_8x8c_h_neon, export=1 ...@@ -495,12 +510,10 @@ function x264_predict_8x8c_h_neon, export=1
ret ret
endfunc endfunc
function x264_predict_8x8c_v_neon, export=1 function x264_predict_8x8c_v_aarch64, export=1
sub x0, x0, #FDEC_STRIDE ldr x1, [x0, #-FDEC_STRIDE]
mov x7, #FDEC_STRIDE .irp c, 0,1,2,3,4,5,6,7
ld1 {v0.8b}, [x0], x7 str x1, [x0, #\c * FDEC_STRIDE]
.rept 8
st1 {v0.8b}, [x0], x7
.endr .endr
ret ret
endfunc endfunc
...@@ -661,20 +674,20 @@ function x264_predict_8x16c_p_neon, export=1 ...@@ -661,20 +674,20 @@ function x264_predict_8x16c_p_neon, export=1
endfunc endfunc
function x264_predict_8x16c_dc_neon, export=1 function x264_predict_8x16c_dc_neon, export=1
sub x3, x0, #FDEC_STRIDE
mov x1, #FDEC_STRIDE mov x1, #FDEC_STRIDE
ld1 {v6.8b}, [x3] sub x10, x0, #FDEC_STRIDE
loadsum4 w2, w3, w4, w5, x0, 0 loadsum4 w2, w3, w4, w5, x0, 0
ld1 {v6.8b}, [x10]
loadsum4 w6, w7, w8, w9, x0, 4
uaddlp v6.4h, v6.8b uaddlp v6.4h, v6.8b
dup v22.8h, w2 // s2 dup v22.8h, w2 // s2
loadsum4 w6, w7, w8, w9, x0, 4
addp v6.4h, v6.4h, v6.4h // s0, s1
dup v23.8h, w6 // s3 dup v23.8h, w6 // s3
loadsum4 w2, w3, w4, w5, x0, 8 loadsum4 w2, w3, w4, w5, x0, 8
dup v20.8h, v6.h[0] // s0 addp v6.4h, v6.4h, v6.4h // s0, s1
dup v24.8h, w2 // s4
loadsum4 w6, w7, w8, w9, x0, 12 loadsum4 w6, w7, w8, w9, x0, 12
dup v20.8h, v6.h[0] // s0
dup v21.8h, v6.h[1] // s1 dup v21.8h, v6.h[1] // s1
dup v24.8h, w2 // s4
dup v25.8h, w6 // s5 dup v25.8h, w6 // s5
ext v16.16b, v20.16b, v21.16b, #8 ext v16.16b, v20.16b, v21.16b, #8
...@@ -692,10 +705,15 @@ function x264_predict_8x16c_dc_neon, export=1 ...@@ -692,10 +705,15 @@ function x264_predict_8x16c_dc_neon, export=1
rshrn v1.8b, v1.8h, #3 rshrn v1.8b, v1.8h, #3
rshrn v2.8b, v2.8h, #3 rshrn v2.8b, v2.8h, #3
rshrn v3.8b, v3.8h, #3 rshrn v3.8b, v3.8h, #3
.irp idx, 0, 1, 2, 3
add x11, x0, #4 * FDEC_STRIDE
add x12, x0, #8 * FDEC_STRIDE
add x13, x0, #12 * FDEC_STRIDE
.rept 4 .rept 4
st1 {v\idx\().8b}, [x0], x1 st1 {v0.8b}, [x0], x1
.endr st1 {v1.8b}, [x11], x1
st1 {v2.8b}, [x12], x1
st1 {v3.8b}, [x13], x1
.endr .endr
ret ret
endfunc endfunc
......
...@@ -72,15 +72,18 @@ void x264_predict_4x4_init_aarch64( int cpu, x264_predict_t pf[12] ) ...@@ -72,15 +72,18 @@ void x264_predict_4x4_init_aarch64( int cpu, x264_predict_t pf[12] )
void x264_predict_8x8c_init_aarch64( int cpu, x264_predict_t pf[7] ) void x264_predict_8x8c_init_aarch64( int cpu, x264_predict_t pf[7] )
{ {
#if !HIGH_BIT_DEPTH
if (cpu&X264_CPU_ARMV8) {
pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_aarch64;
}
if (!(cpu&X264_CPU_NEON)) if (!(cpu&X264_CPU_NEON))
return; return;
#if !HIGH_BIT_DEPTH
pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_neon; pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_neon;
pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x8c_dc_top_neon; pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x8c_dc_top_neon;
pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left_neon; pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left_neon;
pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_neon; pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_neon;
pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_neon;
pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_neon; pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_neon;
#endif // !HIGH_BIT_DEPTH #endif // !HIGH_BIT_DEPTH
} }
......
...@@ -29,10 +29,12 @@ ...@@ -29,10 +29,12 @@
void x264_predict_4x4_h_aarch64( uint8_t *src ); void x264_predict_4x4_h_aarch64( uint8_t *src );
void x264_predict_4x4_v_aarch64( uint8_t *src ); void x264_predict_4x4_v_aarch64( uint8_t *src );
void x264_predict_8x8c_v_aarch64( uint8_t *src );
// for the merged 4x4 intra sad/satd which expects unified suffix // for the merged 4x4 intra sad/satd which expects unified suffix
#define x264_predict_4x4_h_neon x264_predict_4x4_h_aarch64 #define x264_predict_4x4_h_neon x264_predict_4x4_h_aarch64
#define x264_predict_4x4_v_neon x264_predict_4x4_v_aarch64 #define x264_predict_4x4_v_neon x264_predict_4x4_v_aarch64
#define x264_predict_8x8c_v_neon x264_predict_8x8c_v_aarch64
void x264_predict_4x4_dc_neon( uint8_t *src ); void x264_predict_4x4_dc_neon( uint8_t *src );
void x264_predict_8x8_v_neon( uint8_t *src, uint8_t edge[36] ); void x264_predict_8x8_v_neon( uint8_t *src, uint8_t edge[36] );
...@@ -40,7 +42,6 @@ void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[36] ); ...@@ -40,7 +42,6 @@ void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] ); void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8c_dc_neon( uint8_t *src ); void x264_predict_8x8c_dc_neon( uint8_t *src );
void x264_predict_8x8c_h_neon( uint8_t *src ); void x264_predict_8x8c_h_neon( uint8_t *src );
void x264_predict_8x8c_v_neon( uint8_t *src );
void x264_predict_8x16c_v_neon( uint8_t *src ); void x264_predict_8x16c_v_neon( uint8_t *src );
void x264_predict_8x16c_h_neon( uint8_t *src ); void x264_predict_8x16c_h_neon( uint8_t *src );
void x264_predict_8x16c_dc_neon( uint8_t *src ); void x264_predict_8x16c_dc_neon( uint8_t *src );
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment