Commit f2e439d1 authored by Janne Grunau's avatar Janne Grunau Committed by Anton Mitrofanov

aarch64: NEON asm for intra chroma deblocking

deblock_h_chroma_420_intra, deblock_h_chroma_422_intra and
x264_deblock_h_chroma_intra_mbaff_neon are ~3 times faster.
deblock_chroma_intra[1] is ~4 times faster than C.
parent ce6c94c0
......@@ -275,6 +275,173 @@ function x264_deblock_h_chroma_neon, export=1
ret
endfunc
.macro h264_loop_filter_start_intra
orr w4, w2, w3
cmp w4, #0
b.ne 1f
ret
1:
dup v30.16b, w2 // alpha
dup v31.16b, w3 // beta
.endm
.macro h264_loop_filter_chroma_intra, width=16
uabd v26.16b, v16.16b, v17.16b // abs(p0 - q0)
uabd v27.16b, v18.16b, v16.16b // abs(p1 - p0)
uabd v28.16b, v19.16b, v17.16b // abs(q1 - q0)
cmhi v26.16b, v30.16b, v26.16b // < alpha
cmhi v27.16b, v31.16b, v27.16b // < beta
cmhi v28.16b, v31.16b, v28.16b // < beta
and v26.16b, v26.16b, v27.16b
and v26.16b, v26.16b, v28.16b
ushll v4.8h, v18.8b, #1
ushll v6.8h, v19.8b, #1
.ifc \width, 16
ushll2 v5.8h, v18.16b, #1
ushll2 v7.8h, v19.16b, #1
uaddl2 v21.8h, v16.16b, v19.16b
uaddl2 v23.8h, v17.16b, v18.16b
.endif
uaddl v20.8h, v16.8b, v19.8b
uaddl v22.8h, v17.8b, v18.8b
add v20.8h, v20.8h, v4.8h // mlal?
add v22.8h, v22.8h, v6.8h
.ifc \width, 16
add v21.8h, v21.8h, v5.8h
add v23.8h, v23.8h, v7.8h
.endif
uqrshrn v24.8b, v20.8h, #2
uqrshrn v25.8b, v22.8h, #2
.ifc \width, 16
uqrshrn2 v24.16b, v21.8h, #2
uqrshrn2 v25.16b, v23.8h, #2
.endif
bit v16.16b, v24.16b, v26.16b
bit v17.16b, v25.16b, v26.16b
.endm
function x264_deblock_v_chroma_intra_neon, export=1
h264_loop_filter_start_intra
sub x0, x0, x1, lsl #1
ld1 {v18.16b}, [x0], x1
ld1 {v16.16b}, [x0], x1
ld1 {v17.16b}, [x0], x1
ld1 {v19.16b}, [x0]
h264_loop_filter_chroma_intra
sub x0, x0, x1, lsl #1
st1 {v16.16b}, [x0], x1
st1 {v17.16b}, [x0], x1
ret
endfunc
function x264_deblock_h_chroma_intra_mbaff_neon, export=1
h264_loop_filter_start_intra
sub x4, x0, #4
sub x0, x0, #2
ld1 {v18.8b}, [x4], x1
ld1 {v16.8b}, [x4], x1
ld1 {v17.8b}, [x4], x1
ld1 {v19.8b}, [x4], x1
transpose4x4.h v18, v16, v17, v19, v26, v27, v28, v29
h264_loop_filter_chroma_intra, width=8
st2 {v16.h,v17.h}[0], [x0], x1
st2 {v16.h,v17.h}[1], [x0], x1
st2 {v16.h,v17.h}[2], [x0], x1
st2 {v16.h,v17.h}[3], [x0], x1
ret
endfunc
function x264_deblock_h_chroma_intra_neon, export=1
h264_loop_filter_start_intra
sub x4, x0, #4
sub x0, x0, #2
ld1 {v18.d}[0], [x4], x1
ld1 {v16.d}[0], [x4], x1
ld1 {v17.d}[0], [x4], x1
ld1 {v19.d}[0], [x4], x1
ld1 {v18.d}[1], [x4], x1
ld1 {v16.d}[1], [x4], x1
ld1 {v17.d}[1], [x4], x1
ld1 {v19.d}[1], [x4], x1
transpose4x8.h v18, v16, v17, v19, v26, v27, v28, v29
h264_loop_filter_chroma_intra
st2 {v16.h,v17.h}[0], [x0], x1
st2 {v16.h,v17.h}[1], [x0], x1
st2 {v16.h,v17.h}[2], [x0], x1
st2 {v16.h,v17.h}[3], [x0], x1
st2 {v16.h,v17.h}[4], [x0], x1
st2 {v16.h,v17.h}[5], [x0], x1
st2 {v16.h,v17.h}[6], [x0], x1
st2 {v16.h,v17.h}[7], [x0], x1
ret
endfunc
function x264_deblock_h_chroma_422_intra_neon, export=1
h264_loop_filter_start_intra
sub x4, x0, #4
sub x0, x0, #2
ld1 {v18.d}[0], [x4], x1
ld1 {v16.d}[0], [x4], x1
ld1 {v17.d}[0], [x4], x1
ld1 {v19.d}[0], [x4], x1
ld1 {v18.d}[1], [x4], x1
ld1 {v16.d}[1], [x4], x1
ld1 {v17.d}[1], [x4], x1
ld1 {v19.d}[1], [x4], x1
transpose4x8.h v18, v16, v17, v19, v26, v27, v28, v29
h264_loop_filter_chroma_intra
st2 {v16.h,v17.h}[0], [x0], x1
st2 {v16.h,v17.h}[1], [x0], x1
st2 {v16.h,v17.h}[2], [x0], x1
st2 {v16.h,v17.h}[3], [x0], x1
st2 {v16.h,v17.h}[4], [x0], x1
st2 {v16.h,v17.h}[5], [x0], x1
st2 {v16.h,v17.h}[6], [x0], x1
st2 {v16.h,v17.h}[7], [x0], x1
ld1 {v18.d}[0], [x4], x1
ld1 {v16.d}[0], [x4], x1
ld1 {v17.d}[0], [x4], x1
ld1 {v19.d}[0], [x4], x1
ld1 {v18.d}[1], [x4], x1
ld1 {v16.d}[1], [x4], x1
ld1 {v17.d}[1], [x4], x1
ld1 {v19.d}[1], [x4], x1
transpose4x8.h v18, v16, v17, v19, v26, v27, v28, v29
h264_loop_filter_chroma_intra
st2 {v16.h,v17.h}[0], [x0], x1
st2 {v16.h,v17.h}[1], [x0], x1
st2 {v16.h,v17.h}[2], [x0], x1
st2 {v16.h,v17.h}[3], [x0], x1
st2 {v16.h,v17.h}[4], [x0], x1
st2 {v16.h,v17.h}[5], [x0], x1
st2 {v16.h,v17.h}[6], [x0], x1
st2 {v16.h,v17.h}[7], [x0], x1
ret
endfunc
//static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE],
// int8_t ref[2][X264_SCAN8_LUMA_SIZE],
......
......@@ -737,6 +737,12 @@ void x264_deblock_h_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int b
void x264_deblock_strength_neon( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
#if ARCH_AARCH64
void x264_deblock_h_chroma_intra_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_h_chroma_422_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_v_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
#endif
#endif
void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
......@@ -845,6 +851,12 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
pf->deblock_luma[0] = x264_deblock_h_luma_neon;
pf->deblock_chroma[1] = x264_deblock_v_chroma_neon;
pf->deblock_h_chroma_420 = x264_deblock_h_chroma_neon;
#if ARCH_AARCH64
pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_neon;
pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_neon;
pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_neon;
pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_neon;
#endif
pf->deblock_strength = x264_deblock_strength_neon;
}
#endif
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment