Commit 44cb1dcd authored by Janne Grunau's avatar Janne Grunau Committed by Anton Mitrofanov

aarch64: x264_deblock_h_chroma_mbaff_neon

deblock_chroma_420_mbaff_neon  2 times faster
parent f2e439d1
......@@ -275,6 +275,60 @@ function x264_deblock_h_chroma_neon, export=1
ret
endfunc
.macro h264_loop_filter_chroma8
dup v22.8b, w2 // alpha
uxtl v24.8h, v24.8b
uabd v26.8b, v16.8b, v17.8b // abs(p0 - q0)
uxtl v4.8h, v17.8b
uabd v28.8b, v18.8b, v16.8b // abs(p1 - p0)
usubw v4.8h, v4.8h, v16.8b
sli v24.8h, v24.8h, #8
shl v4.8h, v4.8h, #2
uabd v30.8b, v19.8b, v17.8b // abs(q1 - q0)
uaddw v4.8h, v4.8h, v18.8b
cmhi v26.8b, v22.8b, v26.8b // < alpha
usubw v4.8h, v4.8h, v19.8b
dup v22.8b, w3 // beta
rshrn v4.8b, v4.8h, #3
cmhi v28.8b, v22.8b, v28.8b // < beta
cmhi v30.8b, v22.8b, v30.8b // < beta
smin v4.8b, v4.8b, v24.8b
neg v25.8b, v24.8b
and v26.8b, v26.8b, v28.8b
smax v4.8b, v4.8b, v25.8b
and v26.8b, v26.8b, v30.8b
uxtl v22.8h, v17.8b
and v4.8b, v4.8b, v26.8b
uxtl v28.8h, v16.8b
saddw v28.8h, v28.8h, v4.8b
ssubw v22.8h, v22.8h, v4.8b
sqxtun v16.8b, v28.8h
sqxtun v17.8b, v22.8h
.endm
function x264_deblock_h_chroma_mbaff_neon, export=1
h264_loop_filter_start
sub x4, x0, #4
sub x0, x0, #2
ld1 {v18.8b}, [x4], x1
ld1 {v16.8b}, [x4], x1
ld1 {v17.8b}, [x4], x1
ld1 {v19.8b}, [x4]
transpose4x4.h v18, v16, v17, v19, v28, v29, v30, v31
h264_loop_filter_chroma8
st2 {v16.h,v17.h}[0], [x0], x1
st2 {v16.h,v17.h}[1], [x0], x1
st2 {v16.h,v17.h}[2], [x0], x1
st2 {v16.h,v17.h}[3], [x0]
ret
endfunc
.macro h264_loop_filter_start_intra
orr w4, w2, w3
cmp w4, #0
......
......@@ -738,6 +738,7 @@ void x264_deblock_strength_neon( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X26
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
#if ARCH_AARCH64
void x264_deblock_h_chroma_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_intra_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_h_chroma_422_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
......@@ -852,6 +853,7 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
pf->deblock_chroma[1] = x264_deblock_v_chroma_neon;
pf->deblock_h_chroma_420 = x264_deblock_h_chroma_neon;
#if ARCH_AARCH64
pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_neon;
pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_neon;
pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_neon;
pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_neon;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment