Commit dd766674 authored by Janne Grunau's avatar Janne Grunau Committed by Anton Mitrofanov

aarch64: NEON asm for intra luma deblocking

deblock_luma_intra[0]_neon is 2 times fastes,
deblock_luma_intra[1]_neon is ~4 times faster.
parent 0122fd23
......@@ -180,6 +180,202 @@ function x264_deblock_h_luma_neon, export=1
ret
endfunc
.macro h264_loop_filter_start_intra
orr w4, w2, w3
cmp w4, #0
b.ne 1f
ret
1:
dup v30.16b, w2 // alpha
dup v31.16b, w3 // beta
.endm
.macro h264_loop_filter_luma_intra
uabd v16.16b, v7.16b, v0.16b // abs(p0 - q0)
uabd v17.16b, v6.16b, v7.16b // abs(p1 - p0)
uabd v18.16b, v1.16b, v0.16b // abs(q1 - q0)
cmhi v19.16b, v30.16b, v16.16b // < alpha
cmhi v17.16b, v31.16b, v17.16b // < beta
cmhi v18.16b, v31.16b, v18.16b // < beta
movi v29.16b, #2
ushr v30.16b, v30.16b, #2 // alpha >> 2
add v30.16b, v30.16b, v29.16b // (alpha >> 2) + 2
cmhi v16.16b, v30.16b, v16.16b // < (alpha >> 2) + 2
and v19.16b, v19.16b, v17.16b
and v19.16b, v19.16b, v18.16b
shrn v20.8b, v19.8h, #4
mov x4, v20.d[0]
cbz x4, 9f
ushll v20.8h, v6.8b, #1
ushll v22.8h, v1.8b, #1
ushll2 v21.8h, v6.16b, #1
ushll2 v23.8h, v1.16b, #1
uaddw v20.8h, v20.8h, v7.8b
uaddw v22.8h, v22.8h, v0.8b
uaddw2 v21.8h, v21.8h, v7.16b
uaddw2 v23.8h, v23.8h, v0.16b
uaddw v20.8h, v20.8h, v1.8b
uaddw v22.8h, v22.8h, v6.8b
uaddw2 v21.8h, v21.8h, v1.16b
uaddw2 v23.8h, v23.8h, v6.16b
rshrn v24.8b, v20.8h, #2 // p0'_1
rshrn v25.8b, v22.8h, #2 // q0'_1
rshrn2 v24.16b, v21.8h, #2 // p0'_1
rshrn2 v25.16b, v23.8h, #2 // q0'_1
uabd v17.16b, v5.16b, v7.16b // abs(p2 - p0)
uabd v18.16b, v2.16b, v0.16b // abs(q2 - q0)
cmhi v17.16b, v31.16b, v17.16b // < beta
cmhi v18.16b, v31.16b, v18.16b // < beta
and v17.16b, v16.16b, v17.16b // if_2 && if_3
and v18.16b, v16.16b, v18.16b // if_2 && if_4
not v30.16b, v17.16b
not v31.16b, v18.16b
and v30.16b, v30.16b, v19.16b // if_1 && !(if_2 && if_3)
and v31.16b, v31.16b, v19.16b // if_1 && !(if_2 && if_4)
and v17.16b, v19.16b, v17.16b // if_1 && if_2 && if_3
and v18.16b, v19.16b, v18.16b // if_1 && if_2 && if_4
//calc p, v7, v6, v5, v4, v17, v7, v6, v5, v4
uaddl v26.8h, v5.8b, v7.8b
uaddl2 v27.8h, v5.16b, v7.16b
uaddw v26.8h, v26.8h, v0.8b
uaddw2 v27.8h, v27.8h, v0.16b
add v20.8h, v20.8h, v26.8h
add v21.8h, v21.8h, v27.8h
uaddw v20.8h, v20.8h, v0.8b
uaddw2 v21.8h, v21.8h, v0.16b
rshrn v20.8b, v20.8h, #3 // p0'_2
rshrn2 v20.16b, v21.8h, #3 // p0'_2
uaddw v26.8h, v26.8h, v6.8b
uaddw2 v27.8h, v27.8h, v6.16b
rshrn v21.8b, v26.8h, #2 // p1'_2
rshrn2 v21.16b, v27.8h, #2 // p1'_2
uaddl v28.8h, v4.8b, v5.8b
uaddl2 v29.8h, v4.16b, v5.16b
shl v28.8h, v28.8h, #1
shl v29.8h, v29.8h, #1
add v28.8h, v28.8h, v26.8h
add v29.8h, v29.8h, v27.8h
rshrn v19.8b, v28.8h, #3 // p2'_2
rshrn2 v19.16b, v29.8h, #3 // p2'_2
//calc q, v0, v1, v2, v3, v18, v0, v1, v2, v3
uaddl v26.8h, v2.8b, v0.8b
uaddl2 v27.8h, v2.16b, v0.16b
uaddw v26.8h, v26.8h, v7.8b
uaddw2 v27.8h, v27.8h, v7.16b
add v22.8h, v22.8h, v26.8h
add v23.8h, v23.8h, v27.8h
uaddw v22.8h, v22.8h, v7.8b
uaddw2 v23.8h, v23.8h, v7.16b
rshrn v22.8b, v22.8h, #3 // q0'_2
rshrn2 v22.16b, v23.8h, #3 // q0'_2
uaddw v26.8h, v26.8h, v1.8b
uaddw2 v27.8h, v27.8h, v1.16b
rshrn v23.8b, v26.8h, #2 // q1'_2
rshrn2 v23.16b, v27.8h, #2 // q1'_2
uaddl v28.8h, v2.8b, v3.8b
uaddl2 v29.8h, v2.16b, v3.16b
shl v28.8h, v28.8h, #1
shl v29.8h, v29.8h, #1
add v28.8h, v28.8h, v26.8h
add v29.8h, v29.8h, v27.8h
rshrn v26.8b, v28.8h, #3 // q2'_2
rshrn2 v26.16b, v29.8h, #3 // q2'_2
bit v7.16b, v24.16b, v30.16b // p0'_1
bit v0.16b, v25.16b, v31.16b // q0'_1
bit v7.16b, v20.16b, v17.16b // p0'_2
bit v6.16b, v21.16b, v17.16b // p1'_2
bit v5.16b, v19.16b, v17.16b // p2'_2
bit v0.16b, v22.16b, v18.16b // q0'_2
bit v1.16b, v23.16b, v18.16b // q1'_2
bit v2.16b, v26.16b, v18.16b // q2'_2
.endm
function x264_deblock_v_luma_intra_neon, export=1
h264_loop_filter_start_intra
ld1 {v0.16b}, [x0], x1 // q0
ld1 {v1.16b}, [x0], x1 // q1
ld1 {v2.16b}, [x0], x1 // q2
ld1 {v3.16b}, [x0], x1 // q3
sub x0, x0, x1, lsl #3
ld1 {v4.16b}, [x0], x1 // p3
ld1 {v5.16b}, [x0], x1 // p2
ld1 {v6.16b}, [x0], x1 // p1
ld1 {v7.16b}, [x0] // p0
h264_loop_filter_luma_intra
sub x0, x0, x1, lsl #1
st1 {v5.16b}, [x0], x1 // p2
st1 {v6.16b}, [x0], x1 // p1
st1 {v7.16b}, [x0], x1 // p0
st1 {v0.16b}, [x0], x1 // q0
st1 {v1.16b}, [x0], x1 // q1
st1 {v2.16b}, [x0] // q2
9:
ret
endfunc
function x264_deblock_h_luma_intra_neon, export=1
h264_loop_filter_start_intra
sub x0, x0, #4
ld1 {v4.8b}, [x0], x1
ld1 {v5.8b}, [x0], x1
ld1 {v6.8b}, [x0], x1
ld1 {v7.8b}, [x0], x1
ld1 {v0.8b}, [x0], x1
ld1 {v1.8b}, [x0], x1
ld1 {v2.8b}, [x0], x1
ld1 {v3.8b}, [x0], x1
ld1 {v4.d}[1], [x0], x1
ld1 {v5.d}[1], [x0], x1
ld1 {v6.d}[1], [x0], x1
ld1 {v7.d}[1], [x0], x1
ld1 {v0.d}[1], [x0], x1
ld1 {v1.d}[1], [x0], x1
ld1 {v2.d}[1], [x0], x1
ld1 {v3.d}[1], [x0], x1
transpose_8x16.b v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
h264_loop_filter_luma_intra
transpose_8x16.b v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
sub x0, x0, x1, lsl #4
st1 {v4.8b}, [x0], x1
st1 {v5.8b}, [x0], x1
st1 {v6.8b}, [x0], x1
st1 {v7.8b}, [x0], x1
st1 {v0.8b}, [x0], x1
st1 {v1.8b}, [x0], x1
st1 {v2.8b}, [x0], x1
st1 {v3.8b}, [x0], x1
st1 {v4.d}[1], [x0], x1
st1 {v5.d}[1], [x0], x1
st1 {v6.d}[1], [x0], x1
st1 {v7.d}[1], [x0], x1
st1 {v0.d}[1], [x0], x1
st1 {v1.d}[1], [x0], x1
st1 {v2.d}[1], [x0], x1
st1 {v3.d}[1], [x0], x1
9:
ret
endfunc
.macro h264_loop_filter_chroma
dup v22.16b, w2 // alpha
uxtl v24.8h, v24.8b
......@@ -342,16 +538,6 @@ function x264_deblock_h_chroma_mbaff_neon, export=1
ret
endfunc
.macro h264_loop_filter_start_intra
orr w4, w2, w3
cmp w4, #0
b.ne 1f
ret
1:
dup v30.16b, w2 // alpha
dup v31.16b, w3 // beta
.endm
.macro h264_loop_filter_chroma_intra, width=16
uabd v26.16b, v16.16b, v17.16b // abs(p0 - q0)
uabd v27.16b, v18.16b, v16.16b // abs(p1 - p0)
......
......@@ -744,6 +744,8 @@ void x264_deblock_h_chroma_intra_mbaff_neon( uint8_t *pix, intptr_t stride, int
void x264_deblock_h_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_h_chroma_422_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_v_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_h_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_v_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
#endif
#endif
......@@ -860,6 +862,8 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_neon;
pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_neon;
pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_neon;
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_neon;
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_neon;
#endif
pf->deblock_strength = x264_deblock_strength_neon;
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment