Commit 4e8ac132 authored by Janne Grunau's avatar Janne Grunau Committed by Anton Mitrofanov

aarch64: x264_coeff_level_run{4,8,15,16}

All functions ~33% faster.
parent dd766674
......@@ -497,3 +497,80 @@ function x264_coeff_last64_neon, export=1
sub w0, w3, w2
ret
endfunc
.macro coeff_level_run_start size
add x6, x1, #23 // runlevel->mask
mov w7, #0
mov w8, #0
mov w9, #1
and x6, x6, #~15
mov w4, #\size - 1
.endm
.macro coeff_level_run shift
clz x3, x2
subs w4, w4, w3, lsr #\shift
str w4, [x1], #4
1:
ldrh w5, [x0, x4, lsl #1]
strh w5, [x6], #2
add w7, w7, #1
lsl w10, w9, w4
orr w8, w8, w10
b.le 2f
add w3, w3, #1 << \shift
sub w4, w4, #1
and x3, x3, #~((1 << \shift) - 1)
lsl x2, x2, x3
clz x3, x2
subs w4, w4, w3, lsr #\shift
b.ge 1b
2:
str w8, [x1]
mov w0, w7
.endm
function x264_coeff_level_run4_aarch64, export=1
ldr x2, [x0]
coeff_level_run_start 4
coeff_level_run 4
ret
endfunc
.macro X264_COEFF_LEVEL_RUN size
function x264_coeff_level_run\size\()_neon, export=1
.if \size == 15
sub x0, x0, #2
.endif
.if \size < 15
.equ shiftw, 3
ld1 {v0.8h}, [x0]
uqxtn v0.8b, v0.8h
cmtst v0.8b, v0.8b, v0.8b
.else
.equ shiftw, 2
ld1 {v0.8h,v1.8h}, [x0]
uqxtn v0.8b, v0.8h
uqxtn2 v0.16b, v1.8h
cmtst v0.16b, v0.16b, v0.16b
shrn v0.8b, v0.8h, #4
.endif
fmov x2, d0
.if \size == 15
add x0, x0, #2
.endif
coeff_level_run_start \size
coeff_level_run shiftw
ret
endfunc
.endm
X264_COEFF_LEVEL_RUN 8
X264_COEFF_LEVEL_RUN 15
X264_COEFF_LEVEL_RUN 16
......@@ -49,4 +49,8 @@ int x264_coeff_last15_neon( int16_t * );
int x264_coeff_last16_neon( int16_t * );
int x264_coeff_last64_neon( int16_t * );
int x264_coeff_level_run4_aarch64( int16_t *, x264_run_level_t * );
int x264_coeff_level_run8_neon( int16_t *, x264_run_level_t * );
int x264_coeff_level_run15_neon( int16_t *, x264_run_level_t * );
int x264_coeff_level_run16_neon( int16_t *, x264_run_level_t * );
#endif
......@@ -754,9 +754,13 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
{
pf->coeff_last4 = x264_coeff_last4_aarch64;
pf->coeff_last8 = x264_coeff_last8_aarch64;
pf->coeff_level_run4 = x264_coeff_level_run4_aarch64;
}
if( cpu&X264_CPU_NEON )
{
pf->coeff_level_run8 = x264_coeff_level_run8_neon;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_neon;
pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_neon;
pf->decimate_score15 = x264_decimate_score15_neon;
pf->decimate_score16 = x264_decimate_score16_neon;
pf->decimate_score64 = x264_decimate_score64_neon;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment