Commit 40d5db34 authored by Janne Grunau's avatar Janne Grunau Committed by Anton Mitrofanov

aarch64: NEON asm for decimate_score

decimate_score15 and 16 are 60% faster, decimate_score64 is 4 times
faster than C.
parent 45e1ebf8
......@@ -4,6 +4,7 @@
* Copyright (C) 2009-2014 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
* Janne Grunau <janne-x264@jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
......@@ -300,6 +301,118 @@ dequant_4x4_dc_rshift:
ret
endfunc
.macro decimate_score_1x size
function x264_decimate_score\size\()_neon, export=1
ld1 {v0.8h,v1.8h}, [x0]
movrel x5, X(x264_decimate_table4)
movi v3.16b, #0x01
sqxtn v0.8b, v0.8h
sqxtn2 v0.16b, v1.8h
abs v2.16b, v0.16b
cmeq v1.16b, v0.16b, #0
cmhi v2.16b, v2.16b, v3.16b
shrn v1.8b, v1.8h, #4
shrn v2.8b, v2.8h, #4
fmov x2, d2
fmov x1, d1
cbnz x2, 9f
mvn x1, x1
mov w0, #0
cbz x1, 0f
.ifc \size, 15
lsr x1, x1, #1
.endif
rbit x1, x1
1:
clz x3, x1
lsr x6, x3, #2
lsl x1, x1, x3
ldrb w7, [x5, x6]
cbz x1, 2f
lsl x1, x1, #4
add w0, w0, w7
cbnz x1, 1b
ret
2:
add w0, w0, w7
0:
ret
9:
mov w0, #9
ret
endfunc
.endm
decimate_score_1x 15
decimate_score_1x 16
const mask64, align=6
.byte 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01
.byte 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01
endconst
function x264_decimate_score64_neon, export=1
ld1 {v0.8h,v1.8h}, [x0], #32
ld1 {v2.8h,v3.8h}, [x0], #32
ld1 {v4.8h,v5.8h}, [x0], #32
ld1 {v6.8h,v7.8h}, [x0]
movrel x6, mask64
movi v31.16b, #0x01
sqxtn v16.8b, v1.8h
sqxtn2 v16.16b, v0.8h
sqxtn v17.8b, v3.8h
sqxtn2 v17.16b, v2.8h
sqxtn v18.8b, v5.8h
sqxtn2 v18.16b, v4.8h
sqxtn v19.8b, v7.8h
sqxtn2 v19.16b, v6.8h
abs v4.16b, v16.16b
abs v5.16b, v17.16b
abs v6.16b, v18.16b
abs v7.16b, v19.16b
ld1 {v30.16b}, [x6]
cmeq v0.16b, v16.16b, #0
cmeq v1.16b, v17.16b, #0
cmeq v2.16b, v18.16b, #0
cmeq v3.16b, v19.16b, #0
umax v4.16b, v4.16b, v5.16b
umax v6.16b, v6.16b, v7.16b
and v0.16b, v0.16b, v30.16b
and v1.16b, v1.16b, v30.16b
and v2.16b, v2.16b, v30.16b
and v3.16b, v3.16b, v30.16b
umax v4.16b, v4.16b, v6.16b
addp v0.16b, v1.16b, v0.16b
addp v2.16b, v3.16b, v2.16b
cmhi v4.16b, v4.16b, v31.16b
addp v0.16b, v2.16b, v0.16b
shrn v4.8b, v4.8h, #4
addp v0.16b, v0.16b, v0.16b
fmov x2, d4
fmov x1, d0
cbnz x2, 9f
mvn x1, x1
mov w0, #0
cbz x1, 0f
movrel x5, X(x264_decimate_table8)
1:
clz x3, x1
lsl x1, x1, x3
ldrb w7, [x5, x3]
cbz x1, 2f
lsl x1, x1, #1
add w0, w0, w7
cbnz x1, 1b
ret
2:
add w0, w0, w7
0:
ret
9:
mov w0, #9
ret
endfunc
// int coeff_last( int16_t *l )
function x264_coeff_last4_aarch64, export=1
ldr x2, [x0]
......
......@@ -4,6 +4,7 @@
* Copyright (C) 2005-2014 x264 project
*
* Authors: David Conrad <lessen42@gmail.com>
* Janne Grunau <janne-x264@jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
......@@ -38,6 +39,10 @@ void x264_dequant_4x4_dc_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp
void x264_dequant_4x4_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_neon( int16_t dct[64], int dequant_mf[6][64], int i_qp );
int x264_decimate_score15_neon( int16_t * );
int x264_decimate_score16_neon( int16_t * );
int x264_decimate_score64_neon( int16_t * );
int x264_coeff_last4_aarch64( int16_t * );
int x264_coeff_last8_aarch64( int16_t * );
int x264_coeff_last15_neon( int16_t * );
......
......@@ -714,7 +714,8 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
#endif // HAVE_MMX
#if HAVE_ALTIVEC
if( cpu&X264_CPU_ALTIVEC ) {
if( cpu&X264_CPU_ALTIVEC )
{
pf->quant_2x2_dc = x264_quant_2x2_dc_altivec;
pf->quant_4x4_dc = x264_quant_4x4_dc_altivec;
pf->quant_4x4 = x264_quant_4x4_altivec;
......@@ -754,6 +755,12 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->coeff_last4 = x264_coeff_last4_aarch64;
pf->coeff_last8 = x264_coeff_last8_aarch64;
}
if( cpu&X264_CPU_NEON )
{
pf->decimate_score15 = x264_decimate_score15_neon;
pf->decimate_score16 = x264_decimate_score16_neon;
pf->decimate_score64 = x264_decimate_score64_neon;
}
#endif
#endif // HIGH_BIT_DEPTH
pf->coeff_last[DCT_LUMA_DC] = pf->coeff_last[DCT_CHROMAU_DC] = pf->coeff_last[DCT_CHROMAV_DC] =
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment