Commit 5c13589b authored by Martin Storsjö's avatar Martin Storsjö Committed by Henrik Gramner

arm: Implement x284_decimate_score15/16/64_neon

checkasm timing       Cortex-A7      A8     A9
decimate_score15_c           764     736    535
decimate_score15_neon        487     494    453
decimate_score16_c           782     727    553
decimate_score16_neon        487     494    521
decimate_score64_c           2361    2597   2011
decimate_score64_neon        1017    802    785
parent 3902ae02
......@@ -5,6 +5,7 @@
*
* Authors: David Conrad <lessen42@gmail.com>
* Janne Grunau <janne-x264@jannau.net>
* Martin Storsjo <martin@martin.st>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
......
......@@ -32,6 +32,14 @@ pmovmskb_byte:
.byte 1,2,4,8,16,32,64,128
.byte 1,2,4,8,16,32,64,128
mask_2bit:
.byte 3,12,48,192,3,12,48,192
.byte 3,12,48,192,3,12,48,192
mask_1bit:
.byte 128,64,32,16,8,4,2,1
.byte 128,64,32,16,8,4,2,1
.text
.macro QUANT_TWO bias0 bias1 mf0 mf1 mf2 mf3 mask load_mf=no
......@@ -308,6 +316,136 @@ dequant_4x4_dc_rshift:
bx lr
endfunc
.macro decimate_score_1x size
function x264_decimate_score\size\()_neon
vld1.16 {q0, q1}, [r0, :128]
movrel r3, mask_2bit
vmov.s8 q3, #0x01
vqmovn.s16 d0, q0
vqmovn.s16 d1, q1
vqabs.s8 q2, q0
vld1.8 {q8}, [r3, :128]
vceq.s8 q1, q0, #0
vcgt.s8 q2, q2, q3
vand.u8 q1, q1, q8
vshrn.u16 d4, q2, #4
vpadd.u8 d2, d2, d3
vpadd.u8 d4, d4, d4
vpadd.u8 d2, d2, d2
vmov.32 r2, d4[0]
vmov.32 r1, d2[0]
cmp r2, #0
beq 0f
mov r0, #9
bx lr
0:
mvns r1, r1
mov r0, #0
bxeq lr
.ifc \size, 15
lsr r1, r1, #2
.endif
rbit r1, r1
movrel r3, X(x264_decimate_table4)
1:
clz r2, r1
lsl r1, r1, r2
lsr r12, r2, #1
ldrb r2, [r3, r12]
lsls r1, r1, #2
add r0, r0, r2
bne 1b
bx lr
endfunc
.endm
decimate_score_1x 15
decimate_score_1x 16
function x264_decimate_score64_neon
push {lr}
vld1.16 {q8, q9}, [r0, :128]!
vld1.16 {q10, q11}, [r0, :128]!
vld1.16 {q12, q13}, [r0, :128]!
vld1.16 {q14, q15}, [r0, :128]
movrel r3, mask_1bit
vmov.s8 q3, #0x01
vqmovn.s16 d17, q8
vqmovn.s16 d16, q9
vqmovn.s16 d19, q10
vqmovn.s16 d18, q11
vqmovn.s16 d21, q12
vqmovn.s16 d20, q13
vqmovn.s16 d23, q14
vqmovn.s16 d22, q15
vqabs.s8 q12, q8
vqabs.s8 q13, q9
vqabs.s8 q14, q10
vqabs.s8 q15, q11
vld1.8 {q2}, [r3, :128]
vceq.s8 q8, q8, #0
vceq.s8 q9, q9, #0
vceq.s8 q10, q10, #0
vceq.s8 q11, q11, #0
vmax.s8 q12, q12, q13
vmax.s8 q14, q14, q15
vand.u8 q8, q8, q2
vand.u8 q9, q9, q2
vand.u8 q10, q10, q2
vand.u8 q11, q11, q2
vmax.s8 q12, q12, q14
vpadd.u8 d18, d18, d19
vpadd.u8 d19, d16, d17
vcgt.s8 q12, q12, q3
vpadd.u8 d22, d22, d23
vpadd.u8 d23, d20, d21
vshrn.u16 d24, q12, #4
vpadd.u8 d16, d22, d23
vpadd.u8 d17, d18, d19
vpadd.u8 d24, d24, d24
vpadd.u8 d16, d16, d17
vmov.32 r2, d24[0]
vmov r12, r1, d16
cmp r2, #0
beq 0f
mov r0, #9
pop {pc}
0:
mvns r1, r1
mvn r12, r12
mov r0, #0
mov lr, #32
movrel r3, X(x264_decimate_table8)
beq 2f
1:
clz r2, r1
lsl r1, r1, r2
sub lr, lr, r2
ldrb r2, [r3, r2]
lsls r1, r1, #1
sub lr, lr, #1
add r0, r0, r2
bne 1b
2:
cmp r12, #0
popeq {pc}
clz r2, r12
lsl r1, r12, r2
add r2, r2, lr
ldrb r2, [r3, r2]
lsls r1, r1, #1
add r0, r0, r2
popeq {pc}
3:
clz r2, r1
lsl r1, r1, r2
ldrb r2, [r3, r2]
lsls r1, r1, #1
add r0, r0, r2
bne 3b
pop {pc}
endfunc
// int coeff_last( int16_t *l )
function x264_coeff_last4_arm
......
......@@ -38,6 +38,10 @@ void x264_dequant_4x4_dc_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp
void x264_dequant_4x4_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_neon( int16_t dct[64], int dequant_mf[6][64], int i_qp );
int x264_decimate_score15_neon( int16_t * );
int x264_decimate_score16_neon( int16_t * );
int x264_decimate_score64_neon( int16_t * );
int x264_coeff_last4_arm( int16_t * );
int x264_coeff_last8_arm( int16_t * );
int x264_coeff_last15_neon( int16_t * );
......
......@@ -751,6 +751,9 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_neon;
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_neon;
pf->denoise_dct = x264_denoise_dct_neon;
pf->decimate_score15 = x264_decimate_score15_neon;
pf->decimate_score16 = x264_decimate_score16_neon;
pf->decimate_score64 = x264_decimate_score64_neon;
}
#endif
#if ARCH_AARCH64
......@@ -765,9 +768,6 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->coeff_level_run8 = x264_coeff_level_run8_neon;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_neon;
pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_neon;
pf->decimate_score15 = x264_decimate_score15_neon;
pf->decimate_score16 = x264_decimate_score16_neon;
pf->decimate_score64 = x264_decimate_score64_neon;
}
#endif
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment