s32_s16.S 2.53 KB
Newer Older
1 2 3 4 5 6
 @*****************************************************************************
 @ neon_s32_s16.S : ARM NEONv1 fi32 to s16n audio sample conversion
 @*****************************************************************************
 @ Copyright (C) 2009 Rémi Denis-Courmont
 @
 @ This program is free software; you can redistribute it and/or modify
7 8
 @ it under the terms of the GNU Lesser General Public License as published by
 @ the Free Software Foundation; either version 2.1 of the License, or
9 10 11 12 13
 @ (at your option) any later version.
 @
 @ This program is distributed in the hope that it will be useful,
 @ but WITHOUT ANY WARRANTY; without even the implied warranty of
 @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
 @ GNU Lesser General Public License for more details.
15
 @
16
 @ You should have received a copy of the GNU Lesser General Public License
17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
 @ along with this program; if not, write to the Free Software Foundation,
 @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
 @****************************************************************************/

	.fpu neon
	.text

#define	OUT	r0
#define	IN	r1
#define	N	r2
#define	BUF	r3
#define HALF	ip

	.align
	.global s32_s16_neon
	.type	s32_s16_neon, %function
	@ Converts fixed-point 32-bits to signed 16-bits
	@ Input and output must be on 128-bits boundary
s32_s16_neon:
	pld		[IN]
2:
	cmp		N,	#8
	blt		s32_s16_neon_unaligned
	vld1.s32	{q8-q9},	[IN,:128]!

3:	@ Main loop
	pld		[IN, #64]
	sub		N,	#8
	vqrshrn.s32	d16,	q8,	#13
	vqrshrn.s32	d17,	q9,	#13
	cmp		N,	#8
	blt		4f
	vld1.s32	{q10-q11},	[IN,:128]!
	sub		N,	#8
	vqrshrn.s32	d18,	q10,	#13
	vqrshrn.s32	d19,	q11,	#13
	cmp		N,	#8
	blt		5f
	vld1.s32	{q12-q13},	[IN,:128]!
	sub		N,	#8
	vqrshrn.s32	d20,	q12,	#13
	vqrshrn.s32	d21,	q13,	#13
	vst1.s16	{d16-d19},	[OUT,:128]!
	cmp		N,	#8
	blt		6f
	vld1.s32	{q8-q9},	[IN,:128]!
	vst1.s16	{d20-d21},	[OUT,:128]!
	b		3b
4:
	vst1.s16	{d16-d17},	[OUT,:128]!
	b		7f
5:
	vst1.s16	{d16-d19},	[OUT,:128]!
	b		7f
6:
	vst1.s16	{d20-d21},	[OUT,:128]!
7:
	cmp		N,	#4
	blt		s32_s16_neon_unaligned
	vld1.s32	{q8},		[IN,:128]!
	sub		N,	#4
	vqrshrn.s32	d16,	q8,	#13
	vst1.s16	{d16},		[OUT,:64]!

	@ Fall through for last 0-3 samples

	.global	s32_s16_neon_unaligned
	.type	s32_s16_neon_unaligned, %function
	@ Converts fixed-point 32-bits to signed 16-bits
	@ Input must be on 32-bits boundary, output on 16-bits
s32_s16_neon_unaligned:
	mov		HALF,	#4096
1:
	cmp		N,	#0
	bxeq		lr

	ldr		BUF,	[IN]
	add		IN,	#4
	add		OUT,	#2
	qadd		BUF,	HALF,	BUF
	sub		N,	#1
	ssat		BUF,	#16,	BUF, asr #13
	strh		BUF,	[OUT, #-2]
	b		1b