i420_rgb.S 4.82 KB
Newer Older
1 2 3 4 5 6
 @*****************************************************************************
 @ i420_rgb.S : ARM NEONv1 I420 to RGB chroma conversion
 @*****************************************************************************
 @ Copyright (C) 2011 Sébastien Toque
 @                    Rémi Denis-Courmont
 @
Jean-Baptiste Kempf's avatar
Jean-Baptiste Kempf committed
7 8 9
 @ This program is free software; you can redistribute it and/or modify it
 @ under the terms of the GNU Lesser General Public License as published by
 @ the Free Software Foundation; either version 2.1 of the License, or
10 11 12 13
 @ (at your option) any later version.
 @
 @ This program is distributed in the hope that it will be useful,
 @ but WITHOUT ANY WARRANTY; without even the implied warranty of
Jean-Baptiste Kempf's avatar
Jean-Baptiste Kempf committed
14 15
 @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 @ GNU Lesser General Public License for more details.
16
 @
Jean-Baptiste Kempf's avatar
Jean-Baptiste Kempf committed
17
 @ You should have received a copy of the GNU Lesser General Public License
18 19 20 21
 @ along with this program; if not, write to the Free Software Foundation,
 @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
 @****************************************************************************/

22 23
#include "asm.S"

24
	.syntax unified
25 26 27
#if HAVE_AS_FPU_DIRECTIVE
	.fpu	neon
#endif
28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
	.text

/* ARM */
#define O1	r0
#define O2	r1
#define WIDTH	r2
#define HEIGHT	r3
#define Y1	r4
#define Y2	r5
#define U	r6
#define V	r7
#define YPITCH	r8
#define OPAD	r10
#define YPAD	r11
#define COUNT	ip
#define OPITCH	lr

/* NEON */
#define coefY	D0
#define coefRV	D1
#define coefGU	D2
#define coefGV	D3
#define coefBU	D4
#define Rc	Q3
#define Gc	Q4
#define Bc	Q5

#define u	D24
#define v	D25
57 58
#define y1	D18
#define y2	D19
59 60 61 62

#define chro_r	Q6
#define chro_g	Q7
#define chro_b	Q8
63 64 65 66 67 68 69 70
#define lumi1	Q15
#define lumi2	Q10
#define red16_1		Q9
#define green16_1	Q10
#define blue16_1	Q11
#define red16_2		Q12
#define green16_2	Q13
#define blue16_2	Q14
71 72 73 74 75 76 77 78 79 80 81 82 83 84 85

#define red1	D24
#define green1	D25
#define blue1	D26
#define alpha1	D27
#define red2	D28
#define green2	D29
#define blue2	D30
#define alpha2	D31

coefficients:
    .short  -15872
    .short    4992
    .short  -18432

86
	.align 2
87
function i420_rgb_neon
88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
	push		{r4-r8,r10-r11,lr}
	vpush		{q4-q7}

	/* load arguments */
	ldmia		r0,	{O1, OPITCH}
	ldmia		r1,	{Y1, U, V, YPITCH}

	/* round the width to be a multiple of 16 */
	ands		OPAD, WIDTH, #15
	sub			WIDTH, WIDTH, OPAD
	addne		WIDTH, WIDTH, #16

	/* init constants (scale value by 64) */
	vmov.u8		coefY, #74
	vmov.u8		coefRV, #115
	vmov.u8		coefGU, #14
	vmov.u8		coefGV, #34
	vmov.u8		coefBU, #135
	adr			OPAD, coefficients
	vld1.s16	{d6[], d7[]}, [OPAD]!
	vld1.s16	{d8[], d9[]}, [OPAD]!
	vld1.s16	{d10[], d11[]}, [OPAD]!
	vmov.u8		alpha1, #255

	/* init padding */
	cmp			HEIGHT,	#0
	sub			OPAD,	OPITCH,	WIDTH, lsl #2
	sub			YPAD,	YPITCH,	WIDTH

loop_row:
118
	movsgt	COUNT,	WIDTH
119 120 121 122 123 124 125 126 127 128 129 130 131
	add		O2,	O1,	OPITCH
	add		Y2,	Y1,	YPITCH
	/* exit if all rows have been processed */
	vpople	{q4-q7}
	pople	{r4-r8,r10-r11,pc}

loop_col:

	/* Common U & V */

	vld1.u8	{u}, [U,:64]!
	vld1.u8	{v}, [V,:64]!

132 133
	/* Y Top Row */
	vld2.u8	{y1,y2}, [Y1,:128]!
134

135 136 137 138 139 140 141 142 143 144
	vmull.u8	Q14, v, coefRV
	vmull.u8	Q11, u, coefGU
	vmull.u8	Q13, u, coefBU
	vmlal.u8	Q11, v, coefGV

	vmull.u8	lumi2, y2, coefY
	vmull.u8	lumi1, y1, coefY
	vadd.s16	chro_r, Rc, Q14
	vadd.s16	chro_b, Bc, Q13
	vsub.s16	chro_g, Gc, Q11
145

146 147
	pld	[U]
	pld	[V]
148

149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
	/* chrominance + luminance */
	vqadd.s16	red16_2, lumi2, chro_r
	vqadd.s16	blue16_2, lumi2, chro_b
	vqadd.s16	green16_2, lumi2, chro_g
	vqadd.s16	red16_1, lumi1, chro_r
	vqadd.s16	green16_1, lumi1, chro_g
	vqadd.s16	blue16_1, lumi1, chro_b

	/* clamp (divide by 64) */
	vqrshrun.s16	blue2, blue16_2, #6
	vqrshrun.s16	red2, red16_2, #6
	vqrshrun.s16	green2, green16_2, #6
	vqrshrun.s16	red1, red16_1, #6
	vqrshrun.s16	green1, green16_1, #6
	vqrshrun.s16	blue1, blue16_1, #6
164

165
	pld	[Y1]
166

167 168 169 170
	/* Y Bottom Row */
	vld2.u8	{y1,y2}, [Y2,:128]!

	vmov.u8	alpha1, #255
171 172 173 174
	vzip.u8	red1, red2
	vzip.u8	green1, green2
	vzip.u8	blue1, blue2

175
	vmull.u8	lumi2, y2, coefY
176 177
	vst4.u8		{red1,green1,blue1,alpha1}, [O1,:128]!
	vst4.u8		{red2,green2,blue2,alpha2}, [O1,:128]!
178

179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194
	/* chrominance + luminance */
	vmull.u8	lumi1, y1, coefY
	vqadd.s16	red16_2, lumi2, chro_r
	vqadd.s16	green16_2, lumi2, chro_g
	vqadd.s16	blue16_2, lumi2, chro_b
	vqadd.s16	red16_1, lumi1, chro_r
	vqadd.s16	green16_1, lumi1, chro_g
	vqadd.s16	blue16_1, lumi1, chro_b

	/* clamp (divide by 64) */
	vqrshrun.s16	blue2, blue16_2, #6
	vqrshrun.s16	red2, red16_2, #6
	vqrshrun.s16	green2, green16_2, #6
	vqrshrun.s16	red1, red16_1, #6
	vqrshrun.s16	green1, green16_1, #6
	vqrshrun.s16	blue1, blue16_1, #6
195

196
	pld	[Y2]
197 198 199 200 201 202

	vmov.u8	alpha2, #255
	vzip.u8	red1, red2
	vzip.u8	green1, green2
	vzip.u8	blue1, blue2

203 204
	vst4.u8		{red1,green1,blue1,alpha1}, [O2,:128]!
	vst4.u8		{red2,green2,blue2,alpha2}, [O2,:128]!
205 206 207 208 209 210 211 212 213 214 215 216

	/* next columns (x16) */
	subs	COUNT,	COUNT,	#16
	bgt		loop_col

	/* next rows (x2) */
	subs	HEIGHT,	#2
	add		O1,	O2,	OPAD
	add		Y1,	Y2,	YPAD
	add		U,	U,	YPAD,	lsr #1
	add		V,	V,	YPAD,	lsr #1
	b		loop_row