i420_rgb.S 4.82 KB
Newer Older
1 2 3 4 5 6
 @*****************************************************************************
 @ i420_rgb.S : ARM NEONv1 I420 to RGB chroma conversion
 @*****************************************************************************
 @ Copyright (C) 2011 Sébastien Toque
 @                    Rémi Denis-Courmont
 @
Jean-Baptiste Kempf's avatar
Jean-Baptiste Kempf committed
7 8 9
 @ This program is free software; you can redistribute it and/or modify it
 @ under the terms of the GNU Lesser General Public License as published by
 @ the Free Software Foundation; either version 2.1 of the License, or
10 11 12 13
 @ (at your option) any later version.
 @
 @ This program is distributed in the hope that it will be useful,
 @ but WITHOUT ANY WARRANTY; without even the implied warranty of
Jean-Baptiste Kempf's avatar
Jean-Baptiste Kempf committed
14 15
 @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 @ GNU Lesser General Public License for more details.
16
 @
Jean-Baptiste Kempf's avatar
Jean-Baptiste Kempf committed
17
 @ You should have received a copy of the GNU Lesser General Public License
18 19 20 21
 @ along with this program; if not, write to the Free Software Foundation,
 @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
 @****************************************************************************/

22
	.syntax unified
23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
	.fpu neon
	.text

/* ARM */
#define O1	r0
#define O2	r1
#define WIDTH	r2
#define HEIGHT	r3
#define Y1	r4
#define Y2	r5
#define U	r6
#define V	r7
#define YPITCH	r8
#define OPAD	r10
#define YPAD	r11
#define COUNT	ip
#define OPITCH	lr

/* NEON */
#define coefY	D0
#define coefRV	D1
#define coefGU	D2
#define coefGV	D3
#define coefBU	D4
#define Rc	Q3
#define Gc	Q4
#define Bc	Q5

#define u	D24
#define v	D25
53 54
#define y1	D18
#define y2	D19
55 56 57 58

#define chro_r	Q6
#define chro_g	Q7
#define chro_b	Q8
59 60 61 62 63 64 65 66
#define lumi1	Q15
#define lumi2	Q10
#define red16_1		Q9
#define green16_1	Q10
#define blue16_1	Q11
#define red16_2		Q12
#define green16_2	Q13
#define blue16_2	Q14
67 68 69 70 71 72 73 74 75 76 77 78 79 80 81

#define red1	D24
#define green1	D25
#define blue1	D26
#define alpha1	D27
#define red2	D28
#define green2	D29
#define blue2	D30
#define alpha2	D31

coefficients:
    .short  -15872
    .short    4992
    .short  -18432

82
	.align 2
83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
	.global i420_rgb_neon
	.type	i420_rgb_neon, %function
i420_rgb_neon:
	push		{r4-r8,r10-r11,lr}
	vpush		{q4-q7}

	/* load arguments */
	ldmia		r0,	{O1, OPITCH}
	ldmia		r1,	{Y1, U, V, YPITCH}

	/* round the width to be a multiple of 16 */
	ands		OPAD, WIDTH, #15
	sub			WIDTH, WIDTH, OPAD
	addne		WIDTH, WIDTH, #16

	/* init constants (scale value by 64) */
	vmov.u8		coefY, #74
	vmov.u8		coefRV, #115
	vmov.u8		coefGU, #14
	vmov.u8		coefGV, #34
	vmov.u8		coefBU, #135
	adr			OPAD, coefficients
	vld1.s16	{d6[], d7[]}, [OPAD]!
	vld1.s16	{d8[], d9[]}, [OPAD]!
	vld1.s16	{d10[], d11[]}, [OPAD]!
	vmov.u8		alpha1, #255

	/* init padding */
	cmp			HEIGHT,	#0
	sub			OPAD,	OPITCH,	WIDTH, lsl #2
	sub			YPAD,	YPITCH,	WIDTH

loop_row:
116
	movsgt	COUNT,	WIDTH
117 118 119 120 121 122 123 124 125 126 127 128 129
	add		O2,	O1,	OPITCH
	add		Y2,	Y1,	YPITCH
	/* exit if all rows have been processed */
	vpople	{q4-q7}
	pople	{r4-r8,r10-r11,pc}

loop_col:

	/* Common U & V */

	vld1.u8	{u}, [U,:64]!
	vld1.u8	{v}, [V,:64]!

130 131
	/* Y Top Row */
	vld2.u8	{y1,y2}, [Y1,:128]!
132

133 134 135 136 137 138 139 140 141 142
	vmull.u8	Q14, v, coefRV
	vmull.u8	Q11, u, coefGU
	vmull.u8	Q13, u, coefBU
	vmlal.u8	Q11, v, coefGV

	vmull.u8	lumi2, y2, coefY
	vmull.u8	lumi1, y1, coefY
	vadd.s16	chro_r, Rc, Q14
	vadd.s16	chro_b, Bc, Q13
	vsub.s16	chro_g, Gc, Q11
143

144 145
	pld	[U]
	pld	[V]
146

147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
	/* chrominance + luminance */
	vqadd.s16	red16_2, lumi2, chro_r
	vqadd.s16	blue16_2, lumi2, chro_b
	vqadd.s16	green16_2, lumi2, chro_g
	vqadd.s16	red16_1, lumi1, chro_r
	vqadd.s16	green16_1, lumi1, chro_g
	vqadd.s16	blue16_1, lumi1, chro_b

	/* clamp (divide by 64) */
	vqrshrun.s16	blue2, blue16_2, #6
	vqrshrun.s16	red2, red16_2, #6
	vqrshrun.s16	green2, green16_2, #6
	vqrshrun.s16	red1, red16_1, #6
	vqrshrun.s16	green1, green16_1, #6
	vqrshrun.s16	blue1, blue16_1, #6
162

163
	pld	[Y1]
164

165 166 167 168
	/* Y Bottom Row */
	vld2.u8	{y1,y2}, [Y2,:128]!

	vmov.u8	alpha1, #255
169 170 171 172
	vzip.u8	red1, red2
	vzip.u8	green1, green2
	vzip.u8	blue1, blue2

173
	vmull.u8	lumi2, y2, coefY
174 175
	vst4.u8		{red1,green1,blue1,alpha1}, [O1,:128]!
	vst4.u8		{red2,green2,blue2,alpha2}, [O1,:128]!
176

177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192
	/* chrominance + luminance */
	vmull.u8	lumi1, y1, coefY
	vqadd.s16	red16_2, lumi2, chro_r
	vqadd.s16	green16_2, lumi2, chro_g
	vqadd.s16	blue16_2, lumi2, chro_b
	vqadd.s16	red16_1, lumi1, chro_r
	vqadd.s16	green16_1, lumi1, chro_g
	vqadd.s16	blue16_1, lumi1, chro_b

	/* clamp (divide by 64) */
	vqrshrun.s16	blue2, blue16_2, #6
	vqrshrun.s16	red2, red16_2, #6
	vqrshrun.s16	green2, green16_2, #6
	vqrshrun.s16	red1, red16_1, #6
	vqrshrun.s16	green1, green16_1, #6
	vqrshrun.s16	blue1, blue16_1, #6
193

194
	pld	[Y2]
195 196 197 198 199 200

	vmov.u8	alpha2, #255
	vzip.u8	red1, red2
	vzip.u8	green1, green2
	vzip.u8	blue1, blue2

201 202
	vst4.u8		{red1,green1,blue1,alpha1}, [O2,:128]!
	vst4.u8		{red2,green2,blue2,alpha2}, [O2,:128]!
203 204 205 206 207 208 209 210 211 212 213 214

	/* next columns (x16) */
	subs	COUNT,	COUNT,	#16
	bgt		loop_col

	/* next rows (x2) */
	subs	HEIGHT,	#2
	add		O1,	O2,	OPAD
	add		Y1,	Y2,	YPAD
	add		U,	U,	YPAD,	lsr #1
	add		V,	V,	YPAD,	lsr #1
	b		loop_row