nv12_rgb.S 4.71 KB
Newer Older
1 2 3 4 5 6
 @*****************************************************************************
 @ nv12_rgb.S : ARM NEONv1 NV12 to RGB chroma conversion
 @*****************************************************************************
 @ Copyright (C) 2011 Sébastien Toque
 @                    Rémi Denis-Courmont
 @
Jean-Baptiste Kempf's avatar
Jean-Baptiste Kempf committed
7 8 9
 @ This program is free software; you can redistribute it and/or modify it
 @ under the terms of the GNU Lesser General Public License as published by
 @ the Free Software Foundation; either version 2.1 of the License, or
10 11 12 13
 @ (at your option) any later version.
 @
 @ This program is distributed in the hope that it will be useful,
 @ but WITHOUT ANY WARRANTY; without even the implied warranty of
Jean-Baptiste Kempf's avatar
Jean-Baptiste Kempf committed
14 15
 @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 @ GNU Lesser General Public License for more details.
16
 @
Jean-Baptiste Kempf's avatar
Jean-Baptiste Kempf committed
17
 @ You should have received a copy of the GNU Lesser General Public License
18 19 20 21
 @ along with this program; if not, write to the Free Software Foundation,
 @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
 @****************************************************************************/

22 23
#include "asm.S"

24
	.syntax unified
25 26 27
#if HAVE_AS_FPU_DIRECTIVE
	.fpu	neon
#endif
28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81
	.text

/* ARM */
#define O1	r0
#define O2	r1
#define WIDTH	r2
#define HEIGHT	r3
#define Y1	r4
#define Y2	r5
#define U	r6
#define V	r7
#define YPITCH	r8
#define OPAD	r10
#define YPAD	r11
#define COUNT	ip
#define OPITCH	lr

/* NEON */
#define coefY	D0
#define coefRV	D1
#define coefGU	D2
#define coefGV	D3
#define coefBU	D4
#define Rc	Q3
#define Gc	Q4
#define Bc	Q5

#define u	D24
#define v	D25
#define y1	D28
#define y2	D29

#define chro_r	Q6
#define chro_g	Q7
#define chro_b	Q8
#define red		Q9
#define green	Q10
#define blue	Q11
#define lumi	Q15

#define red1	D24
#define green1	D25
#define blue1	D26
#define alpha1	D27
#define red2	D28
#define green2	D29
#define blue2	D30
#define alpha2	D31

coefficients:
    .short  -15872
    .short    4992
    .short  -18432

82
	.align 2
83
function nv12_rgb_neon
84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
	push		{r4-r8,r10-r11,lr}
	vpush		{q4-q7}

	/* load arguments */
	ldmia		r0,	{O1, OPITCH}
	ldmia		r1,	{Y1, U, V, YPITCH}

	/* round the width to be a multiple of 16 */
	ands		OPAD, WIDTH, #15
	sub			WIDTH, WIDTH, OPAD
	addne		WIDTH, WIDTH, #16

	/* init constants (scale value by 64) */
	vmov.u8		coefY, #74
	vmov.u8		coefRV, #115
	vmov.u8		coefGU, #14
	vmov.u8		coefGV, #34
	vmov.u8		coefBU, #135
	adr			OPAD, coefficients
	vld1.s16	{d6[], d7[]}, [OPAD]!
	vld1.s16	{d8[], d9[]}, [OPAD]!
	vld1.s16	{d10[], d11[]}, [OPAD]!
	vmov.u8		alpha1, #255

	/* init padding */
	cmp			HEIGHT,	#0
	sub			OPAD,	OPITCH,	WIDTH, lsl #2
	sub			YPAD,	YPITCH,	WIDTH

loop_row:
114
	movsgt	COUNT,	WIDTH
115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
	add		O2,	O1,	OPITCH
	add		Y2,	Y1,	YPITCH
	/* exit if all rows have been processed */
	vpople	{q4-q7}
	pople	{r4-r8,r10-r11,pc}

loop_col:

	/* Common U & V */

	vld2.u8	{u,v}, [U,:128]!

	vmull.u8	chro_r, v, coefRV
	vmull.u8	chro_g, u, coefGU
	vmlal.u8	chro_g, v, coefGV
	vmull.u8	chro_b, u, coefBU

	vadd.s16	chro_r, Rc, chro_r
	vsub.s16	chro_g, Gc, chro_g
	vadd.s16	chro_b, Bc, chro_b

136
	pld	[U]
137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158

	/* Y Top Row */
	vld2.u8	{y1,y2}, [Y1,:128]!

	/* y1 : chrominance + luminance, then clamp (divide by 64) */
	vmull.u8	lumi, y1, coefY
	vqadd.s16	red, lumi, chro_r
	vqadd.s16	green, lumi, chro_g
	vqadd.s16	blue, lumi, chro_b
	vqrshrun.s16	red1, red, #6
	vqrshrun.s16	green1, green, #6
	vqrshrun.s16	blue1, blue, #6

	/* y2 : chrominance + luminance, then clamp (divide by 64) */
	vmull.u8	lumi, y2, coefY
	vqadd.s16	red, lumi, chro_r
	vqadd.s16	green, lumi, chro_g
	vqadd.s16	blue, lumi, chro_b
	vqrshrun.s16	red2, red, #6
	vqrshrun.s16	green2, green, #6
	vqrshrun.s16	blue2, blue, #6

159
	pld	[Y1]
160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189

	vmov.u8	alpha2, #255
	vzip.u8	red1, red2
	vzip.u8	green1, green2
	vzip.u8	blue1, blue2

	vst4.u8		{red1,green1,blue1,alpha1}, [O1,:128]!
	vst4.u8		{red2,green2,blue2,alpha2}, [O1,:128]!

	/* Y Bottom Row */
	vld2.u8	{y1,y2}, [Y2,:128]!

	/* y1 : chrominance + luminance, then clamp (divide by 64) */
	vmull.u8	lumi, y1, coefY
	vqadd.s16	red, lumi, chro_r
	vqadd.s16	green, lumi, chro_g
	vqadd.s16	blue, lumi, chro_b
	vqrshrun.s16	red1, red, #6
	vqrshrun.s16	green1, green, #6
	vqrshrun.s16	blue1, blue, #6

	/* y2 : chrominance + luminance, then clamp (divide by 64) */
	vmull.u8	lumi, y2, coefY
	vqadd.s16	red, lumi, chro_r
	vqadd.s16	green, lumi, chro_g
	vqadd.s16	blue, lumi, chro_b
	vqrshrun.s16	red2, red, #6
	vqrshrun.s16	green2, green, #6
	vqrshrun.s16	blue2, blue, #6

190
	pld	[Y2]
191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209

	vmov.u8	alpha2, #255
	vzip.u8	red1, red2
	vzip.u8	green1, green2
	vzip.u8	blue1, blue2

	vst4.u8		{red1,green1,blue1,alpha1}, [O2,:128]!
	vst4.u8		{red2,green2,blue2,alpha2}, [O2,:128]!

	/* next columns (x16) */
	subs	COUNT,	COUNT,	#16
	bgt		loop_col

	/* next rows (x2) */
	subs	HEIGHT,	#2
	add		O1,	O2,	OPAD
	add		Y1,	Y2,	YPAD
	add		U,	U,	YPAD
	b		loop_row