diff --git a/modules/arm_neon/Modules.am b/modules/arm_neon/Modules.am index 31064859d49575f5d5f2c85914ef6ca463c8e752..decb3b8189f0eb6b622c4d9f4b8aaf11fed3cf33 100644 --- a/modules/arm_neon/Modules.am +++ b/modules/arm_neon/Modules.am @@ -21,6 +21,7 @@ libvolume_neon_plugin_la_LIBADD = $(AM_LIBADD) libyuv_rgb_neon_plugin_la_SOURCES = \ i420_rgb.S \ + i420_rv16.S \ nv21_rgb.S \ nv12_rgb.S \ yuv_rgb.c diff --git a/modules/arm_neon/chroma_neon.h b/modules/arm_neon/chroma_neon.h index 708d1218ee7e6db22ac84da8251f2f072a0cf82d..865315a7d1a0a261ab911b919fc3db3876b7b1a4 100644 --- a/modules/arm_neon/chroma_neon.h +++ b/modules/arm_neon/chroma_neon.h @@ -72,6 +72,11 @@ void i420_rgb_neon (struct yuv_pack *const out, const struct yuv_planes *const in, int width, int height) asm("i420_rgb_neon"); +/* I420 to RV16 conversion. */ +void i420_rv16_neon (struct yuv_pack *const out, + const struct yuv_planes *const in, + int width, int height) asm("i420_rv16_neon"); + /* NV21 to RGBA conversion. */ void nv21_rgb_neon (struct yuv_pack *const out, const struct yuv_planes *const in, diff --git a/modules/arm_neon/i420_rv16.S b/modules/arm_neon/i420_rv16.S new file mode 100644 index 0000000000000000000000000000000000000000..cd6d2696c596747144f9b0b99f43a83f4532cefd --- /dev/null +++ b/modules/arm_neon/i420_rv16.S @@ -0,0 +1,227 @@ + @***************************************************************************** + @ i420_rv16.S : ARM NEONv1 I420 to RV16 chroma conversion + @***************************************************************************** + @ Copyright (C) 2011 Sébastien Toque + @ Rémi Denis-Courmont + @ + @ This program is free software; you can redistribute it and/or modify it + @ under the terms of the GNU Lesser General Public License as published by + @ the Free Software Foundation; either version 2.1 of the License, or + @ (at your option) any later version. + @ + @ This program is distributed in the hope that it will be useful, + @ but WITHOUT ANY WARRANTY; without even the implied warranty of + @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + @ GNU Lesser General Public License for more details. + @ + @ You should have received a copy of the GNU Lesser General Public License + @ along with this program; if not, write to the Free Software Foundation, + @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA. + @****************************************************************************/ + + .syntax unified + .fpu neon + .text + +/* ARM */ +#define O1 r0 +#define O2 r1 +#define WIDTH r2 +#define HEIGHT r3 +#define Y1 r4 +#define Y2 r5 +#define U r6 +#define V r7 +#define YPITCH r8 +#define OPAD r10 +#define YPAD r11 +#define COUNT ip +#define OPITCH lr + +/* NEON */ +#define coefY D0 +#define coefRV D1 +#define coefGU D2 +#define coefGV D3 +#define coefBU D4 +#define Rc Q3 +#define Gc Q4 +#define Bc Q5 + +#define u D24 +#define v D25 +#define y1 D18 +#define y2 D19 + +#define chro_r Q6 +#define chro_g Q7 +#define chro_b Q8 +#define lumi1 Q15 +#define lumi2 Q10 +#define red16_1 Q9 +#define green16_1 Q10 +#define blue16_1 Q11 +#define red16_2 Q12 +#define green16_2 Q13 +#define blue16_2 Q14 + +#define red1 D25 +#define green1 D26 +#define blue1 D27 +#define red2 D29 +#define green2 D30 +#define blue2 D31 + +#define out1l D24 +#define out1h D25 +#define out2l D28 +#define out2h D29 + +coefficients: + .short -15872 + .short 4992 + .short -18432 + + .align 2 + .global i420_rv16_neon + .type i420_rv16_neon, %function +i420_rv16_neon: + push {r4-r8,r10-r11,lr} + vpush {q4-q7} + + /* load arguments */ + ldmia r0, {O1, OPITCH} + ldmia r1, {Y1, U, V, YPITCH} + + /* round the width to be a multiple of 16 */ + ands OPAD, WIDTH, #15 + sub WIDTH, WIDTH, OPAD + addne WIDTH, WIDTH, #16 + + /* init constants (scale value by 64) */ + vmov.u8 coefY, #74 + vmov.u8 coefRV, #115 + vmov.u8 coefGU, #14 + vmov.u8 coefGV, #34 + vmov.u8 coefBU, #135 + adr OPAD, coefficients + vld1.s16 {d6[], d7[]}, [OPAD]! + vld1.s16 {d8[], d9[]}, [OPAD]! + vld1.s16 {d10[], d11[]}, [OPAD]! + + /* init padding */ + cmp HEIGHT, #0 + sub OPAD, OPITCH, WIDTH, lsl #1 + sub YPAD, YPITCH, WIDTH + +loop_row: + movsgt COUNT, WIDTH + add O2, O1, OPITCH + add Y2, Y1, YPITCH + /* exit if all rows have been processed */ + vpople {q4-q7} + pople {r4-r8,r10-r11,pc} + +loop_col: + + /* Common U & V */ + + vld1.u8 {u}, [U,:64]! + vld1.u8 {v}, [V,:64]! + + /* Y Top Row */ + vld2.u8 {y1,y2}, [Y1,:128]! + + vmull.u8 Q14, v, coefRV + vmull.u8 Q11, u, coefGU + vmull.u8 Q13, u, coefBU + vmlal.u8 Q11, v, coefGV + + vmull.u8 lumi2, y2, coefY + vmull.u8 lumi1, y1, coefY + vadd.s16 chro_r, Rc, Q14 + vadd.s16 chro_b, Bc, Q13 + vsub.s16 chro_g, Gc, Q11 + + pld [U] + pld [V] + + /* chrominance + luminance */ + vqadd.s16 red16_2, lumi2, chro_r + vqadd.s16 green16_2, lumi2, chro_g + vqadd.s16 blue16_2, lumi2, chro_b + vqadd.s16 red16_1, lumi1, chro_r + vqadd.s16 green16_1, lumi1, chro_g + vqadd.s16 blue16_1, lumi1, chro_b + + /* clamp (divide by 64) */ + vqrshrun.s16 green2, green16_2, #6 + vqrshrun.s16 blue2, blue16_2, #6 + vqrshrun.s16 red2, red16_2, #6 + vqrshrun.s16 green1, green16_1, #6 + vqrshrun.s16 red1, red16_1, #6 + vqrshrun.s16 blue1, blue16_1, #6 + + pld [Y1] + + /* pack into RGB565 */ + vshl.u8 out2l, green2, #3 // low 2a + vsri.u8 out2h, green2, #5 // high 2 + vshl.u8 out1l, green1, #3 // low 1a + vsri.u8 out1h, green1, #5 // high 1 + vsri.u8 out2l, blue2, #3 // low 2b + vsri.u8 out1l, blue1, #3 // low 1b + + /* Y Bottom Row */ + vld2.u8 {y1,y2}, [Y2,:128]! + + /* Top Row output */ + vzip.u8 out1h, out2h + vmull.u8 lumi2, y2, coefY + vzip.u8 out1l, out2l + vmull.u8 lumi1, y1, coefY + vst2.u8 {out1l, out1h}, [O1,:128]! + vst2.u8 {out2l, out2h}, [O1,:128]! + + /* chrominance + luminance */ + vqadd.s16 green16_2, lumi2, chro_g + vqadd.s16 red16_2, lumi2, chro_r + vqadd.s16 blue16_2, lumi2, chro_b + vqadd.s16 red16_1, lumi1, chro_r + vqadd.s16 green16_1, lumi1, chro_g + vqadd.s16 blue16_1, lumi1, chro_b + + /* clamp (divide by 64) */ + vqrshrun.s16 green2, green16_2, #6 + vqrshrun.s16 blue2, blue16_2, #6 + vqrshrun.s16 red2, red16_2, #6 + vqrshrun.s16 green1, green16_1, #6 + vqrshrun.s16 red1, red16_1, #6 + vqrshrun.s16 blue1, blue16_1, #6 + + pld [Y1] + + /* pack into RGB565 */ + vshl.u8 out2l, green2, #3 // low 2a + vsri.u8 out2h, green2, #5 // high 2 + vshl.u8 out1l, green1, #3 // low 1a + vsri.u8 out1h, green1, #5 // high 1 + vsri.u8 out2l, blue2, #3 // low 2b + vsri.u8 out1l, blue1, #3 // low 1b + + vzip.u8 out1h, out2h + vzip.u8 out1l, out2l + vst2.u8 {out1l, out1h}, [O2,:128]! + vst2.u8 {out2l, out2h}, [O2,:128]! + + /* next columns (x16) */ + subs COUNT, COUNT, #16 + bgt loop_col + + /* next rows (x2) */ + subs HEIGHT, #2 + add O1, O2, OPAD + add Y1, Y2, YPAD + add U, U, YPAD, lsr #1 + add V, V, YPAD, lsr #1 + b loop_row diff --git a/modules/arm_neon/yuv_rgb.c b/modules/arm_neon/yuv_rgb.c index 0fb29a2081cc9b0048fef69eaeb49c8faeddf0f1..d28a27ef1e5ceb44bab6257188bc286ba7e836af 100644 --- a/modules/arm_neon/yuv_rgb.c +++ b/modules/arm_neon/yuv_rgb.c @@ -95,6 +95,14 @@ static void I420_RGBA (filter_t *filter, picture_t *src, picture_t *dst) struct yuv_planes in = { src->Y_PIXELS, src->U_PIXELS, src->V_PIXELS, src->Y_PITCH }; i420_rgb_neon (&out, &in, filter->fmt_in.video.i_width, filter->fmt_in.video.i_height); } + +static void I420_RV16 (filter_t *filter, picture_t *src, picture_t *dst) +{ + struct yuv_pack out = { dst->p->p_pixels, dst->p->i_pitch }; + struct yuv_planes in = { src->Y_PIXELS, src->U_PIXELS, src->V_PIXELS, src->Y_PITCH }; + i420_rv16_neon (&out, &in, filter->fmt_in.video.i_width, filter->fmt_in.video.i_height); +} + static void YV12_RGBA (filter_t *filter, picture_t *src, picture_t *dst) { struct yuv_pack out = { dst->p->p_pixels, dst->p->i_pitch }; @@ -117,6 +125,7 @@ static void NV12_RGBA (filter_t *filter, picture_t *src, picture_t *dst) } VIDEO_FILTER_WRAPPER (I420_RGBA) +VIDEO_FILTER_WRAPPER (I420_RV16) VIDEO_FILTER_WRAPPER (YV12_RGBA) VIDEO_FILTER_WRAPPER (NV21_RGBA) VIDEO_FILTER_WRAPPER (NV12_RGBA) @@ -135,6 +144,17 @@ static int Open (vlc_object_t *obj) switch (filter->fmt_out.video.i_chroma) { + case VLC_CODEC_RGB16: + switch (filter->fmt_in.video.i_chroma) + { + case VLC_CODEC_I420: + filter->pf_video_filter = I420_RV16_Filter; + break; + default: + return VLC_EGENERIC; + } + break; + case VLC_CODEC_RGB32: if( filter->fmt_out.video.i_rmask != 0x000000ff || filter->fmt_out.video.i_gmask != 0x0000ff00