unicode.c 8.7 KB
Newer Older
1
/*****************************************************************************
2
 * unicode.c: Unicode <-> locale functions
3
 *****************************************************************************
Jean-Baptiste Kempf's avatar
Jean-Baptiste Kempf committed
4
 * Copyright (C) 2005-2006 VLC authors and VideoLAN
5
 * Copyright © 2005-2010 Rémi Denis-Courmont
6 7 8
 *
 * Authors: Rémi Denis-Courmont <rem # videolan.org>
 *
Jean-Baptiste Kempf's avatar
Jean-Baptiste Kempf committed
9 10 11
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or
12 13 14 15
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
Jean-Baptiste Kempf's avatar
Jean-Baptiste Kempf committed
16 17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Lesser General Public License for more details.
18
 *
Jean-Baptiste Kempf's avatar
Jean-Baptiste Kempf committed
19 20 21
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
22 23 24 25 26
 *****************************************************************************/

/*****************************************************************************
 * Preamble
 *****************************************************************************/
27 28 29 30
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif

31
#include <vlc_common.h>
32 33

#include "libvlc.h"
34
#include <vlc_charset.h>
35

36
#include <assert.h>
37

38
#include <stdio.h>
39
#include <stdarg.h>
40
#include <stdlib.h>
41
#include <sys/types.h>
42
#if defined(_WIN32)
Pierre Ynard's avatar
Pierre Ynard committed
43
#  include <io.h>
44
#endif
45
#include <errno.h>
46
#include <wctype.h>
47

48 49 50 51
/**
 * Formats an UTF-8 string as vfprintf(), then print it, with
 * appropriate conversion to local encoding.
 */
52
int utf8_vfprintf( FILE *stream, const char *fmt, va_list ap )
53
{
54
#ifndef _WIN32
55 56
    return vfprintf (stream, fmt, ap);
#else
57
    char *str;
58 59 60
    int res = vasprintf (&str, fmt, ap);
    if (unlikely(res == -1))
        return -1;
61

62
#if !VLC_WINSTORE_APP
63 64 65 66 67 68 69
    /* Writing to the console is a lot of fun on Microsoft Windows.
     * If you use the standard I/O functions, you must use the OEM code page,
     * which is different from the usual ANSI code page. Or maybe not, if the
     * user called "chcp". Anyway, we prefer Unicode. */
    int fd = _fileno (stream);
    if (likely(fd != -1) && _isatty (fd))
    {
70
        wchar_t *wide = ToWide (str);
71 72
        if (likely(wide != NULL))
        {
73 74 75 76
            HANDLE h = (HANDLE)((uintptr_t)_get_osfhandle (fd));
            DWORD out;
            /* XXX: It is not clear whether WriteConsole() wants the number of
             * Unicode characters or the size of the wchar_t array. */
77
            BOOL ok = WriteConsoleW (h, wide, wcslen (wide), &out, NULL);
78
            free (wide);
79 80
            if (ok)
                goto out;
81 82
        }
    }
83
#endif
84 85
    wchar_t *wide = ToWide(str);
    if (likely(wide != NULL))
86
    {
87 88
        res = fputws(wide, stream);
        free(wide);
89
    }
90 91 92
    else
        res = -1;
out:
93
    free (str);
94
    return res;
95
#endif
96 97
}

98 99 100 101
/**
 * Formats an UTF-8 string as fprintf(), then print it, with
 * appropriate conversion to local encoding.
 */
102 103
int utf8_fprintf( FILE *stream, const char *fmt, ... )
{
104 105
    va_list ap;
    int res;
106

107 108 109 110
    va_start( ap, fmt );
    res = utf8_vfprintf( stream, fmt, ap );
    va_end( ap );
    return res;
111 112
}

Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
113
size_t vlc_towc (const char *str, uint32_t *restrict pwc)
114
{
115 116
    uint8_t *ptr = (uint8_t *)str, c;
    uint32_t cp;
117

118
    assert (str != NULL);
119

120
    c = *ptr;
121 122
    if (unlikely(c > 0xF4))
        return -1;
123

124 125 126
    int charlen = clz8 (c ^ 0xFF);
    switch (charlen)
    {
127
        case 0: // 7-bit ASCII character -> short cut
128
            *pwc = c;
129
            return c != '\0';
130

131 132
        case 1: // continuation byte -> error
            return -1;
133

134 135 136 137 138
        case 2:
            if (unlikely(c < 0xC2)) // ASCII overlong
                return -1;
            cp = (c & 0x1F) << 6;
            break;
139

140 141 142
        case 3:
            cp = (c & 0x0F) << 12;
            break;
143

144
        case 4:
145
            cp = (c & 0x07) << 18;
146
            break;
147

148
        default:
149
            vlc_assert_unreachable ();
150 151
    }

152
    /* Unrolled continuation bytes decoding */
153 154 155
    switch (charlen)
    {
        case 4:
156
            c = *++ptr;
157
            if (unlikely((c & 0xC0) != 0x80)) // not a continuation byte
158
                return -1;
159
            cp |= (c & 0x3F) << 12;
160 161 162 163

            if (unlikely(cp >= 0x110000)) // beyond Unicode range
                return -1;
            /* fall through */
164
        case 3:
165
            c = *++ptr;
166
            if (unlikely((c & 0xC0) != 0x80)) // not a continuation byte
167
                return -1;
168
            cp |= (c & 0x3F) << 6;
169

170
            if (unlikely(cp >= 0xD800 && cp < 0xE000)) // UTF-16 surrogate
171
                return -1;
172
            if (unlikely(cp < (1u << (5 * charlen - 4)))) // non-ASCII overlong
173
                return -1;
174 175 176
            /* fall through */
        case 2:
            c = *++ptr;
177
            if (unlikely((c & 0xC0) != 0x80)) // not a continuation byte
178
                return -1;
179
            cp |= (c & 0x3F);
180
            break;
181
    }
182

183 184
    *pwc = cp;
    return charlen;
185
}
186

187 188 189 190 191 192 193 194
/**
 * Look for an UTF-8 string within another one in a case-insensitive fashion.
 * Beware that this is quite slow. Contrary to strcasestr(), this function
 * works regardless of the system character encoding, and handles multibyte
 * code points correctly.

 * @param haystack string to look into
 * @param needle string to look for
195 196
 * @return a pointer to the first occurrence of the needle within the haystack,
 * or NULL if no occurrence were found.
197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225
 */
char *vlc_strcasestr (const char *haystack, const char *needle)
{
    ssize_t s;

    do
    {
        const char *h = haystack, *n = needle;

        for (;;)
        {
            uint32_t cph, cpn;

            s = vlc_towc (n, &cpn);
            if (s == 0)
                return (char *)haystack;
            if (unlikely(s < 0))
                return NULL;
            n += s;

            s = vlc_towc (h, &cph);
            if (s <= 0 || towlower (cph) != towlower (cpn))
                break;
            h += s;
        }

        s = vlc_towc (haystack, &(uint32_t) { 0 });
        haystack += s;
    }
226
    while (s > 0);
227 228 229

    return NULL;
}
230

231 232 233 234 235 236
/**
 * Converts a string from the given character encoding to utf-8.
 *
 * @return a nul-terminated utf-8 string, or null in case of error.
 * The result must be freed using free().
 */
237
char *FromCharset(const char *charset, const void *data, size_t data_size)
238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266
{
    vlc_iconv_t handle = vlc_iconv_open ("UTF-8", charset);
    if (handle == (vlc_iconv_t)(-1))
        return NULL;

    char *out = NULL;
    for(unsigned mul = 4; mul < 8; mul++ )
    {
        size_t in_size = data_size;
        const char *in = data;
        size_t out_max = mul * data_size;
        char *tmp = out = malloc (1 + out_max);
        if (!out)
            break;

        if (vlc_iconv (handle, &in, &in_size, &tmp, &out_max) != (size_t)(-1)) {
            *tmp = '\0';
            break;
        }
        free(out);
        out = NULL;

        if (errno != E2BIG)
            break;
    }
    vlc_iconv_close(handle);
    return out;
}

267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296
/**
 * Converts a nul-terminated UTF-8 string to a given character encoding.
 * @param charset iconv name of the character set
 * @param in nul-terminated UTF-8 string
 * @param outsize pointer to hold the byte size of result
 *
 * @return A pointer to the result, which must be released using free().
 * The UTF-8 nul terminator is included in the conversion if the target
 * character encoding supports it. However it is not included in the returned
 * byte size.
 * In case of error, NULL is returned and the byte size is undefined.
 */
void *ToCharset(const char *charset, const char *in, size_t *outsize)
{
    vlc_iconv_t hd = vlc_iconv_open (charset, "UTF-8");
    if (hd == (vlc_iconv_t)(-1))
        return NULL;

    const size_t inlen = strlen (in);
    void *res;

    for (unsigned mul = 4; mul < 16; mul++)
    {
        size_t outlen = mul * (inlen + 1);
        res = malloc (outlen);
        if (unlikely(res == NULL))
            break;

        const char *inp = in;
        char *outp = res;
297
        size_t inb = inlen;
298
        size_t outb = outlen - mul;
299 300 301

        if (vlc_iconv (hd, &inp, &inb, &outp, &outb) != (size_t)(-1))
        {
302
            *outsize = outlen - mul - outb;
303
            outb += mul;
304 305 306 307 308 309 310 311
            inb = 1; /* append nul terminator if possible */
            if (vlc_iconv (hd, &inp, &inb, &outp, &outb) != (size_t)(-1))
                break;
            if (errno == EILSEQ) /* cannot translate nul terminator!? */
                break;
        }

        free (res);
312
        res = NULL;
313 314 315 316 317 318 319
        if (errno != E2BIG) /* conversion failure */
            break;
    }
    vlc_iconv_close (hd);
    return res;
}