unicode.c 12.2 KB
Newer Older
1
/*****************************************************************************
2
 * unicode.c: Unicode <-> locale functions
3
 *****************************************************************************
4
 * Copyright (C) 2005-2006 the VideoLAN team
5
 * Copyright © 2005-2010 Rémi Denis-Courmont
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
 *
 * Authors: Rémi Denis-Courmont <rem # videolan.org>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
dionoea's avatar
dionoea committed
21
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
22
23
24
25
26
 *****************************************************************************/

/*****************************************************************************
 * Preamble
 *****************************************************************************/
27
28
29
30
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif

31
#include <vlc_common.h>
32
33

#include "libvlc.h"
zorglub's avatar
zorglub committed
34
#include <vlc_charset.h>
35

36
#include <assert.h>
37

38
#include <stdio.h>
39
#include <stdarg.h>
40
#include <stdlib.h>
41
#include <sys/types.h>
42
43
#ifdef UNDER_CE
#  include <tchar.h>
Pierre Ynard's avatar
Pierre Ynard committed
44
45
#elif defined(WIN32)
#  include <io.h>
46
#endif
Laurent Aimar's avatar
Laurent Aimar committed
47
#include <errno.h>
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
48
#include <wctype.h>
49

Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
50
51
52
53
/**
 * Releases (if needed) a localized or uniformized string.
 * @param str non-NULL return value from FromLocale() or ToLocale().
 */
54
55
void LocaleFree (const char *str)
{
56
57
#ifdef ASSUME_UTF8
    (void) str;
58
#else
59
    free ((char *)str);
60
#endif
61
62
}

63

Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
64
/**
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
65
 * Converts a string from the system locale character encoding to UTF-8.
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
66
 *
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
67
 * @param locale nul-terminated string to convert
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
68
69
70
71
72
 *
 * @return a nul-terminated UTF-8 string, or NULL in case of error.
 * To avoid memory leak, you have to pass the result to LocaleFree()
 * when it is no longer needed.
 */
73
char *FromLocale (const char *locale)
74
{
75
76
77
#ifdef ASSUME_UTF8
    return (char *)locale;
#else
78
    return locale ? FromCharset ("", locale, strlen(locale)) : NULL;
79
#endif
80
81
}

Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
82
83
84
85
86
87
88
89
90
/**
 * converts a string from the system locale character encoding to utf-8,
 * the result is always allocated on the heap.
 *
 * @param locale nul-terminated string to convert
 *
 * @return a nul-terminated utf-8 string, or null in case of error.
 * The result must be freed using free() - as with the strdup() function.
 */
91
char *FromLocaleDup (const char *locale)
92
{
93
94
95
#ifdef ASSUME_UTF8
    return strdup (locale);
#else
96
    return FromCharset ("", locale, strlen(locale));
97
#endif
98
99
100
}


Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
101
/**
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
102
 * ToLocale: converts an UTF-8 string to local system encoding.
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
103
104
105
106
107
108
109
 *
 * @param utf8 nul-terminated string to be converted
 *
 * @return a nul-terminated string, or NULL in case of error.
 * To avoid memory leak, you have to pass the result to LocaleFree()
 * when it is no longer needed.
 */
110
char *ToLocale (const char *utf8)
111
{
112
113
114
#ifdef ASSUME_UTF8
    return (char *)utf8;
#else
115
116
    size_t outsize;
    return utf8 ? ToCharset ("", utf8, &outsize) : NULL;
117
#endif
118
119
}

120

Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
121
122
123
124
125
126
127
128
129
/**
 * converts a string from UTF-8 to the system locale character encoding,
 * the result is always allocated on the heap.
 *
 * @param utf8 nul-terminated string to convert
 *
 * @return a nul-terminated string, or null in case of error.
 * The result must be freed using free() - as with the strdup() function.
 */
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
130
char *ToLocaleDup (const char *utf8)
131
{
132
133
134
#ifdef ASSUME_UTF8
    return strdup (utf8);
#else
135
136
    size_t outsize;
    return ToCharset ("", utf8, &outsize);
137
#endif
138
139
}

Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
140
141
142
143
/**
 * Formats an UTF-8 string as vfprintf(), then print it, with
 * appropriate conversion to local encoding.
 */
144
int utf8_vfprintf( FILE *stream, const char *fmt, va_list ap )
145
{
146
147
148
#ifdef ASSUME_UTF8
    return vfprintf (stream, fmt, ap);
#else
149
    char *str;
150
151
    int res;

152
# if defined( WIN32 ) && !defined( UNDER_CE )
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
    /* Writing to the console is a lot of fun on Microsoft Windows.
     * If you use the standard I/O functions, you must use the OEM code page,
     * which is different from the usual ANSI code page. Or maybe not, if the
     * user called "chcp". Anyway, we prefer Unicode. */
    int fd = _fileno (stream);
    if (likely(fd != -1) && _isatty (fd))
    {
        res = vasprintf (&str, fmt, ap);
        if (unlikely(res == -1))
            return -1;

        size_t wlen = 2 * (res + 1);
        wchar_t *wide = malloc (wlen);
        if (likely(wide != NULL))
        {
            wlen = MultiByteToWideChar (CP_UTF8, 0, str, res + 1, wide, wlen);
            if (wlen > 0)
            {
                HANDLE h = (HANDLE)(intptr_t)_get_osfhandle (fd);
                DWORD out;

                WriteConsoleW (h, wide, wlen - 1, &out, NULL);
            }
            else
                res = -1;
            free (wide);
        }
        else
            res = -1;
        free (str);
        return res;
    }
185
# endif
186

187
    res = vasprintf (&str, fmt, ap);
188
    if (unlikely(res == -1))
189
190
        return -1;

191
192
193
    char *ansi = ToLocaleDup (str);
    free (str);

194
195
    if (ansi == NULL)
        return -1;
196
197
    fputs (ansi, stream);
    free (ansi);
198
    return res;
199
#endif
200
201
}

Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
202
203
204
205
/**
 * Formats an UTF-8 string as fprintf(), then print it, with
 * appropriate conversion to local encoding.
 */
206
207
int utf8_fprintf( FILE *stream, const char *fmt, ... )
{
208
209
    va_list ap;
    int res;
210

211
212
213
214
    va_start( ap, fmt );
    res = utf8_vfprintf( stream, fmt, ap );
    va_end( ap );
    return res;
215
216
}

217

218
219
220
221
222
223
224
225
/**
 * Converts the first character from a UTF-8 sequence into a code point.
 *
 * @param str an UTF-8 bytes sequence
 * @return 0 if str points to an empty string, i.e. the first character is NUL;
 * number of bytes that the first character occupies (from 1 to 4) otherwise;
 * -1 if the byte sequence was not a valid UTF-8 sequence.
 */
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
226
size_t vlc_towc (const char *str, uint32_t *restrict pwc)
227
{
228
229
    uint8_t *ptr = (uint8_t *)str, c;
    uint32_t cp;
230

231
    assert (str != NULL);
232

233
    c = *ptr;
234
235
    if (unlikely(c > 0xF4))
        return -1;
236

237
238
239
    int charlen = clz8 (c ^ 0xFF);
    switch (charlen)
    {
240
        case 0: // 7-bit ASCII character -> short cut
241
            *pwc = c;
242
            return c != '\0';
243

244
245
        case 1: // continuation byte -> error
            return -1;
246

247
248
249
250
251
        case 2:
            if (unlikely(c < 0xC2)) // ASCII overlong
                return -1;
            cp = (c & 0x1F) << 6;
            break;
252

253
254
255
        case 3:
            cp = (c & 0x0F) << 12;
            break;
256

257
258
259
        case 4:
            cp = (c & 0x07) << 16;
            break;
260

261
262
        default:
            assert (0);
263
264
    }

265
    /* Unrolled continuation bytes decoding */
266
267
268
    switch (charlen)
    {
        case 4:
269
270
            c = *++ptr;
            if (unlikely((c >> 6) != 2)) // not a continuation byte
271
                return -1;
272
273
274
275
276
            cp |= (c & 0x3f) << 12;

            if (unlikely(cp >= 0x110000)) // beyond Unicode range
                return -1;
            /* fall through */
277
        case 3:
278
279
280
281
282
            c = *++ptr;
            if (unlikely((c >> 6) != 2)) // not a continuation byte
                return -1;
            cp |= (c & 0x3f) << 6;

283
284
            if (unlikely(cp >= 0xD800 && cp < 0xC000)) // UTF-16 surrogate
                return -1;
285
            if (unlikely(cp < (1u << (5 * charlen - 4)))) // non-ASCII overlong
286
                return -1;
287
288
289
290
            /* fall through */
        case 2:
            c = *++ptr;
            if (unlikely((c >> 6) != 2)) // not a continuation byte
291
                return -1;
292
293
            cp |= (c & 0x3f);
            break;
294
    }
295

296
297
    *pwc = cp;
    return charlen;
298
}
299

Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
/**
 * Look for an UTF-8 string within another one in a case-insensitive fashion.
 * Beware that this is quite slow. Contrary to strcasestr(), this function
 * works regardless of the system character encoding, and handles multibyte
 * code points correctly.

 * @param haystack string to look into
 * @param needle string to look for
 * @return a pointer to the first occurence of the needle within the haystack,
 * or NULL if no occurence were found.
 */
char *vlc_strcasestr (const char *haystack, const char *needle)
{
    ssize_t s;

    do
    {
        const char *h = haystack, *n = needle;

        for (;;)
        {
            uint32_t cph, cpn;

            s = vlc_towc (n, &cpn);
            if (s == 0)
                return (char *)haystack;
            if (unlikely(s < 0))
                return NULL;
            n += s;

            s = vlc_towc (h, &cph);
            if (s <= 0 || towlower (cph) != towlower (cpn))
                break;
            h += s;
        }

        s = vlc_towc (haystack, &(uint32_t) { 0 });
        haystack += s;
    }
    while (s != 0);

    return NULL;
}
343

344
/**
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
345
 * Replaces invalid/overlong UTF-8 sequences with question marks.
346
347
348
349
350
351
352
 * Note that it is not possible to convert from Latin-1 to UTF-8 on the fly,
 * so we don't try that, even though it would be less disruptive.
 *
 * @return str if it was valid UTF-8, NULL if not.
 */
char *EnsureUTF8( char *str )
{
353
354
355
356
357
358
359
360
361
362
363
364
365
    char *ret = str;
    size_t n;
    uint32_t cp;

    while ((n = vlc_towc (str, &cp)) != 0)
        if (likely(n != (size_t)-1))
            str += n;
        else
        {
            *str++ = '?';
            ret = NULL;
        }
    return ret;
366
367
368
369
}


/**
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
370
 * Checks whether a string is a valid UTF-8 byte sequence.
371
372
373
374
375
376
377
 *
 * @param str nul-terminated string to be checked
 *
 * @return str if it was valid UTF-8, NULL if not.
 */
const char *IsUTF8( const char *str )
{
378
379
380
381
382
383
384
385
386
    size_t n;
    uint32_t cp;

    while ((n = vlc_towc (str, &cp)) != 0)
        if (likely(n != (size_t)-1))
            str += n;
        else
            return NULL;
    return str;
387
}
Laurent Aimar's avatar
Laurent Aimar committed
388
389
390
391
392
393
394

/**
 * Converts a string from the given character encoding to utf-8.
 *
 * @return a nul-terminated utf-8 string, or null in case of error.
 * The result must be freed using free().
 */
395
char *FromCharset(const char *charset, const void *data, size_t data_size)
Laurent Aimar's avatar
Laurent Aimar committed
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
{
    vlc_iconv_t handle = vlc_iconv_open ("UTF-8", charset);
    if (handle == (vlc_iconv_t)(-1))
        return NULL;

    char *out = NULL;
    for(unsigned mul = 4; mul < 8; mul++ )
    {
        size_t in_size = data_size;
        const char *in = data;
        size_t out_max = mul * data_size;
        char *tmp = out = malloc (1 + out_max);
        if (!out)
            break;

        if (vlc_iconv (handle, &in, &in_size, &tmp, &out_max) != (size_t)(-1)) {
            *tmp = '\0';
            break;
        }
        free(out);
        out = NULL;

        if (errno != E2BIG)
            break;
    }
    vlc_iconv_close(handle);
    return out;
}

425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
/**
 * Converts a nul-terminated UTF-8 string to a given character encoding.
 * @param charset iconv name of the character set
 * @param in nul-terminated UTF-8 string
 * @param outsize pointer to hold the byte size of result
 *
 * @return A pointer to the result, which must be released using free().
 * The UTF-8 nul terminator is included in the conversion if the target
 * character encoding supports it. However it is not included in the returned
 * byte size.
 * In case of error, NULL is returned and the byte size is undefined.
 */
void *ToCharset(const char *charset, const char *in, size_t *outsize)
{
    vlc_iconv_t hd = vlc_iconv_open (charset, "UTF-8");
    if (hd == (vlc_iconv_t)(-1))
        return NULL;

    const size_t inlen = strlen (in);
    void *res;

    for (unsigned mul = 4; mul < 16; mul++)
    {
        size_t outlen = mul * (inlen + 1);
        res = malloc (outlen);
        if (unlikely(res == NULL))
            break;

        const char *inp = in;
        char *outp = res;
455
        size_t inb = inlen;
456
        size_t outb = outlen - mul;
457
458
459

        if (vlc_iconv (hd, &inp, &inb, &outp, &outb) != (size_t)(-1))
        {
460
            *outsize = outlen - mul - outb;
461
            outb += mul;
462
463
464
465
466
467
468
469
            inb = 1; /* append nul terminator if possible */
            if (vlc_iconv (hd, &inp, &inb, &outp, &outb) != (size_t)(-1))
                break;
            if (errno == EILSEQ) /* cannot translate nul terminator!? */
                break;
        }

        free (res);
470
        res = NULL;
471
472
473
474
475
476
477
        if (errno != E2BIG) /* conversion failure */
            break;
    }
    vlc_iconv_close (hd);
    return res;
}