unicode.c 13.3 KB
Newer Older
1
/*****************************************************************************
2
 * unicode.c: Unicode <-> locale functions
3
 *****************************************************************************
4
 * Copyright (C) 2005-2006 the VideoLAN team
5
 * Copyright © 2005-2010 Rémi Denis-Courmont
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
 *
 * Authors: Rémi Denis-Courmont <rem # videolan.org>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
dionoea's avatar
dionoea committed
21
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
22
23
24
25
26
 *****************************************************************************/

/*****************************************************************************
 * Preamble
 *****************************************************************************/
27
28
29
30
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif

31
#include <vlc_common.h>
zorglub's avatar
zorglub committed
32
#include <vlc_charset.h>
33

34
#include <assert.h>
35

36
#include <stdio.h>
37
#include <stdarg.h>
38
#include <stdlib.h>
39
#include <sys/types.h>
40
41
42
#ifdef UNDER_CE
#  include <tchar.h>
#endif
Laurent Aimar's avatar
Laurent Aimar committed
43
#include <errno.h>
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
44
#include <wctype.h>
45

Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
46
47
#if defined (ASSUME_UTF8)
/* Cool */
48

Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
49
50
#elif defined (WIN32) || defined (UNDER_CE)
# define USE_MB2MB 1
51
# include <io.h>
52

53
static char *locale_dup (const char *string, bool from)
54
{
55
56
57
58
59
60
61
    char *out;
    int len;

    len = 1 + MultiByteToWideChar (from ? CP_ACP : CP_UTF8,
                                   0, string, -1, NULL, 0);
    wchar_t *wide = malloc (len * sizeof (wchar_t));
    if (wide == NULL)
62
63
        return NULL;

64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
    MultiByteToWideChar (from ? CP_ACP : CP_UTF8, 0, string, -1, wide, len);
    len = 1 + WideCharToMultiByte (from ? CP_UTF8 : CP_ACP, 0, wide, -1,
                                   NULL, 0, NULL, NULL);
    out = malloc (len);
    if (out != NULL)
        WideCharToMultiByte (from ? CP_UTF8 : CP_ACP, 0, wide, -1, out, len,
                             NULL, NULL);
    free (wide);
    return out;
}

#elif defined (HAVE_ICONV)
# define USE_ICONV 1

static char *locale_dup (const char *string, bool from)
{
80
81
    vlc_iconv_t hd = vlc_iconv_open (from ? "UTF-8" : "",
                                     from ? "" : "UTF-8");
82
    if (hd == (vlc_iconv_t)(-1))
83
        return NULL; /* Uho! */
84
85
86
87
88
89
90
91
92
93
94
95

    const char *iptr = string;
    size_t inb = strlen (string);
    size_t outb = inb * 6 + 1;
    char output[outb], *optr = output;

    while (vlc_iconv (hd, &iptr, &inb, &optr, &outb) == (size_t)(-1))
    {
        *optr++ = '?';
        outb--;
        iptr++;
        inb--;
96
        vlc_iconv (hd, NULL, NULL, NULL, NULL); /* reset */
97
98
    }
    *optr = '\0';
99
    vlc_iconv_close (hd);
100
101
102
103
104
105
106
107
108

    assert (inb == 0);
    assert (*iptr == '\0');
    assert (*optr == '\0');
    assert (strlen (output) == (size_t)(optr - output));
    return strdup (output);
}

#else
109
# error No UTF8 charset conversion implemented on this platform!
110
#endif
111

112

Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
113
114
115
116
/**
 * Releases (if needed) a localized or uniformized string.
 * @param str non-NULL return value from FromLocale() or ToLocale().
 */
117
118
void LocaleFree (const char *str)
{
119
120
#ifdef ASSUME_UTF8
    (void) str;
121
#else
122
    free ((char *)str);
123
#endif
124
125
}

126

Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
127
/**
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
128
 * Converts a string from the system locale character encoding to UTF-8.
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
129
 *
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
130
 * @param locale nul-terminated string to convert
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
131
132
133
134
135
 *
 * @return a nul-terminated UTF-8 string, or NULL in case of error.
 * To avoid memory leak, you have to pass the result to LocaleFree()
 * when it is no longer needed.
 */
136
char *FromLocale (const char *locale)
137
{
138
139
140
141
142
#ifdef ASSUME_UTF8
    return (char *)locale;
#else
    return locale ? locale_dup (locale, true) : NULL;
#endif
143
144
}

Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
145
146
147
148
149
150
151
152
153
/**
 * converts a string from the system locale character encoding to utf-8,
 * the result is always allocated on the heap.
 *
 * @param locale nul-terminated string to convert
 *
 * @return a nul-terminated utf-8 string, or null in case of error.
 * The result must be freed using free() - as with the strdup() function.
 */
154
char *FromLocaleDup (const char *locale)
155
{
156
157
158
#ifdef ASSUME_UTF8
    return strdup (locale);
#else
159
    return locale_dup (locale, true);
160
#endif
161
162
163
}


Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
164
/**
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
165
 * ToLocale: converts an UTF-8 string to local system encoding.
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
166
167
168
169
170
171
172
 *
 * @param utf8 nul-terminated string to be converted
 *
 * @return a nul-terminated string, or NULL in case of error.
 * To avoid memory leak, you have to pass the result to LocaleFree()
 * when it is no longer needed.
 */
173
char *ToLocale (const char *utf8)
174
{
175
176
177
#ifdef ASSUME_UTF8
    return (char *)utf8;
#else
178
    return utf8 ? locale_dup (utf8, false) : NULL;
179
#endif
180
181
}

182

Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
183
184
185
186
187
188
189
190
191
/**
 * converts a string from UTF-8 to the system locale character encoding,
 * the result is always allocated on the heap.
 *
 * @param utf8 nul-terminated string to convert
 *
 * @return a nul-terminated string, or null in case of error.
 * The result must be freed using free() - as with the strdup() function.
 */
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
192
char *ToLocaleDup (const char *utf8)
193
{
194
195
196
#ifdef ASSUME_UTF8
    return strdup (utf8);
#else
197
    return locale_dup (utf8, false);
198
#endif
199
200
}

Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
201
202
203
204
/**
 * Formats an UTF-8 string as vfprintf(), then print it, with
 * appropriate conversion to local encoding.
 */
205
int utf8_vfprintf( FILE *stream, const char *fmt, va_list ap )
206
{
207
208
209
#ifdef ASSUME_UTF8
    return vfprintf (stream, fmt, ap);
#else
210
    char *str;
211
212
    int res;

213
# if defined( WIN32 ) && !defined( UNDER_CE )
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
    /* Writing to the console is a lot of fun on Microsoft Windows.
     * If you use the standard I/O functions, you must use the OEM code page,
     * which is different from the usual ANSI code page. Or maybe not, if the
     * user called "chcp". Anyway, we prefer Unicode. */
    int fd = _fileno (stream);
    if (likely(fd != -1) && _isatty (fd))
    {
        res = vasprintf (&str, fmt, ap);
        if (unlikely(res == -1))
            return -1;

        size_t wlen = 2 * (res + 1);
        wchar_t *wide = malloc (wlen);
        if (likely(wide != NULL))
        {
            wlen = MultiByteToWideChar (CP_UTF8, 0, str, res + 1, wide, wlen);
            if (wlen > 0)
            {
                HANDLE h = (HANDLE)(intptr_t)_get_osfhandle (fd);
                DWORD out;

                WriteConsoleW (h, wide, wlen - 1, &out, NULL);
            }
            else
                res = -1;
            free (wide);
        }
        else
            res = -1;
        free (str);
        return res;
    }
246
# endif
247

248
    res = vasprintf (&str, fmt, ap);
249
    if (unlikely(res == -1))
250
251
        return -1;

252
253
254
255
256
    char *ansi = ToLocaleDup (str);
    free (str);

    fputs (ansi, stream);
    free (ansi);
257
    return res;
258
#endif
259
260
}

Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
261
262
263
264
/**
 * Formats an UTF-8 string as fprintf(), then print it, with
 * appropriate conversion to local encoding.
 */
265
266
int utf8_fprintf( FILE *stream, const char *fmt, ... )
{
267
268
    va_list ap;
    int res;
269

270
271
272
273
    va_start( ap, fmt );
    res = utf8_vfprintf( stream, fmt, ap );
    va_end( ap );
    return res;
274
275
}

276

277
278
279
280
281
282
283
284
285
/**
 * Converts the first character from a UTF-8 sequence into a code point.
 *
 * @param str an UTF-8 bytes sequence
 * @return 0 if str points to an empty string, i.e. the first character is NUL;
 * number of bytes that the first character occupies (from 1 to 4) otherwise;
 * -1 if the byte sequence was not a valid UTF-8 sequence.
 */
static size_t vlc_towc (const char *str, uint32_t *restrict pwc)
286
{
287
    uint8_t *ptr = (uint8_t *)str;
288
289
    assert (str != NULL);

290
    uint8_t c = ptr[0];
291

292
293
294
295
296
    if (unlikely(c == '\0'))
    {
        *pwc = 0;
        return 0;
    }
297

298
299
    if (unlikely(c > 0xF4))
        return -1;
300

301
302
303
304
305
306
    int charlen = clz8 (c ^ 0xFF);
    switch (charlen)
    {
        case 0: // 7-bit ASCII character -> OK
            *pwc = c;
            return 1;
307

308
309
310
        case 1: // continuation byte -> error
            return -1;
    }
311

312
    assert (charlen >= 2 && charlen <= 4);
313

314
315
316
317
318
    uint32_t cp = c & ~((0xff >> (7 - charlen)) << (7 - charlen));
    for (int i = 1; i < charlen; i++)
    {
        assert (cp < (1 << 26));
        c = ptr[i];
319

320
321
        if (unlikely((c >> 6) != 2)) // not a continuation byte
            return -1;
322

323
        cp = (cp << 6) | (ptr[i] & 0x3f);
324
325
    }

326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
    switch (charlen)
    {
        case 4:
            if (unlikely(cp > 0x10FFFF)) // beyond Unicode
                return -1;
        case 3:
            if (unlikely(cp >= 0xD800 && cp < 0xC000)) // UTF-16 surrogate
                return -1;
        case 2:
            if (unlikely(cp < 128)) // ASCII overlong
                return -1;
            if (unlikely(cp < (1u << (5 * charlen - 3)))) // overlong
                return -1;
    }
    *pwc = cp;
    return charlen;
342
}
343

Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
/**
 * Look for an UTF-8 string within another one in a case-insensitive fashion.
 * Beware that this is quite slow. Contrary to strcasestr(), this function
 * works regardless of the system character encoding, and handles multibyte
 * code points correctly.

 * @param haystack string to look into
 * @param needle string to look for
 * @return a pointer to the first occurence of the needle within the haystack,
 * or NULL if no occurence were found.
 */
char *vlc_strcasestr (const char *haystack, const char *needle)
{
    ssize_t s;

    do
    {
        const char *h = haystack, *n = needle;

        for (;;)
        {
            uint32_t cph, cpn;

            s = vlc_towc (n, &cpn);
            if (s == 0)
                return (char *)haystack;
            if (unlikely(s < 0))
                return NULL;
            n += s;

            s = vlc_towc (h, &cph);
            if (s <= 0 || towlower (cph) != towlower (cpn))
                break;
            h += s;
        }

        s = vlc_towc (haystack, &(uint32_t) { 0 });
        haystack += s;
    }
    while (s != 0);

    return NULL;
}
387

388
/**
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
389
 * Replaces invalid/overlong UTF-8 sequences with question marks.
390
391
392
393
394
395
396
 * Note that it is not possible to convert from Latin-1 to UTF-8 on the fly,
 * so we don't try that, even though it would be less disruptive.
 *
 * @return str if it was valid UTF-8, NULL if not.
 */
char *EnsureUTF8( char *str )
{
397
398
399
400
401
402
403
404
405
406
407
408
409
    char *ret = str;
    size_t n;
    uint32_t cp;

    while ((n = vlc_towc (str, &cp)) != 0)
        if (likely(n != (size_t)-1))
            str += n;
        else
        {
            *str++ = '?';
            ret = NULL;
        }
    return ret;
410
411
412
413
}


/**
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
414
 * Checks whether a string is a valid UTF-8 byte sequence.
415
416
417
418
419
420
421
 *
 * @param str nul-terminated string to be checked
 *
 * @return str if it was valid UTF-8, NULL if not.
 */
const char *IsUTF8( const char *str )
{
422
423
424
425
426
427
428
429
430
    size_t n;
    uint32_t cp;

    while ((n = vlc_towc (str, &cp)) != 0)
        if (likely(n != (size_t)-1))
            str += n;
        else
            return NULL;
    return str;
431
}
Laurent Aimar's avatar
Laurent Aimar committed
432
433
434
435
436
437
438

/**
 * Converts a string from the given character encoding to utf-8.
 *
 * @return a nul-terminated utf-8 string, or null in case of error.
 * The result must be freed using free().
 */
439
char *FromCharset(const char *charset, const void *data, size_t data_size)
Laurent Aimar's avatar
Laurent Aimar committed
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
{
    vlc_iconv_t handle = vlc_iconv_open ("UTF-8", charset);
    if (handle == (vlc_iconv_t)(-1))
        return NULL;

    char *out = NULL;
    for(unsigned mul = 4; mul < 8; mul++ )
    {
        size_t in_size = data_size;
        const char *in = data;
        size_t out_max = mul * data_size;
        char *tmp = out = malloc (1 + out_max);
        if (!out)
            break;

        if (vlc_iconv (handle, &in, &in_size, &tmp, &out_max) != (size_t)(-1)) {
            *tmp = '\0';
            break;
        }
        free(out);
        out = NULL;

        if (errno != E2BIG)
            break;
    }
    vlc_iconv_close(handle);
    return out;
}

469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
/**
 * Converts a nul-terminated UTF-8 string to a given character encoding.
 * @param charset iconv name of the character set
 * @param in nul-terminated UTF-8 string
 * @param outsize pointer to hold the byte size of result
 *
 * @return A pointer to the result, which must be released using free().
 * The UTF-8 nul terminator is included in the conversion if the target
 * character encoding supports it. However it is not included in the returned
 * byte size.
 * In case of error, NULL is returned and the byte size is undefined.
 */
void *ToCharset(const char *charset, const char *in, size_t *outsize)
{
    vlc_iconv_t hd = vlc_iconv_open (charset, "UTF-8");
    if (hd == (vlc_iconv_t)(-1))
        return NULL;

    const size_t inlen = strlen (in);
    void *res;

    for (unsigned mul = 4; mul < 16; mul++)
    {
        size_t outlen = mul * (inlen + 1);
        res = malloc (outlen);
        if (unlikely(res == NULL))
            break;

        const char *inp = in;
        char *outp = res;
        size_t inb = inlen + 1;
        size_t outb = outlen;

        if (vlc_iconv (hd, &inp, &inb, &outp, &outb) != (size_t)(-1))
        {
            *outsize = outlen - outb;
            inb = 1; /* append nul terminator if possible */
            if (vlc_iconv (hd, &inp, &inb, &outp, &outb) != (size_t)(-1))
                break;
            if (errno == EILSEQ) /* cannot translate nul terminator!? */
                break;
        }

        free (res);
        if (errno != E2BIG) /* conversion failure */
        {
            res = NULL;
            break;
        }
    }
    vlc_iconv_close (hd);
    return res;
}