unicode.c 10.4 KB
Newer Older
1
/*****************************************************************************
2
 * unicode.c: Unicode <-> locale functions
3
 *****************************************************************************
4
 * Copyright (C) 2005-2006 the VideoLAN team
5
 * Copyright © 2005-2008 Rémi Denis-Courmont
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
 *
 * Authors: Rémi Denis-Courmont <rem # videolan.org>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
dionoea's avatar
dionoea committed
21
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
22
23
24
25
26
 *****************************************************************************/

/*****************************************************************************
 * Preamble
 *****************************************************************************/
27
28
29
30
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif

31
#include <vlc_common.h>
zorglub's avatar
zorglub committed
32
#include <vlc_charset.h>
33

34
#include <assert.h>
35

36
#include <stdio.h>
37
#include <stdarg.h>
38
#include <stdlib.h>
39
#include <sys/types.h>
40
41
42
#ifdef UNDER_CE
#  include <tchar.h>
#endif
Laurent Aimar's avatar
Laurent Aimar committed
43
#include <errno.h>
44

Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
45
46
#if defined (ASSUME_UTF8)
/* Cool */
47

Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
48
49
#elif defined (WIN32) || defined (UNDER_CE)
# define USE_MB2MB 1
50

51
static char *locale_dup (const char *string, bool from)
52
{
53
54
55
56
57
58
59
    char *out;
    int len;

    len = 1 + MultiByteToWideChar (from ? CP_ACP : CP_UTF8,
                                   0, string, -1, NULL, 0);
    wchar_t *wide = malloc (len * sizeof (wchar_t));
    if (wide == NULL)
60
61
        return NULL;

62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
    MultiByteToWideChar (from ? CP_ACP : CP_UTF8, 0, string, -1, wide, len);
    len = 1 + WideCharToMultiByte (from ? CP_UTF8 : CP_ACP, 0, wide, -1,
                                   NULL, 0, NULL, NULL);
    out = malloc (len);
    if (out != NULL)
        WideCharToMultiByte (from ? CP_UTF8 : CP_ACP, 0, wide, -1, out, len,
                             NULL, NULL);
    free (wide);
    return out;
}

#elif defined (HAVE_ICONV)
# define USE_ICONV 1

static char *locale_dup (const char *string, bool from)
{
78
79
    vlc_iconv_t hd = vlc_iconv_open (from ? "UTF-8" : "",
                                     from ? "" : "UTF-8");
80
    if (hd == (vlc_iconv_t)(-1))
81
        return NULL; /* Uho! */
82
83
84
85
86
87
88
89
90
91
92
93

    const char *iptr = string;
    size_t inb = strlen (string);
    size_t outb = inb * 6 + 1;
    char output[outb], *optr = output;

    while (vlc_iconv (hd, &iptr, &inb, &optr, &outb) == (size_t)(-1))
    {
        *optr++ = '?';
        outb--;
        iptr++;
        inb--;
94
        vlc_iconv (hd, NULL, NULL, NULL, NULL); /* reset */
95
96
    }
    *optr = '\0';
97
    vlc_iconv_close (hd);
98
99
100
101
102
103
104
105
106

    assert (inb == 0);
    assert (*iptr == '\0');
    assert (*optr == '\0');
    assert (strlen (output) == (size_t)(optr - output));
    return strdup (output);
}

#else
107
# error No UTF8 charset conversion implemented on this platform!
108
#endif
109

110

Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
111
112
113
114
/**
 * Releases (if needed) a localized or uniformized string.
 * @param str non-NULL return value from FromLocale() or ToLocale().
 */
115
116
void LocaleFree (const char *str)
{
117
118
#ifdef ASSUME_UTF8
    (void) str;
119
#else
120
    free ((char *)str);
121
#endif
122
123
}

124

Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
125
/**
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
126
 * Converts a string from the system locale character encoding to UTF-8.
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
127
 *
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
128
 * @param locale nul-terminated string to convert
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
129
130
131
132
133
 *
 * @return a nul-terminated UTF-8 string, or NULL in case of error.
 * To avoid memory leak, you have to pass the result to LocaleFree()
 * when it is no longer needed.
 */
134
char *FromLocale (const char *locale)
135
{
136
137
138
139
140
#ifdef ASSUME_UTF8
    return (char *)locale;
#else
    return locale ? locale_dup (locale, true) : NULL;
#endif
141
142
}

Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
143
144
145
146
147
148
149
150
151
/**
 * converts a string from the system locale character encoding to utf-8,
 * the result is always allocated on the heap.
 *
 * @param locale nul-terminated string to convert
 *
 * @return a nul-terminated utf-8 string, or null in case of error.
 * The result must be freed using free() - as with the strdup() function.
 */
152
char *FromLocaleDup (const char *locale)
153
{
154
155
156
#ifdef ASSUME_UTF8
    return strdup (locale);
#else
157
    return locale_dup (locale, true);
158
#endif
159
160
161
}


Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
162
/**
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
163
 * ToLocale: converts an UTF-8 string to local system encoding.
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
164
165
166
167
168
169
170
 *
 * @param utf8 nul-terminated string to be converted
 *
 * @return a nul-terminated string, or NULL in case of error.
 * To avoid memory leak, you have to pass the result to LocaleFree()
 * when it is no longer needed.
 */
171
char *ToLocale (const char *utf8)
172
{
173
174
175
#ifdef ASSUME_UTF8
    return (char *)utf8;
#else
176
    return utf8 ? locale_dup (utf8, false) : NULL;
177
#endif
178
179
}

180

Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
181
182
183
184
185
186
187
188
189
/**
 * converts a string from UTF-8 to the system locale character encoding,
 * the result is always allocated on the heap.
 *
 * @param utf8 nul-terminated string to convert
 *
 * @return a nul-terminated string, or null in case of error.
 * The result must be freed using free() - as with the strdup() function.
 */
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
190
char *ToLocaleDup (const char *utf8)
191
{
192
193
194
#ifdef ASSUME_UTF8
    return strdup (utf8);
#else
195
    return locale_dup (utf8, false);
196
#endif
197
198
}

199
/**
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
200
201
 * Formats an UTF-8 string as vasprintf(), then print it to stdout, with
 * appropriate conversion to local encoding.
202
 */
203
static int utf8_vasprintf( char **str, const char *fmt, va_list ap )
204
{
205
206
207
208
209
    char *utf8;
    int res = vasprintf( &utf8, fmt, ap );
    if( res == -1 )
        return -1;

210
211
212
#ifdef ASSUME_UTF8
    *str = utf8;
#else
213
214
    *str = ToLocaleDup( utf8 );
    free( utf8 );
215
#endif
216
    return res;
217
218
}

Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
219
220
221
222
/**
 * Formats an UTF-8 string as vfprintf(), then print it, with
 * appropriate conversion to local encoding.
 */
223
int utf8_vfprintf( FILE *stream, const char *fmt, va_list ap )
224
{
225
    char *str;
226
227
    int res;

228
#if defined( WIN32 ) && !defined( UNDER_CE )
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
    /* Writing to the console is a lot of fun on Microsoft Windows.
     * If you use the standard I/O functions, you must use the OEM code page,
     * which is different from the usual ANSI code page. Or maybe not, if the
     * user called "chcp". Anyway, we prefer Unicode. */
    int fd = _fileno (stream);
    if (likely(fd != -1) && _isatty (fd))
    {
        res = vasprintf (&str, fmt, ap);
        if (unlikely(res == -1))
            return -1;

        size_t wlen = 2 * (res + 1);
        wchar_t *wide = malloc (wlen);
        if (likely(wide != NULL))
        {
            wlen = MultiByteToWideChar (CP_UTF8, 0, str, res + 1, wide, wlen);
            if (wlen > 0)
            {
                HANDLE h = (HANDLE)(intptr_t)_get_osfhandle (fd);
                DWORD out;

                WriteConsoleW (h, wide, wlen - 1, &out, NULL);
            }
            else
                res = -1;
            free (wide);
        }
        else
            res = -1;
        free (str);
        return res;
    }
#endif

    res = utf8_vasprintf (&str, fmt, ap);
    if (unlikely(res == -1))
265
266
267
268
269
        return -1;

    fputs( str, stream );
    free( str );
    return res;
270
271
}

Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
272
273
274
275
/**
 * Formats an UTF-8 string as fprintf(), then print it, with
 * appropriate conversion to local encoding.
 */
276
277
int utf8_fprintf( FILE *stream, const char *fmt, ... )
{
278
279
    va_list ap;
    int res;
280

281
282
283
284
    va_start( ap, fmt );
    res = utf8_vfprintf( stream, fmt, ap );
    va_end( ap );
    return res;
285
286
}

287
288

static char *CheckUTF8( char *str, char rep )
289
{
290
    uint8_t *ptr = (uint8_t *)str;
291
292
    assert (str != NULL);

293
    for (;;)
294
    {
295
296
297
298
299
        uint8_t c = ptr[0];

        if (c == '\0')
            break;

Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
300
301
        if (c > 0xF4)
            goto error;
302

Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
303
        int charlen = clz8 (c ^ 0xFF);
304
        switch (charlen)
305
        {
306
307
308
309
310
            case 0: // 7-bit ASCII character -> OK
                ptr++;
                continue;

            case 1: // continuation byte -> error
311
                goto error;
312
        }
313

314
        assert (charlen >= 2 && charlen <= 4);
315
316
317

        uint32_t cp = c & ~((0xff >> (7 - charlen)) << (7 - charlen));
        for (int i = 1; i < charlen; i++)
318
        {
319
320
321
            assert (cp < (1 << 26));
            c = ptr[i];

322
            if ((c >> 6) != 2) // not a continuation byte
323
                goto error;
324
325

            cp = (cp << 6) | (ptr[i] & 0x3f);
326
        }
327

328
329
330
331
332
333
334
335
336
337
338
339
340
341
        switch (charlen)
        {
            case 4:
                if (cp > 0x10FFFF) // beyond Unicode
                    goto error;
            case 3:
                if (cp >= 0xD800 && cp < 0xC000) // UTF-16 surrogate
                    goto error;
            case 2:
                if (cp < 128) // ASCII overlong
                    goto error;
                if (cp < (1u << (5 * charlen - 3))) // overlong
                    goto error;
        }
342
        ptr += charlen;
343
344
        continue;

345
346
    error:
        if (rep == 0)
347
            return NULL;
348
        *ptr++ = rep;
349
        str = NULL;
350
351
352
353
    }

    return str;
}
354

355
/**
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
356
 * Replaces invalid/overlong UTF-8 sequences with question marks.
357
358
359
360
361
362
363
364
365
366
367
368
 * Note that it is not possible to convert from Latin-1 to UTF-8 on the fly,
 * so we don't try that, even though it would be less disruptive.
 *
 * @return str if it was valid UTF-8, NULL if not.
 */
char *EnsureUTF8( char *str )
{
    return CheckUTF8( str, '?' );
}


/**
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
369
 * Checks whether a string is a valid UTF-8 byte sequence.
370
371
372
373
374
375
376
377
378
 *
 * @param str nul-terminated string to be checked
 *
 * @return str if it was valid UTF-8, NULL if not.
 */
const char *IsUTF8( const char *str )
{
    return CheckUTF8( (char *)str, 0 );
}
Laurent Aimar's avatar
Laurent Aimar committed
379
380
381
382
383
384
385

/**
 * Converts a string from the given character encoding to utf-8.
 *
 * @return a nul-terminated utf-8 string, or null in case of error.
 * The result must be freed using free().
 */
386
char *FromCharset(const char *charset, const void *data, size_t data_size)
Laurent Aimar's avatar
Laurent Aimar committed
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
{
    vlc_iconv_t handle = vlc_iconv_open ("UTF-8", charset);
    if (handle == (vlc_iconv_t)(-1))
        return NULL;

    char *out = NULL;
    for(unsigned mul = 4; mul < 8; mul++ )
    {
        size_t in_size = data_size;
        const char *in = data;
        size_t out_max = mul * data_size;
        char *tmp = out = malloc (1 + out_max);
        if (!out)
            break;

        if (vlc_iconv (handle, &in, &in_size, &tmp, &out_max) != (size_t)(-1)) {
            *tmp = '\0';
            break;
        }
        free(out);
        out = NULL;

        if (errno != E2BIG)
            break;
    }
    vlc_iconv_close(handle);
    return out;
}