unicode.c 15.6 KB
Newer Older
1
/*****************************************************************************
2
 * unicode.c: Unicode <-> locale functions
3
 *****************************************************************************
4
 * Copyright (C) 2005-2006 the VideoLAN team
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
5
 * Copyright © 2005-2006 Rémi Denis-Courmont
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
 * $Id$
 *
 * Authors: Rémi Denis-Courmont <rem # videolan.org>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
dionoea's avatar
dionoea committed
22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23
24
25
26
27
28
 *****************************************************************************/

/*****************************************************************************
 * Preamble
 *****************************************************************************/
#include <vlc/vlc.h>
zorglub's avatar
zorglub committed
29
#include <vlc_charset.h>
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
30
#include "libvlc.h" /* utf8_mkdir */
31

32
#include <assert.h>
33

34
#include <stdio.h>
35
#include <stdarg.h>
36
#include <stdlib.h>
37
#include <errno.h>
38
#include <sys/types.h>
39
40
41
42
43
44
#ifdef HAVE_DIRENT_H
#  include <dirent.h>
#endif
#ifdef UNDER_CE
#  include <tchar.h>
#endif
45
46
47
#ifdef HAVE_SYS_STAT_H
# include <sys/stat.h>
#endif
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
48
49
50
51
52
#ifdef HAVE_FCNTL_H
# include <fcntl.h>
#endif
#ifdef WIN32
# include <io.h>
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
53
54
#else
# include <unistd.h>
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
55
#endif
56

57
58
59
#ifndef HAVE_LSTAT
# define lstat( a, b ) stat(a, b)
#endif
60

61
62
63
64
65
#ifdef __APPLE__
/* Define this if the OS always use UTF-8 internally */
# define ASSUME_UTF8 1
#endif

Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
66
67
68
69
70
71
72
73
#if defined (ASSUME_UTF8)
/* Cool */
#elif defined (WIN32) || defined (UNDER_CE)
# define USE_MB2MB 1
#elif defined (HAVE_ICONV)
# define USE_ICONV 1
#else
# error No UTF8 charset conversion implemented on this platform!
74
75
#endif

76
#if defined (USE_ICONV)
77
static char charset[sizeof ("CSISO11SWEDISHFORNAMES//translit")] = "";
78

79
static void find_charset_once (void)
80
81
{
    char *psz_charset;
82
83
84
85
86
    if (vlc_current_charset (&psz_charset)
     || (psz_charset == NULL)
     || ((size_t)snprintf (charset, sizeof (charset), "%s//translit",
                           psz_charset) >= sizeof (charset)))
        strcpy (charset, "UTF-8");
87

88
    free (psz_charset);
89
90
}

91
static int find_charset (void)
92
{
93
94
    static pthread_once_t once = PTHREAD_ONCE_INIT;
    pthread_once (&once, find_charset_once);
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
95
    return !strcmp (charset, "UTF-8");
96
}
97
98
#endif

99

100
static char *locale_fast (const char *string, vlc_bool_t from)
101
{
102
#if defined (USE_ICONV)
103
104
    if (find_charset ())
        return (char *)string;
105

106
107
    vlc_iconv_t hd = vlc_iconv_open (from ? "UTF-8" : charset,
                                     from ? charset : "UTF-8");
108
    if (hd == (vlc_iconv_t)(-1))
109
        return strdup (string); /* Uho! */
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127

    const char *iptr = string;
    size_t inb = strlen (string);
    size_t outb = inb * 6 + 1;
    char output[outb], *optr = output;

    if (string == NULL)
        return NULL;

    while (vlc_iconv (hd, &iptr, &inb, &optr, &outb) == (size_t)(-1))
    {
        *optr++ = '?';
        outb--;
        iptr++;
        inb--;
        vlc_iconv (hd, NULL, NULL, NULL, NULL);
    }
    *optr = '\0';
128
    vlc_iconv_close (hd);
129
130
131
132
133
134
135

    assert (inb == 0);
    assert (*iptr == '\0');
    assert (*optr == '\0');
    assert (strlen (output) == (size_t)(optr - output));
    return strdup (output);
#elif defined (USE_MB2MB)
136
    char *out;
137
    int len;
138

139
140
141
    if (string == NULL)
        return NULL;

142
143
    len = 1 + MultiByteToWideChar (from ? CP_ACP : CP_UTF8,
                                   0, string, -1, NULL, 0);
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
144
145
    wchar_t wide[len];

damienf's avatar
damienf committed
146
147
    MultiByteToWideChar (from ? CP_ACP : CP_UTF8, 0, string, -1, wide, len);
    len = 1 + WideCharToMultiByte (from ? CP_UTF8 : CP_ACP, 0, wide, -1, NULL, 0, NULL, NULL);
148
    out = malloc (len);
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
149
150
    if (out == NULL)
        return NULL;
151

damienf's avatar
damienf committed
152
    WideCharToMultiByte (from ? CP_UTF8 : CP_ACP, 0, wide, -1, out, len, NULL, NULL);
153
    return out;
154
155
156
157
158
159
#else
    return (char *)string;
#endif
}


160
static inline char *locale_dup (const char *string, vlc_bool_t from)
161
162
{
#if defined (USE_ICONV)
163
164
165
    if (find_charset ())
        return strdup (string);
    return locale_fast (string, from);
166
#elif defined (USE_MB2MB)
167
    return locale_fast (string, from);
168
169
170
#else
    return strdup (string);
#endif
171
}
172
173
174
175
176


void LocaleFree (const char *str)
{
#if defined (USE_ICONV)
177
    if (!find_charset ())
178
179
180
        free ((char *)str);
#elif defined (USE_MB2MB)
    free ((char *)str);
181
#endif
182
183
}

184

Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
185
/**
186
 * FromLocale: converts a locale string to UTF-8
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
187
188
189
190
191
192
193
 *
 * @param locale nul-terminated string to be converted
 *
 * @return a nul-terminated UTF-8 string, or NULL in case of error.
 * To avoid memory leak, you have to pass the result to LocaleFree()
 * when it is no longer needed.
 */
194
char *FromLocale (const char *locale)
195
{
196
    return locale_fast (locale, VLC_TRUE);
197
198
}

199
char *FromLocaleDup (const char *locale)
200
{
201
    return locale_dup (locale, VLC_TRUE);
202
203
204
}


Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
205
206
207
208
209
210
211
212
213
/**
 * ToLocale: converts a UTF-8 string to local system encoding.
 *
 * @param utf8 nul-terminated string to be converted
 *
 * @return a nul-terminated string, or NULL in case of error.
 * To avoid memory leak, you have to pass the result to LocaleFree()
 * when it is no longer needed.
 */
214
char *ToLocale (const char *utf8)
215
{
216
    return locale_fast (utf8, VLC_FALSE);
217
218
}

219

220
static char *ToLocaleDup (const char *utf8)
221
{
222
    return locale_dup (utf8, VLC_FALSE);
223
224
}

225

226
/**
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
227
 * utf8_open: open() wrapper for UTF-8 filenames
228
 */
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
229
int utf8_open (const char *filename, int flags, mode_t mode)
230
{
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
231
#if defined (WIN32) || defined (UNDER_CE)
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
232
    if (GetVersion() < 0x80000000)
233
    {
dionoea's avatar
dionoea committed
234
        /* for Windows NT and above */
235
236
        wchar_t wpath[MAX_PATH + 1];

Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
237
        if (!MultiByteToWideChar (CP_UTF8, 0, filename, -1, wpath, MAX_PATH))
238
239
        {
            errno = ENOENT;
240
            return -1;
241
242
243
        }
        wpath[MAX_PATH] = L'\0';

dionoea's avatar
dionoea committed
244
        /*
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
245
246
         * open() cannot open files with non-“ANSI” characters on Windows.
         * We use _wopen() instead. Same thing for mkdir() and stat().
dionoea's avatar
dionoea committed
247
         */
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
248
        return _wopen (wpath, flags, mode);
249
    }
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
250
#endif
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
251
    const char *local_name = ToLocale (filename);
252

Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
253
    if (local_name == NULL)
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
254
255
    {
        errno = ENOENT;
256
        return -1;
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
    }

    int fd = open (local_name, flags, mode);
    LocaleFree (local_name);
    return fd;
}

/**
 * utf8_fopen: fopen() wrapper for UTF-8 filenames
 */
FILE *utf8_fopen (const char *filename, const char *mode)
{
    int rwflags = 0, oflags = 0;
    vlc_bool_t append = VLC_FALSE;

    for (const char *ptr = mode; *ptr; ptr++)
    {
        switch (*ptr)
        {
            case 'r':
                rwflags = O_RDONLY;
                break;

            case 'a':
                rwflags = O_WRONLY;
                oflags |= O_CREAT;
                append = VLC_TRUE;
                break;

            case 'w':
                rwflags = O_WRONLY;
                oflags |= O_CREAT | O_TRUNC;
                break;

            case '+':
                rwflags = O_RDWR;
                break;
294
295
296
297
298
299

#ifdef O_TEXT
            case 't':
                oflags |= O_TEXT;
                break;
#endif
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
        }
    }

    int fd = utf8_open (filename, rwflags | oflags, 0666);
    if (fd == -1)
        return NULL;

    if (append && (lseek (fd, 0, SEEK_END) == -1))
    {
        close (fd);
        return NULL;
    }

    FILE *stream = fdopen (fd, mode);
    if (stream == NULL)
        close (fd);
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
316

Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
317
    return stream;
318
}
319

320
/**
321
 * utf8_mkdir: Calls mkdir() after conversion of file name to OS locale
322
323
324
325
326
 *
 * @param dirname a UTF-8 string with the name of the directory that you
 *        want to create.
 * @return A 0 return value indicates success. A -1 return value indicates an
 *        error, and an error code is stored in errno
327
 */
328
329
int utf8_mkdir( const char *dirname )
{
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
330
#if defined (UNDER_CE) || defined (WIN32)
331
332
    wchar_t wname[MAX_PATH + 1];
    char mod[MAX_PATH + 1];
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
    int i;

    /* Convert '/' into '\' */
    for( i = 0; *dirname; i++ )
    {
        if( i == MAX_PATH )
            return -1; /* overflow */

        if( *dirname == '/' )
            mod[i] = '\\';
        else
            mod[i] = *dirname;
        dirname++;

    }
    mod[i] = 0;

    if( MultiByteToWideChar( CP_UTF8, 0, mod, -1, wname, MAX_PATH ) == 0 )
    {
        errno = ENOENT;
        return -1;
    }
355
    wname[MAX_PATH] = L'\0';
356
357
358
359
360

    if( CreateDirectoryW( wname, NULL ) == 0 )
    {
        if( GetLastError( ) == ERROR_ALREADY_EXISTS )
            errno = EEXIST;
Christophe Mutricy's avatar
Christophe Mutricy committed
361
362
        else
            errno = ENOENT;
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
        return -1;
    }
    return 0;
#else
    char *locname = ToLocale( dirname );
    int res;

    if( locname == NULL )
    {
        errno = ENOENT;
        return -1;
    }
    res = mkdir( locname, 0755 );

    LocaleFree( locname );
    return res;
#endif
}

382
383
384
385
386
/**
 * utf8_opendir: wrapper that converts dirname to the locale in use by the OS
 *
 * @param dirname UTF-8 representation of the directory name
 *
387
388
 * @return a pointer to the DIR struct. Release with closedir().
 */
389
DIR *utf8_opendir( const char *dirname )
390
{
391
392
393
394
395
396
397
398
399
#ifdef WIN32
    wchar_t wname[MAX_PATH + 1];

    if (MultiByteToWideChar (CP_UTF8, 0, dirname, -1, wname, MAX_PATH))
    {
        wname[MAX_PATH] = L'\0';
        return (DIR *)vlc_wopendir (wname);
    }
#else
400
401
402
403
    const char *local_name = ToLocale( dirname );

    if( local_name != NULL )
    {
404
        DIR *dir = opendir( local_name );
405
406
407
        LocaleFree( local_name );
        return dir;
    }
408
409
410
#endif

    errno = ENOENT;
411
412
413
    return NULL;
}

414
415
416
417
418
419
/**
 * utf8_readdir: a readdir wrapper that returns the name of the next entry
 *     in the directory as a UTF-8 string.
 *
 * @param dir The directory that is being read
 *
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
420
 * @return a UTF-8 string of the directory entry. Use free() to free this memory.
421
 */
422
char *utf8_readdir( DIR *dir )
423
{
424
425
426
427
428
429
430
#ifdef WIN32
    struct _wdirent *ent = vlc_wreaddir (dir);
    if (ent == NULL)
        return NULL;

    return FromWide (ent->d_name);
#else
431
432
    struct dirent *ent;

433
    ent = readdir( (DIR *)dir );
434
435
436
    if( ent == NULL )
        return NULL;

437
    return vlc_fix_readdir( ent->d_name );
438
#endif
439
440
}

441
442
443
444
445
446
static int dummy_select( const char *str )
{
    (void)str;
    return 1;
}

447
int utf8_loaddir( DIR *dir, char ***namelist,
448
449
450
451
452
453
454
455
456
457
458
                  int (*select)( const char * ),
                  int (*compar)( const char **, const char ** ) )
{
    if( select == NULL )
        select = dummy_select;

    if( dir == NULL )
        return -1;
    else
    {
        char **tab = NULL;
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
459
        char *entry;
460
461
        unsigned num = 0;

462
463
        rewinddir( dir );

464
465
466
467
        while( ( entry = utf8_readdir( dir ) ) != NULL )
        {
            char **newtab;

468
            if( !select( entry ) )
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
469
            {
470
                free( entry );
471
                continue;
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
472
            }
473
474
475

            newtab = realloc( tab, sizeof( char * ) * (num + 1) );
            if( newtab == NULL )
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
476
            {
477
                free( entry );
478
                goto error;
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
479
            }
480
            tab = newtab;
481
            tab[num++] = entry;
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
        }

        if( compar != NULL )
            qsort( tab, num, sizeof( tab[0] ),
                   (int (*)( const void *, const void *))compar );

        *namelist = tab;
        return num;

    error:{
        unsigned i;

        for( i = 0; i < num; i++ )
            free( tab[i] );
        if( tab != NULL )
            free( tab );
498
        }
499
    }
500
    return -1;
501
502
}

503
504
505
506
507
508
509
510
511
512
513
514
515
516
int utf8_scandir( const char *dirname, char ***namelist,
                  int (*select)( const char * ),
                  int (*compar)( const char **, const char ** ) )
{
    DIR *dir = utf8_opendir (dirname);
    int val = -1;

    if (dir != NULL)
    {
        val = utf8_loaddir (dir, namelist, select, compar);
        closedir (dir);
    }
    return val;
}
517

518
static int utf8_statEx( const char *filename, struct stat *buf,
519
520
                        vlc_bool_t deref )
{
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
521
#if defined (WIN32) || defined (UNDER_CE)
522
523
524
    /* retrieve Windows OS version */
    if( GetVersion() < 0x80000000 )
    {
dionoea's avatar
dionoea committed
525
        /* for Windows NT and above */
526
527
528
529
530
531
532
533
534
        wchar_t wpath[MAX_PATH + 1];

        if( !MultiByteToWideChar( CP_UTF8, 0, filename, -1, wpath, MAX_PATH ) )
        {
            errno = ENOENT;
            return -1;
        }
        wpath[MAX_PATH] = L'\0';

535
        return _wstati64( wpath, buf );
536
    }
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
537
538
539
#endif
#ifdef HAVE_SYS_STAT_H
    const char *local_name = ToLocale( filename );
540

Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
541
542
    if( local_name != NULL )
    {
543
544
        int res = deref ? stat( local_name, buf )
                       : lstat( local_name, buf );
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
545
546
        LocaleFree( local_name );
        return res;
547
    }
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
548
    errno = ENOENT;
549
#endif
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
550
    return -1;
551
552
553
}


554
int utf8_stat( const char *filename, struct stat *buf)
555
556
557
558
{
    return utf8_statEx( filename, buf, VLC_TRUE );
}

559
int utf8_lstat( const char *filename, struct stat *buf)
560
561
562
563
{
    return utf8_statEx( filename, buf, VLC_FALSE );
}

564
/**
565
 * utf8_*printf: *printf with conversion from UTF-8 to local encoding
566
 */
567
static int utf8_vasprintf( char **str, const char *fmt, va_list ap )
568
{
569
570
571
572
573
574
575
576
    char *utf8;
    int res = vasprintf( &utf8, fmt, ap );
    if( res == -1 )
        return -1;

    *str = ToLocaleDup( utf8 );
    free( utf8 );
    return res;
577
578
}

579
int utf8_vfprintf( FILE *stream, const char *fmt, va_list ap )
580
{
581
582
583
584
585
586
587
588
    char *str;
    int res = utf8_vasprintf( &str, fmt, ap );
    if( res == -1 )
        return -1;

    fputs( str, stream );
    free( str );
    return res;
589
590
591
592
}

int utf8_fprintf( FILE *stream, const char *fmt, ... )
{
593
594
    va_list ap;
    int res;
595

596
597
598
599
    va_start( ap, fmt );
    res = utf8_vfprintf( stream, fmt, ap );
    va_end( ap );
    return res;
600
601
}

602
603

static char *CheckUTF8( char *str, char rep )
604
{
605
    uint8_t *ptr = (uint8_t *)str;
606
607
    assert (str != NULL);

608
    for (;;)
609
    {
610
611
612
613
614
615
616
617
        uint8_t c = ptr[0];
        int charlen = -1;

        if (c == '\0')
            break;

        for (int i = 0; i < 7; i++)
            if ((c >> (7 - i)) == ((0xff >> (7 - i)) ^ 1))
618
            {
619
620
                charlen = i;
                break;
621
            }
622
623

        switch (charlen)
624
        {
625
626
627
628
629
630
            case 0: // 7-bit ASCII character -> OK
                ptr++;
                continue;

            case -1: // 1111111x -> error
            case 1: // continuation byte -> error
631
                goto error;
632
        }
633
634
635
636
637

        assert (charlen >= 2);

        uint32_t cp = c & ~((0xff >> (7 - charlen)) << (7 - charlen));
        for (int i = 1; i < charlen; i++)
638
        {
639
640
641
642
643
            assert (cp < (1 << 26));
            c = ptr[i];

            if ((c == '\0') // unexpected end of string
             || ((c >> 6) != 2)) // not a continuation byte
644
                goto error;
645
646

            cp = (cp << 6) | (ptr[i] & 0x3f);
647
        }
648
649
650
651

        if (cp < 128) // overlong (special case for ASCII)
            goto error;
        if (cp < (1u << (5 * charlen - 3))) // overlong
652
653
            goto error;

654
        ptr += charlen;
655
656
        continue;

657
658
    error:
        if (rep == 0)
659
            return NULL;
660
        *ptr++ = rep;
661
        str = NULL;
662
663
664
665
    }

    return str;
}
666

667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
/**
 * EnsureUTF8: replaces invalid/overlong UTF-8 sequences with question marks
 * Note that it is not possible to convert from Latin-1 to UTF-8 on the fly,
 * so we don't try that, even though it would be less disruptive.
 *
 * @return str if it was valid UTF-8, NULL if not.
 */
char *EnsureUTF8( char *str )
{
    return CheckUTF8( str, '?' );
}


/**
 * IsUTF8: checks whether a string is a valid UTF-8 byte sequence.
 *
 * @param str nul-terminated string to be checked
 *
 * @return str if it was valid UTF-8, NULL if not.
 */
const char *IsUTF8( const char *str )
{
    return CheckUTF8( (char *)str, 0 );
}