wincp.c 7.17 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
/*****************************************************************************
 * wincp.c: Guessing "local" ANSI code page on Microsoft Windows®
 *****************************************************************************
 *
 * Copyright © 2006-2007 Rémi Denis-Courmont
 * $Id$
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
 *****************************************************************************/

/*** We need your help to complete this file!! Look for FIXME ***/

25 26 27 28
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif

29
#include <vlc_common.h>
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60

#ifndef WIN32
# include <locale.h>
#else
# include <windows.h>
#endif

#ifdef __APPLE__
#   include <string.h>
#endif

#include <vlc_charset.h>


#ifndef WIN32 /* should work on Win32, but useless */
static inline int locale_match (const char *tab, const char *locale)
{
    for (;*tab; tab += 2)
        if (memcmp (tab, locale, 2) == 0)
            return 0;
    return 1;
}


/**
 * @return a fallback characters encoding to be used, given a locale.
 */
static const char *FindFallbackEncoding (const char *locale)
{
    if ((locale == NULL) || (strlen (locale) < 2)
     || !strcasecmp (locale, "POSIX"))
61
        return "CP1252"; /* Yeah, this is totally western-biased */
62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86


    /*** The ISO-8859 series (anything but Asia) ***/
    // Latin-1 Western-European languages (ISO-8859-1)
    static const char western[] =
        "aa" "af" "an" "br" "ca" "da" "de" "en" "es" "et" "eu" "fi" "fo" "fr"
        "ga" "gd" "gl" "gv" "id" "is" "it" "kl" "kw" "mg" "ms" "nb" "nl" "nn"
        "no" "oc" "om" "pt" "so" "sq" "st" "sv" "tl" "uz" "wa" "xh" "zu"
        "eo" "mt" "cy";
    if (!locale_match (western, locale))
        return "CP1252"; // Compatible Microsoft superset

    // Latin-2 Slavic languages (ISO-8859-2)
    static const char slavic[] = "bs" "cs" "hr" "hu" "pl" "ro" "sk" "sl";
    if (!locale_match (slavic, locale))
        return "CP1250"; // CP1250 is more common, but incompatible

    // Latin-3 Southern European languages (ISO-8859-3)
    // "eo" and "mt" -> Latin-1 instead, I presume(?).
    // "tr" -> ISO-8859-9 instead

    // Latin-4 North-European languages (ISO-8859-4)
    // -> Latin-1 instead

    /* Cyrillic alphabet languages (ISO-8859-5) */
87 88
    static const char cyrillic[] = "be" "bg" "mk" "ru" "sr" "mn";
    // FIXME: cyrillic only true for mn in Mongolia
89 90 91 92
    if (!locale_match (cyrillic, locale))
        return "CP1251"; // KOI8, ISO-8859-5 and CP1251 are incompatible(?)

    /* Arabic (ISO-8859-6) */
Christophe Mutricy's avatar
Christophe Mutricy committed
93
    static const char arabic[] = "ar" "fa";
94
    if (!locale_match (arabic, locale))
95 96 97 98 99 100 101 102 103 104
        // FIXME: someone check if we should return CP1256 or ISO-8859-6
        return "CP1256"; // CP1256 is(?) more common, but incompatible(?)

    /* Greek (ISO-8859-7) */
    if (!locale_match ("el", locale))
        // FIXME: someone check if we should return CP1253 or ISO-8859-7
        return "CP1253"; // CP1253 is(?) more common and less incompatible

    /* Hebrew (ISO-8859-8) */
    if (!locale_match ("he" "iw" "yi", locale))
105
        return "ISO-8859-8"; // CP1255 is reportedly screwed up
106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146

    /* Latin-5 Turkish (ISO-8859-9) */
    if (!locale_match ("tr" "ku", locale))
        return "CP1254"; // Compatible Microsoft superset

    /* Latin-6 “North-European” languages (ISO-8859-10) */
    /* It is so much north European that glibc only uses that for Luganda
     * which is spoken in Uganda... unless someone complains, I'm not
     * using this one; let's fallback to CP1252 here. */

    // ISO-8859-11 does arguably not exist. Thai is handled below.

    // ISO-8859-12 really doesn't exist.

    // Latin-7 Baltic languages (ISO-8859-13)
    if (!locale_match ("lt" "lv" "mi", locale))
        // FIXME: mi = New Zealand, doesn't sound baltic!
        return "CP1257"; // Compatible Microsoft superset

    // Latin-8 Celtic languages (ISO-8859-14)
    // "cy" -> use Latin-1 instead (most likely English or French)

    // Latin-9 (ISO-8859-15) -> see Latin-1

    // Latin-10 (ISO-8859-16) does not seem to be used

    /*** KOI series ***/
    // For Russian, we use CP1251
    if (!locale_match ("uk", locale))
        return "KOI8-U";

    if (!locale_match ("tg", locale))
        return "KOI8-T";

    /*** Asia ***/
    // Japanese
    if (!locale_match ("jp", locale))
        return "SHIFT-JIS"; // Shift-JIS is way more common than EUC-JP

    // Korean
    if (!locale_match ("ko", locale))
147
        return "CP949"; // Microsoft non-standard superset of EUC-KR
148 149

    // Thai
150 151 152
    static const char thai[] = "th" "km" "lo";
    //FIXME: afaik, khmer and lao are/were not in windows and are close to tahi
    if (!locale_match (thai, locale))
153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
        return "TIS-620";

    // Vietnamese (FIXME: more infos needed)
    if (!locale_match ("vt", locale))
        /* VISCII is probably a bad idea as it is not extended ASCII */
        /* glibc has TCVN5712-1 */
        return "CP1258";

    /* Kazakh (FIXME: more infos needed) */
    if (!locale_match ("kk", locale))
        return "PT154";

    // Chinese. The politically incompatible character sets.
    if (!locale_match ("zh", locale))
    {
        if ((strlen (locale) >= 5) && (locale[2] != '_'))
            locale += 3;

        // Hong Kong
        if (!locale_match ("HK", locale))
            return "BIG5-HKSCS"; /* FIXME: use something else? */

        // Taiwan island
        if (!locale_match ("TW", locale))
            return "BIG5";

        // People's Republic of China and Singapore
        /*
         * GB18030 can represent any Unicode code point
         * (like UTF-8), while remaining compatible with GBK
         * FIXME: is it compatible with GB2312? if not, should we
         * use GB2312 instead?
         */
        return "GB18030";
    }

    return "ASCII";
}
#endif

/**
 * GetFallbackEncoding() suggests an encoding to be used for non UTF-8
 * text files accord to the system's local settings. It is only a best
 * guess.
 */
const char *GetFallbackEncoding( void )
{
#ifndef WIN32
201
    const char *psz_lang;
202

203 204
    psz_lang = getenv ("LC_ALL");
    if ((psz_lang == NULL) || !*psz_lang)
205
    {
206
        psz_lang = getenv ("LC_CTYPE");
207
        if ((psz_lang == NULL) || !*psz_lang)
208
            psz_lang = getenv ("LANG");
209 210 211 212
    }

    return FindFallbackEncoding (psz_lang);
#else
213
    static char buf[16] = "";
214

215
    if (buf[0] == 0)
216 217 218 219 220 221 222 223 224
    {
        int cp = GetACP ();

        switch (cp)
        {
            case 1255: // Hebrew, CP1255 screws up somewhat
                strcpy (buf, "ISO-8859-8");
                break;
            default:
Rémi Denis-Courmont's avatar
Rémi Denis-Courmont committed
225
                snprintf (buf, sizeof (buf), "CP%u", cp);
226 227
        }
    }
228 229 230
    return buf;
#endif
}