Changeset 9709cd067802994f8d40040f31c8527687c7c479

Show
Ignore:
Timestamp:
03/13/07 22:37:55 (1 year ago)
Author:
Rémi Denis-Courmont <rem@videolan.org>
git-committer:
Rémi Denis-Courmont <rem@videolan.org> 1173821875 +0000
git-parent:

[cf9e0c81162d09b7ee079d512436715fd243ffad]

git-author:
Rémi Denis-Courmont <rem@videolan.org> 1173821875 +0000
Message:

- Fix POSIX locale handling (well you'd better not fall in that case though)
- Split Windows CP "guessing" from real system charset determination

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • src/Makefile.am

    r9ddd87d r9709cd0  
    280280    text/strings.c \ 
    281281    text/unicode.c \ 
     282    text/wincp.c \ 
    282283    text/iso_lang.c \ 
    283284    text/iso-639_def.h \ 
  • src/text/charset.c

    r1e4db90 r9709cd0  
    371371 
    372372 
    373 static inline int locale_match (const char *tab, const char *locale) 
    374 { 
    375     for (;*tab; tab += 2) 
    376         if (memcmp (tab, locale, 2) == 0) 
    377             return 0; 
    378     return 1; 
    379 } 
    380  
    381  
    382 /** 
    383  * @return a fallback characters encoding to be used, given a locale. 
    384  */ 
    385 static const char *FindFallbackEncoding (const char *locale) 
    386 { 
    387     if ((locale == NULL) || (strlen (locale) < 2)) 
    388         return "ASCII"; 
    389  
    390  
    391     /*** The ISO-8859 series (anything but Asia) ***/ 
    392     // Latin-1 Western-European languages (ISO-8859-1) 
    393     static const char western[] = 
    394         "aa" "af" "an" "br" "ca" "da" "de" "en" "es" "et" "eu" "fi" "fo" "fr" 
    395         "ga" "gd" "gl" "gv" "id" "is" "it" "kl" "kw" "mg" "ms" "nb" "nl" "nn" 
    396         "no" "oc" "om" "pt" "so" "sq" "st" "sv" "tl" "uz" "wa" "xh" "zu" 
    397         "eo" "mt" "cy"; 
    398     if (!locale_match (western, locale)) 
    399         return "CP1252"; // Compatible Microsoft superset 
    400  
    401     // Latin-2 Slavic languages (ISO-8859-2) 
    402     static const char slavic[] = "bs" "cs" "hr" "hu" "pl" "ro" "sk" "sl"; 
    403     if (!locale_match (slavic, locale)) 
    404         return "CP1250"; // CP1250 is more common, but incompatible 
    405  
    406     // Latin-3 Southern European languages (ISO-8859-3) 
    407     // "eo" and "mt" -> Latin-1 instead, I presume(?). 
    408     // "tr" -> ISO-8859-9 instead 
    409  
    410     // Latin-4 North-European languages (ISO-8859-4) 
    411     // -> Latin-1 instead 
    412  
    413     /* Cyrillic alphabet languages (ISO-8859-5) */ 
    414     static const char cyrillic[] = "be" "bg" "mk" "ru" "sr"; 
    415     if (!locale_match (cyrillic, locale)) 
    416         return "CP1251"; // KOI8, ISO-8859-5 and CP1251 are incompatible(?) 
    417  
    418     /* Arabic (ISO-8859-6) */ 
    419     if (!locale_match ("ar", locale)) 
    420         // FIXME: someone check if we should return CP1256 or ISO-8859-6 
    421         return "CP1256"; // CP1256 is(?) more common, but incompatible(?) 
    422  
    423     /* Greek (ISO-8859-7) */ 
    424     if (!locale_match ("el", locale)) 
    425         // FIXME: someone check if we should return CP1253 or ISO-8859-7 
    426         return "CP1253"; // CP1253 is(?) more common and less incompatible 
    427  
    428     /* Hebrew (ISO-8859-8) */ 
    429     if (!locale_match ("he" "iw" "yi", locale)) 
    430         return "CP1255"; // Compatible Microsoft superset 
    431  
    432     /* Latin-5 Turkish (ISO-8859-9) */ 
    433     if (!locale_match ("tr" "ku", locale)) 
    434         return "CP1254"; // Compatible Microsoft superset 
    435  
    436     /* Latin-6 “North-European” languages (ISO-8859-10) */ 
    437     /* It is so much north European that glibc only uses that for Luganda 
    438      * which is spoken in Uganda... unless someone complains, I'm not 
    439      * using this one; let's fallback to CP1252 here. */ 
    440  
    441     // ISO-8859-11 does arguably not exist. Thai is handled below. 
    442  
    443     // ISO-8859-12 really doesn't exist. 
    444  
    445     // Latin-7 Baltic languages (ISO-8859-13) 
    446     if (!locale_match ("lt" "lv" "mi", locale)) 
    447         // FIXME: mi = New Zealand, doesn't sound baltic! 
    448         return "CP1257"; // Compatible Microsoft superset 
    449  
    450     // Latin-8 Celtic languages (ISO-8859-14) 
    451     // "cy" -> use Latin-1 instead (most likely English or French) 
    452  
    453     // Latin-9 (ISO-8859-15) -> see Latin-1 
    454  
    455     // Latin-10 (ISO-8859-16) does not seem to be used 
    456  
    457     /*** KOI series ***/ 
    458     // For Russian, we use CP1251 
    459     if (!locale_match ("uk", locale)) 
    460         return "KOI8-U"; 
    461  
    462     if (!locale_match ("tg", locale)) 
    463         return "KOI8-T"; 
    464  
    465     /*** Asia ***/ 
    466     // Japanese 
    467     if (!locale_match ("jp", locale)) 
    468         return "SHIFT-JIS"; // Shift-JIS is way more common than EUC-JP 
    469  
    470     // Korean 
    471     if (!locale_match ("ko", locale)) 
    472         return "EUC-KR"; 
    473  
    474     // Thai 
    475     if (!locale_match ("th", locale)) 
    476         return "TIS-620"; 
    477  
    478     // Vietnamese (FIXME: more infos needed) 
    479     if (!locale_match ("vt", locale)) 
    480         /* VISCII is probably a bad idea as it is not extended ASCII */ 
    481         /* glibc has TCVN5712-1 */ 
    482         return "CP1258"; 
    483  
    484     /* Kazakh (FIXME: more infos needed) */ 
    485     if (!locale_match ("kk", locale)) 
    486         return "PT154"; 
    487  
    488     // Chinese. The politically incompatible character sets. 
    489     if (!locale_match ("zh", locale)) 
    490     { 
    491         if ((strlen (locale) >= 5) && (locale[2] != '_')) 
    492             locale += 3; 
    493  
    494         // Hong Kong 
    495         if (!locale_match ("HK", locale)) 
    496             return "BIG5-HKSCS"; /* FIXME: use something else? */ 
    497  
    498         // Taiwan island 
    499         if (!locale_match ("TW", locale)) 
    500             return "BIG5"; 
    501  
    502         // People's Republic of China and Singapore 
    503         /* 
    504          * GB18030 can represent any Unicode code point 
    505          * (like UTF-8), while remaining compatible with GBK 
    506          * FIXME: is it compatible with GB2312? if not, should we 
    507          * use GB2312 instead? 
    508          */ 
    509         return "GB18030"; 
    510     } 
    511  
    512     return "ASCII"; 
    513 } 
    514  
    515 /** 
    516  * GetFallbackEncoding() suggests an encoding to be used for non UTF-8 
    517  * text files accord to the system's local settings. It is only a best 
    518  * guess. 
    519  */ 
    520 const char *GetFallbackEncoding( void ) 
    521 { 
    522 #ifndef WIN32 
    523     const char *psz_lang = NULL; 
    524  
    525     /* Some systems (like Darwin, SunOS 4 or DJGPP) have only the C locale. 
    526      * Therefore we don't use setlocale here; it would return "C". */ 
    527 #  if defined (HAVE_SETLOCALE) && !defined ( __APPLE__) 
    528     psz_lang = setlocale( LC_ALL, NULL ); 
    529 #  endif 
    530     if( psz_lang == NULL || psz_lang[0] == '\0' ) 
    531     { 
    532         psz_lang = getenv( "LC_ALL" ); 
    533         if( psz_lang == NULL || psz_lang == '\0' ) 
    534         { 
    535             psz_lang = getenv( "LC_CTYPE" ); 
    536             if( psz_lang == NULL || psz_lang[0] == '\0') 
    537                 psz_lang = getenv( "LANG" ); 
    538         } 
    539     } 
    540  
    541     return FindFallbackEncoding( psz_lang ); 
    542 #else 
    543     /* 
    544      * This should be thread-safe given GetACP() should always return 
    545      * the same result. 
    546      */ 
    547     static char buf[2 + 10 + 1] = ""; 
    548  
    549     if( buf[0] == 0 ) 
    550         snprintf( buf, sizeof( buf ), "CP%u", GetACP() ); 
    551     return buf; 
    552 #endif 
    553 } 
    554  
    555373/** 
    556374 * There are two decimal separators in the computer world-wide locales: