| 373 | | static inline int locale_match (const char *tab, const char *locale) |
|---|
| 374 | | { |
|---|
| 375 | | for (;*tab; tab += 2) |
|---|
| 376 | | if (memcmp (tab, locale, 2) == 0) |
|---|
| 377 | | return 0; |
|---|
| 378 | | return 1; |
|---|
| 379 | | } |
|---|
| 380 | | |
|---|
| 381 | | |
|---|
| 382 | | /** |
|---|
| 383 | | * @return a fallback characters encoding to be used, given a locale. |
|---|
| 384 | | */ |
|---|
| 385 | | static const char *FindFallbackEncoding (const char *locale) |
|---|
| 386 | | { |
|---|
| 387 | | if ((locale == NULL) || (strlen (locale) < 2)) |
|---|
| 388 | | return "ASCII"; |
|---|
| 389 | | |
|---|
| 390 | | |
|---|
| 391 | | /*** The ISO-8859 series (anything but Asia) ***/ |
|---|
| 392 | | // Latin-1 Western-European languages (ISO-8859-1) |
|---|
| 393 | | static const char western[] = |
|---|
| 394 | | "aa" "af" "an" "br" "ca" "da" "de" "en" "es" "et" "eu" "fi" "fo" "fr" |
|---|
| 395 | | "ga" "gd" "gl" "gv" "id" "is" "it" "kl" "kw" "mg" "ms" "nb" "nl" "nn" |
|---|
| 396 | | "no" "oc" "om" "pt" "so" "sq" "st" "sv" "tl" "uz" "wa" "xh" "zu" |
|---|
| 397 | | "eo" "mt" "cy"; |
|---|
| 398 | | if (!locale_match (western, locale)) |
|---|
| 399 | | return "CP1252"; // Compatible Microsoft superset |
|---|
| 400 | | |
|---|
| 401 | | // Latin-2 Slavic languages (ISO-8859-2) |
|---|
| 402 | | static const char slavic[] = "bs" "cs" "hr" "hu" "pl" "ro" "sk" "sl"; |
|---|
| 403 | | if (!locale_match (slavic, locale)) |
|---|
| 404 | | return "CP1250"; // CP1250 is more common, but incompatible |
|---|
| 405 | | |
|---|
| 406 | | // Latin-3 Southern European languages (ISO-8859-3) |
|---|
| 407 | | // "eo" and "mt" -> Latin-1 instead, I presume(?). |
|---|
| 408 | | // "tr" -> ISO-8859-9 instead |
|---|
| 409 | | |
|---|
| 410 | | // Latin-4 North-European languages (ISO-8859-4) |
|---|
| 411 | | // -> Latin-1 instead |
|---|
| 412 | | |
|---|
| 413 | | /* Cyrillic alphabet languages (ISO-8859-5) */ |
|---|
| 414 | | static const char cyrillic[] = "be" "bg" "mk" "ru" "sr"; |
|---|
| 415 | | if (!locale_match (cyrillic, locale)) |
|---|
| 416 | | return "CP1251"; // KOI8, ISO-8859-5 and CP1251 are incompatible(?) |
|---|
| 417 | | |
|---|
| 418 | | /* Arabic (ISO-8859-6) */ |
|---|
| 419 | | if (!locale_match ("ar", locale)) |
|---|
| 420 | | // FIXME: someone check if we should return CP1256 or ISO-8859-6 |
|---|
| 421 | | return "CP1256"; // CP1256 is(?) more common, but incompatible(?) |
|---|
| 422 | | |
|---|
| 423 | | /* Greek (ISO-8859-7) */ |
|---|
| 424 | | if (!locale_match ("el", locale)) |
|---|
| 425 | | // FIXME: someone check if we should return CP1253 or ISO-8859-7 |
|---|
| 426 | | return "CP1253"; // CP1253 is(?) more common and less incompatible |
|---|
| 427 | | |
|---|
| 428 | | /* Hebrew (ISO-8859-8) */ |
|---|
| 429 | | if (!locale_match ("he" "iw" "yi", locale)) |
|---|
| 430 | | return "CP1255"; // Compatible Microsoft superset |
|---|
| 431 | | |
|---|
| 432 | | /* Latin-5 Turkish (ISO-8859-9) */ |
|---|
| 433 | | if (!locale_match ("tr" "ku", locale)) |
|---|
| 434 | | return "CP1254"; // Compatible Microsoft superset |
|---|
| 435 | | |
|---|
| 436 | | /* Latin-6 “North-European” languages (ISO-8859-10) */ |
|---|
| 437 | | /* It is so much north European that glibc only uses that for Luganda |
|---|
| 438 | | * which is spoken in Uganda... unless someone complains, I'm not |
|---|
| 439 | | * using this one; let's fallback to CP1252 here. */ |
|---|
| 440 | | |
|---|
| 441 | | // ISO-8859-11 does arguably not exist. Thai is handled below. |
|---|
| 442 | | |
|---|
| 443 | | // ISO-8859-12 really doesn't exist. |
|---|
| 444 | | |
|---|
| 445 | | // Latin-7 Baltic languages (ISO-8859-13) |
|---|
| 446 | | if (!locale_match ("lt" "lv" "mi", locale)) |
|---|
| 447 | | // FIXME: mi = New Zealand, doesn't sound baltic! |
|---|
| 448 | | return "CP1257"; // Compatible Microsoft superset |
|---|
| 449 | | |
|---|
| 450 | | // Latin-8 Celtic languages (ISO-8859-14) |
|---|
| 451 | | // "cy" -> use Latin-1 instead (most likely English or French) |
|---|
| 452 | | |
|---|
| 453 | | // Latin-9 (ISO-8859-15) -> see Latin-1 |
|---|
| 454 | | |
|---|
| 455 | | // Latin-10 (ISO-8859-16) does not seem to be used |
|---|
| 456 | | |
|---|
| 457 | | /*** KOI series ***/ |
|---|
| 458 | | // For Russian, we use CP1251 |
|---|
| 459 | | if (!locale_match ("uk", locale)) |
|---|
| 460 | | return "KOI8-U"; |
|---|
| 461 | | |
|---|
| 462 | | if (!locale_match ("tg", locale)) |
|---|
| 463 | | return "KOI8-T"; |
|---|
| 464 | | |
|---|
| 465 | | /*** Asia ***/ |
|---|
| 466 | | // Japanese |
|---|
| 467 | | if (!locale_match ("jp", locale)) |
|---|
| 468 | | return "SHIFT-JIS"; // Shift-JIS is way more common than EUC-JP |
|---|
| 469 | | |
|---|
| 470 | | // Korean |
|---|
| 471 | | if (!locale_match ("ko", locale)) |
|---|
| 472 | | return "EUC-KR"; |
|---|
| 473 | | |
|---|
| 474 | | // Thai |
|---|
| 475 | | if (!locale_match ("th", locale)) |
|---|
| 476 | | return "TIS-620"; |
|---|
| 477 | | |
|---|
| 478 | | // Vietnamese (FIXME: more infos needed) |
|---|
| 479 | | if (!locale_match ("vt", locale)) |
|---|
| 480 | | /* VISCII is probably a bad idea as it is not extended ASCII */ |
|---|
| 481 | | /* glibc has TCVN5712-1 */ |
|---|
| 482 | | return "CP1258"; |
|---|
| 483 | | |
|---|
| 484 | | /* Kazakh (FIXME: more infos needed) */ |
|---|
| 485 | | if (!locale_match ("kk", locale)) |
|---|
| 486 | | return "PT154"; |
|---|
| 487 | | |
|---|
| 488 | | // Chinese. The politically incompatible character sets. |
|---|
| 489 | | if (!locale_match ("zh", locale)) |
|---|
| 490 | | { |
|---|
| 491 | | if ((strlen (locale) >= 5) && (locale[2] != '_')) |
|---|
| 492 | | locale += 3; |
|---|
| 493 | | |
|---|
| 494 | | // Hong Kong |
|---|
| 495 | | if (!locale_match ("HK", locale)) |
|---|
| 496 | | return "BIG5-HKSCS"; /* FIXME: use something else? */ |
|---|
| 497 | | |
|---|
| 498 | | // Taiwan island |
|---|
| 499 | | if (!locale_match ("TW", locale)) |
|---|
| 500 | | return "BIG5"; |
|---|
| 501 | | |
|---|
| 502 | | // People's Republic of China and Singapore |
|---|
| 503 | | /* |
|---|
| 504 | | * GB18030 can represent any Unicode code point |
|---|
| 505 | | * (like UTF-8), while remaining compatible with GBK |
|---|
| 506 | | * FIXME: is it compatible with GB2312? if not, should we |
|---|
| 507 | | * use GB2312 instead? |
|---|
| 508 | | */ |
|---|
| 509 | | return "GB18030"; |
|---|
| 510 | | } |
|---|
| 511 | | |
|---|
| 512 | | return "ASCII"; |
|---|
| 513 | | } |
|---|
| 514 | | |
|---|
| 515 | | /** |
|---|
| 516 | | * GetFallbackEncoding() suggests an encoding to be used for non UTF-8 |
|---|
| 517 | | * text files accord to the system's local settings. It is only a best |
|---|
| 518 | | * guess. |
|---|
| 519 | | */ |
|---|
| 520 | | const char *GetFallbackEncoding( void ) |
|---|
| 521 | | { |
|---|
| 522 | | #ifndef WIN32 |
|---|
| 523 | | const char *psz_lang = NULL; |
|---|
| 524 | | |
|---|
| 525 | | /* Some systems (like Darwin, SunOS 4 or DJGPP) have only the C locale. |
|---|
| 526 | | * Therefore we don't use setlocale here; it would return "C". */ |
|---|
| 527 | | # if defined (HAVE_SETLOCALE) && !defined ( __APPLE__) |
|---|
| 528 | | psz_lang = setlocale( LC_ALL, NULL ); |
|---|
| 529 | | # endif |
|---|
| 530 | | if( psz_lang == NULL || psz_lang[0] == '\0' ) |
|---|
| 531 | | { |
|---|
| 532 | | psz_lang = getenv( "LC_ALL" ); |
|---|
| 533 | | if( psz_lang == NULL || psz_lang == '\0' ) |
|---|
| 534 | | { |
|---|
| 535 | | psz_lang = getenv( "LC_CTYPE" ); |
|---|
| 536 | | if( psz_lang == NULL || psz_lang[0] == '\0') |
|---|
| 537 | | psz_lang = getenv( "LANG" ); |
|---|
| 538 | | } |
|---|
| 539 | | } |
|---|
| 540 | | |
|---|
| 541 | | return FindFallbackEncoding( psz_lang ); |
|---|
| 542 | | #else |
|---|
| 543 | | /* |
|---|
| 544 | | * This should be thread-safe given GetACP() should always return |
|---|
| 545 | | * the same result. |
|---|
| 546 | | */ |
|---|
| 547 | | static char buf[2 + 10 + 1] = ""; |
|---|
| 548 | | |
|---|
| 549 | | if( buf[0] == 0 ) |
|---|
| 550 | | snprintf( buf, sizeof( buf ), "CP%u", GetACP() ); |
|---|
| 551 | | return buf; |
|---|
| 552 | | #endif |
|---|
| 553 | | } |
|---|
| 554 | | |
|---|