| 61 | | |
|---|
| 62 | | typedef struct VLCCharsetAlias |
|---|
| 63 | | { |
|---|
| 64 | | char *psz_alias, *psz_name; |
|---|
| 65 | | } VLCCharsetAlias; |
|---|
| 66 | | |
|---|
| 67 | | /* |
|---|
| 68 | | * The libcharset load all from external text file, but it's strange and |
|---|
| 69 | | * slow solution, we rather use array(s) compiled into source. In the |
|---|
| 70 | | * "good" libc this is not needful -- for example in linux. |
|---|
| 71 | | * |
|---|
| 72 | | * Please, put to this funtion exotic aliases only. The libc 'iconv' knows |
|---|
| 73 | | * a lot of basic aliases (check it first by iconv -l). |
|---|
| 74 | | * |
|---|
| 75 | | */ |
|---|
| 76 | | #if (defined OS2 || !defined(HAVE_LANGINFO_CODESET)) && !defined WIN32 |
|---|
| 77 | | static const char* vlc_encoding_from_language( const char *l ) |
|---|
| 78 | | { |
|---|
| 79 | | /* check for language (and perhaps country) codes */ |
|---|
| 80 | | if (strstr(l, "zh_TW")) return "Big5"; |
|---|
| 81 | | if (strstr(l, "zh_HK")) return "Big5HKSCS"; /* no MIME charset */ |
|---|
| 82 | | if (strstr(l, "zh")) return "GB2312"; |
|---|
| 83 | | if (strstr(l, "th")) return "TIS-620"; |
|---|
| 84 | | if (strstr(l, "ja")) return "EUC-JP"; |
|---|
| 85 | | if (strstr(l, "ko")) return "EUC-KR"; |
|---|
| 86 | | if (strstr(l, "ru")) return "KOI8-R"; |
|---|
| 87 | | if (strstr(l, "uk")) return "KOI8-U"; |
|---|
| 88 | | if (strstr(l, "pl") || strstr(l, "hr") || |
|---|
| 89 | | strstr(l, "hu") || strstr(l, "cs") || |
|---|
| 90 | | strstr(l, "sk") || strstr(l, "sl")) return "ISO-8859-2"; |
|---|
| 91 | | if (strstr(l, "eo") || strstr(l, "mt")) return "ISO-8859-3"; |
|---|
| 92 | | if (strstr(l, "lt") || strstr(l, "la")) return "ISO-8859-4"; |
|---|
| 93 | | if (strstr(l, "bg") || strstr(l, "be") || |
|---|
| 94 | | strstr(l, "mk") || strstr(l, "uk")) return "ISO-8859-5"; |
|---|
| 95 | | if (strstr(l, "ar")) return "ISO-8859-6"; |
|---|
| 96 | | if (strstr(l, "el")) return "ISO-8859-7"; |
|---|
| 97 | | if (strstr(l, "he") || strstr(l, "iw")) return "ISO-8859-8"; |
|---|
| 98 | | if (strstr(l, "tr")) return "ISO-8859-9"; |
|---|
| 99 | | if (strstr(l, "th")) return "ISO-8859-11"; |
|---|
| 100 | | if (strstr(l, "lv")) return "ISO-8859-13"; |
|---|
| 101 | | if (strstr(l, "cy")) return "ISO-8859-14"; |
|---|
| 102 | | if (strstr(l, "et")) return "ISO-8859-15"; /* all latin1 could be iso15 as well */ |
|---|
| 103 | | if (strstr(l, "ro")) return "ISO-8859-2"; /* or ISO-8859-16 */ |
|---|
| 104 | | if (strstr(l, "am") || strstr(l, "vi")) return "UTF-8"; |
|---|
| 105 | | /* We don't know. This ain't working go to default. */ |
|---|
| 106 | | return "ISO-8859-1"; |
|---|
| 107 | | } |
|---|
| 108 | | #endif |
|---|
| 109 | | |
|---|
| 110 | | static const char* vlc_charset_aliases( const char *psz_name ) |
|---|
| 111 | | { |
|---|
| 112 | | VLCCharsetAlias *a; |
|---|
| 113 | | |
|---|
| 114 | | #if defined WIN32 |
|---|
| 115 | | VLCCharsetAlias aliases[] = |
|---|
| 116 | | { |
|---|
| 117 | | { "CP936", "GBK" }, |
|---|
| 118 | | { "CP1361", "JOHAB" }, |
|---|
| 119 | | { "CP20127", "ASCII" }, |
|---|
| 120 | | { "CP20866", "KOI8-R" }, |
|---|
| 121 | | { "CP21866", "KOI8-RU" }, |
|---|
| 122 | | { "CP28591", "ISO-8859-1" }, |
|---|
| 123 | | { "CP28592", "ISO-8859-2" }, |
|---|
| 124 | | { "CP28593", "ISO-8859-3" }, |
|---|
| 125 | | { "CP28594", "ISO-8859-4" }, |
|---|
| 126 | | { "CP28595", "ISO-8859-5" }, |
|---|
| 127 | | { "CP28596", "ISO-8859-6" }, |
|---|
| 128 | | { "CP28597", "ISO-8859-7" }, |
|---|
| 129 | | { "CP28598", "ISO-8859-8" }, |
|---|
| 130 | | { "CP28599", "ISO-8859-9" }, |
|---|
| 131 | | { "CP28605", "ISO-8859-15" }, |
|---|
| 132 | | { NULL, NULL } |
|---|
| 133 | | }; |
|---|
| 134 | | #elif defined (SYS_AIX) |
|---|
| 135 | | VLCCharsetAlias aliases[] = |
|---|
| 136 | | { |
|---|
| 137 | | { "IBM-850", "CP850" }, |
|---|
| 138 | | { "IBM-856", "CP856" }, |
|---|
| 139 | | { "IBM-921", "ISO-8859-13" }, |
|---|
| 140 | | { "IBM-922", "CP922" }, |
|---|
| 141 | | { "IBM-932", "CP932" }, |
|---|
| 142 | | { "IBM-943", "CP943" }, |
|---|
| 143 | | { "IBM-1046", "CP1046" }, |
|---|
| 144 | | { "IBM-1124", "CP1124" }, |
|---|
| 145 | | { "IBM-1129", "CP1129" }, |
|---|
| 146 | | { "IBM-1252", "CP1252" }, |
|---|
| 147 | | { "IBM-EUCCN", "GB2312" }, |
|---|
| 148 | | { "IBM-EUCJP", "EUC-JP" }, |
|---|
| 149 | | { "IBM-EUCKR", "EUC-KR" }, |
|---|
| 150 | | { "IBM-EUCTW", "EUC-TW" }, |
|---|
| 151 | | { NULL, NULL } |
|---|
| 152 | | }; |
|---|
| 153 | | #elif defined (SYS_HPUX) |
|---|
| 154 | | VLCCharsetAlias aliases[] = |
|---|
| 155 | | { |
|---|
| 156 | | { "ROMAN8", "HP-ROMAN8" }, |
|---|
| 157 | | { "ARABIC8", "HP-ARABIC8" }, |
|---|
| 158 | | { "GREEK8", "HP-GREEK8" }, |
|---|
| 159 | | { "HEBREW8", "HP-HEBREW8" }, |
|---|
| 160 | | { "TURKISH8", "HP-TURKISH8" }, |
|---|
| 161 | | { "KANA8", "HP-KANA8" }, |
|---|
| 162 | | { "HP15CN", "GB2312" }, |
|---|
| 163 | | { NULL, NULL } |
|---|
| 164 | | }; |
|---|
| 165 | | #elif defined (SYS_IRIX) |
|---|
| 166 | | VLCCharsetAlias aliases[] = |
|---|
| 167 | | { |
|---|
| 168 | | { "EUCCN", "GB2312" }, |
|---|
| 169 | | { NULL, NULL } |
|---|
| 170 | | }; |
|---|
| 171 | | #elif defined (SYS_OSF) |
|---|
| 172 | | VLCCharsetAlias aliases[] = |
|---|
| 173 | | { |
|---|
| 174 | | { "KSC5601", "CP949" }, |
|---|
| 175 | | { "SDECKANJI", "EUC-JP" }, |
|---|
| 176 | | { "TACTIS", "TIS-620" }, |
|---|
| 177 | | { NULL, NULL } |
|---|
| 178 | | }; |
|---|
| 179 | | #elif defined (SYS_SOLARIS) |
|---|
| 180 | | VLCCharsetAlias aliases[] = |
|---|
| 181 | | { |
|---|
| 182 | | { "646", "ASCII" }, |
|---|
| 183 | | { "CNS11643", "EUC-TW" }, |
|---|
| 184 | | { "5601", "EUC-KR" }, |
|---|
| 185 | | { "JOHAP92", "JOHAB" }, |
|---|
| 186 | | { "PCK", "SHIFT_JIS" }, |
|---|
| 187 | | { "2533", "TIS-620" }, |
|---|
| 188 | | { NULL, NULL } |
|---|
| 189 | | }; |
|---|
| 190 | | #elif defined (SYS_BSD) |
|---|
| 191 | | VLCCharsetAlias aliases[] = |
|---|
| 192 | | { |
|---|
| 193 | | { "646", " ASCII" }, |
|---|
| 194 | | { "EUCCN", "GB2312" }, |
|---|
| 195 | | { NULL, NULL } |
|---|
| 196 | | }; |
|---|
| 197 | | #else |
|---|
| 198 | | VLCCharsetAlias aliases[] = {{NULL, NULL}}; |
|---|
| 199 | | #endif |
|---|
| 200 | | |
|---|
| 201 | | for (a = aliases; a->psz_alias; a++) |
|---|
| 202 | | if (strcasecmp (a->psz_alias, psz_name) == 0) |
|---|
| 203 | | return a->psz_name; |
|---|
| 204 | | |
|---|
| 205 | | /* we return original name beacuse iconv() probably will know |
|---|
| 206 | | * something better about name if we don't know it :-) */ |
|---|
| 207 | | return psz_name; |
|---|
| 208 | | } |
|---|
| 209 | | |
|---|
| 210 | | /* Returns charset from "language_COUNTRY.charset@modifier" string */ |
|---|
| 211 | | #if (defined OS2 || !defined(HAVE_LANGINFO_CODESET)) && !defined WIN32 |
|---|
| 212 | | static void vlc_encoding_from_locale( char *psz_locale, char *psz_charset ) |
|---|
| 213 | | { |
|---|
| 214 | | char *psz_dot = strchr( psz_locale, '.' ); |
|---|
| 215 | | |
|---|
| 216 | | if( psz_dot != NULL ) |
|---|
| 217 | | { |
|---|
| 218 | | const char *psz_modifier; |
|---|
| 219 | | |
|---|
| 220 | | psz_dot++; |
|---|
| 221 | | |
|---|
| 222 | | /* Look for the possible @... trailer and remove it, if any. */ |
|---|
| 223 | | psz_modifier = strchr( psz_dot, '@' ); |
|---|
| 224 | | |
|---|
| 225 | | if( psz_modifier == NULL ) |
|---|
| 226 | | { |
|---|
| 227 | | strcpy( psz_charset, psz_dot ); |
|---|
| 228 | | return; |
|---|
| 229 | | } |
|---|
| 230 | | if( 0 < ( psz_modifier - psz_dot ) |
|---|
| 231 | | && ( psz_modifier - psz_dot ) < 2 + 10 + 1 ) |
|---|
| 232 | | { |
|---|
| 233 | | memcpy( psz_charset, psz_dot, psz_modifier - psz_dot ); |
|---|
| 234 | | psz_charset[ psz_modifier - psz_dot ] = '\0'; |
|---|
| 235 | | return; |
|---|
| 236 | | } |
|---|
| 237 | | } |
|---|
| 238 | | /* try language mapping */ |
|---|
| 239 | | strcpy( psz_charset, vlc_encoding_from_language( psz_locale ) ); |
|---|
| 240 | | } |
|---|
| 241 | | #endif |
|---|
| 242 | | |
|---|
| 243 | | bool vlc_current_charset( char **psz_charset ) |
|---|
| 244 | | { |
|---|
| 245 | | const char *psz_codeset; |
|---|
| 246 | | |
|---|
| 247 | | #if !(defined WIN32 || defined OS2 || defined __APPLE__) |
|---|
| 248 | | |
|---|
| 249 | | # ifdef HAVE_LANGINFO_CODESET |
|---|
| 250 | | /* Most systems support nl_langinfo( CODESET ) nowadays. */ |
|---|
| 251 | | psz_codeset = nl_langinfo( CODESET ); |
|---|
| 252 | | if( !strcmp( psz_codeset, "ANSI_X3.4-1968" ) ) |
|---|
| 253 | | psz_codeset = "ASCII"; |
|---|
| 254 | | # else |
|---|
| 255 | | /* On old systems which lack it, use setlocale or getenv. */ |
|---|
| 256 | | const char *psz_locale = NULL; |
|---|
| 257 | | char buf[2 + 10 + 1]; |
|---|
| 258 | | |
|---|
| 259 | | /* But most old systems don't have a complete set of locales. Some |
|---|
| 260 | | * (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't |
|---|
| 261 | | * use setlocale here; it would return "C" when it doesn't support the |
|---|
| 262 | | * locale name the user has set. Darwin's setlocale is broken. */ |
|---|
| 263 | | # if defined (HAVE_SETLOCALE) && !defined (__APPLE__) |
|---|
| 264 | | psz_locale = setlocale( LC_ALL, NULL ); |
|---|
| 265 | | # endif |
|---|
| 266 | | if( psz_locale == NULL || psz_locale[0] == '\0' ) |
|---|
| 267 | | { |
|---|
| 268 | | psz_locale = getenv( "LC_ALL" ); |
|---|
| 269 | | if( psz_locale == NULL || psz_locale[0] == '\0' ) |
|---|
| 270 | | { |
|---|
| 271 | | psz_locale = getenv( "LC_CTYPE" ); |
|---|
| 272 | | if( psz_locale == NULL || psz_locale[0] == '\0') |
|---|
| 273 | | psz_locale = getenv( "LANG" ); |
|---|
| 274 | | } |
|---|
| 275 | | } |
|---|
| 276 | | |
|---|
| 277 | | /* On some old systems, one used to set locale = "iso8859_1". On others, |
|---|
| 278 | | * you set it to "language_COUNTRY.charset". Darwin only has LANG :( */ |
|---|
| 279 | | vlc_encoding_from_locale( (char *)psz_locale, buf ); |
|---|
| 280 | | psz_codeset = buf; |
|---|
| 281 | | # endif /* HAVE_LANGINFO_CODESET */ |
|---|
| 282 | | |
|---|
| 283 | | #elif defined __APPLE__ |
|---|
| 284 | | |
|---|
| 285 | | /* Darwin is always using UTF-8 internally. */ |
|---|
| 286 | | psz_codeset = "UTF-8"; |
|---|
| 287 | | |
|---|
| 288 | | #elif defined WIN32 |
|---|
| 289 | | |
|---|
| 290 | | char buf[2 + 10 + 1]; |
|---|
| 291 | | |
|---|
| 292 | | /* Woe32 has a function returning the locale's codepage as a number. */ |
|---|
| 293 | | snprintf( buf, sizeof( buf ), "CP%u", GetACP() ); |
|---|
| 294 | | psz_codeset = buf; |
|---|
| 295 | | |
|---|
| 296 | | #elif defined OS2 |
|---|
| 297 | | |
|---|
| 298 | | const char *psz_locale; |
|---|
| 299 | | char buf[2 + 10 + 1]; |
|---|
| 300 | | ULONG cp[3]; |
|---|
| 301 | | ULONG cplen; |
|---|
| 302 | | |
|---|
| 303 | | /* Allow user to override the codeset, as set in the operating system, |
|---|
| 304 | | * with standard language environment variables. */ |
|---|
| 305 | | psz_locale = getenv( "LC_ALL" ); |
|---|
| 306 | | if( psz_locale == NULL || psz_locale[0] == '\0' ) |
|---|
| 307 | | { |
|---|
| 308 | | psz+locale = getenv( "LC_CTYPE" ); |
|---|
| 309 | | if( psz_locale == NULL || locale[0] == '\0' ) |
|---|
| 310 | | locale = getenv( "LANG" ); |
|---|
| 311 | | } |
|---|
| 312 | | if( psz_locale != NULL && psz_locale[0] != '\0' ) |
|---|
| 313 | | vlc_encoding_from_locale( psz_locale, buf ); |
|---|
| 314 | | psz_codeset = buf; |
|---|
| 315 | | else |
|---|
| 316 | | { |
|---|
| 317 | | /* OS/2 has a function returning the locale's codepage as a number. */ |
|---|
| 318 | | if( DosQueryCp( sizeof( cp ), cp, &cplen ) ) |
|---|
| 319 | | psz_codeset = ""; |
|---|
| 320 | | else |
|---|
| 321 | | { |
|---|
| 322 | | snprintf( buf, sizeof( buf ), "CP%u", cp[0] ); |
|---|
| 323 | | psz_codeset = buf; |
|---|
| 324 | | } |
|---|
| 325 | | } |
|---|
| 326 | | #endif |
|---|
| 327 | | if( psz_codeset == NULL ) |
|---|
| 328 | | /* The canonical name cannot be determined. */ |
|---|
| 329 | | psz_codeset = ""; |
|---|
| 330 | | else |
|---|
| 331 | | psz_codeset = vlc_charset_aliases( psz_codeset ); |
|---|
| 332 | | |
|---|
| 333 | | /* Don't return an empty string. GNU libc and GNU libiconv interpret |
|---|
| 334 | | * the empty string as denoting "the locale's character encoding", |
|---|
| 335 | | * thus GNU libiconv would call this function a second time. */ |
|---|
| 336 | | if( psz_codeset[0] == '\0' ) |
|---|
| 337 | | { |
|---|
| 338 | | /* Last possibility is 'CHARSET' enviroment variable */ |
|---|
| 339 | | if( !( psz_codeset = getenv( "CHARSET" ) ) ) |
|---|
| 340 | | psz_codeset = "ISO-8859-1"; |
|---|
| 341 | | } |
|---|
| 342 | | |
|---|
| 343 | | if( psz_charset ) |
|---|
| 344 | | *psz_charset = strdup(psz_codeset); |
|---|
| 345 | | |
|---|
| 346 | | if( !strcasecmp(psz_codeset, "UTF8") || !strcasecmp(psz_codeset, "UTF-8") ) |
|---|
| 347 | | return true; |
|---|
| 348 | | |
|---|
| 349 | | return false; |
|---|
| 350 | | } |
|---|
| 351 | | |
|---|