bug-libunistring
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[bug-libunistring] Fix libunistring in MS-Windows locales


From: Eli Zaretskii
Subject: [bug-libunistring] Fix libunistring in MS-Windows locales
Date: Wed, 02 Jul 2014 19:53:56 +0300

Libunistring on MS-Windows is currently less useful than it could be.
In a nutshell, it supports only the default system locale; setting the
locale to anything else using 'setlocale' will cause many libunistring
functions to fail.

One reason is that libunistring needs to know the codeset of the
locale, to use it when it converts strings to and from Unicode using
iconv.  However, on Windows the function locale_charset always returns
the default ANSI codepage, using the GetACP API.  That API always
returns the same codepage regardless of locale changes.  The result is
that when libunistring is passed a string in any encoding incompatible
with the default system codepage, any calls to libiconv will fail with
EILSEQ.  And since most libunistring functions call libiconv, they all
fail.

The second problem that gets in the way is the one in gl_locale_name.
On MS-Windows, this function returns only the default locale (by
calling GetThreadLocale, which is again oblivious to any locale
changes made by 'setlocale').  So again, only the default locale can
be supported correctly.  E.g., functions that need to apply
language-specific rules, like for Turkish, will work incorrectly
because the language they get from gl_locale_name is not Turkish, even
though 'setlocale' was previously called to set the Turkish locale.

To fix these problems, I propose the following changes.  They have
been extensively tested using Guile's test suite, in particular the
i18n.test there.

I hope these changes will be accepted, because as I said, without them
libunistring is much less useful on MS-Windows than it could be.

Thanks.

--- lib/localcharset.c~0        2010-01-01 13:02:02 +0200
+++ lib/localcharset.c  2014-06-12 18:23:39 +0300
@@ -35,6 +35,7 @@
 
 #if defined _WIN32 || defined __WIN32__
 # define WIN32_NATIVE
+# include <locale.h>
 #endif
 
 #if defined __EMX__
@@ -459,14 +460,34 @@ locale_charset (void)
 
   static char buf[2 + 10 + 1];
 
-  /* Woe32 has a function returning the locale's codepage as a number:
-     GetACP().
-     When the output goes to a console window, it needs to be provided in
-     GetOEMCP() encoding if the console is using a raster font, or in
-     GetConsoleOutputCP() encoding if it is using a TrueType font.
-     But in GUI programs and for output sent to files and pipes, GetACP()
-     encoding is the best bet.  */
-  sprintf (buf, "CP%u", GetACP ());
+  /* The Windows API has a function returning the locale's codepage as
+     a number, but the value doesn't change according to what the
+     'setlocale' call specified.  So we use it as a last resort, in
+     case the string returned by 'setlocale' doesn't specify the
+     codepage.  */
+  char *current_locale = setlocale (LC_ALL, NULL);
+  char *pdot;
+
+  /* If they set different locales for different categories,
+     'setlocale' will return a semi-colon separated list of locale
+     values.  To make sure we use the correct one, we choose LC_CTYPE.  */
+  if (strchr (current_locale, ';'))
+    current_locale = setlocale (LC_CTYPE, NULL);
+
+  pdot = strrchr (current_locale, '.');
+  if (pdot)
+    sprintf (buf, "CP%s", pdot + 1);
+  else
+    {
+      /* Woe32 has a function returning the system locale's default
+        codepage as a number: GetACP().  When the output goes to a
+        console window, it needs to be provided in GetOEMCP()
+        encoding if the console is using a raster font, or in
+        GetConsoleOutputCP() encoding if it is using a TrueType font.
+        But in GUI programs and for output sent to files and pipes,
+        GetACP() encoding is the best bet.  */
+      sprintf (buf, "CP%u", GetACP ());
+    }
   codeset = buf;
 
 #elif defined OS2


--- lib/localename.c~0  2010-04-03 12:43:57 +0300
+++ lib/localename.c    2014-06-15 19:03:07 +0300
@@ -62,6 +62,7 @@
 #if defined WIN32_NATIVE || defined __CYGWIN__ /* WIN32 or Cygwin */
 # define WIN32_LEAN_AND_MEAN
 # include <windows.h>
+# include <winnls.h>
 /* List of language codes, sorted by value:
    0x01 LANG_ARABIC
    0x02 LANG_BULGARIAN
@@ -1126,6 +1127,9 @@
 # ifndef LOCALE_SNAME
 # define LOCALE_SNAME 0x5c
 # endif
+# ifndef LOCALE_NAME_MAX_LENGTH
+# define LOCALE_NAME_MAX_LENGTH 85
+# endif
 #endif
 
 
@@ -2504,6 +2508,61 @@ gl_locale_name_from_win32_LCID (LCID lci
   return gl_locale_name_from_win32_LANGID (langid);
 }
 
+#ifdef WIN32_NATIVE
+
+static LCID found_lcid;
+static char lname[LC_MAX * (LOCALE_NAME_MAX_LENGTH + 1) + 1];
+
+static BOOL CALLBACK
+enum_locales_fn (LPTSTR locale_num_str)
+{
+  char *endp;
+  char locval[2 * LOCALE_NAME_MAX_LENGTH + 1 + 1];
+  LCID try_lcid = strtoul (locale_num_str, &endp, 16);
+
+  if (GetLocaleInfo (try_lcid, LOCALE_SENGLANGUAGE,
+                    locval, LOCALE_NAME_MAX_LENGTH))
+    {
+      strcat (locval, "_");
+      if (GetLocaleInfo (try_lcid, LOCALE_SENGCOUNTRY,
+                        locval + strlen (locval), LOCALE_NAME_MAX_LENGTH))
+       {
+         size_t locval_len = strlen (locval);
+
+         if (strncmp (locval, lname, locval_len) == 0
+             && (lname[locval_len] == '.'
+                 || lname[locval_len] == '\0'))
+           {
+             found_lcid = try_lcid;
+             return FALSE;
+           }
+       }
+    }
+  return TRUE;
+}
+
+static LCID
+get_lcid (const char *locale_name)
+{
+  /* A simple cache.  */
+  static LCID last_lcid;
+  static char last_locale[1000];
+
+  if (last_lcid > 0 && strcmp (locale_name, last_locale) == 0)
+    return last_lcid;
+  strncpy (lname, locale_name, sizeof (lname) - 1);
+  lname[sizeof (lname) - 1] = '\0';
+  found_lcid = 0;
+  EnumSystemLocales (enum_locales_fn, LCID_SUPPORTED);
+  if (found_lcid > 0)
+    {
+      last_lcid = found_lcid;
+      strcpy (last_locale, locale_name);
+    }
+  return found_lcid;
+}
+
+#endif
 #endif
 
 
@@ -2761,6 +2820,26 @@ gl_locale_name_thread (int category, con
   const char *name = gl_locale_name_thread_unsafe (category, categoryname);
   if (name != NULL)
     return struniq (name);
+#elif defined WIN32_NATIVE
+  if (LC_MIN <= category && category <= LC_MAX)
+    {
+      char *locname = setlocale (category, NULL);
+
+      /* If CATEGORY is LC_ALL, the result might be a semi-colon
+        separated list of locales.  We need only one, so we take the
+        one corresponding to LC_CTYPE, as the most important for
+        character translations.  */
+      if (strchr (locname, ';'))
+       locname = setlocale (LC_CTYPE, NULL);
+
+    /* Convert locale name to LCID.  We don't want to use
+       LocaleNameToLCID because (a) it is only available since Vista,
+       and (b) it doesn't accept locale names returned by 'setlocale'.  */
+    LCID lcid = get_lcid (locname);
+
+    if (lcid > 0)
+      return gl_locale_name_from_win32_LCID (lcid);
+  }
 #endif
   return NULL;
 }



reply via email to

[Prev in Thread] Current Thread [Next in Thread]