Re: From wchar_t to char32

bug-gnulib
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: From wchar_t to char32_t

From:	Bruno Haible
Subject:	Re: From wchar_t to char32_t
Date:	Sat, 24 Jun 2023 18:47:23 +0200
I wrote:
> In Gnulib, the following areas will need migration:
> 
> * lib/mbchar.h
>   lib/mbiter.h
>   lib/mbuiter.h
>   Draft patch attached.

It seems to work fine, so I pushed this:


2023-06-24  Bruno Haible  <bruno@clisp.org>

        mbchar, mbiter, mbuiter: Overcome wchar_t limitations.
        * lib/mbchar.h: Include <uchar.h> instead of <wchar.h>, <wctype.h>.
        (struct mbchar): Use char32_t instead of wchar_t.
        (mb_casecmp, mb_caseequal): Use c32tolower instead of towlower.
        (mb_isalnum): Use c32isalnum instead of iswalnum.
        (mb_isalpha): Use c32isalpha instead of iswalpha.
        (mb_isblank): Use c32isblank instead of iswblank.
        (mb_iscntrl): Use c32iscntrl instead of iswcntrl.
        (mb_isdigit): Use c32isdigit instead of iswdigit.
        (mb_isgraph): Use c32isgraph instead of iswgraph.
        (mb_islower): Use c32islower instead of iswlower.
        (mb_isprint): Use c32isprint instead of iswprint.
        (mb_ispunct): Use c32ispunct instead of iswpunct.
        (mb_isspace): Use c32isspace instead of iswspace.
        (mb_isupper): Use c32isupper instead of iswupper.
        (mb_isxdigit): Use c32isxdigit instead of iswxdigit.
        (mb_width_aux): Use c32width, c32iscntrl instead of wcwidth, iswcntrl.
        * lib/mbiter.h: Include <uchar.h> instead of <wchar.h>.
        (mbiter_multi_next): Use mbrtoc32 instead of mbrtowc.
        * lib/mbuiter.h: Include <uchar.h> instead of <wchar.h>.
        (mbuiter_multi_next): Use mbrtoc32 instead of mbrtowc.
        * lib/mbfile.h (mbfile_multi_getc): Use mbrtoc32 instead of mbrtowc.
        * lib/mbscasestr.c (knuth_morris_pratt_multibyte, mbscasestr): Use
        c32tolower instead of towlower.
        * lib/exclude.c (string_hasher_ci): Use char32_t, c32tolower instead of
        wchar_t, towlower.
        * modules/mbchar (Depends-on): Add uchar, c32isalnum, c32isalpha,
        c32isblank, c32iscntrl, c32isdigit, c32isgraph, c32islower, c32isprint,
        c32ispunct, c32isspace, c32isupper, c32isxdigit, c32tolower, c32width.
        Remove wchar, wctype-h, iswblank, iswdigit, iswxdigit, wcwidth.
        * modules/mbiter (Depends-on): Add mbrtoc32, uchar. Remove mbrtowc,
        wchar.
        * modules/mbuiter (Depends-on): Likewise.
        * modules/mbscasestr (Depends-on): Add c32tolower.
        * modules/exclude (Depends-on): Add c32tolower.
        * doc/strings.texi (C strings): Fix typo.

diff --git a/doc/strings.texi b/doc/strings.texi
index 612d4fb083..7a074e4040 100644
--- a/doc/strings.texi
+++ b/doc/strings.texi
@@ -45,7 +45,7 @@
 functions, standardized by ISO C and POSIX, that assume this
 representation of strings.
 
-An @emph{character encoding}, or @emph{encoding} for short, describes
+A @emph{character encoding}, or @emph{encoding} for short, describes
 how the elements of a character set are represented as a sequence of
 bytes.  For example, in the @code{ASCII} encoding, the UNDERSCORE
 character is represented by a single byte, with value 0x5F.  As another
diff --git a/lib/exclude.c b/lib/exclude.c
index 7bd0ec8c71..af204cd300 100644
--- a/lib/exclude.c
+++ b/lib/exclude.c
@@ -209,10 +209,10 @@ string_hasher_ci (void const *data, size_t n_buckets)
   for (mbui_init (iter, p); mbui_avail (iter); mbui_advance (iter))
     {
       mbchar_t m = mbui_cur (iter);
-      wchar_t wc;
+      char32_t wc;
 
       if (m.wc_valid)
-        wc = towlower (m.wc);
+        wc = c32tolower (m.wc);
       else
         wc = *m.ptr;
 
diff --git a/lib/mbchar.h b/lib/mbchar.h
index a2ff1d8b21..c183772cc6 100644
--- a/lib/mbchar.h
+++ b/lib/mbchar.h
@@ -17,10 +17,10 @@
 /* Written by Bruno Haible <bruno@clisp.org>.  */
 
 /* A multibyte character is a short subsequence of a char* string,
-   representing a single wide character.
+   representing a single 32-bit wide character.
 
-   We use multibyte characters instead of wide characters because of
-   the following goals:
+   We use multibyte characters instead of 32-bit wide characters because
+   of the following goals:
    1) correct multibyte handling, i.e. operate according to the LC_CTYPE
       locale,
    2) ease of maintenance, i.e. the maintainer needs not know all details
@@ -28,8 +28,7 @@
    3) don't fail grossly if the input is not in the encoding set by the
       locale, because often different encodings are in use in the same
       countries (ISO-8859-1/UTF-8, EUC-JP/Shift_JIS, ...),
-   4) fast in the case of ASCII characters,
-   5) portability, i.e. don't make unportable assumptions about wchar_t.
+   4) fast in the case of ASCII characters.
 
    Multibyte characters are only accessed through the mb* macros.
 
@@ -150,8 +149,7 @@
 #endif
 
 #include <string.h>
-#include <wchar.h>
-#include <wctype.h>
+#include <uchar.h>
 
 _GL_INLINE_HEADER_BEGIN
 #ifndef MBCHAR_INLINE
@@ -164,8 +162,8 @@ struct mbchar
 {
   const char *ptr;      /* pointer to current character */
   size_t bytes;         /* number of bytes of current character, > 0 */
-  bool wc_valid;        /* true if wc is a valid wide character */
-  wchar_t wc;           /* if wc_valid: the current character */
+  bool wc_valid;        /* true if wc is a valid 32-bit wide character */
+  char32_t wc;          /* if wc_valid: the current character */
   char buf[MBCHAR_BUF_SIZE]; /* room for the bytes, used for file input only */
 };
 
@@ -184,7 +182,7 @@ typedef struct mbchar mbchar_t;
 #define mb_cmp(mbc1, mbc2) \
   ((mbc1).wc_valid                                                      \
    ? ((mbc2).wc_valid                                                   \
-      ? (int) (mbc1).wc - (int) (mbc2).wc                               \
+      ? _GL_CMP ((mbc1).wc, (mbc2).wc)                                  \
       : -1)                                                             \
    : ((mbc2).wc_valid                                                   \
       ? 1                                                               \
@@ -196,7 +194,7 @@ typedef struct mbchar mbchar_t;
 #define mb_casecmp(mbc1, mbc2) \
   ((mbc1).wc_valid                                                      \
    ? ((mbc2).wc_valid                                                   \
-      ? (int) towlower ((mbc1).wc) - (int) towlower ((mbc2).wc)         \
+      ? _GL_CMP (c32tolower ((mbc1).wc), c32tolower ((mbc2).wc))        \
       : -1)                                                             \
    : ((mbc2).wc_valid                                                   \
       ? 1                                                               \
@@ -212,25 +210,25 @@ typedef struct mbchar mbchar_t;
      && memcmp ((mbc1).ptr, (mbc2).ptr, (mbc1).bytes) == 0)
 #define mb_caseequal(mbc1, mbc2) \
   ((mbc1).wc_valid && (mbc2).wc_valid                                   \
-   ? towlower ((mbc1).wc) == towlower ((mbc2).wc)                       \
+   ? c32tolower ((mbc1).wc) == c32tolower ((mbc2).wc)                   \
    : (mbc1).bytes == (mbc2).bytes                                       \
      && memcmp ((mbc1).ptr, (mbc2).ptr, (mbc1).bytes) == 0)
 
 /* <ctype.h>, <wctype.h> classification.  */
 #define mb_isascii(mbc) \
   ((mbc).wc_valid && (mbc).wc >= 0 && (mbc).wc <= 127)
-#define mb_isalnum(mbc) ((mbc).wc_valid && iswalnum ((mbc).wc))
-#define mb_isalpha(mbc) ((mbc).wc_valid && iswalpha ((mbc).wc))
-#define mb_isblank(mbc) ((mbc).wc_valid && iswblank ((mbc).wc))
-#define mb_iscntrl(mbc) ((mbc).wc_valid && iswcntrl ((mbc).wc))
-#define mb_isdigit(mbc) ((mbc).wc_valid && iswdigit ((mbc).wc))
-#define mb_isgraph(mbc) ((mbc).wc_valid && iswgraph ((mbc).wc))
-#define mb_islower(mbc) ((mbc).wc_valid && iswlower ((mbc).wc))
-#define mb_isprint(mbc) ((mbc).wc_valid && iswprint ((mbc).wc))
-#define mb_ispunct(mbc) ((mbc).wc_valid && iswpunct ((mbc).wc))
-#define mb_isspace(mbc) ((mbc).wc_valid && iswspace ((mbc).wc))
-#define mb_isupper(mbc) ((mbc).wc_valid && iswupper ((mbc).wc))
-#define mb_isxdigit(mbc) ((mbc).wc_valid && iswxdigit ((mbc).wc))
+#define mb_isalnum(mbc) ((mbc).wc_valid && c32isalnum ((mbc).wc))
+#define mb_isalpha(mbc) ((mbc).wc_valid && c32isalpha ((mbc).wc))
+#define mb_isblank(mbc) ((mbc).wc_valid && c32isblank ((mbc).wc))
+#define mb_iscntrl(mbc) ((mbc).wc_valid && c32iscntrl ((mbc).wc))
+#define mb_isdigit(mbc) ((mbc).wc_valid && c32isdigit ((mbc).wc))
+#define mb_isgraph(mbc) ((mbc).wc_valid && c32isgraph ((mbc).wc))
+#define mb_islower(mbc) ((mbc).wc_valid && c32islower ((mbc).wc))
+#define mb_isprint(mbc) ((mbc).wc_valid && c32isprint ((mbc).wc))
+#define mb_ispunct(mbc) ((mbc).wc_valid && c32ispunct ((mbc).wc))
+#define mb_isspace(mbc) ((mbc).wc_valid && c32isspace ((mbc).wc))
+#define mb_isupper(mbc) ((mbc).wc_valid && c32isupper ((mbc).wc))
+#define mb_isxdigit(mbc) ((mbc).wc_valid && c32isxdigit ((mbc).wc))
 
 /* Extra <wchar.h> function.  */
 
@@ -238,12 +236,12 @@ typedef struct mbchar mbchar_t;
 #define MB_UNPRINTABLE_WIDTH 1
 
 MBCHAR_INLINE int
-mb_width_aux (wint_t wc)
+mb_width_aux (char32_t wc)
 {
-  int w = wcwidth (wc);
+  int w = c32width (wc);
   /* For unprintable characters, arbitrarily return 0 for control characters
      and MB_UNPRINTABLE_WIDTH otherwise.  */
-  return (w >= 0 ? w : iswcntrl (wc) ? 0 : MB_UNPRINTABLE_WIDTH);
+  return (w >= 0 ? w : c32iscntrl (wc) ? 0 : MB_UNPRINTABLE_WIDTH);
 }
 
 #define mb_width(mbc) \
diff --git a/lib/mbfile.h b/lib/mbfile.h
index 3482f394b9..7c6d70fcae 100644
--- a/lib/mbfile.h
+++ b/lib/mbfile.h
@@ -110,7 +110,7 @@ mbfile_multi_getc (struct mbchar *mbc, struct mbfile_multi 
*mbf)
     {
       /* These characters are part of the basic character set.  ISO C 99
          guarantees that their wide character code is identical to their
-         char code.  */
+         char code.  The 32-bit wide character code is the same as well.  */
       mbc->wc = mbc->buf[0] = mbf->buf[0];
       mbc->wc_valid = true;
       mbc->ptr = &mbc->buf[0];
@@ -136,7 +136,7 @@ mbfile_multi_getc (struct mbchar *mbc, struct mbfile_multi 
*mbf)
          behaviour will clobber it.  */
       mbstate_t backup_state = mbf->state;
 
-      bytes = mbrtowc (&mbc->wc, &mbf->buf[0], mbf->bufcount, &mbf->state);
+      bytes = mbrtoc32 (&mbc->wc, &mbf->buf[0], mbf->bufcount, &mbf->state);
 
       if (bytes == (size_t) -1)
         {
@@ -178,7 +178,7 @@ mbfile_multi_getc (struct mbchar *mbc, struct mbfile_multi 
*mbf)
         {
           if (bytes == 0)
             {
-              /* A null wide character was encountered.  */
+              /* A null 32-bit wide character was encountered.  */
               bytes = 1;
               assert (mbf->buf[0] == '\0');
               assert (mbc->wc == 0);
diff --git a/lib/mbiter.h b/lib/mbiter.h
index 7b41870b55..93bad990a1 100644
--- a/lib/mbiter.h
+++ b/lib/mbiter.h
@@ -90,7 +90,7 @@
 #include <assert.h>
 #include <stddef.h>
 #include <string.h>
-#include <wchar.h>
+#include <uchar.h>
 
 #include "mbchar.h"
 
@@ -106,11 +106,11 @@ struct mbiter_multi
   mbstate_t state;      /* if in_shift: current shift state */
   bool next_done;       /* true if mbi_avail has already filled the following 
*/
   struct mbchar cur;    /* the current character:
-        const char *cur.ptr             pointer to current character
+        const char *cur.ptr          pointer to current character
         The following are only valid after mbi_avail.
-        size_t cur.bytes                number of bytes of current character
-        bool cur.wc_valid               true if wc is a valid wide character
-        wchar_t cur.wc                  if wc_valid: the current character
+        size_t cur.bytes             number of bytes of current character
+        bool cur.wc_valid            true if wc is a valid 32-bit wide 
character
+        char32_t cur.wc              if wc_valid: the current character
         */
 };
 
@@ -136,8 +136,8 @@ mbiter_multi_next (struct mbiter_multi *iter)
       assert (mbsinit (&iter->state));
       iter->in_shift = true;
     with_shift:
-      iter->cur.bytes = mbrtowc (&iter->cur.wc, iter->cur.ptr,
-                                 iter->limit - iter->cur.ptr, &iter->state);
+      iter->cur.bytes = mbrtoc32 (&iter->cur.wc, iter->cur.ptr,
+                                  iter->limit - iter->cur.ptr, &iter->state);
       if (iter->cur.bytes == (size_t) -1)
         {
           /* An invalid multibyte sequence was encountered.  */
diff --git a/lib/mbscasestr.c b/lib/mbscasestr.c
index d92b847ba7..0753aeb864 100644
--- a/lib/mbscasestr.c
+++ b/lib/mbscasestr.c
@@ -64,7 +64,7 @@ knuth_morris_pratt_multibyte (const char *haystack, const 
char *needle,
       {
         mb_copy (&needle_mbchars[j], &mbui_cur (iter));
         if (needle_mbchars[j].wc_valid)
-          needle_mbchars[j].wc = towlower (needle_mbchars[j].wc);
+          needle_mbchars[j].wc = c32tolower (needle_mbchars[j].wc);
       }
   }
 
@@ -152,7 +152,7 @@ knuth_morris_pratt_multibyte (const char *haystack, const 
char *needle,
 
         mb_copy (&c, &mbui_cur (phaystack));
         if (c.wc_valid)
-          c.wc = towlower (c.wc);
+          c.wc = c32tolower (c.wc);
         if (mb_equal (needle_mbchars[j], c))
           {
             j++;
@@ -237,7 +237,7 @@ mbscasestr (const char *haystack, const char *needle)
 
           mb_copy (&b, &mbui_cur (iter_needle));
           if (b.wc_valid)
-            b.wc = towlower (b.wc);
+            b.wc = c32tolower (b.wc);
 
           mbui_init (iter_haystack, haystack);
           for (;; mbui_advance (iter_haystack))
@@ -279,7 +279,7 @@ mbscasestr (const char *haystack, const char *needle)
               comparison_count++;
               mb_copy (&c, &mbui_cur (iter_haystack));
               if (c.wc_valid)
-                c.wc = towlower (c.wc);
+                c.wc = c32tolower (c.wc);
               if (mb_equal (c, b))
                 /* The first character matches.  */
                 {
diff --git a/lib/mbuiter.h b/lib/mbuiter.h
index 7a619f19e1..02e3190f1c 100644
--- a/lib/mbuiter.h
+++ b/lib/mbuiter.h
@@ -98,7 +98,7 @@
 #include <stddef.h>
 #include <stdlib.h>
 #include <string.h>
-#include <wchar.h>
+#include <uchar.h>
 
 #include "mbchar.h"
 #include "strnlen1.h"
@@ -114,11 +114,11 @@ struct mbuiter_multi
   mbstate_t state;      /* if in_shift: current shift state */
   bool next_done;       /* true if mbui_avail has already filled the following 
*/
   struct mbchar cur;    /* the current character:
-        const char *cur.ptr             pointer to current character
+        const char *cur.ptr          pointer to current character
         The following are only valid after mbui_avail.
-        size_t cur.bytes                number of bytes of current character
-        bool cur.wc_valid               true if wc is a valid wide character
-        wchar_t cur.wc                  if wc_valid: the current character
+        size_t cur.bytes             number of bytes of current character
+        bool cur.wc_valid            true if wc is a valid 32-bit wide 
character
+        char32_t cur.wc              if wc_valid: the current character
         */
 };
 
@@ -144,9 +144,9 @@ mbuiter_multi_next (struct mbuiter_multi *iter)
       assert (mbsinit (&iter->state));
       iter->in_shift = true;
     with_shift:
-      iter->cur.bytes = mbrtowc (&iter->cur.wc, iter->cur.ptr,
-                                 strnlen1 (iter->cur.ptr, MB_CUR_MAX),
-                                 &iter->state);
+      iter->cur.bytes = mbrtoc32 (&iter->cur.wc, iter->cur.ptr,
+                                  strnlen1 (iter->cur.ptr, MB_CUR_MAX),
+                                  &iter->state);
       if (iter->cur.bytes == (size_t) -1)
         {
           /* An invalid multibyte sequence was encountered.  */
diff --git a/modules/exclude b/modules/exclude
index 841dd826cd..93bfdaf4cf 100644
--- a/modules/exclude
+++ b/modules/exclude
@@ -7,6 +7,7 @@ lib/exclude.c
 
 Depends-on:
 assert-h
+c32tolower
 filename
 fnmatch
 fopen-gnu
diff --git a/modules/mbchar b/modules/mbchar
index b1fa0fa4ac..51a1c8e1b9 100644
--- a/modules/mbchar
+++ b/modules/mbchar
@@ -10,12 +10,21 @@ Depends-on:
 extensions
 extern-inline
 stdbool
-wchar
-wctype-h
-iswblank
-iswdigit
-iswxdigit
-wcwidth
+uchar
+c32isalnum
+c32isalpha
+c32isblank
+c32iscntrl
+c32isdigit
+c32isgraph
+c32islower
+c32isprint
+c32ispunct
+c32isspace
+c32isupper
+c32isxdigit
+c32tolower
+c32width
 memcmp
 
 configure.ac:
diff --git a/modules/mbiter b/modules/mbiter
index 42305d62cd..082afd42f2 100644
--- a/modules/mbiter
+++ b/modules/mbiter
@@ -10,9 +10,9 @@ m4/mbrtowc.m4
 Depends-on:
 extern-inline
 mbchar
-mbrtowc
+mbrtoc32
 mbsinit
-wchar
+uchar
 stdbool
 
 configure.ac:
diff --git a/modules/mbscasestr b/modules/mbscasestr
index 2892c2fc2b..672cac8960 100644
--- a/modules/mbscasestr
+++ b/modules/mbscasestr
@@ -11,6 +11,7 @@ stdbool
 string
 mbslen
 malloca
+c32tolower
 strnlen
 
 configure.ac:
diff --git a/modules/mbuiter b/modules/mbuiter
index b9e41031d5..63a11ff2f5 100644
--- a/modules/mbuiter
+++ b/modules/mbuiter
@@ -10,9 +10,9 @@ m4/mbrtowc.m4
 Depends-on:
 extern-inline
 mbchar
-mbrtowc
+mbrtoc32
 mbsinit
-wchar
+uchar
 stdbool
 strnlen1
[Prev in Thread]
Current Thread
[Next in Thread]
From wchar_t to char32_t, Bruno Haible, 2023/06/19
- Re: From wchar_t to char32_t, Bruno Haible <=
  - Re: From wchar_t to char32_t, Bruno Haible, 2023/06/25
  - Re: From wchar_t to char32_t, Bruno Haible, 2023/06/30
Prev by Date: Re: INSTALL updates
Next by Date: Re: INSTALL updates
Previous by thread: From wchar_t to char32_t
Next by thread: Re: From wchar_t to char32_t
Index(es):
- Date
- Thread