[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: From wchar_t to char32_t
From: |
Bruno Haible |
Subject: |
Re: From wchar_t to char32_t |
Date: |
Sat, 24 Jun 2023 18:47:23 +0200 |
I wrote:
> In Gnulib, the following areas will need migration:
>
> * lib/mbchar.h
> lib/mbiter.h
> lib/mbuiter.h
> Draft patch attached.
It seems to work fine, so I pushed this:
2023-06-24 Bruno Haible <bruno@clisp.org>
mbchar, mbiter, mbuiter: Overcome wchar_t limitations.
* lib/mbchar.h: Include <uchar.h> instead of <wchar.h>, <wctype.h>.
(struct mbchar): Use char32_t instead of wchar_t.
(mb_casecmp, mb_caseequal): Use c32tolower instead of towlower.
(mb_isalnum): Use c32isalnum instead of iswalnum.
(mb_isalpha): Use c32isalpha instead of iswalpha.
(mb_isblank): Use c32isblank instead of iswblank.
(mb_iscntrl): Use c32iscntrl instead of iswcntrl.
(mb_isdigit): Use c32isdigit instead of iswdigit.
(mb_isgraph): Use c32isgraph instead of iswgraph.
(mb_islower): Use c32islower instead of iswlower.
(mb_isprint): Use c32isprint instead of iswprint.
(mb_ispunct): Use c32ispunct instead of iswpunct.
(mb_isspace): Use c32isspace instead of iswspace.
(mb_isupper): Use c32isupper instead of iswupper.
(mb_isxdigit): Use c32isxdigit instead of iswxdigit.
(mb_width_aux): Use c32width, c32iscntrl instead of wcwidth, iswcntrl.
* lib/mbiter.h: Include <uchar.h> instead of <wchar.h>.
(mbiter_multi_next): Use mbrtoc32 instead of mbrtowc.
* lib/mbuiter.h: Include <uchar.h> instead of <wchar.h>.
(mbuiter_multi_next): Use mbrtoc32 instead of mbrtowc.
* lib/mbfile.h (mbfile_multi_getc): Use mbrtoc32 instead of mbrtowc.
* lib/mbscasestr.c (knuth_morris_pratt_multibyte, mbscasestr): Use
c32tolower instead of towlower.
* lib/exclude.c (string_hasher_ci): Use char32_t, c32tolower instead of
wchar_t, towlower.
* modules/mbchar (Depends-on): Add uchar, c32isalnum, c32isalpha,
c32isblank, c32iscntrl, c32isdigit, c32isgraph, c32islower, c32isprint,
c32ispunct, c32isspace, c32isupper, c32isxdigit, c32tolower, c32width.
Remove wchar, wctype-h, iswblank, iswdigit, iswxdigit, wcwidth.
* modules/mbiter (Depends-on): Add mbrtoc32, uchar. Remove mbrtowc,
wchar.
* modules/mbuiter (Depends-on): Likewise.
* modules/mbscasestr (Depends-on): Add c32tolower.
* modules/exclude (Depends-on): Add c32tolower.
* doc/strings.texi (C strings): Fix typo.
diff --git a/doc/strings.texi b/doc/strings.texi
index 612d4fb083..7a074e4040 100644
--- a/doc/strings.texi
+++ b/doc/strings.texi
@@ -45,7 +45,7 @@
functions, standardized by ISO C and POSIX, that assume this
representation of strings.
-An @emph{character encoding}, or @emph{encoding} for short, describes
+A @emph{character encoding}, or @emph{encoding} for short, describes
how the elements of a character set are represented as a sequence of
bytes. For example, in the @code{ASCII} encoding, the UNDERSCORE
character is represented by a single byte, with value 0x5F. As another
diff --git a/lib/exclude.c b/lib/exclude.c
index 7bd0ec8c71..af204cd300 100644
--- a/lib/exclude.c
+++ b/lib/exclude.c
@@ -209,10 +209,10 @@ string_hasher_ci (void const *data, size_t n_buckets)
for (mbui_init (iter, p); mbui_avail (iter); mbui_advance (iter))
{
mbchar_t m = mbui_cur (iter);
- wchar_t wc;
+ char32_t wc;
if (m.wc_valid)
- wc = towlower (m.wc);
+ wc = c32tolower (m.wc);
else
wc = *m.ptr;
diff --git a/lib/mbchar.h b/lib/mbchar.h
index a2ff1d8b21..c183772cc6 100644
--- a/lib/mbchar.h
+++ b/lib/mbchar.h
@@ -17,10 +17,10 @@
/* Written by Bruno Haible <bruno@clisp.org>. */
/* A multibyte character is a short subsequence of a char* string,
- representing a single wide character.
+ representing a single 32-bit wide character.
- We use multibyte characters instead of wide characters because of
- the following goals:
+ We use multibyte characters instead of 32-bit wide characters because
+ of the following goals:
1) correct multibyte handling, i.e. operate according to the LC_CTYPE
locale,
2) ease of maintenance, i.e. the maintainer needs not know all details
@@ -28,8 +28,7 @@
3) don't fail grossly if the input is not in the encoding set by the
locale, because often different encodings are in use in the same
countries (ISO-8859-1/UTF-8, EUC-JP/Shift_JIS, ...),
- 4) fast in the case of ASCII characters,
- 5) portability, i.e. don't make unportable assumptions about wchar_t.
+ 4) fast in the case of ASCII characters.
Multibyte characters are only accessed through the mb* macros.
@@ -150,8 +149,7 @@
#endif
#include <string.h>
-#include <wchar.h>
-#include <wctype.h>
+#include <uchar.h>
_GL_INLINE_HEADER_BEGIN
#ifndef MBCHAR_INLINE
@@ -164,8 +162,8 @@ struct mbchar
{
const char *ptr; /* pointer to current character */
size_t bytes; /* number of bytes of current character, > 0 */
- bool wc_valid; /* true if wc is a valid wide character */
- wchar_t wc; /* if wc_valid: the current character */
+ bool wc_valid; /* true if wc is a valid 32-bit wide character */
+ char32_t wc; /* if wc_valid: the current character */
char buf[MBCHAR_BUF_SIZE]; /* room for the bytes, used for file input only */
};
@@ -184,7 +182,7 @@ typedef struct mbchar mbchar_t;
#define mb_cmp(mbc1, mbc2) \
((mbc1).wc_valid \
? ((mbc2).wc_valid \
- ? (int) (mbc1).wc - (int) (mbc2).wc \
+ ? _GL_CMP ((mbc1).wc, (mbc2).wc) \
: -1) \
: ((mbc2).wc_valid \
? 1 \
@@ -196,7 +194,7 @@ typedef struct mbchar mbchar_t;
#define mb_casecmp(mbc1, mbc2) \
((mbc1).wc_valid \
? ((mbc2).wc_valid \
- ? (int) towlower ((mbc1).wc) - (int) towlower ((mbc2).wc) \
+ ? _GL_CMP (c32tolower ((mbc1).wc), c32tolower ((mbc2).wc)) \
: -1) \
: ((mbc2).wc_valid \
? 1 \
@@ -212,25 +210,25 @@ typedef struct mbchar mbchar_t;
&& memcmp ((mbc1).ptr, (mbc2).ptr, (mbc1).bytes) == 0)
#define mb_caseequal(mbc1, mbc2) \
((mbc1).wc_valid && (mbc2).wc_valid \
- ? towlower ((mbc1).wc) == towlower ((mbc2).wc) \
+ ? c32tolower ((mbc1).wc) == c32tolower ((mbc2).wc) \
: (mbc1).bytes == (mbc2).bytes \
&& memcmp ((mbc1).ptr, (mbc2).ptr, (mbc1).bytes) == 0)
/* <ctype.h>, <wctype.h> classification. */
#define mb_isascii(mbc) \
((mbc).wc_valid && (mbc).wc >= 0 && (mbc).wc <= 127)
-#define mb_isalnum(mbc) ((mbc).wc_valid && iswalnum ((mbc).wc))
-#define mb_isalpha(mbc) ((mbc).wc_valid && iswalpha ((mbc).wc))
-#define mb_isblank(mbc) ((mbc).wc_valid && iswblank ((mbc).wc))
-#define mb_iscntrl(mbc) ((mbc).wc_valid && iswcntrl ((mbc).wc))
-#define mb_isdigit(mbc) ((mbc).wc_valid && iswdigit ((mbc).wc))
-#define mb_isgraph(mbc) ((mbc).wc_valid && iswgraph ((mbc).wc))
-#define mb_islower(mbc) ((mbc).wc_valid && iswlower ((mbc).wc))
-#define mb_isprint(mbc) ((mbc).wc_valid && iswprint ((mbc).wc))
-#define mb_ispunct(mbc) ((mbc).wc_valid && iswpunct ((mbc).wc))
-#define mb_isspace(mbc) ((mbc).wc_valid && iswspace ((mbc).wc))
-#define mb_isupper(mbc) ((mbc).wc_valid && iswupper ((mbc).wc))
-#define mb_isxdigit(mbc) ((mbc).wc_valid && iswxdigit ((mbc).wc))
+#define mb_isalnum(mbc) ((mbc).wc_valid && c32isalnum ((mbc).wc))
+#define mb_isalpha(mbc) ((mbc).wc_valid && c32isalpha ((mbc).wc))
+#define mb_isblank(mbc) ((mbc).wc_valid && c32isblank ((mbc).wc))
+#define mb_iscntrl(mbc) ((mbc).wc_valid && c32iscntrl ((mbc).wc))
+#define mb_isdigit(mbc) ((mbc).wc_valid && c32isdigit ((mbc).wc))
+#define mb_isgraph(mbc) ((mbc).wc_valid && c32isgraph ((mbc).wc))
+#define mb_islower(mbc) ((mbc).wc_valid && c32islower ((mbc).wc))
+#define mb_isprint(mbc) ((mbc).wc_valid && c32isprint ((mbc).wc))
+#define mb_ispunct(mbc) ((mbc).wc_valid && c32ispunct ((mbc).wc))
+#define mb_isspace(mbc) ((mbc).wc_valid && c32isspace ((mbc).wc))
+#define mb_isupper(mbc) ((mbc).wc_valid && c32isupper ((mbc).wc))
+#define mb_isxdigit(mbc) ((mbc).wc_valid && c32isxdigit ((mbc).wc))
/* Extra <wchar.h> function. */
@@ -238,12 +236,12 @@ typedef struct mbchar mbchar_t;
#define MB_UNPRINTABLE_WIDTH 1
MBCHAR_INLINE int
-mb_width_aux (wint_t wc)
+mb_width_aux (char32_t wc)
{
- int w = wcwidth (wc);
+ int w = c32width (wc);
/* For unprintable characters, arbitrarily return 0 for control characters
and MB_UNPRINTABLE_WIDTH otherwise. */
- return (w >= 0 ? w : iswcntrl (wc) ? 0 : MB_UNPRINTABLE_WIDTH);
+ return (w >= 0 ? w : c32iscntrl (wc) ? 0 : MB_UNPRINTABLE_WIDTH);
}
#define mb_width(mbc) \
diff --git a/lib/mbfile.h b/lib/mbfile.h
index 3482f394b9..7c6d70fcae 100644
--- a/lib/mbfile.h
+++ b/lib/mbfile.h
@@ -110,7 +110,7 @@ mbfile_multi_getc (struct mbchar *mbc, struct mbfile_multi
*mbf)
{
/* These characters are part of the basic character set. ISO C 99
guarantees that their wide character code is identical to their
- char code. */
+ char code. The 32-bit wide character code is the same as well. */
mbc->wc = mbc->buf[0] = mbf->buf[0];
mbc->wc_valid = true;
mbc->ptr = &mbc->buf[0];
@@ -136,7 +136,7 @@ mbfile_multi_getc (struct mbchar *mbc, struct mbfile_multi
*mbf)
behaviour will clobber it. */
mbstate_t backup_state = mbf->state;
- bytes = mbrtowc (&mbc->wc, &mbf->buf[0], mbf->bufcount, &mbf->state);
+ bytes = mbrtoc32 (&mbc->wc, &mbf->buf[0], mbf->bufcount, &mbf->state);
if (bytes == (size_t) -1)
{
@@ -178,7 +178,7 @@ mbfile_multi_getc (struct mbchar *mbc, struct mbfile_multi
*mbf)
{
if (bytes == 0)
{
- /* A null wide character was encountered. */
+ /* A null 32-bit wide character was encountered. */
bytes = 1;
assert (mbf->buf[0] == '\0');
assert (mbc->wc == 0);
diff --git a/lib/mbiter.h b/lib/mbiter.h
index 7b41870b55..93bad990a1 100644
--- a/lib/mbiter.h
+++ b/lib/mbiter.h
@@ -90,7 +90,7 @@
#include <assert.h>
#include <stddef.h>
#include <string.h>
-#include <wchar.h>
+#include <uchar.h>
#include "mbchar.h"
@@ -106,11 +106,11 @@ struct mbiter_multi
mbstate_t state; /* if in_shift: current shift state */
bool next_done; /* true if mbi_avail has already filled the following
*/
struct mbchar cur; /* the current character:
- const char *cur.ptr pointer to current character
+ const char *cur.ptr pointer to current character
The following are only valid after mbi_avail.
- size_t cur.bytes number of bytes of current character
- bool cur.wc_valid true if wc is a valid wide character
- wchar_t cur.wc if wc_valid: the current character
+ size_t cur.bytes number of bytes of current character
+ bool cur.wc_valid true if wc is a valid 32-bit wide
character
+ char32_t cur.wc if wc_valid: the current character
*/
};
@@ -136,8 +136,8 @@ mbiter_multi_next (struct mbiter_multi *iter)
assert (mbsinit (&iter->state));
iter->in_shift = true;
with_shift:
- iter->cur.bytes = mbrtowc (&iter->cur.wc, iter->cur.ptr,
- iter->limit - iter->cur.ptr, &iter->state);
+ iter->cur.bytes = mbrtoc32 (&iter->cur.wc, iter->cur.ptr,
+ iter->limit - iter->cur.ptr, &iter->state);
if (iter->cur.bytes == (size_t) -1)
{
/* An invalid multibyte sequence was encountered. */
diff --git a/lib/mbscasestr.c b/lib/mbscasestr.c
index d92b847ba7..0753aeb864 100644
--- a/lib/mbscasestr.c
+++ b/lib/mbscasestr.c
@@ -64,7 +64,7 @@ knuth_morris_pratt_multibyte (const char *haystack, const
char *needle,
{
mb_copy (&needle_mbchars[j], &mbui_cur (iter));
if (needle_mbchars[j].wc_valid)
- needle_mbchars[j].wc = towlower (needle_mbchars[j].wc);
+ needle_mbchars[j].wc = c32tolower (needle_mbchars[j].wc);
}
}
@@ -152,7 +152,7 @@ knuth_morris_pratt_multibyte (const char *haystack, const
char *needle,
mb_copy (&c, &mbui_cur (phaystack));
if (c.wc_valid)
- c.wc = towlower (c.wc);
+ c.wc = c32tolower (c.wc);
if (mb_equal (needle_mbchars[j], c))
{
j++;
@@ -237,7 +237,7 @@ mbscasestr (const char *haystack, const char *needle)
mb_copy (&b, &mbui_cur (iter_needle));
if (b.wc_valid)
- b.wc = towlower (b.wc);
+ b.wc = c32tolower (b.wc);
mbui_init (iter_haystack, haystack);
for (;; mbui_advance (iter_haystack))
@@ -279,7 +279,7 @@ mbscasestr (const char *haystack, const char *needle)
comparison_count++;
mb_copy (&c, &mbui_cur (iter_haystack));
if (c.wc_valid)
- c.wc = towlower (c.wc);
+ c.wc = c32tolower (c.wc);
if (mb_equal (c, b))
/* The first character matches. */
{
diff --git a/lib/mbuiter.h b/lib/mbuiter.h
index 7a619f19e1..02e3190f1c 100644
--- a/lib/mbuiter.h
+++ b/lib/mbuiter.h
@@ -98,7 +98,7 @@
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
-#include <wchar.h>
+#include <uchar.h>
#include "mbchar.h"
#include "strnlen1.h"
@@ -114,11 +114,11 @@ struct mbuiter_multi
mbstate_t state; /* if in_shift: current shift state */
bool next_done; /* true if mbui_avail has already filled the following
*/
struct mbchar cur; /* the current character:
- const char *cur.ptr pointer to current character
+ const char *cur.ptr pointer to current character
The following are only valid after mbui_avail.
- size_t cur.bytes number of bytes of current character
- bool cur.wc_valid true if wc is a valid wide character
- wchar_t cur.wc if wc_valid: the current character
+ size_t cur.bytes number of bytes of current character
+ bool cur.wc_valid true if wc is a valid 32-bit wide
character
+ char32_t cur.wc if wc_valid: the current character
*/
};
@@ -144,9 +144,9 @@ mbuiter_multi_next (struct mbuiter_multi *iter)
assert (mbsinit (&iter->state));
iter->in_shift = true;
with_shift:
- iter->cur.bytes = mbrtowc (&iter->cur.wc, iter->cur.ptr,
- strnlen1 (iter->cur.ptr, MB_CUR_MAX),
- &iter->state);
+ iter->cur.bytes = mbrtoc32 (&iter->cur.wc, iter->cur.ptr,
+ strnlen1 (iter->cur.ptr, MB_CUR_MAX),
+ &iter->state);
if (iter->cur.bytes == (size_t) -1)
{
/* An invalid multibyte sequence was encountered. */
diff --git a/modules/exclude b/modules/exclude
index 841dd826cd..93bfdaf4cf 100644
--- a/modules/exclude
+++ b/modules/exclude
@@ -7,6 +7,7 @@ lib/exclude.c
Depends-on:
assert-h
+c32tolower
filename
fnmatch
fopen-gnu
diff --git a/modules/mbchar b/modules/mbchar
index b1fa0fa4ac..51a1c8e1b9 100644
--- a/modules/mbchar
+++ b/modules/mbchar
@@ -10,12 +10,21 @@ Depends-on:
extensions
extern-inline
stdbool
-wchar
-wctype-h
-iswblank
-iswdigit
-iswxdigit
-wcwidth
+uchar
+c32isalnum
+c32isalpha
+c32isblank
+c32iscntrl
+c32isdigit
+c32isgraph
+c32islower
+c32isprint
+c32ispunct
+c32isspace
+c32isupper
+c32isxdigit
+c32tolower
+c32width
memcmp
configure.ac:
diff --git a/modules/mbiter b/modules/mbiter
index 42305d62cd..082afd42f2 100644
--- a/modules/mbiter
+++ b/modules/mbiter
@@ -10,9 +10,9 @@ m4/mbrtowc.m4
Depends-on:
extern-inline
mbchar
-mbrtowc
+mbrtoc32
mbsinit
-wchar
+uchar
stdbool
configure.ac:
diff --git a/modules/mbscasestr b/modules/mbscasestr
index 2892c2fc2b..672cac8960 100644
--- a/modules/mbscasestr
+++ b/modules/mbscasestr
@@ -11,6 +11,7 @@ stdbool
string
mbslen
malloca
+c32tolower
strnlen
configure.ac:
diff --git a/modules/mbuiter b/modules/mbuiter
index b9e41031d5..63a11ff2f5 100644
--- a/modules/mbuiter
+++ b/modules/mbuiter
@@ -10,9 +10,9 @@ m4/mbrtowc.m4
Depends-on:
extern-inline
mbchar
-mbrtowc
+mbrtoc32
mbsinit
-wchar
+uchar
stdbool
strnlen1