bug-gnu-utils
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: gawk: Wrong behavior in binary mode


From: Aharon Robbins
Subject: Re: gawk: Wrong behavior in binary mode
Date: Thu, 11 Dec 2008 21:41:19 +0200

Greetings. Re this:

> Date: Mon, 8 Dec 2008 23:27:51 -0200
> From: "Carlos G." <address@hidden>
> To: address@hidden
> Subject: gawk: Wrong behavior in binary mode
> Content-Type: multipart/alternative; 
>       boundary="----=_Part_38195_7884193.1228786071849"
>
> Hi... I think this is a bug.
> When working with gawk in binary mode, the length() and index() built-ins
> fail with character codes greater than 127(0x7f). For example:
>
> $ printf "\x80\x81\x82\x83" | gawk 'BEGIN { BINMODE=1 } { print length }'
> 0
>
> But it should print 4.
> I've found that this happens because gawk assumes multi-byte in those cases.
> The following patch worked fine for me:

Hi. As I mentioned in my earlier mail, it's a locale issue, not a BINMODE
issue.  Attached is a patch.  This will be showing up in the stable CVS
tree shortly.

Thank you for the bug report!

Arnold
------------------------------------------------------
Thu Dec 11 21:23:50 2008  Arnold D. Robbins  <address@hidden>

        * builtin.c (do_length): If the wide string has zero length
        but the bytes are more than zero, use the number of bytes.
        (do_index): Similar also: fall back to byte count if the
        bytes don't make a wide-character string.
        Bug reported by "Carlos G." <address@hidden>

        (do_substr): If defaulting to length of rest of the string,
        do it based on the wide char string if it's valid.

Index: builtin.c
===================================================================
RCS file: /d/mongo/cvsrep/gawk-stable/builtin.c,v
retrieving revision 1.27
diff -u -r1.27 builtin.c
--- builtin.c   30 Jul 2008 20:17:09 -0000      1.27
+++ builtin.c   11 Dec 2008 19:30:40 -0000
@@ -300,6 +300,7 @@
        register const char *p1, *p2;
        register size_t l1, l2;
        long ret;
+       int do_single_byte = FALSE;
 
        s1 = tree_eval(tree->lnode);
        s2 = tree_eval(tree->rnode->lnode);
@@ -327,18 +328,28 @@
                goto out;
        }
 
+#ifdef MBS_SUPPORT
+       if (gawk_mb_cur_max > 1) {
+               s1 = force_wstring(s1);
+               s2 = force_wstring(s2);
+               /*
+                * If we don't have valid wide character strings, use
+                * the real bytes.
+                */
+               do_single_byte = ((s1->wstlen == 0 && s1->stlen > 0) 
+                                       || (s2->wstlen == 0 && s2->stlen > 0));
+       }
+#endif
+
        /* IGNORECASE will already be false if posix */
        if (IGNORECASE) {
                while (l1 > 0) {
                        if (l2 > l1)
                                break;
 #ifdef MBS_SUPPORT
-                       if (gawk_mb_cur_max > 1) {
+                       if (! do_single_byte && gawk_mb_cur_max > 1) {
                                const wchar_t *pos;
 
-                               s1 = force_wstring(s1);
-                               s2 = force_wstring(s2);
-
                                pos = wcasestrstr(s1->wstptr, s1->wstlen, 
s2->wstptr, s2->wstlen);
                                if (pos == NULL)
                                        ret = 0;
@@ -372,12 +383,9 @@
                                break;
                        }
 #ifdef MBS_SUPPORT
-                       if (gawk_mb_cur_max > 1) {
+                       if (! do_single_byte && gawk_mb_cur_max > 1) {
                                const wchar_t *pos;
 
-                               s1 = force_wstring(s1);
-                               s2 = force_wstring(s2);
-
                                pos = wstrstr(s1->wstptr, s1->wstlen, 
s2->wstptr, s2->wstlen);
                                if (pos == NULL)
                                        ret = 0;
@@ -468,6 +476,12 @@
                if (gawk_mb_cur_max > 1) {
                        tmp = force_wstring(tmp);
                        len = tmp->wstlen;
+                       /*
+                        * If the bytes don't make a valid wide character
+                        * string, fall back to the bytes themselves.
+                        */
+                       if (len == 0 && tmp->stlen > 0)
+                               len = tmp->stlen;
                } else
 #endif
                        len = tmp->stlen;
@@ -1439,7 +1453,14 @@
 
        if (tree->rnode->rnode == NULL) {       /* third arg. missing */
                /* use remainder of string */
-               length = t1->stlen - indx;
+               length = t1->stlen - indx;      /* default to bytes */
+#ifdef MBS_SUPPORT
+               if (gawk_mb_cur_max > 1) {
+                       t1 = force_wstring(t1);
+                       if (t1->wstlen > 0)     /* use length of wide char 
string if we have one */
+                               length = t1->wstlen - indx;
+               }
+#endif
                d_length = length;      /* set here in case used in 
diagnostics, below */
        } else {
                t3 = tree_eval(tree->rnode->rnode->lnode);




reply via email to

[Prev in Thread] Current Thread [Next in Thread]