bug-gnu-utils
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

multibyte awareness patch for gawk 3.1.4


From: Aharon Robbins
Subject: multibyte awareness patch for gawk 3.1.4
Date: Thu, 13 Jan 2005 15:22:06 +0200

        awk 'BEGIN { print "Hello, world!\n" }'
                -- Aho, Kernighan, and Weinberger
                   (Members of the all-star band, The Dudes from Bell Labs)

Greetings all.  Below is the first public release of the multibyte awareness
patch for gawk.  With this patch, index(), length(), substr() and match()
all work in terms of "characters" instead of "bytes".  

Preliminary testing by a few intrepid explorers seems to indicate that things
work as they're supposed to.  This patch passes `make test' in the en_US.utf8
locale, with the expected failure of the gsubtst5 test.

Please try this out, particularly in multibyte locales.

As noted in the version string, this is an UNOFFICIAL patch; you should not
drop it into a production version without testing.  Please send me any problem
reports.  I will also be more than happy to get success reports in different
locales.

Thanks,

Arnold Robbins
Not a member of any band, all-star or not
------------------------------------------------------------------------------
diff -ur gawk-3.1.4/awk.h gawk-mb/awk.h
--- gawk-3.1.4/awk.h    2004-07-26 17:11:05.000000000 +0300
+++ gawk-mb/awk.h       2005-01-13 11:05:59.000000000 +0200
@@ -481,6 +481,10 @@
                        size_t slen;
                        long sref;
                        int idx;
+#ifdef MBS_SUPPORT
+                       wchar_t *wsp;
+                       size_t wslen;
+#endif
                } val;
                struct {
                        struct exp_node *next;
@@ -516,6 +520,9 @@
                                         * function name; see awkgram.y */
 #              define  FIELD   1024    /* this is a field */
 #              define  INTLSTR 2048    /* use localized version */
+#ifdef MBS_SUPPORT
+#              define  WSTRCUR 4096    /* wide str value is current */
+#endif
 } NODE;
 
 #define vname sub.nodep.name
@@ -548,6 +555,9 @@
 #define stref  sub.val.sref
 #define        stfmt   sub.val.idx
 
+#define wstptr sub.val.wsp
+#define wstlen sub.val.wslen
+
 #define numbr  sub.val.fltnum
 
 /* Node_var: */
@@ -1126,6 +1136,12 @@
 #endif
 extern void unref P((NODE *tmp));
 extern int parse_escape P((const char **string_ptr));
+#ifdef MBS_SUPPORT
+extern NODE *str2wstr P((NODE *n, size_t **ptr));
+#define force_wstring(n)       str2wstr(n, NULL)
+extern const wchar_t *wstrstr P((const wchar_t *haystack, size_t hs_len, const 
wchar_t *needle, size_t needle_len));
+extern const wchar_t *wcasestrstr P((const wchar_t *haystack, size_t hs_len, 
const wchar_t *needle, size_t needle_len));
+#endif
 /* re.c */
 extern Regexp *make_regexp P((const char *s, size_t len, int ignorecase, int 
dfa));
 extern int research P((Regexp *rp, char *str, int start,
diff -ur gawk-3.1.4/builtin.c gawk-mb/builtin.c
--- gawk-3.1.4/builtin.c        2004-07-13 10:55:28.000000000 +0300
+++ gawk-mb/builtin.c   2005-01-09 11:50:48.000000000 +0200
@@ -364,6 +364,7 @@
                                break;
 #ifdef MBS_SUPPORT
                        if (gawk_mb_cur_max > 1) {
+#if 0
                                if (strncasecmpmbs(p1, mbs1, p2, mbs2, l2) == 
0) {
                                        ret = 1 + s1->stlen - l1;
                                        break;
@@ -377,6 +378,19 @@
                                }
                                l1 -= mbclen;
                                p1 += mbclen;
+#else
+                               const wchar_t *pos;
+
+                               s1 = force_wstring(s1);
+                               s2 = force_wstring(s2);
+
+                               pos = wcasestrstr(s1->wstptr, s1->wstlen, 
s2->wstptr, s2->wstlen);
+                               if (pos == NULL)
+                                       ret = 0;
+                               else
+                                       ret = pos - s1->wstptr + 1;     /* 
1-based */
+                               goto out;
+#endif
                        } else {
 #endif
                        /*
@@ -405,6 +419,7 @@
                        }
 #ifdef MBS_SUPPORT
                        if (gawk_mb_cur_max > 1) {
+#if 0
                                mbclen = mbrlen(p1, l1, &mbs1);
                                if ((mbclen == 1) || (mbclen == (size_t) -1) ||
                                        (mbclen == (size_t) -2) || (mbclen == 
0)) {
@@ -413,6 +428,19 @@
                                }
                                l1 -= mbclen;
                                p1 += mbclen;
+#else
+                               const wchar_t *pos;
+
+                               s1 = force_wstring(s1);
+                               s2 = force_wstring(s2);
+
+                               pos = wstrstr(s1->wstptr, s1->wstlen, 
s2->wstptr, s2->wstlen);
+                               if (pos == NULL)
+                                       ret = 0;
+                               else
+                                       ret = pos - s1->wstptr + 1;     /* 
1-based */
+                               goto out;
+#endif
                        } else {
                                l1--;
                                p1++;
@@ -469,7 +497,15 @@
        tmp = tree_eval(tree->lnode);
        if (do_lint && (tmp->flags & (STRING|STRCUR)) == 0)
                lintwarn(_("length: received non-string argument"));
-       len = force_string(tmp)->stlen;
+       tmp = force_string(tmp);
+#ifdef MBS_SUPPORT
+       if (gawk_mb_cur_max > 1) {
+               tmp = force_wstring(tmp);
+               len = tmp->wstlen;
+       } else
+#endif
+               len = tmp->stlen;
+
        free_temp(tmp);
        return tmp_number((AWKNUM) len);
 }
@@ -1289,6 +1325,7 @@
        register size_t indx;
        size_t length;
        double d_index, d_length;
+       size_t src_len;
 
        t1 = force_string(tree_eval(tree->lnode));
        t2 = tree_eval(tree->rnode->lnode);
@@ -1352,21 +1389,71 @@
                free_temp(t1);
                return Nnull_string;
        }
-       if (indx >= t1->stlen) {
+
+       /* get total len of input string, for following checks */
+#ifdef MBS_SUPPORT
+       if (gawk_mb_cur_max > 1) {
+               t1 = force_wstring(t1);
+               src_len = t1->wstlen;
+       } else
+#endif
+               src_len = t1->stlen;
+
+       if (indx >= src_len) {
                if (do_lint)
                        lintwarn(_("substr: start index %g is past end of 
string"),
                                d_index);
                free_temp(t1);
                return Nnull_string;
        }
-       if (length > t1->stlen - indx) {
+       if (length > src_len - indx) {
                if (do_lint)
                        lintwarn(
        _("substr: length %g at start index %g exceeds length of first argument 
(%lu)"),
-                       d_length, d_index, (unsigned long int) t1->stlen);
-               length = t1->stlen - indx;
+                       d_length, d_index, (unsigned long int) src_len);
+               length = src_len - indx;
        }
+
+#ifdef MBS_SUPPORT
+       if (gawk_mb_cur_max > 1) {
+               /* multibyte case, more work */
+               size_t result;
+               wchar_t *wp;
+               mbstate_t mbs;
+               char *substr, *cp;
+
+               /* force_wstring() already called */
+
+               if (t1->stlen == t1->wstlen)
+                       goto single_byte_case;
+
+               /*
+                * Convert the wide chars in t1->wstptr back into m.b. chars.
+                * This is pretty grotty, but it's the most straightforward
+                * way to do things.
+                */
+               memset(& mbs, 0, sizeof(mbs));
+               emalloc(substr, char *, (length * gawk_mb_cur_max) + 2, 
"do_substr");
+               wp = t1->wstptr + indx;
+               for (cp = substr; length > 0; length--) {
+                       result = wcrtomb(cp, *wp, & mbs);
+                       if (result == (size_t) -1)      /* what to do? break 
seems best */
+                               break;
+                       cp += result;
+                       wp++;
+               }
+               *cp = '\0';
+               r = make_str_node(substr, cp - substr, ALREADY_MALLOCED);
+               r->flags |= TEMP;
+       } else {
+               /* single byte case, easy */
+single_byte_case:
+               r = tmp_string(t1->stptr + indx, length);
+       }
+#else
        r = tmp_string(t1->stptr + indx, length);
+#endif
+
        free_temp(t1);
        return r;
 }
@@ -1866,7 +1953,7 @@
 {
        NODE *t1, *dest, *it;
        int rstart, len, ii;
-       AWKNUM rlength;
+       int rlength;
        Regexp *rp;
        regoff_t s;
        char *start;
@@ -1891,8 +1978,17 @@
        
        rstart = research(rp, t1->stptr, 0, t1->stlen, TRUE);
        if (rstart >= 0) {      /* match succeded */
-               rstart++;       /* 1-based indexing */
-               rlength = REEND(rp, t1->stptr) - RESTART(rp, t1->stptr);
+               size_t *wc_indices = NULL;
+
+               rlength = REEND(rp, t1->stptr) - RESTART(rp, t1->stptr);        
/* byte length */
+#ifdef MBS_SUPPORT
+               if (gawk_mb_cur_max > 1) {
+                       t1 = str2wstr(t1, & wc_indices);
+                       rlength = wc_indices[rstart + rlength - 1] - 
wc_indices[rstart] + 1;
+                       rstart = wc_indices[rstart];
+               }
+#endif
+               rstart++;       /* now it's 1-based indexing */
        
                /* Build the array only if the caller wants the optional 
subpatterns */
                if (dest != NULL) {
@@ -1905,8 +2001,18 @@
                                 * matched even if all of them did not.
                                 */
                                if ((s = SUBPATSTART(rp, t1->stptr, ii)) != -1) 
{
+                                       size_t subpat_start;
+                                       size_t subpat_len;
+
                                        start = t1->stptr + s;
-                                       len = SUBPATEND(rp, t1->stptr, ii) - s;
+                                       subpat_start = s;
+                                       subpat_len = len = SUBPATEND(rp, 
t1->stptr, ii) - s;
+#ifdef MBS_SUPPORT
+                                       if (gawk_mb_cur_max > 1) {
+                                               subpat_start = wc_indices[s];
+                                               subpat_len = wc_indices[s + len 
- 1] - subpat_start + 1;
+                                       }
+#endif
        
                                        it = make_string(start, len);
                                        /*
@@ -1930,7 +2036,7 @@
        
                                        slen = ilen + subseplen + 5;
        
-                                       it = make_number((AWKNUM) s + 1);
+                                       it = make_number((AWKNUM) subpat_start 
+ 1);
                                        *assoc_lookup(dest, tmp_string(buf, 
slen), FALSE) = it;
        
                                        memcpy(buf, buff, ilen);
@@ -1939,22 +2045,24 @@
        
                                        slen = ilen + subseplen + 6;
        
-                                       it = make_number((AWKNUM) len);
+                                       it = make_number((AWKNUM) subpat_len);
                                        *assoc_lookup(dest, tmp_string(buf, 
slen), FALSE) = it;
                                }
                        }
 
                        free(buf);
+                       if (wc_indices != NULL)
+                               free(wc_indices);
                }
        } else {                /* match failed */
                rstart = 0;
-               rlength = -1.0;
+               rlength = -1;
        }
        free_temp(t1);
        unref(RSTART_node->var_value);
        RSTART_node->var_value = make_number((AWKNUM) rstart);
        unref(RLENGTH_node->var_value);
-       RLENGTH_node->var_value = make_number(rlength);
+       RLENGTH_node->var_value = make_number((AWKNUM) rlength);
        return tmp_number((AWKNUM) rstart);
 }
 
Only in gawk-mb: builtin.c.save
diff -ur gawk-3.1.4/dfa.c gawk-mb/dfa.c
--- gawk-3.1.4/dfa.c    2004-07-26 17:11:41.000000000 +0300
+++ gawk-mb/dfa.c       2004-12-06 12:32:53.000000000 +0200
@@ -405,9 +405,8 @@
                                   inputstring[i] is a single-byte char,
                                   or 1st byte of a multibyte char.
                                   And inputwcs[i] is the codepoint.  */
-static unsigned char const *buf_begin;/* refference to begin in dfaexec().  */
-static unsigned char const *buf_end;   /* refference to end in dfaexec().  */
-static unsigned long buf_offset; /* Go fast. */
+static unsigned char const *buf_begin;/* reference to begin in dfaexec().  */
+static unsigned char const *buf_end;   /* reference to end in dfaexec().  */
 #endif /* MBS_SUPPORT  */
 
 #ifdef MBS_SUPPORT
@@ -596,6 +595,9 @@
                {
                  wctype_t wt;
                  /* Query the character class as wctype_t.  */
+                 if (case_fold && (strcmp(str, "upper") == 0 || strcmp(str, 
"lower") == 0))
+                    strcpy(str, "alpha");
+
                  wt = wctype (str);
 
                  if (ch_classes_al == 0)
@@ -634,7 +636,7 @@
                      work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem;
                    }
                }
-             wc = WEOF;
+             wc = wc1 = WEOF;
            }
          else
            /* We treat '[' as a normal character here.  */
@@ -682,6 +684,28 @@
          REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
                               range_ends_al, work_mbc->nranges + 1);
          work_mbc->range_ends[work_mbc->nranges++] = (wchar_t)wc2;
+         if (case_fold && (iswlower((wint_t)wc) || iswupper((wint_t)wc))
+                         && (iswlower((wint_t)wc2) || iswupper((wint_t)wc2)))
+           {
+               wint_t altcase;
+               altcase = wc;
+               if (iswlower((wint_t)wc))
+                  altcase = towupper((wint_t)wc);
+               else
+                  altcase = towlower((wint_t)wc);
+               REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,
+                               range_sts_al, work_mbc->nranges + 1);
+               work_mbc->range_sts[work_mbc->nranges] = (wchar_t)altcase;
+
+               altcase = wc2;
+               if (iswlower((wint_t)wc2))
+                  altcase = towupper((wint_t)wc2);
+               else
+                  altcase = towlower((wint_t)wc2);
+               REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,
+                               range_ends_al, work_mbc->nranges + 1);
+               work_mbc->range_ends[work_mbc->nranges++] = (wchar_t)altcase;
+          }
        }
       else if (wc != WEOF)
        /* build normal characters.  */
@@ -2412,12 +2436,14 @@
 #define SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p)         \
   if (s == 0)                                          \
     {                                                  \
-      while (inputwcs[p - buf_begin + buf_offset] == 0         \
-            && mblen_buf[p - buf_begin + buf_offset] > 0               \
+      while (inputwcs[p - buf_begin] == 0              \
+            && mblen_buf[p - buf_begin] > 0            \
              && (unsigned char const *)p < buf_end)    \
         ++p;                                           \
       if ((char *)p >= end)                            \
         {                                              \
+          free(mblen_buf);                             \
+          free(inputwcs);                              \
          return NULL;                                  \
        }                                               \
     }
@@ -2511,8 +2537,8 @@
   wchar_t wc;
   int mbclen;
 
-  wc = inputwcs[index + buf_offset];
-  mbclen = (mblen_buf[index + buf_offset] == 0)? 1 : mblen_buf[index + 
buf_offset];
+  wc = inputwcs[index];
+  mbclen = (mblen_buf[index] == 0)? 1 : mblen_buf[index];
 
   /* Check context.  */
   if (wc == (wchar_t)eolbyte)
@@ -2554,14 +2580,14 @@
   char buffer[128];
   wchar_t wcbuf[6];
 
-  /* Pointer to the structure to which we are currently reffering.  */
+  /* Pointer to the structure to which we are currently refering.  */
   struct mb_char_classes *work_mbc;
 
   int newline = 0;
   int letter = 0;
-  wchar_t wc;          /* Current reffering character.  */
+  wchar_t wc;          /* Current refering character.  */
 
-  wc = inputwcs[index + buf_offset];
+  wc = inputwcs[index];
 
   /* Check context.  */
   if (wc == (wchar_t)eolbyte)
@@ -2582,10 +2608,10 @@
                           newline, d->states[s].letter, letter))
     return 0;
 
-  /* Assign the current reffering operator to work_mbc.  */
+  /* Assign the current refering operator to work_mbc.  */
   work_mbc = &(d->mbcsets[(d->multibyte_prop[pos.index]) >> 2]);
   match = !work_mbc->invert;
-  match_len = (mblen_buf[index + buf_offset] == 0)? 1 : mblen_buf[index + 
buf_offset];
+  match_len = (mblen_buf[index] == 0)? 1 : mblen_buf[index];
 
   /* match with a character class?  */
   for (i = 0; i<work_mbc->nch_classes; i++)
@@ -2700,8 +2726,8 @@
 
   /* Calculate the length of the (single/multi byte) character
      to which p points.  */
-  *mbclen = (mblen_buf[*pp - buf_begin + buf_offset] == 0)? 1
-    : mblen_buf[*pp - buf_begin + buf_offset];
+  *mbclen = (mblen_buf[*pp - buf_begin] == 0)? 1
+    : mblen_buf[*pp - buf_begin];
 
   /* Calculate the state which can be reached from the state `s' by
      consuming `*mbclen' single bytes from the buffer.  */
@@ -2795,7 +2821,7 @@
      `maxlen' bytes.  */
   rs = transit_state_consume_1char(d, s, pp, match_lens, &mbclen, &follows);
 
-  wc = inputwcs[*pp - mbclen - buf_begin + buf_offset];
+  wc = inputwcs[*pp - mbclen - buf_begin];
   s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc));
   realloc_trans_if_necessary(d, s1);
 
@@ -2813,7 +2839,7 @@
                     &follows);
        }
 
-      wc = inputwcs[*pp - mbclen - buf_begin + buf_offset];
+      wc = inputwcs[*pp - mbclen - buf_begin];
       s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc));
       realloc_trans_if_necessary(d, s1);
     }
@@ -2871,21 +2897,13 @@
   if (MB_CUR_MAX > 1)
     {
       int remain_bytes, i;
-      buf_begin -= buf_offset;
-      if (buf_begin <= (unsigned char const *)begin && (unsigned char const *) 
end <= buf_end) {
-       buf_offset = (unsigned char const *)begin - buf_begin;
-       buf_begin = begin;
-       buf_end = end;
-       goto go_fast;
-      }
 
-      buf_offset = 0;
       buf_begin = begin;
       buf_end = end;
 
       /* initialize mblen_buf, and inputwcs.  */
-      REALLOC(mblen_buf, unsigned char, end - begin + 2);
-      REALLOC(inputwcs, wchar_t, end - begin + 2);
+      MALLOC(mblen_buf, unsigned char, end - begin + 2);
+      MALLOC(inputwcs, wchar_t, end - begin + 2);
       memset(&mbs, 0, sizeof(mbstate_t));
       remain_bytes = 0;
       for (i = 0; i < end - begin + 1; i++)
@@ -2916,7 +2934,6 @@
       mblen_buf[i] = 0;
       inputwcs[i] = 0; /* sentinel */
     }
-go_fast:
 #endif /* MBS_SUPPORT */
 
   for (;;)
@@ -2930,7 +2947,7 @@
             s1 = s;
            if (d->states[s].mbps.nelem != 0)
              {
-               /* Can match with a multibyte character( and multi character
+               /* Can match with a multibyte character (and multi character
                   collating element).  */
                unsigned char const *nextp;
 
@@ -2966,6 +2983,13 @@
            {
              if (backref)
                *backref = (d->states[s].backref != 0);
+#ifdef MBS_SUPPORT
+              if (MB_CUR_MAX > 1)
+               {
+                 free(mblen_buf);
+                 free(inputwcs);
+               }
+#endif /* MBS_SUPPORT */
               return (char *) p;
            }
 
@@ -2994,6 +3018,13 @@
       /* Check if we've run off the end of the buffer. */
       if ((char *) p > end)
         {
+#ifdef MBS_SUPPORT
+          if (MB_CUR_MAX > 1)
+           {
+             free(mblen_buf);
+             free(inputwcs);
+           }
+#endif /* MBS_SUPPORT */
           return NULL;
         }
 
@@ -3358,14 +3389,14 @@
   for (lcp = left; *lcp != '\0'; ++lcp)
     {
       len = 0;
-      rcp = strchr(right, *lcp);
+      rcp = strchr (right, *lcp);
       while (rcp != NULL)
        {
          for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
            continue;
          if (i > len)
            len = i;
-         rcp = strchr(rcp + 1, *lcp);
+         rcp = strchr (rcp + 1, *lcp);
        }
       if (len == 0)
        continue;
@@ -3668,9 +3699,9 @@
  done:
   if (strlen(result))
     {
-      dm = (struct dfamust *) malloc(sizeof (struct dfamust));
+      MALLOC(dm, struct dfamust, 1);
       dm->exact = exact;
-      dm->must = malloc(strlen(result) + 1);
+      MALLOC(dm->must, char, strlen(result) + 1);
       strcpy(dm->must, result);
       dm->next = dfa->musts;
       dfa->musts = dm;
diff -ur gawk-3.1.4/eval.c gawk-mb/eval.c
--- gawk-3.1.4/eval.c   2004-07-26 17:11:57.000000000 +0300
+++ gawk-mb/eval.c      2005-01-04 17:10:44.000000000 +0200
@@ -322,6 +322,9 @@
                { FUNC, "FUNC" },
                { FIELD, "FIELD" },
                { INTLSTR, "INTLSTR" },
+#ifdef WSTRCUR
+               { WSTRCUR, "WSTRCUR" },
+#endif
                { 0,    NULL },
        };
 
Only in gawk-mb: mb.awk
diff -ur gawk-3.1.4/node.c gawk-mb/node.c
--- gawk-3.1.4/node.c   2004-07-28 16:45:04.000000000 +0300
+++ gawk-mb/node.c      2005-01-13 15:00:38.620478340 +0200
@@ -212,6 +212,14 @@
 no_malloc:
        s->stref = 1;
        s->flags |= STRCUR;
+#if defined MBS_SUPPORT
+       if ((s->flags & WSTRCUR) != 0) {
+               free(s->wstptr);
+               s->wstptr = NULL;
+               s->wstlen = 0;
+               s->flags &= ~WSTRCUR;
+       }
+#endif
        return s;
 }
 
@@ -279,6 +287,15 @@
                emalloc(r->stptr, char *, r->stlen + 2, "dupnode");
                memcpy(r->stptr, n->stptr, r->stlen);
                r->stptr[r->stlen] = '\0';
+#if defined MBS_SUPPORT
+               if ((n->flags & WSTRCUR) != 0) {
+                       r->wstlen = n->wstlen;
+                       emalloc(r->wstptr, wchar_t *, sizeof(wchar_t) * 
(r->wstlen + 2), "dupnode");
+                       memcpy(r->wstptr, n->wstptr, r->wstlen);
+                       r->wstptr[r->wstlen] = L'\0';
+                       r->flags |= WSTRCUR;
+               }
+#endif /* defined MBS_SUPPORT */
        } else if (n->type == Node_ahash && (n->flags & MALLOC) != 0) {
                r->ahname_ref = 1;
                emalloc(r->ahname_str, char *, r->ahname_len + 2, "dupnode");
@@ -319,7 +336,12 @@
        r->stref = 1;
        r->stptr = NULL;
        r->stlen = 0;
-#endif
+#if defined MBS_SUPPORT
+       r->wstptr = NULL;
+       r->wstlen = 0;
+       r->flags &= ~WSTRCUR;
+#endif /* MBS_SUPPORT */
+#endif /* GAWKDEBUG */
        return r;
 }
 
@@ -333,6 +355,10 @@
        getnode(r);
        r->type = Node_val;
        r->flags = (STRING|STRCUR|MALLOC);
+#if defined MBS_SUPPORT
+       r->wstptr = NULL;
+       r->wstlen = 0;
+#endif
        if (flags & ALREADY_MALLOCED)
                r->stptr = s;
        else {
@@ -398,11 +424,8 @@
 
        /* get more nodes and initialize list */
        emalloc(nextfree, NODE *, NODECHUNK * sizeof(NODE), "more_nodes");
+       memset(nextfree, 0, NODECHUNK * sizeof(NODE));
        for (np = nextfree; np <= &nextfree[NODECHUNK - 1]; np++) {
-               np->flags = 0;
-#ifndef NO_PROFILING
-               np->exec_count = 0;
-#endif
                np->nextp = np + 1;
        }
        --np;
@@ -456,6 +479,13 @@
                                return;
                        }
                        free(tmp->stptr);
+#if defined MBS_SUPPORT
+                       if (tmp->wstptr != NULL)
+                               free(tmp->wstptr);
+                       tmp->flags &= ~WSTRCUR;
+                       tmp->wstptr = NULL;
+                       tmp->wstlen = 0;
+#endif
                }
                freenode(tmp);
                return;
@@ -584,3 +614,163 @@
                return c;
        }
 }
+
+#if defined MBS_SUPPORT
+/* str2wstr --- convert a multibyte string to a wide string */
+
+NODE *
+str2wstr(NODE *n, size_t **ptr)
+{
+       size_t i, count, src_count;
+       char *sp;
+       mbstate_t mbs;
+       wchar_t wc, *wsp;
+
+       assert((n->flags & (STRING|STRCUR)) != 0);
+
+       if ((n->flags & WSTRCUR) != 0) {
+               if (ptr == NULL)
+                       return n;
+               /* otherwise
+                       fall through and recompute to fill in the array */
+       }
+
+       if (n->wstptr != NULL) {
+               free(n->wstptr);
+               n->wstptr = NULL;
+               n->wstlen = 0;
+       }
+
+       /*
+        * After consideration and consultation, this
+        * code trades space for time. We allocate
+        * an array of wchar_t that is n->stlen long.
+        * This is needed in the worst case anyway, where
+        * each input bytes maps to one wchar_t.  The
+        * advantage is that we only have to convert the string
+        * once, instead of twice, once to find out how many
+        * wide characters, and then again to actually fill
+        * the info in.  If there's a lot left over, we can
+        * realloc the wide string down in size.
+        */
+
+       emalloc(n->wstptr, wchar_t *, sizeof(wchar_t) * (n->stlen + 2), 
"str2wstr");
+       wsp = n->wstptr;
+
+       /*
+        * For use by do_match, create and fill in an array.
+        * For each byte `i' in n->stptr (the original string),
+        * a[i] is equal to `j', where `j' is the corresponding wchar_t
+        * in the converted wide string.
+        *
+        * Create the array.
+        */
+       if (ptr != NULL) {
+               emalloc(*ptr, size_t *, sizeof(size_t) * n->stlen, "str2wstr");
+               memset(*ptr, 0, sizeof(size_t) * n->stlen);
+       }
+
+       sp = n->stptr;
+       src_count = n->stlen;
+       memset(& mbs, 0, sizeof(mbs));
+       for (i = 0; src_count > 0; i++) {
+               count = mbrtowc(& wc, sp, src_count, & mbs);
+               switch (count) {
+               case (size_t) -2:
+               case (size_t) -1:
+               case 0:
+                       goto done;
+
+               default:
+                       *wsp++ = wc;
+                       src_count -= count;
+                       while (count--)  {
+                               if (ptr != NULL)
+                                       (*ptr)[sp - n->stptr] = i;
+                               sp++;
+                       }
+                       break;
+               }
+       }
+
+done:
+       *wsp = L'\0';
+       n->wstlen = i;
+       n->flags |= WSTRCUR;
+#define ARBITRARY_AMOUNT_TO_GIVE_BACK 100
+       if (n->wstlen - n->stlen > ARBITRARY_AMOUNT_TO_GIVE_BACK)
+               erealloc(n->wstptr, wchar_t *, sizeof(wchar_t) * (n->wstlen + 
2), "str2wstr");
+
+       return n;
+}
+
+static void
+dump_wstr(FILE *fp, const wchar_t *str, size_t len)
+{
+       if (str == NULL || len == 0)
+               return;
+
+       for (; len--; str++)
+               putc((int) *str, fp);
+}
+
+/* wstrstr --- walk haystack, looking for needle, wide char version */
+
+const wchar_t *
+wstrstr(const wchar_t *haystack, size_t hs_len,
+       const wchar_t *needle, size_t needle_len)
+{
+       size_t i, j;
+
+       if (haystack == NULL || needle == NULL || needle_len > hs_len)
+               return NULL;
+
+       for (i = 0; i < hs_len; i++) {
+               if (haystack[i] == needle[0]
+                   && i+needle_len-1 < hs_len
+                   && haystack[i+needle_len-1] == needle[needle_len-1]) {
+                       /* first & last chars match, check string */
+                       if (memcmp(haystack+i, needle, sizeof(wchar_t) * 
needle_len) == 0) {
+                               return haystack + i;
+                       }
+               }
+       }
+
+       return NULL;
+}
+
+/* wcasestrstr --- walk haystack, nocase look for needle, wide char version */
+
+const wchar_t *
+wcasestrstr(const wchar_t *haystack, size_t hs_len,
+       const wchar_t *needle, size_t needle_len)
+{
+       size_t i, j;
+
+       if (haystack == NULL || needle == NULL || needle_len > hs_len)
+               return NULL;
+
+       for (i = 0; i < hs_len; i++) {
+               if (towlower(haystack[i]) == towlower(needle[0])
+                   && i+needle_len-1 < hs_len
+                   && towlower(haystack[i+needle_len-1]) == 
towlower(needle[needle_len-1])) {
+                       /* first & last chars match, check string */
+                       const wchar_t *start;
+
+                       start = haystack+i;
+                       for (j = 0; j < needle_len; j++, start++) {
+                               wchar_t h, n;
+
+                               h = towlower(*start);
+                               n = towlower(needle[j]);
+                               if (h != n)
+                                       goto out;
+                       }
+                       return haystack + i;
+               }
+out:   ;
+       }
+
+       return NULL;
+}
+#endif /* defined MBS_SUPPORT */
diff -ur gawk-3.1.4/version.c gawk-mb/version.c
--- gawk-3.1.4/version.c        2004-08-19 14:42:25.000000000 +0300
+++ gawk-mb/version.c   2005-01-13 15:06:49.653442092 +0200
@@ -1,6 +1,6 @@
 #include "config.h"
 
-const char *version_string = "@(#)GNU Awk 3.1.4";
+const char *version_string = "@(#)GNU Awk 3.1.4 + unofficial multibyte patch 
#2";
 
 /* 1.02                fixed /= += *= etc to return the new Left Hand Side 
instead
                of the Right Hand Side */
diff -ur gawk-3.1.4/version.in gawk-mb/version.in
--- gawk-3.1.4/version.in       2004-06-06 19:15:43.000000000 +0300
+++ gawk-mb/version.in  2005-01-13 15:06:48.721790691 +0200
@@ -1,6 +1,6 @@
 #include "config.h"
 
-const char *version_string = "@(#)GNU Awk @VERSION@";
+const char *version_string = "@(#)GNU Awk @VERSION@ + unofficial multibyte 
patch #2";
 
 /* 1.02                fixed /= += *= etc to return the new Left Hand Side 
instead
                of the Right Hand Side */
-- 
Aharon (Arnold) Robbins --- Pioneer Consulting Ltd.     arnold AT skeeve DOT com
P.O. Box 354            Home Phone: +972  8 979-0381    Fax: +1 206 350 8765
Nof Ayalon              Cell Phone: +972 50  729-7545
D.N. Shimshon 99785     ISRAEL




reply via email to

[Prev in Thread] Current Thread [Next in Thread]