[SCM] gawk branch, master, updated. gawk-4.1.0-5322-g90479e23

gawk-diffs
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[SCM] gawk branch, master, updated. gawk-4.1.0-5322-g90479e23

From:	Arnold Robbins
Subject:	[SCM] gawk branch, master, updated. gawk-4.1.0-5322-g90479e23
Date:	Mon, 21 Aug 2023 01:06:38 -0400 (EDT)
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "gawk".

The branch, master has been updated
       via  90479e2302628bf514abcc4e45e7f1b82eb327dc (commit)
       via  9ca9214ed9ea0ea714ca9a2b66285b94341e8f18 (commit)
       via  cc8ef530839dfd2f784149fa04a708d73aad0381 (commit)
      from  8d73db577ce2d25f8f5bf5bbb6c0e7a74901869c (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.sv.gnu.org/cgit/gawk.git/commit/?id=90479e2302628bf514abcc4e45e7f1b82eb327dc

commit 90479e2302628bf514abcc4e45e7f1b82eb327dc
Author: Arnold D. Robbins <arnold@skeeve.com>
Date:   Sun Aug 20 22:06:07 2023 -0700

    Improve parse_escape code.

diff --git a/ChangeLog b/ChangeLog
index 6b8508f4..63b98f48 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,17 @@
+2023-08-20         Arnold D. Robbins     <arnold@skeeve.com>
+
+       Change parse_escape() to return an enum instead of relying
+       on manifest constants.
+
+       * awk.h (enum escape_results): New enum.
+       (parse_escape): Now returns a value from it, add nbytes pointer
+       parameter.
+       * node.c (make_str_node): Revised to use switch/case on
+       parse_escape return value.
+       (parse_escape): Update code.
+       * re.c (make_regexp): Revised to use switch/case on parse_escape
+       return value.
+
 2023-08-20         Miguel Pineiro Jr     <mpj@pineiro.cc>
 
        * awk.h (ELIDE_BACK_NL): Removed.
diff --git a/awk.h b/awk.h
index b9b8a346..034f7a2e 100644
--- a/awk.h
+++ b/awk.h
@@ -294,6 +294,13 @@ enum commenttype {
        FOR_COMMENT     // special case
 };
 
+enum escape_results {
+       ESCAPE_OK,              // nbytes == 1 to MB_CUR_MAX: the length of the 
translated escape sequence
+       ESCAPE_CONV_ERR,        // wcrtomb conversion error
+       ESCAPE_TERM_BACKSLASH,  // terminal backslash (to be preserved in 
cmdline strings)
+       ESCAPE_LINE_CONINUATION // line continuation  (backslash-newline pair)
+};
+
 /* string hash table */
 #define ahnext         hs.next
 #define        ahname          hs.name /* a string index node */
@@ -1741,7 +1748,7 @@ extern NODE *make_str_node(const char *s, size_t len, int 
flags);
 extern NODE *make_bool_node(bool value);
 extern NODE *make_typed_regex(const char *re, size_t len);
 extern void *more_blocks(int id);
-extern int parse_escape(const char **string_ptr, const char **escseq);
+extern enum escape_results parse_escape(const char **string_ptr, const char 
**escseq, int *nbytes);
 extern NODE *str2wstr(NODE *n, size_t **ptr);
 extern NODE *wstr2str(NODE *n);
 #define force_wstring(n)       str2wstr(n, NULL)
diff --git a/node.c b/node.c
index 910be3c2..7d731b9e 100644
--- a/node.c
+++ b/node.c
@@ -456,23 +456,31 @@ make_str_node(const char *s, size_t len, int flags)
                        if (c == '\\') {
                                const char *result;
                                int nbytes;
+                               enum escape_results ret;
 
-                               nbytes = parse_escape(&pf, &result);
-                               if (nbytes > 0) {
+                               ret = parse_escape(& pf, & result, & nbytes);
+                               switch (ret) {
+                               case ESCAPE_OK:
+                                       assert(nbytes > 0);
                                        while (nbytes--)
                                                *ptm++ = *result++;
-                               } else if (nbytes == 0) {
+                                       break;
+                               case ESCAPE_CONV_ERR:
                                        *ptm++ = '?';
-                               } else if (nbytes == -1) {
+                                       break;
+                               case ESCAPE_TERM_BACKSLASH:
                                        if (do_lint)
                                                lintwarn(_("backslash at end of 
string"));
                                        *ptm++ = '\\';
-                               } else if (nbytes == -2) {
+                                       break;
+                               case ESCAPE_LINE_CONINUATION:
                                        if (do_lint)
                                                lintwarn(_("backslash string 
continuation is not portable"));
                                        continue;
-                               } else {
-                                       cant_happen("received bad result %d 
from parse_escape()", nbytes);
+                               default:
+                                       cant_happen(N_("received bad result %d 
from parse_escape(), nbytes = %d"),
+                                                       (int) ret, nbytes);
+                                       break;
                                }
                        } else
                                *ptm++ = c;
@@ -544,25 +552,26 @@ r_unref(NODE *tmp)
  * translated escape sequence.
  *
  * Return values:
- *     1 to MB_CUR_MAX: the length of the translated escape sequence
- *     0 wcrtomb conversion error
- *    -1 terminal backslash (to be preserved in cmdline strings)
- *    -2 line continuation  (backslash-newline pair)
+ *     ESCAPE_OK,              // nbytes == 1 to MB_CUR_MAX: the length of the 
translated escape sequence
+ *     ESCAPE_CONV_ERR,        // wcrtomb conversion error
+ *     ESCAPE_TERM_BACKSLASH,  // terminal backslash (to be preserved in 
cmdline strings)
+ *     ESCAPE_LINE_CONINUATION // line continuation  (backslash-newline pair)
  *
  * POSIX doesn't allow \x or \u.
  */
 
-int
-parse_escape(const char **string_ptr, const char **result)
+enum escape_results
+parse_escape(const char **string_ptr, const char **result, int *nbytes)
 {
        static char buf[MB_LEN_MAX];
-       int retval = 1;
+       enum escape_results retval = ESCAPE_OK;
        int c = *(*string_ptr)++;
        int i;
        int count;
        int j;
        const char *start;
 
+       *nbytes = 1;
        if (do_lint_old) {
                switch (c) {
                case 'a':
@@ -597,11 +606,11 @@ parse_escape(const char **string_ptr, const char **result)
                buf[0] = '\v';
                break;
        case '\n':
-               retval = -2;
+               retval = ESCAPE_LINE_CONINUATION;
                break;
        case 0:
                (*string_ptr)--;
-               retval = -1;
+               retval = ESCAPE_TERM_BACKSLASH;
                break;
        case '0':
        case '1':
@@ -706,10 +715,11 @@ parse_escape(const char **string_ptr, const char **result)
                n = wcrtomb(buf, i, & mbs);
                if (n == (size_t) -1) {
                        warning(_("invalid \\u' escape sequence"));
-                       retval = 0;
+                       retval = ESCAPE_CONV_ERR;
+                       *nbytes = 0;
                } else {
                        /* MB_LEN_MAX is an int, so n fits */
-                       retval = (int) n;
+                       *nbytes = (int) n;
                }
                break;
        }
diff --git a/re.c b/re.c
index 2b35a52f..088f19da 100644
--- a/re.c
+++ b/re.c
@@ -148,10 +148,19 @@ make_regexp(const char *s, size_t len, bool ignorecase, 
bool dfa, bool canfatal)
                        {
                                const char *result;
                                int nbytes;
+                               enum escape_results ret;
 
-                               nbytes = parse_escape(&src, &result);
-                               if (nbytes < 0)
-                                       cant_happen("received bad result %d 
from parse_escape()", nbytes);
+                               ret = parse_escape(& src, & result, & nbytes);
+                               switch (ret) {
+                               case ESCAPE_OK:
+                               case ESCAPE_CONV_ERR:
+                                       break;
+                               case ESCAPE_TERM_BACKSLASH:
+                               case ESCAPE_LINE_CONINUATION:
+                                       cant_happen(N_("received bad result %d 
from parse_escape(), nbytes = %d"),
+                                                       (int) ret, nbytes);
+                                       break;
+                               }
                                /*
                                 * Invalid code points produce '?' (0x3F).
                                 * These are quoted so that they're taken

http://git.sv.gnu.org/cgit/gawk.git/commit/?id=9ca9214ed9ea0ea714ca9a2b66285b94341e8f18

commit 9ca9214ed9ea0ea714ca9a2b66285b94341e8f18
Author: Arnold D. Robbins <arnold@skeeve.com>
Date:   Sun Aug 20 22:01:43 2023 -0700

    Update pc/Makefile.tst.

diff --git a/pc/ChangeLog b/pc/ChangeLog
index 70a5ab55..da989e60 100644
--- a/pc/ChangeLog
+++ b/pc/ChangeLog
@@ -1,3 +1,7 @@
+2023-08-20         Arnold D. Robbins     <arnold@skeeve.com>
+
+       * Makefile.tst: Regenerated.
+
 2023-07-09         Arnold D. Robbins     <arnold@skeeve.com>
 
        * Makefile.tst: Regenerated.
diff --git a/pc/Makefile.tst b/pc/Makefile.tst
index 360639ed..4f356acc 100644
--- a/pc/Makefile.tst
+++ b/pc/Makefile.tst
@@ -149,7 +149,7 @@ BASIC_TESTS = \
        arysubnm aryunasgn asgext assignnumfield assignnumfield2 awkpath \
        back89 backgsub badassign1 badbuild callparam childin clobber \
        closebad close_status clsflnam compare compare2 concat1 concat2 \
-       cmdlinefsbacknl \
+       cmdlinefsbacknl cmdlinefsbacknl2 \
        concat3 concat4 concat5 convfmt datanonl defref delargv delarpm2 \
        delarprm delfunc dfacheck2 dfamb1 dfastress divzero divzero2 \
        dynlj eofsplit \
@@ -1527,6 +1527,11 @@ cmdlinefsbacknl:
        @-$(LOCALES) AWK="$(AWKPROG)" "$(srcdir)"/$@.sh  > _$@ 2>&1 || echo 
EXIT CODE: $$? >>_$@
        @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@
 
+cmdlinefsbacknl2:
+       @echo $@
+       @-$(LOCALES) AWK="$(AWKPROG)" "$(srcdir)"/$@.sh  > _$@ 2>&1 || echo 
EXIT CODE: $$? >>_$@
+       @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@
+
 concat3:
        @echo $@
        @-AWKPATH="$(srcdir)" $(AWK) -f $@.awk  >_$@ 2>&1 || echo EXIT CODE: 
$$? >>_$@

http://git.sv.gnu.org/cgit/gawk.git/commit/?id=cc8ef530839dfd2f784149fa04a708d73aad0381

commit cc8ef530839dfd2f784149fa04a708d73aad0381
Author: Arnold D. Robbins <arnold@skeeve.com>
Date:   Sun Aug 20 21:34:58 2023 -0700

    Revise handling of backslash+newline and other escapes.

diff --git a/ChangeLog b/ChangeLog
index 7eefe83d..6b8508f4 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,16 @@
+2023-08-20         Miguel Pineiro Jr     <mpj@pineiro.cc>
+
+       * awk.h (ELIDE_BACK_NL): Removed.
+       (parse_escape): Update declaration.
+       * main.c (cmdline_fs): Don't use ELIDE_BACK_NL.
+       (arg_assign): Ditto.
+       * node.c (make_str_node): Rework handling of return value
+       from parse_escape.
+       (parse_escape): Change return code to convey more info,
+       fully parse escapes and handle conversion from unicode to
+       UTF-8.
+       * re.c (make_regexp): Rework for new interface to parse_escape.
+
 2023-07-09         Arnold D. Robbins     <arnold@skeeve.com>
 
        * re.c (make_regexp): In error message, use the original text
diff --git a/awk.h b/awk.h
index 57d4f8bd..b9b8a346 100644
--- a/awk.h
+++ b/awk.h
@@ -1376,7 +1376,6 @@ extern void r_freeblock(void *, int id);
 // Flags for making string nodes
 #define                SCAN                    1
 #define                ALREADY_MALLOCED        2
-#define                ELIDE_BACK_NL           4
 
 #define        cant_happen(format, ...)        r_fatal("internal error: file 
%s, line %d: " format, \
                                __FILE__, __LINE__, __VA_ARGS__)
@@ -1742,7 +1741,7 @@ extern NODE *make_str_node(const char *s, size_t len, int 
flags);
 extern NODE *make_bool_node(bool value);
 extern NODE *make_typed_regex(const char *re, size_t len);
 extern void *more_blocks(int id);
-extern int parse_escape(const char **string_ptr, bool *unicode);
+extern int parse_escape(const char **string_ptr, const char **escseq);
 extern NODE *str2wstr(NODE *n, size_t **ptr);
 extern NODE *wstr2str(NODE *n);
 #define force_wstring(n)       str2wstr(n, NULL)
diff --git a/main.c b/main.c
index cc5ae9a7..9b63bad3 100644
--- a/main.c
+++ b/main.c
@@ -776,7 +776,7 @@ cmdline_fs(char *str)
                        str[0] = '\t';
        }
 
-       *tmp = make_str_node(str, strlen(str), SCAN | ELIDE_BACK_NL); /* do 
process escapes */
+       *tmp = make_str_node(str, strlen(str), SCAN); /* do process escapes */
        set_FS();
 }
 
@@ -1276,7 +1276,7 @@ arg_assign(char *arg, bool initing)
                 * This makes sense, so we do it too.
                 * In addition, remove \-<newline> as in scanning.
                 */
-               it = make_str_node(cp, strlen(cp), SCAN | ELIDE_BACK_NL);
+               it = make_str_node(cp, strlen(cp), SCAN);
                it->flags |= USER_INPUT;
 #ifdef LC_NUMERIC
                /*
diff --git a/node.c b/node.c
index 2a476847..910be3c2 100644
--- a/node.c
+++ b/node.c
@@ -454,32 +454,26 @@ make_str_node(const char *s, size_t len, int flags)
 
                        c = *pf++;
                        if (c == '\\') {
-                               bool unicode;
-                               c = parse_escape(&pf, &unicode);
-                               if (c < 0) {
+                               const char *result;
+                               int nbytes;
+
+                               nbytes = parse_escape(&pf, &result);
+                               if (nbytes > 0) {
+                                       while (nbytes--)
+                                               *ptm++ = *result++;
+                               } else if (nbytes == 0) {
+                                       *ptm++ = '?';
+                               } else if (nbytes == -1) {
+                                       if (do_lint)
+                                               lintwarn(_("backslash at end of 
string"));
+                                       *ptm++ = '\\';
+                               } else if (nbytes == -2) {
                                        if (do_lint)
                                                lintwarn(_("backslash string 
continuation is not portable"));
-                                       if ((flags & ELIDE_BACK_NL) != 0)
-                                               continue;
-                                       c = '\\';
+                                       continue;
+                               } else {
+                                       cant_happen("received bad result %d 
from parse_escape()", nbytes);
                                }
-                               if (unicode) {
-                                       char buf[20];
-                                       size_t n;
-                                       mbstate_t mbs;
-                                       int i;
-
-                                       memset(& mbs, 0, sizeof(mbs));
-
-                                       n = wcrtomb(buf, c, & mbs);
-                                       if (n == (size_t) -1)   // bad value
-                                               *ptm++ = '?';
-                                       else {
-                                               for (i = 0; i < n; i++)
-                                                       *ptm++ = buf[i];
-                                       }
-                               } else
-                                       *ptm++ = c;
                        } else
                                *ptm++ = c;
                }
@@ -544,33 +538,31 @@ r_unref(NODE *tmp)
 /*
  * parse_escape:
  *
- * Parse a C escape sequence.  STRING_PTR points to a variable containing a
- * pointer to the string to parse.  That pointer is updated past the
- * characters we use.  The value of the escape sequence is returned.
- *
- * A negative value means the sequence \ newline was seen, which is supposed to
- * be equivalent to nothing at all.
+ * Parse a C escape sequence.  string_ptr points to a variable containing
+ * a pointer to the string to parse.  result points to a pointer which will
+ * be set to the address of the internal buffer holding the bytes of the
+ * translated escape sequence.
  *
- * If \ is followed by a null character, we return a negative value and leave
- * the string pointer pointing at the null character.
- *
- * If \ is followed by 000, we return 0 and leave the string pointer after the
- * zeros.  A value of 0 does not mean end of string.
+ * Return values:
+ *     1 to MB_CUR_MAX: the length of the translated escape sequence
+ *     0 wcrtomb conversion error
+ *    -1 terminal backslash (to be preserved in cmdline strings)
+ *    -2 line continuation  (backslash-newline pair)
  *
  * POSIX doesn't allow \x or \u.
  */
 
 int
-parse_escape(const char **string_ptr, bool *unicode)
+parse_escape(const char **string_ptr, const char **result)
 {
+       static char buf[MB_LEN_MAX];
+       int retval = 1;
        int c = *(*string_ptr)++;
        int i;
        int count;
        int j;
        const char *start;
 
-       *unicode = false;
-
        if (do_lint_old) {
                switch (c) {
                case 'a':
@@ -584,24 +576,33 @@ parse_escape(const char **string_ptr, bool *unicode)
 
        switch (c) {
        case 'a':
-               return '\a';
+               buf[0] = '\a';
+               break;
        case 'b':
-               return '\b';
+               buf[0] = '\b';
+               break;
        case 'f':
-               return '\f';
+               buf[0] = '\f';
+               break;
        case 'n':
-               return '\n';
+               buf[0] = '\n';
+               break;
        case 'r':
-               return '\r';
+               buf[0] = '\r';
+               break;
        case 't':
-               return '\t';
+               buf[0] = '\t';
+               break;
        case 'v':
-               return '\v';
+               buf[0] = '\v';
+               break;
        case '\n':
-               return -2;
+               retval = -2;
+               break;
        case 0:
                (*string_ptr)--;
-               return -1;
+               retval = -1;
+               break;
        case '0':
        case '1':
        case '2':
@@ -621,7 +622,8 @@ parse_escape(const char **string_ptr, bool *unicode)
                                break;
                        }
                }
-               return i;
+               buf[0] = i;
+               break;
        case 'x':
                if (do_lint) {
                        static bool warned = false;
@@ -631,11 +633,14 @@ parse_escape(const char **string_ptr, bool *unicode)
                                lintwarn(_("POSIX does not allow `\\x' 
escapes"));
                        }
                }
-               if (do_posix)
-                       return ('x');
+               if (do_posix) {
+                       buf[0] = 'x';
+                       break;
+               }
                if (! isxdigit((unsigned char) (*string_ptr)[0])) {
                        warning(_("no hex digits in `\\x' escape sequence"));
-                       return ('x');
+                       buf[0] = 'x';
+                       break;
                }
                start = *string_ptr;
                for (i = j = 0; j < 2; j++) {
@@ -656,8 +661,13 @@ parse_escape(const char **string_ptr, bool *unicode)
                }
                if (do_lint && j == 2 && isxdigit((unsigned 
char)*(*string_ptr)))
                        lintwarn(_("hex escape \\x%.*s of %d characters 
probably not interpreted the way you expect"), 3, start, 3);
-               return i;
+               buf[0] = i;
+               break;
        case 'u':
+       {
+               size_t n;
+               mbstate_t mbs;
+
                if (do_lint) {
                        static bool warned = false;
 
@@ -666,11 +676,14 @@ parse_escape(const char **string_ptr, bool *unicode)
                                lintwarn(_("POSIX does not allow `\\u' 
escapes"));
                        }
                }
-               if (do_posix)
-                       return ('u');
+               if (do_posix) {
+                       buf[0] = 'u';
+                       break;
+               }
                if (! isxdigit((unsigned char) (*string_ptr)[0])) {
                        warning(_("no hex digits in `\\u' escape sequence"));
-                       return ('u');
+                       buf[0] = 'u';
+                       break;
                }
                start = *string_ptr;
                for (i = j = 0; j < 8; j++) {
@@ -689,11 +702,21 @@ parse_escape(const char **string_ptr, bool *unicode)
                                break;
                        }
                }
-               *unicode = true;
-               return i;
+               memset(& mbs, 0, sizeof(mbs));
+               n = wcrtomb(buf, i, & mbs);
+               if (n == (size_t) -1) {
+                       warning(_("invalid \\u' escape sequence"));
+                       retval = 0;
+               } else {
+                       /* MB_LEN_MAX is an int, so n fits */
+                       retval = (int) n;
+               }
+               break;
+       }
        case '\\':
        case '"':
-               return c;
+               buf[0] = c;
+               break;
        default:
        {
                static bool warned[256];
@@ -707,8 +730,12 @@ parse_escape(const char **string_ptr, bool *unicode)
                        warning(_("escape sequence `\\%c' treated as plain 
`%c'"), uc, uc);
                }
        }
-               return c;
+               buf[0] = c;
+               break;
        }
+
+       *result = buf;
+       return retval;
 }
 
 /* get_numbase --- return the base to use for the number in 's' */
diff --git a/re.c b/re.c
index 5ea27d09..2b35a52f 100644
--- a/re.c
+++ b/re.c
@@ -46,7 +46,7 @@ make_regexp(const char *s, size_t len, bool ignorecase, bool 
dfa, bool canfatal)
        static size_t buflen;
        const char *end = s + len;
        char *dest;
-       int c, c2;
+       int c;
        static bool first = true;
        static bool no_dfa = false;
        int i;
@@ -120,8 +120,6 @@ make_regexp(const char *s, size_t len, bool ignorecase, 
bool dfa, bool canfatal)
                   character.  */
                if ((gawk_mb_cur_max == 1 || ! is_multibyte) &&
                    (*src == '\\')) {
-                       bool unicode;
-
                        c = *++src;
                        switch (c) {
                        case '\0':      /* \\ before \0, either dynamic data or 
real end of string */
@@ -147,43 +145,48 @@ make_regexp(const char *s, size_t len, bool ignorecase, 
bool dfa, bool canfatal)
                        case '5':
                        case '6':
                        case '7':
-                               c2 = parse_escape(&src, &unicode);
-                               if (c2 < 0)
-                                       cant_happen("received bad result %d 
from parse_escape()", c2);
+                       {
+                               const char *result;
+                               int nbytes;
+
+                               nbytes = parse_escape(&src, &result);
+                               if (nbytes < 0)
+                                       cant_happen("received bad result %d 
from parse_escape()", nbytes);
+                               /*
+                                * Invalid code points produce '?' (0x3F).
+                                * These are quoted so that they're taken
+                                * literally. Unlike \u3F, a metachar.
+                                */
+                               if (nbytes == 0) {
+                                       *dest++ = '\\';
+                                       *dest++ = '?';
+                                       break;
+                               }
+
                                /*
                                 * Unix awk treats octal (and hex?) chars
                                 * literally in re's, so escape regexp
                                 * metacharacters.
                                 */
-                               if (do_traditional
+                               if (nbytes == 1
+                                   && do_traditional
                                    && ! do_posix
                                    && (isdigit(c) || c == 'x')
-                                   && strchr("()|*+?.^$\\[]", c2) != NULL)
+                                   && strchr("()|*+?.^$\\[]", *result) != NULL)
                                        *dest++ = '\\';
-                               if (unicode) {
-                                       char buf[20];
-                                       size_t n;
-                                       mbstate_t mbs;
-                                       int i;
-
-                                       memset(& mbs, 0, sizeof(mbs));
-
-                                       n = wcrtomb(buf, c, & mbs);
-                                       if (n == (size_t) -1)   // bad value
-                                               *dest++ = '?';
-                                       else {
-                                               for (i = 0; i < n; i++)
-                                                       *dest++ = buf[i];
-                                       }
-                               } else
-                                       *dest++ = (char) c2;
+
                                if (do_lint
                                    && ! nul_warned
-                                   && c2 == '\0') {
+                                   && *result == '\0') {
                                        nul_warned = true;
                                        lintwarn(_("behavior of matching a 
regexp containing NUL characters is not defined by POSIX"));
                                }
+
+                               /* nbytes is now > 0 */
+                               while (nbytes--)
+                                       *dest++ = *result++;
                                break;
+                       }
                        case '8':
                        case '9':       /* a\9b not valid */
                                *dest++ = c;
diff --git a/test/ChangeLog b/test/ChangeLog
index 7f0fa1b9..be048dba 100644
--- a/test/ChangeLog
+++ b/test/ChangeLog
@@ -1,3 +1,9 @@
+2023-08-20         Arnold D. Robbins     <arnold@skeeve.com>
+
+       * Makefile.am (EXTRA_DIST): New test, cmdlinefsbacknl2.
+       * cmdlinefsbacknl2.sh, cmdlinefsbacknl2.ok: New files.
+       Thanks to Miguel Pineiro Jr <mpj@pineiro.cc> for the test.
+
 2023-07-09         Arnold D. Robbins     <arnold@skeeve.com>
 
        * Makefile.am (EXTRA_DIST): New test, regexpbad.
diff --git a/test/Makefile.am b/test/Makefile.am
index cf6f6526..d1e8eeb5 100644
--- a/test/Makefile.am
+++ b/test/Makefile.am
@@ -202,6 +202,8 @@ EXTRA_DIST = \
        clsflnam.ok \
        cmdlinefsbacknl.sh \
        cmdlinefsbacknl.ok \
+       cmdlinefsbacknl2.sh \
+       cmdlinefsbacknl2.ok \
        colonwarn.awk \
        colonwarn.in \
        colonwarn.ok \
@@ -1484,7 +1486,7 @@ BASIC_TESTS = \
        arysubnm aryunasgn asgext assignnumfield assignnumfield2 awkpath \
        back89 backgsub badassign1 badbuild callparam childin clobber \
        closebad close_status clsflnam compare compare2 concat1 concat2 \
-       cmdlinefsbacknl \
+       cmdlinefsbacknl cmdlinefsbacknl2 \
        concat3 concat4 concat5 convfmt datanonl defref delargv delarpm2 \
        delarprm delfunc dfacheck2 dfamb1 dfastress divzero divzero2 \
        dynlj eofsplit \
diff --git a/test/Makefile.in b/test/Makefile.in
index 827fe246..7669b391 100644
--- a/test/Makefile.in
+++ b/test/Makefile.in
@@ -466,6 +466,8 @@ EXTRA_DIST = \
        clsflnam.ok \
        cmdlinefsbacknl.sh \
        cmdlinefsbacknl.ok \
+       cmdlinefsbacknl2.sh \
+       cmdlinefsbacknl2.ok \
        colonwarn.awk \
        colonwarn.in \
        colonwarn.ok \
@@ -1748,7 +1750,7 @@ BASIC_TESTS = \
        arysubnm aryunasgn asgext assignnumfield assignnumfield2 awkpath \
        back89 backgsub badassign1 badbuild callparam childin clobber \
        closebad close_status clsflnam compare compare2 concat1 concat2 \
-       cmdlinefsbacknl \
+       cmdlinefsbacknl cmdlinefsbacknl2 \
        concat3 concat4 concat5 convfmt datanonl defref delargv delarpm2 \
        delarprm delfunc dfacheck2 dfamb1 dfastress divzero divzero2 \
        dynlj eofsplit \
@@ -3315,6 +3317,11 @@ cmdlinefsbacknl:
        @-$(LOCALES) AWK="$(AWKPROG)" "$(srcdir)"/$@.sh  > _$@ 2>&1 || echo 
EXIT CODE: $$? >>_$@
        @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@
 
+cmdlinefsbacknl2:
+       @echo $@
+       @-$(LOCALES) AWK="$(AWKPROG)" "$(srcdir)"/$@.sh  > _$@ 2>&1 || echo 
EXIT CODE: $$? >>_$@
+       @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@
+
 concat3:
        @echo $@
        @-AWKPATH="$(srcdir)" $(AWK) -f $@.awk  >_$@ 2>&1 || echo EXIT CODE: 
$$? >>_$@
diff --git a/test/Maketests b/test/Maketests
index fb1f98d7..b86f2afa 100644
--- a/test/Maketests
+++ b/test/Maketests
@@ -215,6 +215,11 @@ cmdlinefsbacknl:
        @-$(LOCALES) AWK="$(AWKPROG)" "$(srcdir)"/$@.sh  > _$@ 2>&1 || echo 
EXIT CODE: $$? >>_$@
        @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@
 
+cmdlinefsbacknl2:
+       @echo $@
+       @-$(LOCALES) AWK="$(AWKPROG)" "$(srcdir)"/$@.sh  > _$@ 2>&1 || echo 
EXIT CODE: $$? >>_$@
+       @-$(CMP) "$(srcdir)"/$@.ok _$@ && rm -f _$@
+
 concat3:
        @echo $@
        @-AWKPATH="$(srcdir)" $(AWK) -f $@.awk  >_$@ 2>&1 || echo EXIT CODE: 
$$? >>_$@
diff --git a/test/cmdlinefsbacknl2.ok b/test/cmdlinefsbacknl2.ok
new file mode 100644
index 00000000..7f4c6951
--- /dev/null
+++ b/test/cmdlinefsbacknl2.ok
@@ -0,0 +1,23 @@
+ 5c 0a
+ 5c 0a
+ 5c 0a
+ 5c 0a
+ 5c 0a
+ 5c 0a
+gawk: cmd. line:2: warning: invalid \u' escape sequence
+gawk: cmd. line:3: warning: invalid \u' escape sequence
+ 3f 0a 3f 0a
+gawk: cmd. line:2: warning: invalid \u' escape sequence
+right: literal ? is literal
+gawk: cmd. line:2: warning: invalid \u' escape sequence
+gawk: cmd. line:5: warning: invalid \u' escape sequence
+gawk: cmd. line:8: warning: invalid \u' escape sequence
+1 == 1
+gawk: cmd. line:3: warning: invalid \u' escape sequence
+1 == 1
+0 == 0
+gawk: cmd. line:6: warning: invalid \u' escape sequence
+0 == 0
+0 == 0
+gawk: cmd. line:9: warning: invalid \u' escape sequence
+0 == 0
diff --git a/test/cmdlinefsbacknl2.sh b/test/cmdlinefsbacknl2.sh
new file mode 100755
index 00000000..7d6ece9d
--- /dev/null
+++ b/test/cmdlinefsbacknl2.sh
@@ -0,0 +1,73 @@
+#!/bin/sh
+
+awk=$AWK
+
+# Both of these succeeded:
+# export LC_CTYPE=C
+export LC_CTYPE="en_US.UTF-8"
+
+
+# Test terminal backslash handling. Each test should produce "5c 0a"
+# (a backslash followed by a newline). Presently, master produces "0a".
+#
+$awk -F'\' 'BEGIN { print FS }' | od -An -tx1
+$awk -v s='\' 'BEGIN { print s }' | od -An -tx1
+echo | $awk '{ print s }' s='\' | od -An -tx1
+
+
+# These are the same terminal backslash handling tests we just ran, but
+# with a line-continuation pair immediately preceding the terminal
+# backslash. Let's make sure these two special sequences play well.
+# Their output should be the same as before, "5c 0a". Again, master
+# produces only "0a". The backslash is lost.
+#
+$awk -F'\
+\' 'BEGIN { print FS }' | od -An -tx1
+$awk -v s='\
+\' 'BEGIN { print s }' | od -An -tx1
+echo | $awk '{ print s }' s='\
+\' | od -An -tx1
+
+
+# \uffffffff (-1)
+# \ufffffffe (-2)
+#
+# A properly handled invalid \u sequence yields a literal '?' (3f).
+# This test should print "3f 0a 3f 0a", but master prints "5c 0a 5c 0a".
+#
+# The code point overflows parse_escape's int return value, yielding one
+# of the reserved, negative values. In this case, make_str_node is
+# confused into preserving a terminal backslash that doesn't exist.
+#
+$awk 'BEGIN {
+       print "\uFFFFFFFF"      # "?\n"
+       print "\uFFFFFFFE"      # "?\n"
+}' | od -An -tx1               # 3f 0a 3f 0a
+
+
+# Again, a negative return value causes confusion, this time producing a
+# fatal error in make_regexp.
+#
+$awk 'BEGIN {
+       if ("xy?z" ~ /xy\uFFFFFFFFz/)
+               print "right: literal ? is literal"
+       else
+               print "wrong: literal ? is meta"
+}'
+
+# \uD800 to \uDFFF are invalid Unicode code points. They're reserved
+# for UTF-16 surrogates. Let's use them to test \u conversion error
+# handling. The correct output is a true expression, 0 == 0 or 1 == 1.
+#
+$awk 'BEGIN {
+       print "1 == " ("xy?z" ~ /xy\uD800z/)
+       print "1 == " ("xy?z" ~ "xy\\uD800z")
+
+       print "0 == " ("xyz" ~ /xy\uD800z/)
+       print "0 == " ("xyz" ~ "xy\\uD800z")
+
+       print "0 == " ("xz" ~ /xy\uD800z/)
+       print "0 == " ("xz" ~ "xy\\uD800z")
+}'
+
+exit 0

-----------------------------------------------------------------------

Summary of changes:
 ChangeLog                |  27 +++++++++
 awk.h                    |  10 ++-
 main.c                   |   4 +-
 node.c                   | 155 +++++++++++++++++++++++++++++------------------
 pc/ChangeLog             |   4 ++
 pc/Makefile.tst          |   7 ++-
 re.c                     |  64 +++++++++++--------
 test/ChangeLog           |   6 ++
 test/Makefile.am         |   4 +-
 test/Makefile.in         |   9 ++-
 test/Maketests           |   5 ++
 test/cmdlinefsbacknl2.ok |  23 +++++++
 test/cmdlinefsbacknl2.sh |  73 ++++++++++++++++++++++
 13 files changed, 299 insertions(+), 92 deletions(-)
 create mode 100644 test/cmdlinefsbacknl2.ok
 create mode 100755 test/cmdlinefsbacknl2.sh


hooks/post-receive
-- 
gawk
[Prev in Thread]
Current Thread
[Next in Thread]
[SCM] gawk branch, master, updated. gawk-4.1.0-5322-g90479e23, Arnold Robbins <=
Prev by Date: [SCM] gawk branch, master, updated. gawk-4.1.0-5319-g8d73db57
Next by Date: [SCM] gawk branch, master, updated. gawk-4.1.0-5323-g4bf84cda
Previous by thread: [SCM] gawk branch, master, updated. gawk-4.1.0-5319-g8d73db57
Next by thread: [SCM] gawk branch, master, updated. gawk-4.1.0-5323-g4bf84cda
Index(es):
- Date
- Thread