grep branch, master, updated. v2.20-19-g16fc7fa

grep-commit
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
grep branch, master, updated. v2.20-19-g16fc7fa

From:	Paul Eggert
Subject:	grep branch, master, updated. v2.20-19-g16fc7fa
Date:	Tue, 09 Sep 2014 19:44:00 +0000
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "grep".

The branch, master has been updated
       via  16fc7fa0e0f273fa81e382b4303617d707364902 (commit)
      from  2199f20d9d89268d4382d739a598fe86ba560320 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.savannah.gnu.org/cgit/grep.git/commit/?id=16fc7fa0e0f273fa81e382b4303617d707364902


commit 16fc7fa0e0f273fa81e382b4303617d707364902
Author: Paul Eggert <address@hidden>
Date:   Tue Sep 9 12:41:54 2014 -0700

    grep: -P now treats invalid UTF-8 input as non-matching
    
    Problem reported by Santiago Vila in: http://bugs.gnu.org/18266
    * NEWS: Mention this.
    * src/pcresearch.c (Pexecute): Treat UTF-8 encoding errors
    as non-matching data, instead of exiting 'grep'.
    * tests/pcre-infloop: grep now exits with status 1, not 2.
    * tests/pcre-invalid-utf8-input: grep now exits with status 0, not 2.

diff --git a/NEWS b/NEWS
index 550bf4c..ca79525 100644
--- a/NEWS
+++ b/NEWS
@@ -6,6 +6,9 @@ GNU grep NEWS                                    -*- outline -*-
 
   Performance has improved for very long strings in patterns.
 
+  grep -P no longer reports an error and exits when given invalid UTF-8 data.
+  Instead, it considers the data to be non-matching.
+
 ** Bug fixes
 
   grep -E rejected unmatched ')', instead of treating it like '\)'.
diff --git a/src/pcresearch.c b/src/pcresearch.c
index 820dd00..2a01e6d 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -136,34 +136,41 @@ Pexecute (char const *buf, size_t size, size_t 
*match_size,
 #else
   /* This array must have at least two elements; everything after that
      is just for performance improvement in pcre_exec.  */
-  int sub[300];
+  enum { nsub = 300 };
+  int sub[nsub];
 
-  const char *line_buf, *line_end, *line_next;
+  char const *p = start_ptr ? start_ptr : buf;
+  int options = p == buf || p[-1] == eolbyte ? 0 : PCRE_NOTBOL;
+  char const *line_start = buf;
   int e = PCRE_ERROR_NOMATCH;
-  ptrdiff_t start_ofs = start_ptr ? start_ptr - buf : 0;
+  char const *line_end;
 
   /* PCRE can't limit the matching to single lines, therefore we have to
      match each line in the buffer separately.  */
-  for (line_next = buf;
-       e == PCRE_ERROR_NOMATCH && line_next < buf + size;
-       start_ofs -= line_next - line_buf)
+  for (; p < buf + size; p = line_start = line_end + 1)
     {
-      line_buf = line_next;
-      line_end = memchr (line_buf, eolbyte, (buf + size) - line_buf);
-      if (line_end == NULL)
-        line_next = line_end = buf + size;
-      else
-        line_next = line_end + 1;
-
-      if (start_ptr && start_ptr >= line_end)
-        continue;
+      line_end = memchr (p, eolbyte, buf + size - p);
 
-      if (INT_MAX < line_end - line_buf)
+      if (INT_MAX < line_end - p)
         error (EXIT_TROUBLE, 0, _("exceeded PCRE's line length limit"));
 
-      e = pcre_exec (cre, extra, line_buf, line_end - line_buf,
-                     start_ofs < 0 ? 0 : start_ofs, 0,
-                     sub, sizeof sub / sizeof *sub);
+      /* Treat encoding-error bytes as data that cannot match.  */
+      for (;;)
+        {
+          e = pcre_exec (cre, extra, p, line_end - p, 0, options, sub, nsub);
+          if (e != PCRE_ERROR_BADUTF8)
+            break;
+          e = pcre_exec (cre, extra, p, sub[0], 0,
+                         options | PCRE_NO_UTF8_CHECK, sub, nsub);
+          if (e != PCRE_ERROR_NOMATCH)
+            break;
+          p += sub[0] + 1;
+          options = PCRE_NOTBOL;
+        }
+
+      if (e != PCRE_ERROR_NOMATCH)
+        break;
+      options = 0;
     }
 
   if (e <= 0)
@@ -180,10 +187,6 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
           error (EXIT_TROUBLE, 0,
                  _("exceeded PCRE's backtracking limit"));
 
-        case PCRE_ERROR_BADUTF8:
-          error (EXIT_TROUBLE, 0,
-                 _("invalid UTF-8 byte sequence in input"));
-
         default:
           /* For now, we lump all remaining PCRE failures into this basket.
              If anyone cares to provide sample grep usage that can trigger
@@ -197,25 +200,8 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
     }
   else
     {
-      /* Narrow down to the line we've found.  */
-      char const *beg = line_buf + sub[0];
-      char const *end = line_buf + sub[1];
-      char const *buflim = buf + size;
-      char eol = eolbyte;
-      if (!start_ptr)
-        {
-          /* FIXME: The case when '\n' is not found indicates a bug:
-             Since grep is line oriented, the match should never contain
-             a newline, so there _must_ be a newline following.
-           */
-          if (!(end = memchr (end, eol, buflim - end)))
-            end = buflim;
-          else
-            end++;
-          while (buf < beg && beg[-1] != eol)
-            --beg;
-        }
-
+      char const *beg = start_ptr ? p + sub[0] : line_start;
+      char const *end = start_ptr ? p + sub[1] : line_end + 1;
       *match_size = end - beg;
       return beg - buf;
     }
diff --git a/tests/pcre-infloop b/tests/pcre-infloop
index 1b33e72..b92f8e1 100755
--- a/tests/pcre-infloop
+++ b/tests/pcre-infloop
@@ -28,6 +28,6 @@ printf 'a\201b\r' > in || framework_failure_
 fail=0
 
 LC_ALL=en_US.UTF-8 timeout 3 grep -P 'a.?..b' in
-test $? = 2 || fail_ "libpcre's match function appears to infloop"
+test $? = 1 || fail_ "libpcre's match function appears to infloop"
 
 Exit $fail
diff --git a/tests/pcre-invalid-utf8-input b/tests/pcre-invalid-utf8-input
index 913e8ee..f42e0dd 100755
--- a/tests/pcre-invalid-utf8-input
+++ b/tests/pcre-invalid-utf8-input
@@ -16,6 +16,6 @@ fail=0
 printf 'j\202\nj\n' > in || framework_failure_
 
 LC_ALL=en_US.UTF-8 grep -P j in
-test $? -eq 2 || fail=1
+test $? -eq 0 || fail=1
 
 Exit $fail

-----------------------------------------------------------------------

Summary of changes:
 NEWS                          |    3 ++
 src/pcresearch.c              |   70 ++++++++++++++++------------------------
 tests/pcre-infloop            |    2 +-
 tests/pcre-invalid-utf8-input |    2 +-
 4 files changed, 33 insertions(+), 44 deletions(-)


hooks/post-receive
-- 
grep
[Prev in Thread]
Current Thread
[Next in Thread]
grep branch, master, updated. v2.20-19-g16fc7fa, Paul Eggert <=
Next by Date: grep branch, master, updated. v2.20-20-g5610ec0
Next by thread: grep branch, master, updated. v2.20-20-g5610ec0
Index(es):
- Date
- Thread