[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
grep branch, master, updated. v2.20-19-g16fc7fa
From: |
Paul Eggert |
Subject: |
grep branch, master, updated. v2.20-19-g16fc7fa |
Date: |
Tue, 09 Sep 2014 19:44:00 +0000 |
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "grep".
The branch, master has been updated
via 16fc7fa0e0f273fa81e382b4303617d707364902 (commit)
from 2199f20d9d89268d4382d739a598fe86ba560320 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
http://git.savannah.gnu.org/cgit/grep.git/commit/?id=16fc7fa0e0f273fa81e382b4303617d707364902
commit 16fc7fa0e0f273fa81e382b4303617d707364902
Author: Paul Eggert <address@hidden>
Date: Tue Sep 9 12:41:54 2014 -0700
grep: -P now treats invalid UTF-8 input as non-matching
Problem reported by Santiago Vila in: http://bugs.gnu.org/18266
* NEWS: Mention this.
* src/pcresearch.c (Pexecute): Treat UTF-8 encoding errors
as non-matching data, instead of exiting 'grep'.
* tests/pcre-infloop: grep now exits with status 1, not 2.
* tests/pcre-invalid-utf8-input: grep now exits with status 0, not 2.
diff --git a/NEWS b/NEWS
index 550bf4c..ca79525 100644
--- a/NEWS
+++ b/NEWS
@@ -6,6 +6,9 @@ GNU grep NEWS -*- outline -*-
Performance has improved for very long strings in patterns.
+ grep -P no longer reports an error and exits when given invalid UTF-8 data.
+ Instead, it considers the data to be non-matching.
+
** Bug fixes
grep -E rejected unmatched ')', instead of treating it like '\)'.
diff --git a/src/pcresearch.c b/src/pcresearch.c
index 820dd00..2a01e6d 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -136,34 +136,41 @@ Pexecute (char const *buf, size_t size, size_t
*match_size,
#else
/* This array must have at least two elements; everything after that
is just for performance improvement in pcre_exec. */
- int sub[300];
+ enum { nsub = 300 };
+ int sub[nsub];
- const char *line_buf, *line_end, *line_next;
+ char const *p = start_ptr ? start_ptr : buf;
+ int options = p == buf || p[-1] == eolbyte ? 0 : PCRE_NOTBOL;
+ char const *line_start = buf;
int e = PCRE_ERROR_NOMATCH;
- ptrdiff_t start_ofs = start_ptr ? start_ptr - buf : 0;
+ char const *line_end;
/* PCRE can't limit the matching to single lines, therefore we have to
match each line in the buffer separately. */
- for (line_next = buf;
- e == PCRE_ERROR_NOMATCH && line_next < buf + size;
- start_ofs -= line_next - line_buf)
+ for (; p < buf + size; p = line_start = line_end + 1)
{
- line_buf = line_next;
- line_end = memchr (line_buf, eolbyte, (buf + size) - line_buf);
- if (line_end == NULL)
- line_next = line_end = buf + size;
- else
- line_next = line_end + 1;
-
- if (start_ptr && start_ptr >= line_end)
- continue;
+ line_end = memchr (p, eolbyte, buf + size - p);
- if (INT_MAX < line_end - line_buf)
+ if (INT_MAX < line_end - p)
error (EXIT_TROUBLE, 0, _("exceeded PCRE's line length limit"));
- e = pcre_exec (cre, extra, line_buf, line_end - line_buf,
- start_ofs < 0 ? 0 : start_ofs, 0,
- sub, sizeof sub / sizeof *sub);
+ /* Treat encoding-error bytes as data that cannot match. */
+ for (;;)
+ {
+ e = pcre_exec (cre, extra, p, line_end - p, 0, options, sub, nsub);
+ if (e != PCRE_ERROR_BADUTF8)
+ break;
+ e = pcre_exec (cre, extra, p, sub[0], 0,
+ options | PCRE_NO_UTF8_CHECK, sub, nsub);
+ if (e != PCRE_ERROR_NOMATCH)
+ break;
+ p += sub[0] + 1;
+ options = PCRE_NOTBOL;
+ }
+
+ if (e != PCRE_ERROR_NOMATCH)
+ break;
+ options = 0;
}
if (e <= 0)
@@ -180,10 +187,6 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
error (EXIT_TROUBLE, 0,
_("exceeded PCRE's backtracking limit"));
- case PCRE_ERROR_BADUTF8:
- error (EXIT_TROUBLE, 0,
- _("invalid UTF-8 byte sequence in input"));
-
default:
/* For now, we lump all remaining PCRE failures into this basket.
If anyone cares to provide sample grep usage that can trigger
@@ -197,25 +200,8 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
}
else
{
- /* Narrow down to the line we've found. */
- char const *beg = line_buf + sub[0];
- char const *end = line_buf + sub[1];
- char const *buflim = buf + size;
- char eol = eolbyte;
- if (!start_ptr)
- {
- /* FIXME: The case when '\n' is not found indicates a bug:
- Since grep is line oriented, the match should never contain
- a newline, so there _must_ be a newline following.
- */
- if (!(end = memchr (end, eol, buflim - end)))
- end = buflim;
- else
- end++;
- while (buf < beg && beg[-1] != eol)
- --beg;
- }
-
+ char const *beg = start_ptr ? p + sub[0] : line_start;
+ char const *end = start_ptr ? p + sub[1] : line_end + 1;
*match_size = end - beg;
return beg - buf;
}
diff --git a/tests/pcre-infloop b/tests/pcre-infloop
index 1b33e72..b92f8e1 100755
--- a/tests/pcre-infloop
+++ b/tests/pcre-infloop
@@ -28,6 +28,6 @@ printf 'a\201b\r' > in || framework_failure_
fail=0
LC_ALL=en_US.UTF-8 timeout 3 grep -P 'a.?..b' in
-test $? = 2 || fail_ "libpcre's match function appears to infloop"
+test $? = 1 || fail_ "libpcre's match function appears to infloop"
Exit $fail
diff --git a/tests/pcre-invalid-utf8-input b/tests/pcre-invalid-utf8-input
index 913e8ee..f42e0dd 100755
--- a/tests/pcre-invalid-utf8-input
+++ b/tests/pcre-invalid-utf8-input
@@ -16,6 +16,6 @@ fail=0
printf 'j\202\nj\n' > in || framework_failure_
LC_ALL=en_US.UTF-8 grep -P j in
-test $? -eq 2 || fail=1
+test $? -eq 0 || fail=1
Exit $fail
-----------------------------------------------------------------------
Summary of changes:
NEWS | 3 ++
src/pcresearch.c | 70 ++++++++++++++++------------------------
tests/pcre-infloop | 2 +-
tests/pcre-invalid-utf8-input | 2 +-
4 files changed, 33 insertions(+), 44 deletions(-)
hooks/post-receive
--
grep
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- grep branch, master, updated. v2.20-19-g16fc7fa,
Paul Eggert <=