[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
grep branch, master, updated. v2.22-17-gf6603c4
From: |
Paul Eggert |
Subject: |
grep branch, master, updated. v2.22-17-gf6603c4 |
Date: |
Wed, 06 Jan 2016 08:26:54 +0000 |
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "grep".
The branch, master has been updated
via f6603c4e1e04dbb87a7232c4b44acc6afdf65fef (commit)
from 71c206b5042a11c976c25a9f77aff04ebb29fcd9 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
http://git.savannah.gnu.org/cgit/grep.git/commit/?id=f6603c4e1e04dbb87a7232c4b44acc6afdf65fef
commit f6603c4e1e04dbb87a7232c4b44acc6afdf65fef
Author: Paul Eggert <address@hidden>
Date: Wed Jan 6 00:26:26 2016 -0800
grep: restore -P PCRE_NO_UTF8_CHECK optimization
On my platform in the en_US.utf8 locale, this makes 'grep -P "z.*a" k'
220x faster, where k is created by the shell command:
yes 'abcdefg hijklmn opqrstu vwxyz' | head -n 10000000 >k
* src/dfasearch.c (EGexecute):
* src/grep.c (execute_fp_t):
* src/kwsearch.c (Fexecute):
* src/pcresearch.c (Pexecute):
First arg is now char *, not char const *, since Pexecute now
temporarily modifies this argument.
* src/grep.c, src/grep.h (buf_has_encoding_errors): Now extern.
* src/pcresearch.c (Pexecute): Use it. If the input is free of
encoding errors, use a multiline search and the PCRE_NO_UTF8_CHECK
option, as this is typically way faster. This restores an
optimization that was removed with the recent changes for binary
file detection.
diff --git a/src/dfasearch.c b/src/dfasearch.c
index 0205011..a330eac 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -202,7 +202,7 @@ GEAcompile (char const *pattern, size_t size, reg_syntax_t
syntax_bits)
}
size_t
-EGexecute (char const *buf, size_t size, size_t *match_size,
+EGexecute (char *buf, size_t size, size_t *match_size,
char const *start_ptr)
{
char const *buflim, *beg, *end, *ptr, *match, *best_match, *mb_start;
diff --git a/src/grep.c b/src/grep.c
index f6fb0bc..10aabf9 100644
--- a/src/grep.c
+++ b/src/grep.c
@@ -462,7 +462,7 @@ enum { SEEK_HOLE = SEEK_SET };
/* Functions we'll use to search. */
typedef void (*compile_fp_t) (char const *, size_t);
-typedef size_t (*execute_fp_t) (char const *, size_t, size_t *, char const *);
+typedef size_t (*execute_fp_t) (char *, size_t, size_t *, char const *);
static compile_fp_t compile;
static execute_fp_t execute;
@@ -561,7 +561,7 @@ skip_easy_bytes (char const *buf)
/* Return true if BUF, of size SIZE, has an encoding error.
BUF must be followed by at least sizeof (uword) bytes,
the first of which may be modified. */
-static bool
+bool
buf_has_encoding_errors (char *buf, size_t size)
{
if (! unibyte_mask)
diff --git a/src/grep.h b/src/grep.h
index 577fb72..75b7ef7 100644
--- a/src/grep.h
+++ b/src/grep.h
@@ -29,4 +29,6 @@ extern bool match_words; /* -w */
extern bool match_lines; /* -x */
extern char eolbyte; /* -z */
+extern bool buf_has_encoding_errors (char *, size_t);
+
#endif
diff --git a/src/kwsearch.c b/src/kwsearch.c
index e33caaf..e9966d4 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -78,7 +78,7 @@ Fcompile (char const *pattern, size_t size)
}
size_t
-Fexecute (char const *buf, size_t size, size_t *match_size,
+Fexecute (char *buf, size_t size, size_t *match_size,
char const *start_ptr)
{
char const *beg, *try, *end, *mb_start;
diff --git a/src/pcresearch.c b/src/pcresearch.c
index a647514..8f3d935 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -174,7 +174,7 @@ Pcompile (char const *pattern, size_t size)
}
size_t
-Pexecute (char const *buf, size_t size, size_t *match_size,
+Pexecute (char *buf, size_t size, size_t *match_size,
char const *start_ptr)
{
#if !HAVE_LIBPCRE
@@ -194,13 +194,31 @@ Pexecute (char const *buf, size_t size, size_t
*match_size,
error. */
char const *subject = buf;
+ /* If the input is free of encoding errors a multiline search is
+ typically more efficient. Otherwise, a single-line search is
+ typically faster, so that pcre_exec doesn't waste time validating
+ the entire input buffer. */
+ bool multiline = ! buf_has_encoding_errors (buf, size - 1);
+ buf[size - 1] = eolbyte;
+
for (; p < buf + size; p = line_start = line_end + 1)
{
- /* A single-line search is typically faster, so that
- pcre_exec doesn't waste time validating the entire input
- buffer. */
- line_end = memchr (p, eolbyte, buf + size - p);
- if (INT_MAX < line_end - p)
+ bool too_big;
+
+ if (multiline)
+ {
+ size_t pcre_size_max = MIN (INT_MAX, SIZE_MAX - 1);
+ size_t scan_size = MIN (pcre_size_max + 1, buf + size - p);
+ line_end = memrchr (p, eolbyte, scan_size);
+ too_big = ! line_end;
+ }
+ else
+ {
+ line_end = memchr (p, eolbyte, buf + size - p);
+ too_big = INT_MAX < line_end - p;
+ }
+
+ if (too_big)
error (EXIT_TROUBLE, 0, _("exceeded PCRE's line length limit"));
for (;;)
@@ -228,11 +246,27 @@ Pexecute (char const *buf, size_t size, size_t
*match_size,
int options = 0;
if (!bol)
options |= PCRE_NOTBOL;
+ if (multiline)
+ options |= PCRE_NO_UTF8_CHECK;
e = jit_exec (subject, line_end - subject, search_offset,
options, sub);
if (e != PCRE_ERROR_BADUTF8)
- break;
+ {
+ if (0 < e && multiline && sub[1] - sub[0] != 0)
+ {
+ char const *nl = memchr (subject + sub[0], eolbyte,
+ sub[1] - sub[0]);
+ if (nl)
+ {
+ /* This match crosses a line boundary; reject it. */
+ p = subject + sub[0];
+ line_end = nl;
+ continue;
+ }
+ }
+ break;
+ }
int valid_bytes = sub[0];
/* Try to match the string before the encoding error. */
@@ -304,6 +338,15 @@ Pexecute (char const *buf, size_t size, size_t *match_size,
beg = matchbeg;
end = matchend;
}
+ else if (multiline)
+ {
+ char const *prev_nl = memrchr (line_start - 1, eolbyte,
+ matchbeg - (line_start - 1));
+ char const *next_nl = memchr (matchend, eolbyte,
+ line_end + 1 - matchend);
+ beg = prev_nl + 1;
+ end = next_nl + 1;
+ }
else
{
beg = line_start;
-----------------------------------------------------------------------
Summary of changes:
src/dfasearch.c | 2 +-
src/grep.c | 4 +-
src/grep.h | 2 +
src/kwsearch.c | 2 +-
src/pcresearch.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++------
5 files changed, 56 insertions(+), 11 deletions(-)
hooks/post-receive
--
grep
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- grep branch, master, updated. v2.22-17-gf6603c4,
Paul Eggert <=