>From 6e8f5b27ab033f4551e61740c1bdd6ffa13e9047 Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Wed, 6 Jan 2016 00:26:26 -0800 Subject: [PATCH] grep: restore -P PCRE_NO_UTF8_CHECK optimization On my platform in the en_US.utf8 locale, this makes 'grep -P "z.*a" k' 220x faster, where k is created by the shell command: yes 'abcdefg hijklmn opqrstu vwxyz' | head -n 10000000 >k * src/dfasearch.c (EGexecute): * src/grep.c (execute_fp_t): * src/kwsearch.c (Fexecute): * src/pcresearch.c (Pexecute): First arg is now char *, not char const *, since Pexecute now temporarily modifies this argument. * src/grep.c, src/grep.h (buf_has_encoding_errors): Now extern. * src/pcresearch.c (Pexecute): Use it. If the input is free of encoding errors, use a multiline search and the PCRE_NO_UTF8_CHECK option, as this is typically way faster. This restores an optimization that was removed with the recent changes for binary file detection. --- src/dfasearch.c | 2 +- src/grep.c | 4 ++-- src/grep.h | 2 ++ src/kwsearch.c | 2 +- src/pcresearch.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++------- 5 files changed, 56 insertions(+), 11 deletions(-) diff --git a/src/dfasearch.c b/src/dfasearch.c index 0205011..a330eac 100644 --- a/src/dfasearch.c +++ b/src/dfasearch.c @@ -202,7 +202,7 @@ GEAcompile (char const *pattern, size_t size, reg_syntax_t syntax_bits) } size_t -EGexecute (char const *buf, size_t size, size_t *match_size, +EGexecute (char *buf, size_t size, size_t *match_size, char const *start_ptr) { char const *buflim, *beg, *end, *ptr, *match, *best_match, *mb_start; diff --git a/src/grep.c b/src/grep.c index f6fb0bc..10aabf9 100644 --- a/src/grep.c +++ b/src/grep.c @@ -462,7 +462,7 @@ enum { SEEK_HOLE = SEEK_SET }; /* Functions we'll use to search. */ typedef void (*compile_fp_t) (char const *, size_t); -typedef size_t (*execute_fp_t) (char const *, size_t, size_t *, char const *); +typedef size_t (*execute_fp_t) (char *, size_t, size_t *, char const *); static compile_fp_t compile; static execute_fp_t execute; @@ -561,7 +561,7 @@ skip_easy_bytes (char const *buf) /* Return true if BUF, of size SIZE, has an encoding error. BUF must be followed by at least sizeof (uword) bytes, the first of which may be modified. */ -static bool +bool buf_has_encoding_errors (char *buf, size_t size) { if (! unibyte_mask) diff --git a/src/grep.h b/src/grep.h index 577fb72..75b7ef7 100644 --- a/src/grep.h +++ b/src/grep.h @@ -29,4 +29,6 @@ extern bool match_words; /* -w */ extern bool match_lines; /* -x */ extern char eolbyte; /* -z */ +extern bool buf_has_encoding_errors (char *, size_t); + #endif diff --git a/src/kwsearch.c b/src/kwsearch.c index e33caaf..e9966d4 100644 --- a/src/kwsearch.c +++ b/src/kwsearch.c @@ -78,7 +78,7 @@ Fcompile (char const *pattern, size_t size) } size_t -Fexecute (char const *buf, size_t size, size_t *match_size, +Fexecute (char *buf, size_t size, size_t *match_size, char const *start_ptr) { char const *beg, *try, *end, *mb_start; diff --git a/src/pcresearch.c b/src/pcresearch.c index a647514..8f3d935 100644 --- a/src/pcresearch.c +++ b/src/pcresearch.c @@ -174,7 +174,7 @@ Pcompile (char const *pattern, size_t size) } size_t -Pexecute (char const *buf, size_t size, size_t *match_size, +Pexecute (char *buf, size_t size, size_t *match_size, char const *start_ptr) { #if !HAVE_LIBPCRE @@ -194,13 +194,31 @@ Pexecute (char const *buf, size_t size, size_t *match_size, error. */ char const *subject = buf; + /* If the input is free of encoding errors a multiline search is + typically more efficient. Otherwise, a single-line search is + typically faster, so that pcre_exec doesn't waste time validating + the entire input buffer. */ + bool multiline = ! buf_has_encoding_errors (buf, size - 1); + buf[size - 1] = eolbyte; + for (; p < buf + size; p = line_start = line_end + 1) { - /* A single-line search is typically faster, so that - pcre_exec doesn't waste time validating the entire input - buffer. */ - line_end = memchr (p, eolbyte, buf + size - p); - if (INT_MAX < line_end - p) + bool too_big; + + if (multiline) + { + size_t pcre_size_max = MIN (INT_MAX, SIZE_MAX - 1); + size_t scan_size = MIN (pcre_size_max + 1, buf + size - p); + line_end = memrchr (p, eolbyte, scan_size); + too_big = ! line_end; + } + else + { + line_end = memchr (p, eolbyte, buf + size - p); + too_big = INT_MAX < line_end - p; + } + + if (too_big) error (EXIT_TROUBLE, 0, _("exceeded PCRE's line length limit")); for (;;) @@ -228,11 +246,27 @@ Pexecute (char const *buf, size_t size, size_t *match_size, int options = 0; if (!bol) options |= PCRE_NOTBOL; + if (multiline) + options |= PCRE_NO_UTF8_CHECK; e = jit_exec (subject, line_end - subject, search_offset, options, sub); if (e != PCRE_ERROR_BADUTF8) - break; + { + if (0 < e && multiline && sub[1] - sub[0] != 0) + { + char const *nl = memchr (subject + sub[0], eolbyte, + sub[1] - sub[0]); + if (nl) + { + /* This match crosses a line boundary; reject it. */ + p = subject + sub[0]; + line_end = nl; + continue; + } + } + break; + } int valid_bytes = sub[0]; /* Try to match the string before the encoding error. */ @@ -304,6 +338,15 @@ Pexecute (char const *buf, size_t size, size_t *match_size, beg = matchbeg; end = matchend; } + else if (multiline) + { + char const *prev_nl = memrchr (line_start - 1, eolbyte, + matchbeg - (line_start - 1)); + char const *next_nl = memchr (matchend, eolbyte, + line_end + 1 - matchend); + beg = prev_nl + 1; + end = next_nl + 1; + } else { beg = line_start; -- 2.5.0