From 069f2986cc2f954709ae18de78f6f512bcf4854c Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Sun, 2 Apr 2023 10:48:52 -0700 Subject: [PATCH] grep: go back to 3.9 -P '\d' behavior Treating \d differently from Perl was more trouble than it was worth. * NEWS, doc/grep.texi (grep Programs): Document this. * src/pcresearch.c (PCRE2_EXTRA_ASCII_BSD): Remove. All uses removed. * tests/pcre-ascii-digits: Adjust to this change. * tests/pcre-utf8-w: Revert to 3.9. --- NEWS | 11 ++++++----- doc/grep.texi | 17 ++++++----------- src/pcresearch.c | 23 ++++------------------- tests/pcre-ascii-digits | 20 ++++++++++---------- tests/pcre-utf8-w | 2 -- 5 files changed, 26 insertions(+), 47 deletions(-) diff --git a/NEWS b/NEWS index 6ebade3..82c213b 100644 --- a/NEWS +++ b/NEWS @@ -4,11 +4,12 @@ GNU grep NEWS -*- outline -*- ** Bug fixes - With -P, patterns like [\d] now work again. The fix relies on PCRE2 - support for the PCRE2_EXTRA_ASCII_BSD flag planned for PCRE2 10.43. - With PCRE2 version 10.42 or earlier, behavior reverts to that of - grep 3.8, in that patterns like \w and \b use ASCII rather than - Unicode interpretations. + With -P, \d now matches all decimal digits, not just ASCII digits. + That is, \d is equivalent to [[:digit:]], not to [0-9]. + This is more compatible with plain Perl, and reverts to the + behavior of grep 3.9. If you prefer \d to mean [0-9] and + have a PCRE2 version later than 10.42 installed, you can + prefix your regular expression with (?aD). * Noteworthy changes in release 3.10 (2023-03-22) [stable] diff --git a/doc/grep.texi b/doc/grep.texi index e05ae25..705569c 100644 --- a/doc/grep.texi +++ b/doc/grep.texi @@ -1149,23 +1149,18 @@ combined with the @option{-z} (@option{--null-data}) option, and note that For documentation, refer to @url{https://www.pcre.org/}, with these caveats: @itemize -@item -@samp{\d} matches only the ten ASCII digits -(and @samp{\D} matches the complement), regardless of locale. -Use @samp{\p@{Nd@}} to also match non-ASCII digits. - -When @command{grep} is built with PCRE2 10.42 and earlier, -@samp{\d} and @samp{\D} ignore in-regexp directives like @samp{(?aD)} -and work like @samp{[0-9]} and @samp{[^0-9]} respectively. -However, later versions of PCRE2 likely will fix this, -and the plan is for @command{grep} to respect those directives if possible. - @item Although PCRE tracks the syntax and semantics of Perl's regular expressions, the match is not always exact, partly because Perl evolves and a Perl installation may predate or postdate the PCRE2 installation on the same host. +For example, @samp{\d} ordinarily is like @samp{\p@{Nd@}} and matches +all decimal digits, whereas @samp{[0-9]} matches only ASCII digits. +PCRE2 versions after 10.42 let you change this behavior: in a regular +expression that starts with @samp{(?aD)}, @samp{\d} is like @samp{[0-9]}. +However, PCRE2 10.42 and earlier do not support @samp{(?aD)}. + @item By default, @command{grep} applies each regexp to a line at a time, so the @samp{(?s)} directive (making @samp{.} match line breaks) diff --git a/src/pcresearch.c b/src/pcresearch.c index e77509c..222386f 100644 --- a/src/pcresearch.c +++ b/src/pcresearch.c @@ -35,9 +35,6 @@ # define PCRE2_ERROR_DEPTHLIMIT PCRE2_ERROR_RECURSIONLIMIT # define pcre2_set_depth_limit pcre2_set_recursion_limit #endif -#ifndef PCRE2_EXTRA_ASCII_BSD -# define PCRE2_EXTRA_ASCII_BSD 0 -#endif struct pcre_comp { @@ -157,15 +154,7 @@ Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact) if (! localeinfo.using_utf8) die (EXIT_TROUBLE, 0, _("-P supports only unibyte and UTF-8 locales")); - flags |= PCRE2_UTF; - - /* If PCRE2_EXTRA_ASCII_BSD is available, use PCRE2_UCP - so that \d does not have the undesirable effect of matching - non-ASCII digits. Otherwise (i.e., with PCRE2 10.42 and earlier), - escapes like \w have only their ASCII interpretations, - but that's better than the confusion that would ensue if \d - matched non-ASCII digits. */ - flags |= PCRE2_EXTRA_ASCII_BSD ? PCRE2_UCP : 0; + flags |= PCRE2_UTF | PCRE2_UCP; #if 0 /* Do not match individual code units but only UTF-8. */ @@ -181,16 +170,12 @@ Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact) if (rawmemchr (pattern, '\n') != patlim) die (EXIT_TROUBLE, 0, _("the -P option only supports a single pattern")); -#ifdef PCRE2_EXTRA_MATCH_LINE - uint32_t extra_options = (PCRE2_EXTRA_ASCII_BSD - | (match_lines ? PCRE2_EXTRA_MATCH_LINE : 0)); - pcre2_set_compile_extra_options (ccontext, extra_options); -#endif - void *re_storage = NULL; if (match_lines) { -#ifndef PCRE2_EXTRA_MATCH_LINE +#ifdef PCRE2_EXTRA_MATCH_LINE + pcre2_set_compile_extra_options (ccontext, PCRE2_EXTRA_MATCH_LINE); +#else static char const /* These sizes omit trailing NUL. */ xprefix[4] = "^(?:", xsuffix[2] = ")$"; idx_t re_size = size + sizeof xprefix + sizeof xsuffix; diff --git a/tests/pcre-ascii-digits b/tests/pcre-ascii-digits index 9dfc0fa..cd92c3c 100755 --- a/tests/pcre-ascii-digits +++ b/tests/pcre-ascii-digits @@ -18,7 +18,7 @@ require_pcre_ echo . | grep -qP '(*UTF).' 2>/dev/null \ || skip_ 'PCRE unicode support is compiled out' echo 0 | grep -qP '(?aD)\d' \ - || skip_ 'PCRE 10.42 and older lack PCRE2_EXTRA_ASCII_BSD' + || skip_ 'PCRE 10.42 and older lack (?aD)' fail=0 @@ -29,27 +29,27 @@ printf '\331\240\331\241\331\242\331\243\331\244' > in || framework_failure_ printf '\331\245\331\246\331\247\331\250\331\251' >> in || framework_failure_ printf '\n' >> in || framework_failure_ -# Ensure that \d matches no character. -returns_ 1 grep -P '\d' in > out || fail=1 +# Ensure that (?aD)\d matches no character. +returns_ 1 grep -P '(?aD)\d' in > out || fail=1 compare /dev/null out || fail=1 -# Ensure that ^\D+$ matches the entire line. -grep -P '^\D+$' in > out || fail=1 +# Ensure that (?aD)^\D+$ matches the entire line. +grep -P '(?aD)^\D+$' in > out || fail=1 compare in out || fail=1 # When built with PCRE2 10.43 and newer, one may use (?aD) and (?-aD) -# to toggle between modes. (?aD) is the default (making \d == [0-9]). -# (?-aD) relaxes \d, making it match "all" digits. +# to toggle between modes. (?aD) makes \d == [0-9]. +# (?-aD), the default, makes \d match all digits. # Use mixed digits as input: Arabic 0 and ASCII 4: ٠4 printf '\331\2404\n' > in2 || framework_failure_ -returns_ 1 grep -P '\d\d' in2 > out || fail=1 +returns_ 1 grep -P '(?aD)\d\d' in2 > out || fail=1 compare /dev/null out || fail=1 -grep -P '(?-aD)\d(?aD)\d' in2 > out || fail=1 +grep -P '\d(?aD)\d' in2 > out || fail=1 compare in2 out || fail=1 -returns_ 1 grep -P '\d(?-aD)\d' in2 > out || fail=1 +returns_ 1 grep -P '(?aD)\d(?-aD)\d' in2 > out || fail=1 compare /dev/null out || fail=1 Exit $fail diff --git a/tests/pcre-utf8-w b/tests/pcre-utf8-w index aa34784..a88ace4 100755 --- a/tests/pcre-utf8-w +++ b/tests/pcre-utf8-w @@ -16,8 +16,6 @@ require_pcre_ echo . | grep -qP '(*UTF).' 2>/dev/null \ || skip_ 'PCRE unicode support is compiled out' -echo 0 | grep -qP '(?aD)\d' \ - || skip_ 'PCRE 10.42 and older lack PCRE2_EXTRA_ASCII_BSD' fail=0 -- 2.37.2