From 069f2986cc2f954709ae18de78f6f512bcf4854c Mon Sep 17 00:00:00 2001
From: Paul Eggert <eggert@cs.ucla.edu>
Date: Sun, 2 Apr 2023 10:48:52 -0700
Subject: [PATCH] grep: go back to 3.9 -P '\d' behavior

Treating \d differently from Perl was more trouble than it was worth.
* NEWS, doc/grep.texi (grep Programs): Document this.
* src/pcresearch.c (PCRE2_EXTRA_ASCII_BSD):
Remove.  All uses removed.
* tests/pcre-ascii-digits: Adjust to this change.
* tests/pcre-utf8-w: Revert to 3.9.
---
 NEWS                    | 11 ++++++-----
 doc/grep.texi           | 17 ++++++-----------
 src/pcresearch.c        | 23 ++++-------------------
 tests/pcre-ascii-digits | 20 ++++++++++----------
 tests/pcre-utf8-w       |  2 --
 5 files changed, 26 insertions(+), 47 deletions(-)

diff --git a/NEWS b/NEWS
index 6ebade3..82c213b 100644
--- a/NEWS
+++ b/NEWS
@@ -4,11 +4,12 @@ GNU grep NEWS                                    -*- outline -*-
 
 ** Bug fixes
 
-  With -P, patterns like [\d] now work again.  The fix relies on PCRE2
-  support for the PCRE2_EXTRA_ASCII_BSD flag planned for PCRE2 10.43.
-  With PCRE2 version 10.42 or earlier, behavior reverts to that of
-  grep 3.8, in that patterns like \w and \b use ASCII rather than
-  Unicode interpretations.
+  With -P, \d now matches all decimal digits, not just ASCII digits.
+  That is, \d is equivalent to [[:digit:]], not to [0-9].
+  This is more compatible with plain Perl, and reverts to the
+  behavior of grep 3.9.  If you prefer \d to mean [0-9] and
+  have a PCRE2 version later than 10.42 installed, you can
+  prefix your regular expression with (?aD).
 
 
 * Noteworthy changes in release 3.10 (2023-03-22) [stable]
diff --git a/doc/grep.texi b/doc/grep.texi
index e05ae25..705569c 100644
--- a/doc/grep.texi
+++ b/doc/grep.texi
@@ -1149,23 +1149,18 @@ combined with the @option{-z} (@option{--null-data}) option, and note that
 
 For documentation, refer to @url{https://www.pcre.org/}, with these caveats:
 @itemize
-@item
-@samp{\d} matches only the ten ASCII digits
-(and @samp{\D} matches the complement), regardless of locale.
-Use @samp{\p@{Nd@}} to also match non-ASCII digits.
-
-When @command{grep} is built with PCRE2 10.42 and earlier,
-@samp{\d} and @samp{\D} ignore in-regexp directives like @samp{(?aD)}
-and work like @samp{[0-9]} and @samp{[^0-9]} respectively.
-However, later versions of PCRE2 likely will fix this,
-and the plan is for @command{grep} to respect those directives if possible.
-
 @item
 Although PCRE tracks the syntax and semantics of Perl's regular
 expressions, the match is not always exact, partly because Perl
 evolves and a Perl installation may predate or postdate the PCRE2
 installation on the same host.
 
+For example, @samp{\d} ordinarily is like @samp{\p@{Nd@}} and matches
+all decimal digits, whereas @samp{[0-9]} matches only ASCII digits.
+PCRE2 versions after 10.42 let you change this behavior: in a regular
+expression that starts with @samp{(?aD)}, @samp{\d} is like @samp{[0-9]}.
+However, PCRE2 10.42 and earlier do not support @samp{(?aD)}.
+
 @item
 By default, @command{grep} applies each regexp to a line at a time,
 so the @samp{(?s)} directive (making @samp{.} match line breaks)
diff --git a/src/pcresearch.c b/src/pcresearch.c
index e77509c..222386f 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -35,9 +35,6 @@
 # define PCRE2_ERROR_DEPTHLIMIT PCRE2_ERROR_RECURSIONLIMIT
 # define pcre2_set_depth_limit pcre2_set_recursion_limit
 #endif
-#ifndef PCRE2_EXTRA_ASCII_BSD
-# define PCRE2_EXTRA_ASCII_BSD 0
-#endif
 
 struct pcre_comp
 {
@@ -157,15 +154,7 @@ Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact)
       if (! localeinfo.using_utf8)
         die (EXIT_TROUBLE, 0, _("-P supports only unibyte and UTF-8 locales"));
 
-      flags |= PCRE2_UTF;
-
-      /* If PCRE2_EXTRA_ASCII_BSD is available, use PCRE2_UCP
-         so that \d does not have the undesirable effect of matching
-         non-ASCII digits.  Otherwise (i.e., with PCRE2 10.42 and earlier),
-         escapes like \w have only their ASCII interpretations,
-         but that's better than the confusion that would ensue if \d
-         matched non-ASCII digits.  */
-      flags |= PCRE2_EXTRA_ASCII_BSD ? PCRE2_UCP : 0;
+      flags |= PCRE2_UTF | PCRE2_UCP;
 
 #if 0
       /* Do not match individual code units but only UTF-8.  */
@@ -181,16 +170,12 @@ Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact)
   if (rawmemchr (pattern, '\n') != patlim)
     die (EXIT_TROUBLE, 0, _("the -P option only supports a single pattern"));
 
-#ifdef PCRE2_EXTRA_MATCH_LINE
-  uint32_t extra_options = (PCRE2_EXTRA_ASCII_BSD
-                            | (match_lines ? PCRE2_EXTRA_MATCH_LINE : 0));
-  pcre2_set_compile_extra_options (ccontext, extra_options);
-#endif
-
   void *re_storage = NULL;
   if (match_lines)
     {
-#ifndef PCRE2_EXTRA_MATCH_LINE
+#ifdef PCRE2_EXTRA_MATCH_LINE
+      pcre2_set_compile_extra_options (ccontext, PCRE2_EXTRA_MATCH_LINE);
+#else
       static char const /* These sizes omit trailing NUL.  */
         xprefix[4] = "^(?:", xsuffix[2] = ")$";
       idx_t re_size = size + sizeof xprefix + sizeof xsuffix;
diff --git a/tests/pcre-ascii-digits b/tests/pcre-ascii-digits
index 9dfc0fa..cd92c3c 100755
--- a/tests/pcre-ascii-digits
+++ b/tests/pcre-ascii-digits
@@ -18,7 +18,7 @@ require_pcre_
 echo . | grep -qP '(*UTF).' 2>/dev/null \
   || skip_ 'PCRE unicode support is compiled out'
 echo 0 | grep -qP '(?aD)\d' \
-  || skip_ 'PCRE 10.42 and older lack PCRE2_EXTRA_ASCII_BSD'
+  || skip_ 'PCRE 10.42 and older lack (?aD)'
 
 fail=0
 
@@ -29,27 +29,27 @@ printf '\331\240\331\241\331\242\331\243\331\244' > in || framework_failure_
 printf '\331\245\331\246\331\247\331\250\331\251' >> in || framework_failure_
 printf '\n' >> in || framework_failure_
 
-# Ensure that \d matches no character.
-returns_ 1 grep -P '\d' in > out || fail=1
+# Ensure that (?aD)\d matches no character.
+returns_ 1 grep -P '(?aD)\d' in > out || fail=1
 compare /dev/null out || fail=1
 
-# Ensure that ^\D+$ matches the entire line.
-grep -P '^\D+$' in > out || fail=1
+# Ensure that (?aD)^\D+$ matches the entire line.
+grep -P '(?aD)^\D+$' in > out || fail=1
 compare in out || fail=1
 
 # When built with PCRE2 10.43 and newer, one may use (?aD) and (?-aD)
-# to toggle between modes.  (?aD) is the default (making \d == [0-9]).
-# (?-aD) relaxes \d, making it match "all" digits.
+# to toggle between modes.  (?aD) makes \d == [0-9].
+# (?-aD), the default, makes \d match all digits.
 # Use mixed digits as input: Arabic 0 and ASCII 4: ٠4
 printf '\331\2404\n' > in2 || framework_failure_
 
-returns_ 1 grep -P '\d\d' in2 > out || fail=1
+returns_ 1 grep -P '(?aD)\d\d' in2 > out || fail=1
 compare /dev/null out || fail=1
 
-grep -P '(?-aD)\d(?aD)\d' in2 > out || fail=1
+grep -P '\d(?aD)\d' in2 > out || fail=1
 compare in2 out || fail=1
 
-returns_ 1 grep -P '\d(?-aD)\d' in2 > out || fail=1
+returns_ 1 grep -P '(?aD)\d(?-aD)\d' in2 > out || fail=1
 compare /dev/null out || fail=1
 
 Exit $fail
diff --git a/tests/pcre-utf8-w b/tests/pcre-utf8-w
index aa34784..a88ace4 100755
--- a/tests/pcre-utf8-w
+++ b/tests/pcre-utf8-w
@@ -16,8 +16,6 @@ require_pcre_
 
 echo . | grep -qP '(*UTF).' 2>/dev/null \
   || skip_ 'PCRE unicode support is compiled out'
-echo 0 | grep -qP '(?aD)\d' \
-  || skip_ 'PCRE 10.42 and older lack PCRE2_EXTRA_ASCII_BSD'
 
 fail=0
 
-- 
2.37.2