emacs-diffs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

master be91192ecb1 1/3: Straighten regexp postfix operator after zero-wi


From: Mattias Engdegård
Subject: master be91192ecb1 1/3: Straighten regexp postfix operator after zero-width assertion parse
Date: Wed, 21 Jun 2023 12:00:45 -0400 (EDT)

branch: master
commit be91192ecb1e0dff794582cd463f0a6480d160ef
Author: Mattias Engdegård <mattiase@acm.org>
Commit: Mattias Engdegård <mattiase@acm.org>

    Straighten regexp postfix operator after zero-width assertion parse
    
    The zero-width assertions \` \' \b \B were parsed in a sloppy way so
    that a following postfix repetition operator could yield surprising
    results.  For instance, "\\b*" would act as "\\b\\*", and "xy\\b*"
    would act as "\\(?:xy\\b\\)*".
    
    Except for \` and ^, any following postfix operator now applies to the
    zero-width assertion itself only, which is predictable and consistent
    with other assertions, if useless in practice.
    For historical compatibility, an operator character following \` and ^
    always becomes a literal. (Bug#64128)
    
    * src/regex-emacs.c (regex_compile):
    Set `laststart` appropriately for each zero-width assertion instead
    of leaving it with whatever value it had before.
    Remove a redundant condition.
    * test/src/regex-emacs-tests.el
    (regexp-tests-zero-width-assertion-repetition): New test.
    * doc/lispref/searching.texi (Regexp Special):
    Say that repetition operators are not special after \`,
    and that they work as expected after other backslash escapes.
    * etc/NEWS: Announce.
---
 doc/lispref/searching.texi    |  6 +---
 etc/NEWS                      |  8 ++++++
 src/regex-emacs.c             | 15 ++++++++--
 test/src/regex-emacs-tests.el | 66 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 88 insertions(+), 7 deletions(-)

diff --git a/doc/lispref/searching.texi b/doc/lispref/searching.texi
index 28230cea643..7c9893054d9 100644
--- a/doc/lispref/searching.texi
+++ b/doc/lispref/searching.texi
@@ -546,15 +546,11 @@ example, the regular expression that matches the @samp{\} 
character is
 
 For historical compatibility, a repetition operator is treated as ordinary
 if it appears at the start of a regular expression
-or after @samp{^}, @samp{\(}, @samp{\(?:} or @samp{\|}.
+or after @samp{^}, @samp{\`}, @samp{\(}, @samp{\(?:} or @samp{\|}.
 For example, @samp{*foo} is treated as @samp{\*foo}, and
 @samp{two\|^\@{2\@}} is treated as @samp{two\|^@{2@}}.
 It is poor practice to depend on this behavior; use proper backslash
 escaping anyway, regardless of where the repetition operator appears.
-Also, a repetition operator should not immediately follow a backslash escape
-that matches only empty strings, as Emacs has bugs in this area.
-For example, it is unwise to use @samp{\b*}, which can be omitted
-without changing the documented meaning of the regular expression.
 
 As a @samp{\} is not special inside a bracket expression, it can
 never remove the special meaning of @samp{-}, @samp{^} or @samp{]}.
diff --git a/etc/NEWS b/etc/NEWS
index d703b7e77be..7552640663f 100644
--- a/etc/NEWS
+++ b/etc/NEWS
@@ -475,6 +475,14 @@ symbol, and either that symbol is ':eval' and the second 
element of
 the list evaluates to 'nil' or the symbol's value as a variable is
 'nil' or void.
 
++++
+** Regexp zero-width assertions followed by operators are better defined.
+Previously, regexps such as "xy\\B*" would have ill-defined behaviour.
+Now any operator following a zero-width assertion applies to that
+assertion only (which is useless).  For historical compatibility, an
+operator character following '^' or '\`' becomes literal, but we
+advise against relying on this.
+
 
 * Lisp Changes in Emacs 30.1
 
diff --git a/src/regex-emacs.c b/src/regex-emacs.c
index fea34df991b..9e298b81ebb 100644
--- a/src/regex-emacs.c
+++ b/src/regex-emacs.c
@@ -1716,7 +1716,8 @@ regex_compile (re_char *pattern, ptrdiff_t size,
 
   /* Address of start of the most recently finished expression.
      This tells, e.g., postfix * where to find the start of its
-     operand.  Reset at the beginning of groups and alternatives.  */
+     operand.  Reset at the beginning of groups and alternatives,
+     and after ^ and \` for dusty-deck compatibility.  */
   unsigned char *laststart = 0;
 
   /* Address of beginning of regexp, or inside of last group.  */
@@ -1847,12 +1848,16 @@ regex_compile (re_char *pattern, ptrdiff_t size,
        case '^':
          if (! (p == pattern + 1 || at_begline_loc_p (pattern, p)))
            goto normal_char;
+         /* Special case for compatibility: postfix ops after ^ become
+            literals.  */
+         laststart = 0;
          BUF_PUSH (begline);
          break;
 
        case '$':
          if (! (p == pend || at_endline_loc_p (p, pend)))
            goto normal_char;
+         laststart = b;
          BUF_PUSH (endline);
          break;
 
@@ -1892,7 +1897,7 @@ regex_compile (re_char *pattern, ptrdiff_t size,
 
            /* Star, etc. applied to an empty pattern is equivalent
               to an empty pattern.  */
-           if (!laststart || laststart == b)
+           if (laststart == b)
              break;
 
            /* Now we know whether or not zero matches is allowed
@@ -2544,18 +2549,24 @@ regex_compile (re_char *pattern, ptrdiff_t size,
               break;
 
            case 'b':
+             laststart = b;
              BUF_PUSH (wordbound);
              break;
 
            case 'B':
+             laststart = b;
              BUF_PUSH (notwordbound);
              break;
 
            case '`':
+             /* Special case for compatibility: postfix ops after \` become
+                literals, as for ^ (see above).  */
+             laststart = 0;
              BUF_PUSH (begbuf);
              break;
 
            case '\'':
+             laststart = b;
              BUF_PUSH (endbuf);
              break;
 
diff --git a/test/src/regex-emacs-tests.el b/test/src/regex-emacs-tests.el
index 52d43775b8e..08a93dbf30e 100644
--- a/test/src/regex-emacs-tests.el
+++ b/test/src/regex-emacs-tests.el
@@ -883,4 +883,70 @@ This evaluates the TESTS test cases from glibc."
     (should (looking-at "x*\\(=\\|:\\)*"))
     (should (looking-at "x*=*?"))))
 
+(ert-deftest regexp-tests-zero-width-assertion-repetition ()
+  ;; Check compatibility behaviour with repetition operators after
+  ;; certain zero-width assertions (bug#64128).
+
+  ;; This function is just to hide ugly regexps from relint so that it
+  ;; doesn't complain about them.
+  (cl-flet ((smatch (re str) (string-match re str)))
+    ;; Postfix operators after ^ and \` become literals, for historical
+    ;; compatibility.  Only the first character of a lazy operator (like *?)
+    ;; becomes a literal.
+    (should (equal (smatch "^*a" "x\n*a") 2))
+    (should (equal (smatch "^*?a" "x\n*a") 2))
+    (should (equal (smatch "^*?a" "x\na") 2))
+    (should (equal (smatch "^*?a" "x\n**a") nil))
+
+    (should (equal (smatch "\\`*a" "*a") 0))
+    (should (equal (smatch "\\`*?a" "*a") 0))
+    (should (equal (smatch "\\`*?a" "a") 0))
+    (should (equal (smatch "\\`*?a" "**a") nil))
+
+    ;; Other zero-width assertions are treated as normal elements, so postfix
+    ;; operators apply to them alone (which is pointless but valid).
+    (should (equal (smatch "\\b*!" "*!") 1))
+    (should (equal (smatch "!\\b+;" "!;") nil))
+    (should (equal (smatch "!\\b+a" "!a") 0))
+
+    (should (equal (smatch "\\B*!" "*!") 1))
+    (should (equal (smatch "!\\B+;" "!;") 0))
+    (should (equal (smatch "!\\B+a" "!a") nil))
+
+    (should (equal (smatch "\\<*b" "*b") 1))
+    (should (equal (smatch "a\\<*b" "ab") 0))
+    (should (equal (smatch ";\\<*b" ";b") 0))
+    (should (equal (smatch "a\\<+b" "ab") nil))
+    (should (equal (smatch ";\\<+b" ";b") 0))
+
+    (should (equal (smatch "\\>*;" "*;") 1))
+    (should (equal (smatch "a\\>*b" "ab") 0))
+    (should (equal (smatch "a\\>*;" "a;") 0))
+    (should (equal (smatch "a\\>+b" "ab") nil))
+    (should (equal (smatch "a\\>+;" "a;") 0))
+
+    (should (equal (smatch "a\\'" "ab") nil))
+    (should (equal (smatch "b\\'" "ab") 1))
+    (should (equal (smatch "a\\'*b" "ab") 0))
+    (should (equal (smatch "a\\'+" "ab") nil))
+    (should (equal (smatch "b\\'+" "ab") 1))
+    (should (equal (smatch "\\'+" "+") 1))
+
+    (should (equal (smatch "\\_<*b" "*b") 1))
+    (should (equal (smatch "a\\_<*b" "ab") 0))
+    (should (equal (smatch " \\_<*b" " b") 0))
+    (should (equal (smatch "a\\_<+b" "ab") nil))
+    (should (equal (smatch " \\_<+b" " b") 0))
+
+    (should (equal (smatch "\\_>*;" "*;") 1))
+    (should (equal (smatch "a\\_>*b" "ab") 0))
+    (should (equal (smatch "a\\_>* " "a ") 0))
+    (should (equal (smatch "a\\_>+b" "ab") nil))
+    (should (equal (smatch "a\\_>+ " "a ") 0))
+
+    (should (equal (smatch "\\=*b" "*b") 1))
+    (should (equal (smatch "a\\=*b" "a*b") nil))
+    (should (equal (smatch "a\\=*b" "ab") 0))
+    ))
+
 ;;; regex-emacs-tests.el ends here



reply via email to

[Prev in Thread] Current Thread [Next in Thread]