emacs-diffs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

master f16766a: Use `or' instead of `union' for charset union in rx


From: Mattias Engdegård
Subject: master f16766a: Use `or' instead of `union' for charset union in rx
Date: Thu, 12 Dec 2019 17:48:33 -0500 (EST)

branch: master
commit f16766a0eb2a78b58a4856d31306fc37f913d70e
Author: Mattias Engdegård <address@hidden>
Commit: Mattias Engdegård <address@hidden>

    Use `or' instead of `union' for charset union in rx
    
    Design change suggested by Stefan Monnier.
    
    * doc/lispref/searching.texi (Rx Constructs):
    * etc/NEWS: Document.
    * lisp/emacs-lisp/rx.el (rx--translate-or): Detect charset arguments.
    (rx--charset-p): New.
    (rx--translate-not, rx--charset-intervals, rx--translate-union):
    Change from `union' to `or'.
    (rx--translate-form, rx--builtin-forms, rx): Remove `union'.
    * test/lisp/emacs-lisp/rx-tests.el (rx-union, rx-def-in-union)
    (rx-intersection): Rename tests and change `union' to `or' and `|'.
---
 doc/lispref/searching.texi       | 22 ++++++++++----------
 etc/NEWS                         |  6 +++---
 lisp/emacs-lisp/rx.el            | 41 +++++++++++++++++++++++--------------
 test/lisp/emacs-lisp/rx-tests.el | 44 +++++++++++++++++++++++-----------------
 4 files changed, 65 insertions(+), 48 deletions(-)

diff --git a/doc/lispref/searching.texi b/doc/lispref/searching.texi
index bf18f80..0c6c7cc 100644
--- a/doc/lispref/searching.texi
+++ b/doc/lispref/searching.texi
@@ -1214,20 +1214,19 @@ Corresponding string regexp: @samp{[@dots{}]}
 @item @code{(not @var{charspec})}
 @cindex @code{not} in rx
 Match a character not included in @var{charspec}.  @var{charspec} can
-be an @code{any}, @code{not}, @code{union}, @code{intersection},
-@code{syntax} or @code{category} form, or a character class.@*
+be an @code{any}, @code{not}, @code{or}, @code{intersection},
+@code{syntax} or @code{category} form, or a character class.
+If @var{charspec} is an @code{or} form, its arguments have the same
+restrictions as those of @code{intersection}; see below.@*
 Corresponding string regexp: @samp{[^@dots{}]}, @samp{\S@var{code}},
 @samp{\C@var{code}}
 
-@item @code{(union @var{charset}@dots{})}
-@itemx @code{(intersection @var{charset}@dots{})}
-@cindex @code{union} in rx
+@item @code{(intersection @var{charset}@dots{})}
 @cindex @code{intersection} in rx
-Match a character that matches the union or intersection,
-respectively, of the @var{charset}s.  Each @var{charset} can be an
-@code{any} form without character classes, or a @code{union},
-@code{intersection} or @code{not} form whose arguments are also
-@var{charset}s.
+Match a character included in all of the @var{charset}s.
+Each @var{charset} can be an @code{any} form without character
+classes, or an @code{intersection}, @code{or} or @code{not} form whose
+arguments are also @var{charset}s.
 
 @item @code{not-newline}, @code{nonl}
 @cindex @code{not-newline} in rx
@@ -1591,7 +1590,8 @@ when they are used, not when they are defined.
 User-defined forms are allowed wherever arbitrary @code{rx}
 expressions are expected; for example, in the body of a
 @code{zero-or-one} form, but not inside @code{any} or @code{category}
-forms.  They are also allowed inside @code{not} forms.
+forms.  They are also allowed inside @code{not} and
+@code{intersection} forms.
 @end itemize
 
 @defmac rx-define name [arglist] rx-form
diff --git a/etc/NEWS b/etc/NEWS
index 4df123d..1e0422c 100644
--- a/etc/NEWS
+++ b/etc/NEWS
@@ -2120,9 +2120,9 @@ These macros add new forms to the rx notation.
 Both match any single character; 'anychar' is more descriptive.
 
 +++
-*** New 'union' and 'intersection' forms for character sets.
-These permit composing character-matching expressions from simpler
-parts.
+*** New 'intersection' form for character sets.
+With 'or' and 'not', it can be used to compose character-matching
+expressions from simpler parts.
 
 ** Frames
 
diff --git a/lisp/emacs-lisp/rx.el b/lisp/emacs-lisp/rx.el
index d4b21c3..a5cab1d 100644
--- a/lisp/emacs-lisp/rx.el
+++ b/lisp/emacs-lisp/rx.el
@@ -273,10 +273,8 @@ Return (REGEXP . PRECEDENCE)."
   ;;   (or (+ digit) "CHARLIE" "CHAN" (+ blank))
   ;;   -> (or (+ digit) (or "CHARLIE" "CHAN") (+ blank))
   ;;
-  ;; - Fuse patterns into a single character alternative if they fit.
-  ;;   regexp-opt will do that if all are strings, but we want to do that for:
-  ;;     * symbols that expand to classes: space, alpha, ...
-  ;;     * character alternatives: (any ...)
+  ;; - Optimise single-character alternatives better:
+  ;;     * classes: space, alpha, ...
   ;;     * (syntax S), for some S (whitespace, word)
   ;;   so that (or "@" "%" digit (any "A-Z" space) (syntax word))
   ;;        -> (any "@" "%" digit "A-Z" space word)
@@ -294,6 +292,8 @@ Return (REGEXP . PRECEDENCE)."
    ((rx--every #'stringp body)     ; All strings.
     (cons (list (regexp-opt body nil t))
           t))
+   ((rx--every #'rx--charset-p body)  ; All charsets.
+    (rx--translate-union nil body))
    (t
     (cons (append (car (rx--translate (car body)))
                   (mapcan (lambda (item)
@@ -301,6 +301,19 @@ Return (REGEXP . PRECEDENCE)."
                           (cdr body)))
           nil))))
 
+(defun rx--charset-p (form)
+  "Whether FORM looks like a charset, only consisting of character intervals
+and set operations."
+  (or (and (consp form)
+           (or (and (memq (car form) '(any 'in 'char))
+                    (rx--every (lambda (x) (not (symbolp x))) (cdr form)))
+               (and (memq (car form) '(not or | intersection))
+                    (rx--every #'rx--charset-p (cdr form)))))
+      (and (or (symbolp form) (consp form))
+           (let ((expanded (rx--expand-def form)))
+             (and expanded
+                  (rx--charset-p expanded))))))
+
 (defun rx--string-to-intervals (str)
   "Decode STR as intervals: A-Z becomes (?A . ?Z), and the single
 character X becomes (?X . ?X).  Return the intervals in a list."
@@ -477,7 +490,7 @@ If NEGATED, negate the sense."
      (not negated) (rx--complement-intervals intervals) nil)))
 
 ;; FIXME: Consider turning `not' into a variadic operator, following SRE:
-;; (not A B) = (not (union A B)) = (intersection (not A) (not B)), and
+;; (not A B) = (not (or A B)) = (intersection (not A) (not B)), and
 ;; (not) = anychar.
 ;; Maybe allow singleton characters as arguments.
 
@@ -498,7 +511,7 @@ If NEGATED, negate the sense (thus making it positive)."
               (rx--translate-category (not negated) (cdr arg)))
              ('not
               (rx--translate-not      (not negated) (cdr arg)))
-             ('union
+             ((or 'or '|)
               (rx--translate-union    (not negated) (cdr arg)))
              ('intersection
               (rx--translate-intersection (not negated) (cdr arg))))))
@@ -558,7 +571,7 @@ If NEGATED, negate the sense (thus making it positive)."
 (defun rx--charset-intervals (charset)
   "Return a sorted list of non-adjacent disjoint intervals from CHARSET.
 CHARSET is any expression allowed in a character set expression:
-either `any' (no classes permitted), or `not', `union' or `intersection'
+either `any' (no classes permitted), or `not', `or' or `intersection'
 forms whose arguments are charsets."
   (pcase charset
     (`(,(or 'any 'in 'char) . ,body)
@@ -569,8 +582,8 @@ forms whose arguments are charsets."
           (cadr parsed)))
        (car parsed)))
     (`(not ,x) (rx--complement-intervals (rx--charset-intervals x)))
-    (`(union . ,xs) (rx--charset-union xs))
-    (`(intersection . ,xs) (rx--charset-intersection xs))
+    (`(,(or 'or '|) . ,body) (rx--charset-union body))
+    (`(intersection . ,body) (rx--charset-intersection body))
     (_ (let ((expanded (rx--expand-def charset)))
          (if expanded
              (rx--charset-intervals expanded)
@@ -589,7 +602,7 @@ forms whose arguments are charsets."
              (mapcar #'rx--charset-intervals charsets)))
 
 (defun rx--translate-union (negated body)
-  "Translate a (union ...) construct.  Return (REGEXP . PRECEDENCE).
+  "Translate an (or ...) construct of charsets.  Return (REGEXP . PRECEDENCE).
 If NEGATED, negate the sense."
   (rx--intervals-to-alt negated (rx--charset-union body)))
 
@@ -976,7 +989,6 @@ can expand to any number of values."
       ((or 'any 'in 'char)      (rx--translate-any nil body))
       ('not-char                (rx--translate-any t body))
       ('not                     (rx--translate-not nil body))
-      ('union                   (rx--translate-union nil body))
       ('intersection            (rx--translate-intersection nil body))
 
       ('repeat                  (rx--translate-repeat body))
@@ -1036,7 +1048,7 @@ can expand to any number of values."
         (t (error "Unknown rx form `%s'" op)))))))
 
 (defconst rx--builtin-forms
-  '(seq sequence : and or | any in char not-char not union intersection
+  '(seq sequence : and or | any in char not-char not intersection
     repeat = >= **
     zero-or-more 0+ *
     one-or-more 1+ +
@@ -1149,11 +1161,10 @@ CHAR           Match a literal character.
                 character, a string, a range as string \"A-Z\" or cons
                 (?A . ?Z), or a character class (see below).  Alias: in, char.
 (not CHARSPEC)  Match one character not matched by CHARSPEC.  CHARSPEC
-                can be (any ...), (union ...), (intersection ...),
+                can be (any ...), (or ...), (intersection ...),
                 (syntax ...), (category ...), or a character class.
-(union CHARSET...)        Union of CHARSETs.
 (intersection CHARSET...) Intersection of CHARSETs.
-                CHARSET is (any...), (not...), (union...) or (intersection...).
+                CHARSET is (any...), (not...), (or...) or (intersection...).
 not-newline     Match any character except a newline.  Alias: nonl.
 anychar         Match any character.  Alias: anything.
 unmatchable     Never match anything at all.
diff --git a/test/lisp/emacs-lisp/rx-tests.el b/test/lisp/emacs-lisp/rx-tests.el
index 0cd2c95..344f467 100644
--- a/test/lisp/emacs-lisp/rx-tests.el
+++ b/test/lisp/emacs-lisp/rx-tests.el
@@ -274,33 +274,36 @@
   (should (equal (rx (not (not ascii)) (not (not (not (any "a-z")))))
                  "[[:ascii:]][^a-z]")))
 
-(ert-deftest rx-union ()
-  (should (equal (rx (union))
+(ert-deftest rx-charset-or ()
+  (should (equal (rx (or))
                  "\\`a\\`"))
-  (should (equal (rx (union (any "ba")))
+  (should (equal (rx (or (any "ba")))
                  "[ab]"))
-  (should (equal (rx (union (any "a-f") (any "c-k" ?y) (any ?r "x-z")))
+  (should (equal (rx (| (any "a-f") (any "c-k" ?y) (any ?r "x-z")))
                  "[a-krx-z]"))
-  (should (equal (rx (union (not (any "a-m")) (not (any "f-p"))))
+  (should (equal (rx (or (not (any "a-m")) (not (any "f-p"))))
                  "[^f-m]"))
-  (should (equal (rx (union (any "e-m") (not (any "a-z"))))
+  (should (equal (rx (| (any "e-m") (not (any "a-z"))))
                  "[^a-dn-z]"))
-  (should (equal (rx (union (not (any "g-r")) (not (any "t"))))
+  (should (equal (rx (or (not (any "g-r")) (not (any "t"))))
                  "[^z-a]"))
-  (should (equal (rx (not (union (not (any "g-r")) (not (any "t")))))
+  (should (equal (rx (not (or (not (any "g-r")) (not (any "t")))))
                  "\\`a\\`"))
-  (should (equal (rx (union (union (any "a-f") (any "u-z"))
-                            (any "g-r")))
+  (should (equal (rx (or (| (any "a-f") (any "u-z"))
+                         (any "g-r")))
                  "[a-ru-z]"))
-  (should (equal (rx (union (intersection (any "c-z") (any "a-g"))
-                            (not (any "a-k"))))
+  (should (equal (rx (or (intersection (any "c-z") (any "a-g"))
+                         (not (any "a-k"))))
                  "[^abh-k]")))
 
-(ert-deftest rx-def-in-union ()
+(ert-deftest rx-def-in-charset-or ()
   (rx-let ((a (any "badc"))
-           (b (union a (any "def"))))
-    (should (equal(rx (union b (any "q")))
-                  "[a-fq]"))))
+           (b (| a (any "def"))))
+    (should (equal (rx (or b (any "q")))
+                   "[a-fq]")))
+  (rx-let ((diff-| (a b) (not (or (not a) b))))
+    (should (equal (rx (diff-| (any "a-z") (any "gr")))
+                   "[a-fh-qs-z]"))))
 
 (ert-deftest rx-intersection ()
   (should (equal (rx (intersection))
@@ -321,15 +324,18 @@
   (should (equal (rx (intersection (any "d-u")
                                    (intersection (any "e-z") (any "a-m"))))
                  "[e-m]"))
-  (should (equal (rx (intersection (union (any "a-f") (any "f-t"))
+  (should (equal (rx (intersection (or (any "a-f") (any "f-t"))
                                    (any "e-w")))
                  "[e-t]")))
 
 (ert-deftest rx-def-in-intersection ()
   (rx-let ((a (any "a-g"))
            (b (intersection a (any "d-j"))))
-    (should (equal(rx (intersection b (any "e-k")))
-                  "[e-g]"))))
+    (should (equal (rx (intersection b (any "e-k")))
+                   "[e-g]")))
+  (rx-let ((diff-& (a b) (intersection a (not b))))
+    (should (equal (rx (diff-& (any "a-z") (any "m-p")))
+                   "[a-lq-z]"))))
 
 (ert-deftest rx-group ()
   (should (equal (rx (group nonl) (submatch "x")



reply via email to

[Prev in Thread] Current Thread [Next in Thread]