emacs-diffs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

master ea93326: Add `union' and `intersection' to rx (bug#37849)


From: Mattias Engdegård
Subject: master ea93326: Add `union' and `intersection' to rx (bug#37849)
Date: Tue, 10 Dec 2019 16:38:43 -0500 (EST)

branch: master
commit ea93326cc046cb1beb7535cdf6d69b216b767685
Author: Mattias Engdegård <address@hidden>
Commit: Mattias Engdegård <address@hidden>

    Add `union' and `intersection' to rx (bug#37849)
    
    These character set operations, together with `not' for set
    complement, improve the compositionality of rx, and reduce duplication
    in complicated cases.  Named character classes are not permitted in
    set operations.
    
    * lisp/emacs-lisp/rx.el (rx--translate-any): Split into multiple
    functions.
    (rx--foldl, rx--parse-any, rx--generate-alt, rx--intervals-to-alt)
    (rx--complement-intervals, rx--intersect-intervals)
    (rx--union-intervals, rx--charset-intervals, rx--charset-union)
    (rx--charset-all, rx--charset-intersection, rx--translate-union)
    (rx--translate-intersection): New.
    (rx--translate-not, rx--translate-form, rx--builtin-forms, rx):
    Add `union' and `intersection'.
    * test/lisp/emacs-lisp/rx-tests.el (rx-union ,rx-def-in-union)
    (rx-intersection, rx-def-in-intersection): New tests.
    * doc/lispref/searching.texi (Rx Constructs):
    * etc/NEWS:
    Document `union' and `intersection'.
---
 doc/lispref/searching.texi       |  14 +-
 etc/NEWS                         |   7 +-
 lisp/emacs-lisp/rx.el            | 309 +++++++++++++++++++++++++++------------
 test/lisp/emacs-lisp/rx-tests.el |  57 ++++++++
 4 files changed, 289 insertions(+), 98 deletions(-)

diff --git a/doc/lispref/searching.texi b/doc/lispref/searching.texi
index 0cb3001..5bf3c5b 100644
--- a/doc/lispref/searching.texi
+++ b/doc/lispref/searching.texi
@@ -1214,11 +1214,21 @@ Corresponding string regexp: @samp{[@dots{}]}
 @item @code{(not @var{charspec})}
 @cindex @code{not} in rx
 Match a character not included in @var{charspec}.  @var{charspec} can
-be an @code{any}, @code{not}, @code{syntax} or @code{category} form, or a
-character class.@*
+be an @code{any}, @code{not}, @code{union}, @code{intersection},
+@code{syntax} or @code{category} form, or a character class.@*
 Corresponding string regexp: @samp{[^@dots{}]}, @samp{\S@var{code}},
 @samp{\C@var{code}}
 
+@item @code{(union @var{charset}@dots{})}
+@itemx @code{(intersection @var{charset}@dots{})}
+@cindex @code{union} in rx
+@cindex @code{intersection} in rx
+Match a character that matches the union or intersection,
+respectively, of the @var{charset}s.  Each @var{charset} can be an
+@code{any} form without character classes, or a @code{union},
+@code{intersection} or @code{not} form whose arguments are also
+@var{charset}s.
+
 @item @code{not-newline}, @code{nonl}
 @cindex @code{not-newline} in rx
 @cindex @code{nonl} in rx
diff --git a/etc/NEWS b/etc/NEWS
index 923890d..69b51b7 100644
--- a/etc/NEWS
+++ b/etc/NEWS
@@ -2110,9 +2110,14 @@ at run time, instead of a constant string.
 These macros add new forms to the rx notation.
 
 +++
-*** 'anychar' is now an alias for 'anything'
+*** 'anychar' is now an alias for 'anything'.
 Both match any single character; 'anychar' is more descriptive.
 
++++
+*** New 'union' and 'intersection' forms for character sets.
+These permit composing character-matching expressions from simpler
+parts.
+
 ** Frames
 
 +++
diff --git a/lisp/emacs-lisp/rx.el b/lisp/emacs-lisp/rx.el
index a92c613..d4b21c3 100644
--- a/lisp/emacs-lisp/rx.el
+++ b/lisp/emacs-lisp/rx.el
@@ -246,6 +246,14 @@ Return (REGEXP . PRECEDENCE)."
     (setq list (cdr list)))
   (null list))
 
+(defun rx--foldl (f x l)
+  "(F (F (F X L0) L1) L2) ...
+Left-fold the list L, starting with X, by the binary function F."
+  (while l
+    (setq x (funcall f x (car l)))
+    (setq l (cdr l)))
+  x)
+
 (defun rx--translate-or (body)
   "Translate an or-pattern of zero or more rx items.
 Return (REGEXP . PRECEDENCE)."
@@ -343,22 +351,11 @@ INTERVALS is a list of (START . END) with START ≤ END, 
sorted by START."
         (setq tail d)))
     intervals))
 
-;; FIXME: Consider expanding definitions inside (any ...) and (not ...),
-;; and perhaps allow (any ...) inside (any ...).
-;; It would be benefit composability (build a character alternative by pieces)
-;; and be handy for obtaining the complement of a defined set of
-;; characters.  (See, for example, python.el:421, `not-simple-operator'.)
-;; (Expansion in other non-rx positions is probably not a good idea:
-;; syntax, category, backref, and the integer parameters of group-n,
-;; =, >=, **, repeat)
-;; Similar effect could be attained by ensuring that
-;; (or (any X) (any Y)) -> (any X Y), and find a way to compose negative
-;; sets.  `and' is taken, but we could add
-;; (intersection (not (any X)) (not (any Y))) -> (not (any X Y)).
-
-(defun rx--translate-any (negated body)
-  "Translate an (any ...) construct.  Return (REGEXP . PRECEDENCE).
-If NEGATED, negate the sense."
+(defun rx--parse-any (body)
+  "Parse arguments of an (any ...) construct.
+Return (INTERVALS . CLASSES), where INTERVALS is a sorted list of
+disjoint intervals (each a cons of chars), and CLASSES
+a list of named character classes in the order they occur in BODY."
   (let ((classes nil)
         (strings nil)
         (conses nil))
@@ -380,81 +377,109 @@ If NEGATED, negate the sense."
                          (or (memq class classes)
                              (progn (push class classes) t))))))
             (t (error "Invalid rx `any' argument: %s" arg))))
-    (let ((items
-           ;; Translate strings and conses into nonoverlapping intervals,
-           ;; and add classes as symbols at the end.
-           (append
-            (rx--condense-intervals
-             (sort (append conses
-                           (mapcan #'rx--string-to-intervals strings))
-                   #'car-less-than-car))
-            (reverse classes))))
-
-      ;; Move lone ] and range ]-x to the start.
-      (let ((rbrac-l (assq ?\] items)))
-        (when rbrac-l
-          (setq items (cons rbrac-l (delq rbrac-l items)))))
-
-      ;; Split x-] and move the lone ] to the start.
-      (let ((rbrac-r (rassq ?\] items)))
-        (when (and rbrac-r (not (eq (car rbrac-r) ?\])))
-          (setcdr rbrac-r ?\\)
-          (setq items (cons '(?\] . ?\]) items))))
-
-      ;; Split ,-- (which would end up as ,- otherwise).
-      (let ((dash-r (rassq ?- items)))
-        (when (eq (car dash-r) ?,)
-          (setcdr dash-r ?,)
-          (setq items (nconc items '((?- . ?-))))))
-
-      ;; Remove - (lone or at start of interval)
-      (let ((dash-l (assq ?- items)))
-        (when dash-l
-          (if (eq (cdr dash-l) ?-)
-              (setq items (delq dash-l items))   ; Remove lone -
-            (setcar dash-l ?.))                  ; Reduce --x to .-x
-          (setq items (nconc items '((?- . ?-))))))
-
-      ;; Deal with leading ^ and range ^-x.
-      (when (and (consp (car items))
-                 (eq (caar items) ?^)
-                 (cdr items))
-        ;; Move ^ and ^-x to second place.
-        (setq items (cons (cadr items)
-                          (cons (car items) (cddr items)))))
+    (cons (rx--condense-intervals
+           (sort (append conses
+                         (mapcan #'rx--string-to-intervals strings))
+                 #'car-less-than-car))
+          (reverse classes))))
+
+(defun rx--generate-alt (negated intervals classes)
+  "Generate a character alternative.  Return (REGEXP . PRECEDENCE).
+If NEGATED is non-nil, negate the result; INTERVALS is a sorted
+list of disjoint intervals and CLASSES a list of named character
+classes."
+  (let ((items (append intervals classes)))
+    ;; Move lone ] and range ]-x to the start.
+    (let ((rbrac-l (assq ?\] items)))
+      (when rbrac-l
+        (setq items (cons rbrac-l (delq rbrac-l items)))))
+
+    ;; Split x-] and move the lone ] to the start.
+    (let ((rbrac-r (rassq ?\] items)))
+      (when (and rbrac-r (not (eq (car rbrac-r) ?\])))
+        (setcdr rbrac-r ?\\)
+        (setq items (cons '(?\] . ?\]) items))))
+
+    ;; Split ,-- (which would end up as ,- otherwise).
+    (let ((dash-r (rassq ?- items)))
+      (when (eq (car dash-r) ?,)
+        (setcdr dash-r ?,)
+        (setq items (nconc items '((?- . ?-))))))
+
+    ;; Remove - (lone or at start of interval)
+    (let ((dash-l (assq ?- items)))
+      (when dash-l
+        (if (eq (cdr dash-l) ?-)
+            (setq items (delq dash-l items))   ; Remove lone -
+          (setcar dash-l ?.))                  ; Reduce --x to .-x
+        (setq items (nconc items '((?- . ?-))))))
+
+    ;; Deal with leading ^ and range ^-x.
+    (when (and (consp (car items))
+               (eq (caar items) ?^)
+               (cdr items))
+      ;; Move ^ and ^-x to second place.
+      (setq items (cons (cadr items)
+                        (cons (car items) (cddr items)))))
 
-      (cond
-       ;; Empty set: if negated, any char, otherwise match-nothing.
-       ((null items)
-        (if negated
-            (rx--translate-symbol 'anything)
-          (rx--empty)))
-       ;; Single non-negated character.
-       ((and (null (cdr items))
-             (consp (car items))
-             (eq (caar items) (cdar items))
-             (not negated))
-        (cons (list (regexp-quote (char-to-string (caar items))))
-              t))
-       ;; At least one character or class, possibly negated.
-       (t
-        (cons
-         (list
-          (concat
-           "["
-           (and negated "^")
-           (mapconcat (lambda (item)
-                        (cond ((symbolp item)
-                               (format "[:%s:]" item))
-                              ((eq (car item) (cdr item))
-                               (char-to-string (car item)))
-                              ((eq (1+ (car item)) (cdr item))
-                               (string (car item) (cdr item)))
-                              (t
-                               (string (car item) ?- (cdr item)))))
-                      items nil)
-           "]"))
-         t))))))
+    (cond
+     ;; Empty set: if negated, any char, otherwise match-nothing.
+     ((null items)
+      (if negated
+          (rx--translate-symbol 'anything)
+        (rx--empty)))
+     ;; Single non-negated character.
+     ((and (null (cdr items))
+           (consp (car items))
+           (eq (caar items) (cdar items))
+           (not negated))
+      (cons (list (regexp-quote (char-to-string (caar items))))
+            t))
+     ;; At least one character or class, possibly negated.
+     (t
+      (cons
+       (list
+        (concat
+         "["
+         (and negated "^")
+         (mapconcat (lambda (item)
+                      (cond ((symbolp item)
+                             (format "[:%s:]" item))
+                            ((eq (car item) (cdr item))
+                             (char-to-string (car item)))
+                            ((eq (1+ (car item)) (cdr item))
+                             (string (car item) (cdr item)))
+                            (t
+                             (string (car item) ?- (cdr item)))))
+                    items nil)
+         "]"))
+       t)))))
+
+(defun rx--translate-any (negated body)
+  "Translate an (any ...) construct.  Return (REGEXP . PRECEDENCE).
+If NEGATED, negate the sense."
+  (let ((parsed (rx--parse-any body)))
+    (rx--generate-alt negated (car parsed) (cdr parsed))))
+
+(defun rx--intervals-to-alt (negated intervals)
+  "Generate a character alternative from an interval set.
+Return (REGEXP . PRECEDENCE).
+INTERVALS is a sorted list of disjoint intervals.
+If NEGATED, negate the sense."
+  ;; Detect whether the interval set is better described in
+  ;; complemented form.  This is not just a matter of aesthetics: any
+  ;; range from ASCII to raw bytes will automatically exclude the
+  ;; entire non-ASCII Unicode range by the regexp engine.
+  (if (rx--every (lambda (iv) (not (<= (car iv) #x3ffeff (cdr iv))))
+                 intervals)
+      (rx--generate-alt negated intervals nil)
+    (rx--generate-alt
+     (not negated) (rx--complement-intervals intervals) nil)))
+
+;; FIXME: Consider turning `not' into a variadic operator, following SRE:
+;; (not A B) = (not (union A B)) = (intersection (not A) (not B)), and
+;; (not) = anychar.
+;; Maybe allow singleton characters as arguments.
 
 (defun rx--translate-not (negated body)
   "Translate a (not ...) construct.  Return (REGEXP . PRECEDENCE).
@@ -472,10 +497,14 @@ If NEGATED, negate the sense (thus making it positive)."
              ('category
               (rx--translate-category (not negated) (cdr arg)))
              ('not
-              (rx--translate-not      (not negated) (cdr arg))))))
+              (rx--translate-not      (not negated) (cdr arg)))
+             ('union
+              (rx--translate-union    (not negated) (cdr arg)))
+             ('intersection
+              (rx--translate-intersection (not negated) (cdr arg))))))
      ((let ((class (cdr (assq arg rx--char-classes))))
         (and class
-             (rx--translate-any (not negated) (list class)))))
+             (rx--generate-alt (not negated) nil (list class)))))
      ((eq arg 'word-boundary)
       (rx--translate-symbol
        (if negated 'word-boundary 'not-word-boundary)))
@@ -484,6 +513,91 @@ If NEGATED, negate the sense (thus making it positive)."
              (rx--translate-not negated (list expanded)))))
      (t (error "Illegal argument to rx `not': %S" arg)))))
 
+(defun rx--complement-intervals (intervals)
+  "Complement of the interval list INTERVALS."
+  (let ((compl nil)
+        (c 0))
+    (dolist (iv intervals)
+      (when (< c (car iv))
+        (push (cons c (1- (car iv))) compl))
+      (setq c (1+ (cdr iv))))
+    (when (< c (max-char))
+      (push (cons c (max-char)) compl))
+    (nreverse compl)))
+
+(defun rx--intersect-intervals (ivs-a ivs-b)
+  "Intersection of the interval lists IVS-A and IVS-B."
+  (let ((isect nil))
+    (while (and ivs-a ivs-b)
+      (let ((a (car ivs-a))
+            (b (car ivs-b)))
+        (cond
+         ((< (cdr a) (car b)) (setq ivs-a (cdr ivs-a)))
+         ((> (car a) (cdr b)) (setq ivs-b (cdr ivs-b)))
+         (t
+          (push (cons (max (car a) (car b))
+                      (min (cdr a) (cdr b)))
+                isect)
+          (setq ivs-a (cdr ivs-a))
+          (setq ivs-b (cdr ivs-b))
+          (cond ((< (cdr a) (cdr b))
+                 (push (cons (1+ (cdr a)) (cdr b))
+                       ivs-b))
+                ((> (cdr a) (cdr b))
+                 (push (cons (1+ (cdr b)) (cdr a))
+                       ivs-a)))))))
+    (nreverse isect)))
+
+(defun rx--union-intervals (ivs-a ivs-b)
+  "Union of the interval lists IVS-A and IVS-B."
+  (rx--complement-intervals
+   (rx--intersect-intervals
+    (rx--complement-intervals ivs-a)
+    (rx--complement-intervals ivs-b))))
+
+(defun rx--charset-intervals (charset)
+  "Return a sorted list of non-adjacent disjoint intervals from CHARSET.
+CHARSET is any expression allowed in a character set expression:
+either `any' (no classes permitted), or `not', `union' or `intersection'
+forms whose arguments are charsets."
+  (pcase charset
+    (`(,(or 'any 'in 'char) . ,body)
+     (let ((parsed (rx--parse-any body)))
+       (when (cdr parsed)
+         (error
+          "Character class not permitted in set operations: %S"
+          (cadr parsed)))
+       (car parsed)))
+    (`(not ,x) (rx--complement-intervals (rx--charset-intervals x)))
+    (`(union . ,xs) (rx--charset-union xs))
+    (`(intersection . ,xs) (rx--charset-intersection xs))
+    (_ (let ((expanded (rx--expand-def charset)))
+         (if expanded
+             (rx--charset-intervals expanded)
+           (error "Bad character set: %S" charset))))))
+
+(defun rx--charset-union (charsets)
+  "Union of CHARSETS, as a set of intervals."
+  (rx--foldl #'rx--union-intervals nil
+             (mapcar #'rx--charset-intervals charsets)))
+
+(defconst rx--charset-all (list (cons 0 (max-char))))
+
+(defun rx--charset-intersection (charsets)
+  "Intersection of CHARSETS, as a set of intervals."
+  (rx--foldl #'rx--intersect-intervals rx--charset-all
+             (mapcar #'rx--charset-intervals charsets)))
+
+(defun rx--translate-union (negated body)
+  "Translate a (union ...) construct.  Return (REGEXP . PRECEDENCE).
+If NEGATED, negate the sense."
+  (rx--intervals-to-alt negated (rx--charset-union body)))
+
+(defun rx--translate-intersection (negated body)
+  "Translate an (intersection ...) construct.  Return (REGEXP . PRECEDENCE).
+If NEGATED, negate the sense."
+  (rx--intervals-to-alt negated (rx--charset-intersection body)))
+
 (defun rx--atomic-regexp (item)
   "ITEM is (REGEXP . PRECEDENCE); return a regexp of precedence t."
   (if (eq (cdr item) t)
@@ -862,6 +976,8 @@ can expand to any number of values."
       ((or 'any 'in 'char)      (rx--translate-any nil body))
       ('not-char                (rx--translate-any t body))
       ('not                     (rx--translate-not nil body))
+      ('union                   (rx--translate-union nil body))
+      ('intersection            (rx--translate-intersection nil body))
 
       ('repeat                  (rx--translate-repeat body))
       ('=                       (rx--translate-= body))
@@ -920,7 +1036,7 @@ can expand to any number of values."
         (t (error "Unknown rx form `%s'" op)))))))
 
 (defconst rx--builtin-forms
-  '(seq sequence : and or | any in char not-char not
+  '(seq sequence : and or | any in char not-char not union intersection
     repeat = >= **
     zero-or-more 0+ *
     one-or-more 1+ +
@@ -1033,8 +1149,11 @@ CHAR           Match a literal character.
                 character, a string, a range as string \"A-Z\" or cons
                 (?A . ?Z), or a character class (see below).  Alias: in, char.
 (not CHARSPEC)  Match one character not matched by CHARSPEC.  CHARSPEC
-                can be (any ...), (syntax ...), (category ...),
-                or a character class.
+                can be (any ...), (union ...), (intersection ...),
+                (syntax ...), (category ...), or a character class.
+(union CHARSET...)        Union of CHARSETs.
+(intersection CHARSET...) Intersection of CHARSETs.
+                CHARSET is (any...), (not...), (union...) or (intersection...).
 not-newline     Match any character except a newline.  Alias: nonl.
 anychar         Match any character.  Alias: anything.
 unmatchable     Never match anything at all.
diff --git a/test/lisp/emacs-lisp/rx-tests.el b/test/lisp/emacs-lisp/rx-tests.el
index 317dae2..0cd2c95 100644
--- a/test/lisp/emacs-lisp/rx-tests.el
+++ b/test/lisp/emacs-lisp/rx-tests.el
@@ -274,6 +274,63 @@
   (should (equal (rx (not (not ascii)) (not (not (not (any "a-z")))))
                  "[[:ascii:]][^a-z]")))
 
+(ert-deftest rx-union ()
+  (should (equal (rx (union))
+                 "\\`a\\`"))
+  (should (equal (rx (union (any "ba")))
+                 "[ab]"))
+  (should (equal (rx (union (any "a-f") (any "c-k" ?y) (any ?r "x-z")))
+                 "[a-krx-z]"))
+  (should (equal (rx (union (not (any "a-m")) (not (any "f-p"))))
+                 "[^f-m]"))
+  (should (equal (rx (union (any "e-m") (not (any "a-z"))))
+                 "[^a-dn-z]"))
+  (should (equal (rx (union (not (any "g-r")) (not (any "t"))))
+                 "[^z-a]"))
+  (should (equal (rx (not (union (not (any "g-r")) (not (any "t")))))
+                 "\\`a\\`"))
+  (should (equal (rx (union (union (any "a-f") (any "u-z"))
+                            (any "g-r")))
+                 "[a-ru-z]"))
+  (should (equal (rx (union (intersection (any "c-z") (any "a-g"))
+                            (not (any "a-k"))))
+                 "[^abh-k]")))
+
+(ert-deftest rx-def-in-union ()
+  (rx-let ((a (any "badc"))
+           (b (union a (any "def"))))
+    (should (equal(rx (union b (any "q")))
+                  "[a-fq]"))))
+
+(ert-deftest rx-intersection ()
+  (should (equal (rx (intersection))
+                 "[^z-a]"))
+  (should (equal (rx (intersection (any "ba")))
+                 "[ab]"))
+  (should (equal (rx (intersection (any "a-j" "u-z") (any "c-k" ?y)
+                                   (any "a-i" "x-z")))
+                 "[c-iy]"))
+  (should (equal (rx (intersection (not (any "a-m")) (not (any "f-p"))))
+                 "[^a-p]"))
+  (should (equal (rx (intersection (any "a-z") (not (any "g-q"))))
+                 "[a-fr-z]"))
+  (should (equal (rx (intersection (any "a-d") (any "e")))
+                 "\\`a\\`"))
+  (should (equal (rx (not (intersection (any "a-d") (any "e"))))
+                 "[^z-a]"))
+  (should (equal (rx (intersection (any "d-u")
+                                   (intersection (any "e-z") (any "a-m"))))
+                 "[e-m]"))
+  (should (equal (rx (intersection (union (any "a-f") (any "f-t"))
+                                   (any "e-w")))
+                 "[e-t]")))
+
+(ert-deftest rx-def-in-intersection ()
+  (rx-let ((a (any "a-g"))
+           (b (intersection a (any "d-j"))))
+    (should (equal(rx (intersection b (any "e-k")))
+                  "[e-g]"))))
+
 (ert-deftest rx-group ()
   (should (equal (rx (group nonl) (submatch "x")
                      (group-n 3 "y") (submatch-n 13 "z") (backref 1))



reply via email to

[Prev in Thread] Current Thread [Next in Thread]