emacs-elpa-diffs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[elpa] externals/xr 6d02ab45b6 1/2: Recognise implicit gap in ASCII-raw


From: Mattias Engdegård
Subject: [elpa] externals/xr 6d02ab45b6 1/2: Recognise implicit gap in ASCII-raw ranges
Date: Mon, 11 Jul 2022 14:21:03 -0400 (EDT)

branch: externals/xr
commit 6d02ab45b6e8a99b65d792c3e91b95827f0ac3e5
Author: Mattias Engdegård <mattiase@acm.org>
Commit: Mattias Engdegård <mattiase@acm.org>

    Recognise implicit gap in ASCII-raw ranges
    
    Character ranges between ASCII and raw bytes, such as "[a-\xc0]", do
    not include the intervening codes 80..3fff7f. That is, "[a-\xc0]"
    really means "[a-\x7f\x80-\xc0]" and does not match any non-ASCII
    Unicode character. This is also true for skip-sets.
    
    This change does two things: xr-lint no longer complains about
    "[a-\xc0λ]" since the λ is in the gap not matched by the range, and
    the conversion to rx now represents the gap explicitly:
    (any "a-\x7f\x80-\xc0" "λ").
---
 xr-test.el |  24 ++++++---
 xr.el      | 174 ++++++++++++++++++++++++++++++++++++-------------------------
 2 files changed, 120 insertions(+), 78 deletions(-)

diff --git a/xr-test.el b/xr-test.el
index 2d5d8859a4..e78c72d4f8 100644
--- a/xr-test.el
+++ b/xr-test.el
@@ -441,6 +441,13 @@
              (xr-lint "\\`\\{2\\}\\(a\\|\\|b\\)\\{,8\\}")
              '((2 . "Repetition of zero-width assertion")
                (17 . "Repetition of expression matching an empty string"))))
+    ;; The range "[\x70-\x8f]" only includes 70..7f and 3fff80..3fff8f;
+    ;; the gap 80..3fff7f is excluded.
+    (should (equal (xr-lint "[\x70-\x8f∃]") nil))
+    (should (equal (xr-lint "[\x70-\x8f\x7e-å]")
+                   '((4 . "Ranges `\x70-\\x7f' and `\x7e-å' overlap"))))
+    (should (equal (xr-lint "[\x70-\x8få-\x82]")
+                   '((4 . "Ranges `å-\\x82' and `\\x80-\\x8f' overlap"))))
     ))
 
 (ert-deftest xr-lint-repetition-of-empty ()
@@ -714,11 +721,11 @@
 
 (ert-deftest xr-skip-set ()
   (should (equal (xr-skip-set "0-9a-fA-F+*")
-                 '(any "0-9a-fA-F" "+*")))
+                 '(any "0-9A-Fa-f" "*+")))
   (should (equal (xr-skip-set "^ab-ex-")
                  '(not (any "b-e" "ax-"))))
   (should (equal (xr-skip-set "-^][\\")
-                 '(any "^][-")))
+                 '(any "[]^-")))
   (should (equal (xr-skip-set "\\^a\\-bc-\\fg")
                  '(any "c-f" "^abg-")))
   (should (equal (xr-skip-set "\\")
@@ -728,15 +735,15 @@
   (should (equal (xr-skip-set "^Q-\\c-\\n")
                  '(not (any "Q-c" "n-"))))
   (should (equal (xr-skip-set "\\\\A-")
-                 '(any "\\A-")))
+                 '(any "A\\-")))
   (should (equal (xr-skip-set "[a-z]")
                  '(any "a-z" "[]")))
   (should (equal (xr-skip-set "[:ascii:]-[:digit:]")
                  '(any "-" ascii digit)))
   (should (equal (xr-skip-set "A-[:blank:]")
-                 '(any "A-[" ":blank]")))
+                 '(any "A-[" ":]abkln")))
   (should (equal (xr-skip-set "\\[:xdigit:]-b")
-                 '(any "]-b" "[:xdigt")))
+                 '(any "]-b" ":[dgitx")))
   (should (equal (xr-skip-set "^a-z+" 'terse)
                  '(not (in "a-z" "+"))))
   (should-error (xr-skip-set "[::]"))
@@ -763,7 +770,7 @@
     (should (equal (xr-skip-set-lint "A-Fa-z3D-KM-N!3-7\\!b")
                    '((7 . "Ranges `A-F' and `D-K' overlap")
                      (10 . "Two-element range `M-N'")
-                     (14 . "Range `3-7' includes character `3'")
+                     (14 . "Character `3' included in range `3-7'")
                      (17 . "Duplicated character `!'")
                      (17 . "Unnecessarily escaped `!'")
                      (19 . "Character `b' included in range `a-z'"))))
@@ -787,6 +794,11 @@
                    nil))
     (should (equal (xr-skip-set-lint "A-Z-z")
                    '((3 . "Literal `-' not first or last"))))
+    (should (equal (xr-skip-set-lint "\x70-\x8f∃") nil))
+    (should (equal (xr-skip-set-lint "\x70-\x8f\x7e-å")
+                   '((3 . "Ranges `\x70-\\x7f' and `\x7e-å' overlap"))))
+    (should (equal (xr-skip-set-lint "\x70-\x8få-\x82")
+                   '((3 . "Ranges `å-\\x82' and `\\x80-\\x8f' overlap"))))
     ))
 
 (provide 'xr-test)
diff --git a/xr.el b/xr.el
index aa4089d552..95c0df27b2 100644
--- a/xr.el
+++ b/xr.el
@@ -129,6 +129,11 @@
         (let ((start ch)
               (end (char-after (+ (point) 2))))
           (cond
+           ((<= start #x7f #x3fff80 end)
+            ;; Intervals that go from ASCII (0-7f) to raw bytes
+            ;; (3fff80-3fffff) always exclude the intervening (Unicode) points.
+            (push (vector start #x7f (point)) intervals)
+            (push (vector #x3fff80 end (point)) intervals))
            ((<= start end)
             (push (vector start end (point)) intervals))
            ;; It's unlikely that anyone writes z-a by mistake; don't complain.
@@ -1458,7 +1463,7 @@ A-SETS and B-SETS are arguments to `any'."
 
   (let ((negated (eq (following-char) ?^))
         (start-pos (point))
-        (ranges nil)
+        (intervals nil)
         (classes nil))
     (when negated
       (forward-char)
@@ -1531,39 +1536,16 @@ A-SETS and B-SETS are arguments to `any'."
                            (format-message "Two-element range `%c-%c'"
                                            start end)
                            nil))))
-            (let ((tail ranges))
-              (while tail
-                (let ((range (car tail)))
-                  (if (and (<= (car range) (or end start))
-                           (<= start (cdr range)))
-                      (let ((msg
-                             (cond
-                              ((and end (< start end)
-                                    (< (car range) (cdr range)))
-                               (format-message
-                                "Ranges `%c-%c' and `%c-%c' overlap"
-                                (car range) (cdr range) start end))
-                              ((and end (< start end))
-                               (format-message
-                                "Range `%c-%c' includes character `%c'"
-                                start end (car range)))
-                              ((< (car range) (cdr range))
-                               (format-message
-                                "Character `%c' included in range `%c-%c'"
-                                start (car range) (cdr range)))
-                              (t
-                               (format-message "Duplicated character `%c'"
-                                               start)))))
-                        (xr--report warnings (point)
-                                    (xr--escape-string msg nil))
-                        ;; Expand previous interval to include this range.
-                        (setcar range (min (car range) start))
-                        (setcdr range (max (cdr range) (or end start)))
-                        (setq start nil)
-                        (setq tail nil))
-                    (setq tail (cdr tail))))))
-            (when start
-              (push (cons start (or end start)) ranges)))))
+          (cond
+           ((not end)
+            (push (vector start start (point)) intervals))
+           ((<= start #x7f #x3fff80 end)
+            ;; Intervals that go from ASCII (0-7f) to raw bytes
+            ;; (3fff80-3fffff) always exclude the intervening (Unicode) points.
+            (push (vector start #x7f (point)) intervals)
+            (push (vector #x3fff80 end (point)) intervals))
+           (t
+            (push (vector start end (point)) intervals))))))
 
        ((looking-at (rx "\\" eos))
         (xr--report warnings (point)
@@ -1571,51 +1553,99 @@ A-SETS and B-SETS are arguments to `any'."
 
       (goto-char (match-end 0)))
 
-    (when (and (null ranges) (null classes))
+    (when (and (null intervals) (null classes))
       (xr--report warnings (point-min)
                   (if negated
                       "Negated empty set matches anything"
                     "Empty set matches nothing")))
 
-    (cond
-     ;; Single non-negated character, like "-": make a string.
-     ((and (not negated)
-           (null classes)
-           (= (length ranges) 1)
-           (eq (caar ranges) (cdar ranges)))
-      (regexp-quote (char-to-string (caar ranges))))
-     ;; Negated empty set, like "^": anything.
-     ((and negated
-           (null classes)
-           (null ranges))
-      'anything)
-     ;; Single named class, like "[:nonascii:]": use the symbol.
-     ((and (= (length classes) 1)
-           (null ranges))
-      (if negated
-          (list 'not (car classes))
-        (car classes)))
-     ;; Anything else: produce (any ...)
-     (t
-      (let ((intervals nil)
+    (let* ((sorted (sort (nreverse intervals)
+                         (lambda (a b) (< (aref a 0) (aref b 0)))))
+           (s sorted))
+      (while (cdr s)
+        (let ((this (car s))
+              (next (cadr s)))
+          (if (>= (aref this 1) (aref next 0))
+              ;; Overlap.
+              (let ((message
+                     (cond
+                      ;; Duplicate character: drop it and warn.
+                      ((and (eq (aref this 0) (aref this 1))
+                            (eq (aref next 0) (aref next 1)))
+                       (format-message
+                        "Duplicated character `%c'"
+                        (aref this 0)))
+                      ;; Duplicate range: drop it and warn.
+                      ((and (eq (aref this 0) (aref next 0))
+                            (eq (aref this 1) (aref next 1)))
+                       (format-message
+                        "Duplicated range `%c-%c'"
+                        (aref this 0) (aref this 1)))
+                      ;; Character in range: drop it and warn.
+                      ((eq (aref this 0) (aref this 1))
+                       (setcar s next)
+                       (format-message
+                        "Character `%c' included in range `%c-%c'"
+                        (aref this 0) (aref next 0) (aref next 1)))
+                      ;; Same but other way around.
+                      ((eq (aref next 0) (aref next 1))
+                       (format-message
+                        "Character `%c' included in range `%c-%c'"
+                        (aref next 0) (aref this 0) (aref this 1)))
+                      ;; Overlapping ranges: merge and warn.
+                      (t
+                       (let ((this-end (aref this 1)))
+                         (aset this 1 (max (aref this 1) (aref next 1)))
+                         (format-message "Ranges `%c-%c' and `%c-%c' overlap"
+                                         (aref this 0) this-end
+                                         (aref next 0) (aref next 1)))))))
+                (xr--report warnings (max (aref this 2) (aref next 2))
+                            (xr--escape-string message nil))
+                (setcdr s (cddr s)))
+            ;; No overlap.
+            (setq s (cdr s)))))
+
+      (let ((ranges nil)
             (chars nil))
-        (dolist (range ranges)
-          (if (eq (car range) (cdr range))
-              (push (car range) chars)
-            (push (string (car range) ?- (cdr range)) intervals)))
-        ;; Put a single `-' last.
-        (when (memq ?- chars)
-          (setq chars (append (delq ?- chars) (list ?-))))
-        (let ((set (cons 'any
-                         (append
-                          (and intervals
-                               (list (apply #'concat intervals)))
-                          (and chars
-                               (list (apply #'string chars)))
-                          (nreverse classes)))))
+        (dolist (interv sorted)
+          (if (eq (aref interv 0) (aref interv 1))
+              (push (aref interv 0) chars)
+            (push (string (aref interv 0) ?- (aref interv 1))
+                  ranges)))
+
+        (cond
+         ;; Single non-negated character, like "-": make a string.
+         ((and (not negated)
+               (null classes)
+               (null ranges)
+               (= (length chars) 1))
+          (regexp-quote (char-to-string (car chars))))
+         ;; Negated empty set, like "^": anything.
+         ((and negated
+               (null classes)
+               (null intervals))
+          'anything)
+         ;; Single named class, like "[:nonascii:]": use the symbol.
+         ((and (= (length classes) 1)
+               (null intervals))
           (if negated
-              (list 'not set)
-            set)))))))
+              (list 'not (car classes))
+            (car classes)))
+         ;; Anything else: produce (any ...)
+         (t
+          ;; Put a single `-' last.
+          (when (memq ?- chars)
+            (setq chars (cons ?- (delq ?- chars))))
+          (let ((set (cons 'any
+                           (append
+                            (and ranges
+                                 (list (apply #'concat (nreverse ranges))))
+                            (and chars
+                                 (list (apply #'string (nreverse chars))))
+                            (nreverse classes)))))
+            (if negated
+                (list 'not set)
+              set))))))))
 
 (defun xr--parse-skip-set (skip-string warnings)
   (with-temp-buffer



reply via email to

[Prev in Thread] Current Thread [Next in Thread]