[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[elpa] externals/xr 6d02ab45b6 1/2: Recognise implicit gap in ASCII-raw
From: |
Mattias Engdegård |
Subject: |
[elpa] externals/xr 6d02ab45b6 1/2: Recognise implicit gap in ASCII-raw ranges |
Date: |
Mon, 11 Jul 2022 14:21:03 -0400 (EDT) |
branch: externals/xr
commit 6d02ab45b6e8a99b65d792c3e91b95827f0ac3e5
Author: Mattias Engdegård <mattiase@acm.org>
Commit: Mattias Engdegård <mattiase@acm.org>
Recognise implicit gap in ASCII-raw ranges
Character ranges between ASCII and raw bytes, such as "[a-\xc0]", do
not include the intervening codes 80..3fff7f. That is, "[a-\xc0]"
really means "[a-\x7f\x80-\xc0]" and does not match any non-ASCII
Unicode character. This is also true for skip-sets.
This change does two things: xr-lint no longer complains about
"[a-\xc0λ]" since the λ is in the gap not matched by the range, and
the conversion to rx now represents the gap explicitly:
(any "a-\x7f\x80-\xc0" "λ").
---
xr-test.el | 24 ++++++---
xr.el | 174 ++++++++++++++++++++++++++++++++++++-------------------------
2 files changed, 120 insertions(+), 78 deletions(-)
diff --git a/xr-test.el b/xr-test.el
index 2d5d8859a4..e78c72d4f8 100644
--- a/xr-test.el
+++ b/xr-test.el
@@ -441,6 +441,13 @@
(xr-lint "\\`\\{2\\}\\(a\\|\\|b\\)\\{,8\\}")
'((2 . "Repetition of zero-width assertion")
(17 . "Repetition of expression matching an empty string"))))
+ ;; The range "[\x70-\x8f]" only includes 70..7f and 3fff80..3fff8f;
+ ;; the gap 80..3fff7f is excluded.
+ (should (equal (xr-lint "[\x70-\x8f∃]") nil))
+ (should (equal (xr-lint "[\x70-\x8f\x7e-å]")
+ '((4 . "Ranges `\x70-\\x7f' and `\x7e-å' overlap"))))
+ (should (equal (xr-lint "[\x70-\x8få-\x82]")
+ '((4 . "Ranges `å-\\x82' and `\\x80-\\x8f' overlap"))))
))
(ert-deftest xr-lint-repetition-of-empty ()
@@ -714,11 +721,11 @@
(ert-deftest xr-skip-set ()
(should (equal (xr-skip-set "0-9a-fA-F+*")
- '(any "0-9a-fA-F" "+*")))
+ '(any "0-9A-Fa-f" "*+")))
(should (equal (xr-skip-set "^ab-ex-")
'(not (any "b-e" "ax-"))))
(should (equal (xr-skip-set "-^][\\")
- '(any "^][-")))
+ '(any "[]^-")))
(should (equal (xr-skip-set "\\^a\\-bc-\\fg")
'(any "c-f" "^abg-")))
(should (equal (xr-skip-set "\\")
@@ -728,15 +735,15 @@
(should (equal (xr-skip-set "^Q-\\c-\\n")
'(not (any "Q-c" "n-"))))
(should (equal (xr-skip-set "\\\\A-")
- '(any "\\A-")))
+ '(any "A\\-")))
(should (equal (xr-skip-set "[a-z]")
'(any "a-z" "[]")))
(should (equal (xr-skip-set "[:ascii:]-[:digit:]")
'(any "-" ascii digit)))
(should (equal (xr-skip-set "A-[:blank:]")
- '(any "A-[" ":blank]")))
+ '(any "A-[" ":]abkln")))
(should (equal (xr-skip-set "\\[:xdigit:]-b")
- '(any "]-b" "[:xdigt")))
+ '(any "]-b" ":[dgitx")))
(should (equal (xr-skip-set "^a-z+" 'terse)
'(not (in "a-z" "+"))))
(should-error (xr-skip-set "[::]"))
@@ -763,7 +770,7 @@
(should (equal (xr-skip-set-lint "A-Fa-z3D-KM-N!3-7\\!b")
'((7 . "Ranges `A-F' and `D-K' overlap")
(10 . "Two-element range `M-N'")
- (14 . "Range `3-7' includes character `3'")
+ (14 . "Character `3' included in range `3-7'")
(17 . "Duplicated character `!'")
(17 . "Unnecessarily escaped `!'")
(19 . "Character `b' included in range `a-z'"))))
@@ -787,6 +794,11 @@
nil))
(should (equal (xr-skip-set-lint "A-Z-z")
'((3 . "Literal `-' not first or last"))))
+ (should (equal (xr-skip-set-lint "\x70-\x8f∃") nil))
+ (should (equal (xr-skip-set-lint "\x70-\x8f\x7e-å")
+ '((3 . "Ranges `\x70-\\x7f' and `\x7e-å' overlap"))))
+ (should (equal (xr-skip-set-lint "\x70-\x8få-\x82")
+ '((3 . "Ranges `å-\\x82' and `\\x80-\\x8f' overlap"))))
))
(provide 'xr-test)
diff --git a/xr.el b/xr.el
index aa4089d552..95c0df27b2 100644
--- a/xr.el
+++ b/xr.el
@@ -129,6 +129,11 @@
(let ((start ch)
(end (char-after (+ (point) 2))))
(cond
+ ((<= start #x7f #x3fff80 end)
+ ;; Intervals that go from ASCII (0-7f) to raw bytes
+ ;; (3fff80-3fffff) always exclude the intervening (Unicode) points.
+ (push (vector start #x7f (point)) intervals)
+ (push (vector #x3fff80 end (point)) intervals))
((<= start end)
(push (vector start end (point)) intervals))
;; It's unlikely that anyone writes z-a by mistake; don't complain.
@@ -1458,7 +1463,7 @@ A-SETS and B-SETS are arguments to `any'."
(let ((negated (eq (following-char) ?^))
(start-pos (point))
- (ranges nil)
+ (intervals nil)
(classes nil))
(when negated
(forward-char)
@@ -1531,39 +1536,16 @@ A-SETS and B-SETS are arguments to `any'."
(format-message "Two-element range `%c-%c'"
start end)
nil))))
- (let ((tail ranges))
- (while tail
- (let ((range (car tail)))
- (if (and (<= (car range) (or end start))
- (<= start (cdr range)))
- (let ((msg
- (cond
- ((and end (< start end)
- (< (car range) (cdr range)))
- (format-message
- "Ranges `%c-%c' and `%c-%c' overlap"
- (car range) (cdr range) start end))
- ((and end (< start end))
- (format-message
- "Range `%c-%c' includes character `%c'"
- start end (car range)))
- ((< (car range) (cdr range))
- (format-message
- "Character `%c' included in range `%c-%c'"
- start (car range) (cdr range)))
- (t
- (format-message "Duplicated character `%c'"
- start)))))
- (xr--report warnings (point)
- (xr--escape-string msg nil))
- ;; Expand previous interval to include this range.
- (setcar range (min (car range) start))
- (setcdr range (max (cdr range) (or end start)))
- (setq start nil)
- (setq tail nil))
- (setq tail (cdr tail))))))
- (when start
- (push (cons start (or end start)) ranges)))))
+ (cond
+ ((not end)
+ (push (vector start start (point)) intervals))
+ ((<= start #x7f #x3fff80 end)
+ ;; Intervals that go from ASCII (0-7f) to raw bytes
+ ;; (3fff80-3fffff) always exclude the intervening (Unicode) points.
+ (push (vector start #x7f (point)) intervals)
+ (push (vector #x3fff80 end (point)) intervals))
+ (t
+ (push (vector start end (point)) intervals))))))
((looking-at (rx "\\" eos))
(xr--report warnings (point)
@@ -1571,51 +1553,99 @@ A-SETS and B-SETS are arguments to `any'."
(goto-char (match-end 0)))
- (when (and (null ranges) (null classes))
+ (when (and (null intervals) (null classes))
(xr--report warnings (point-min)
(if negated
"Negated empty set matches anything"
"Empty set matches nothing")))
- (cond
- ;; Single non-negated character, like "-": make a string.
- ((and (not negated)
- (null classes)
- (= (length ranges) 1)
- (eq (caar ranges) (cdar ranges)))
- (regexp-quote (char-to-string (caar ranges))))
- ;; Negated empty set, like "^": anything.
- ((and negated
- (null classes)
- (null ranges))
- 'anything)
- ;; Single named class, like "[:nonascii:]": use the symbol.
- ((and (= (length classes) 1)
- (null ranges))
- (if negated
- (list 'not (car classes))
- (car classes)))
- ;; Anything else: produce (any ...)
- (t
- (let ((intervals nil)
+ (let* ((sorted (sort (nreverse intervals)
+ (lambda (a b) (< (aref a 0) (aref b 0)))))
+ (s sorted))
+ (while (cdr s)
+ (let ((this (car s))
+ (next (cadr s)))
+ (if (>= (aref this 1) (aref next 0))
+ ;; Overlap.
+ (let ((message
+ (cond
+ ;; Duplicate character: drop it and warn.
+ ((and (eq (aref this 0) (aref this 1))
+ (eq (aref next 0) (aref next 1)))
+ (format-message
+ "Duplicated character `%c'"
+ (aref this 0)))
+ ;; Duplicate range: drop it and warn.
+ ((and (eq (aref this 0) (aref next 0))
+ (eq (aref this 1) (aref next 1)))
+ (format-message
+ "Duplicated range `%c-%c'"
+ (aref this 0) (aref this 1)))
+ ;; Character in range: drop it and warn.
+ ((eq (aref this 0) (aref this 1))
+ (setcar s next)
+ (format-message
+ "Character `%c' included in range `%c-%c'"
+ (aref this 0) (aref next 0) (aref next 1)))
+ ;; Same but other way around.
+ ((eq (aref next 0) (aref next 1))
+ (format-message
+ "Character `%c' included in range `%c-%c'"
+ (aref next 0) (aref this 0) (aref this 1)))
+ ;; Overlapping ranges: merge and warn.
+ (t
+ (let ((this-end (aref this 1)))
+ (aset this 1 (max (aref this 1) (aref next 1)))
+ (format-message "Ranges `%c-%c' and `%c-%c' overlap"
+ (aref this 0) this-end
+ (aref next 0) (aref next 1)))))))
+ (xr--report warnings (max (aref this 2) (aref next 2))
+ (xr--escape-string message nil))
+ (setcdr s (cddr s)))
+ ;; No overlap.
+ (setq s (cdr s)))))
+
+ (let ((ranges nil)
(chars nil))
- (dolist (range ranges)
- (if (eq (car range) (cdr range))
- (push (car range) chars)
- (push (string (car range) ?- (cdr range)) intervals)))
- ;; Put a single `-' last.
- (when (memq ?- chars)
- (setq chars (append (delq ?- chars) (list ?-))))
- (let ((set (cons 'any
- (append
- (and intervals
- (list (apply #'concat intervals)))
- (and chars
- (list (apply #'string chars)))
- (nreverse classes)))))
+ (dolist (interv sorted)
+ (if (eq (aref interv 0) (aref interv 1))
+ (push (aref interv 0) chars)
+ (push (string (aref interv 0) ?- (aref interv 1))
+ ranges)))
+
+ (cond
+ ;; Single non-negated character, like "-": make a string.
+ ((and (not negated)
+ (null classes)
+ (null ranges)
+ (= (length chars) 1))
+ (regexp-quote (char-to-string (car chars))))
+ ;; Negated empty set, like "^": anything.
+ ((and negated
+ (null classes)
+ (null intervals))
+ 'anything)
+ ;; Single named class, like "[:nonascii:]": use the symbol.
+ ((and (= (length classes) 1)
+ (null intervals))
(if negated
- (list 'not set)
- set)))))))
+ (list 'not (car classes))
+ (car classes)))
+ ;; Anything else: produce (any ...)
+ (t
+ ;; Put a single `-' last.
+ (when (memq ?- chars)
+ (setq chars (cons ?- (delq ?- chars))))
+ (let ((set (cons 'any
+ (append
+ (and ranges
+ (list (apply #'concat (nreverse ranges))))
+ (and chars
+ (list (apply #'string (nreverse chars))))
+ (nreverse classes)))))
+ (if negated
+ (list 'not set)
+ set))))))))
(defun xr--parse-skip-set (skip-string warnings)
(with-temp-buffer