From 3de0366cb1a098ab205f2003b824107447d6e9e6 Mon Sep 17 00:00:00 2001 From: Peter Bex Date: Thu, 2 Jan 2020 20:46:44 +0100 Subject: [PATCH] Update irregex to the 0.9.7 release (upstream commit 353b8db8) This makes the behaviour of irregex-fold and irregex-split more in line of expectations (also as compared to other regex engines). --- NEWS | 2 ++ irregex-core.scm | 22 ++++++++++++++++++---- manual/Module (chicken irregex) | 18 ++++++++++++++++++ tests/test-irregex.scm | 10 ++++++++-- 4 files changed, 46 insertions(+), 6 deletions(-) diff --git a/NEWS b/NEWS index 004baf15..6c30dece 100644 --- a/NEWS +++ b/NEWS @@ -17,6 +17,8 @@ letter is present in the supplied path string. - irregex-replace[/all] have been fixed for empty matches, so they will no longer drop characters and ignore the replacement (#1661). + - Irregex has been updated to upstream 0.9.7, which also improves + how empty matches are treated in irregex-fold and irregex-split. - Runtime system - Quoted empty keywords like ||: and :|| are now read like prescribed diff --git a/irregex-core.scm b/irregex-core.scm index badc11c0..9bcf7e0b 100644 --- a/irregex-core.scm +++ b/irregex-core.scm @@ -30,6 +30,8 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; History +;; 0.9.7: 2019/12/31 - more intuitive handling of empty matches in -fold, +;; -replace and -split ;; 0.9.6: 2016/12/05 - fixed exponential memory use of + in compilation ;; of backtracking matcher (CVE-2016-9954). ;; 0.9.5: 2016/09/10 - fixed a bug in irregex-fold handling of bow @@ -4024,9 +4026,9 @@ irx (lambda (i m a) (cond - ;; ((= i (%irregex-match-end-index m 0)) - ;; ;; empty match, just include the char - ;; (cons (substring str i (+ i 1)) a)) + ((= i (%irregex-match-end-index m 0)) + ;; empty match, include the skipped char to rejoin in finish + (cons (string-ref str i) a)) ((= i (%irregex-match-start-index m 0)) a) (else @@ -4034,6 +4036,18 @@ '() str (lambda (i a) - (reverse (if (= i end) a (cons (substring str i end) a)))) + (let lp ((ls (if (= i end) a (cons (substring str i end) a))) + (res '()) + (was-char? #f)) + (cond + ((null? ls) res) + ((char? (car ls)) + (lp (cdr ls) + (if (or was-char? (null? res)) + (cons (string (car ls)) res) + (cons (string-append (string (car ls)) (car res)) + (cdr res))) + #t)) + (else (lp (cdr ls) (cons (car ls) res) #f))))) start end))) diff --git a/manual/Module (chicken irregex) b/manual/Module (chicken irregex) index 82061397..ba6b7c36 100644 --- a/manual/Module (chicken irregex) +++ b/manual/Module (chicken irregex) @@ -218,6 +218,8 @@ Examples: (irregex-replace/all "[aeiou]" "hello world" "*") => "h*ll* w*rld" +(irregex-replace/all '(* "poo ") "poo poo platter" "*") => "**p*l*a*t*t*e*r" + (irregex-replace "(.)(.)" "ab" 2 1 "*") => "ba*" (irregex-replace "...bar" "xxfoobar" (lambda (m) @@ -238,6 +240,16 @@ by the pattern in {{}}. {{irregex-extract}} does the opposite, returning a list of each instance of the pattern matched disregarding the substrings in between. +Empty matches will result in subsequent single character string in +{{irregex-split}}, or empty strings in {{irregex-extract}}. + + +(irregex-split "[aeiou]*" "foobarbaz") => '("f" "b" "r" "b" "z") + +(irregex-extract "[aeiou]*" "foobarbaz") => '("" "oo" "" "a" "" "" "a" "") + + + ==== irregex-fold (irregex-fold [ ]) @@ -285,6 +297,12 @@ To extract all instances of a match out of a string, you can use (lambda (i s) (reverse s)))) +Note if an empty match is found {{}} will be called on that +empty string, and to avoid an infinite loop matching will resume at +the next char. It is up to the programmer to do something sensible +with the skipped char in this case. + + === Extended SRE Syntax Irregex provides the first native implementation of SREs (Scheme diff --git a/tests/test-irregex.scm b/tests/test-irregex.scm index 1bb63a58..59268364 100644 --- a/tests/test-irregex.scm +++ b/tests/test-irregex.scm @@ -397,8 +397,14 @@ (irregex-replace/all '(* "poo ") "poo poo platter" "*")) (test-equal '("foo" " " "foo" " " "b" "a" "r" " " "foo") (irregex-extract '(or (: bow "foo" eow) any) "foo foo bar foo")) - ;; (test-equal '("f" "o" "o" "b" "a" "r" "b" "a" "z") - ;; (irregex-split (irregex "") "foobarbaz")) + (test-equal '("f" "o" "o" "b" "a" "r" "b" "a" "z") + (irregex-split (irregex "") "foobarbaz")) + (test-equal '("f" "b" "r" "b" "z") + (irregex-split (irregex "[aeiou]*") "foobarbaz")) + (test-equal '("" "oo" "" "a" "" "" "a" "") + (irregex-extract (irregex "[aeiou]*") "foobarbaz")) + (test-equal '("Line 1\n" "Line 2\n" "Line 3") + (irregex-split 'bol "Line 1\nLine 2\nLine 3")) ) -- 2.20.1