From efe932f4fa7afbc56865d33edfbf6836c34ce919 Mon Sep 17 00:00:00 2001 From: Peter Bex Date: Tue, 6 Jul 2021 15:15:34 +0200 Subject: [PATCH] Bump irregex to upstream commit 29334af, bringing us to version 0.9.10 This fixes upstream ticket #25, where newlines would overlap with "bol" in situations where a string matches multiple times due to inconsistent handling. --- NEWS | 6 ++++-- irregex-core.scm | 16 +++++++++++----- tests/test-irregex.scm | 4 ++++ 3 files changed, 19 insertions(+), 7 deletions(-) diff --git a/NEWS b/NEWS index 53a40f0f..2e254e48 100644 --- a/NEWS +++ b/NEWS @@ -6,15 +6,17 @@ - Fixed a bug where optimisations for `irregex-match?` would cause runtime errors due to the inlined specialisations not being fully-expanded (see #1690). - - Irregex has been updated to upstream 0.9.9, which fixes behaviour + - Irregex has been updated to upstream 0.9.10, which fixes behaviour of irregex-replace/all with positive lookbehind so all matches are replaced instead of only the first (reported by Kay Rhodes), and a regression regarding replacing empty matches which was introduced by the fixes in 0.9.7 (reported by Sandra Snan). Also, the http-url shorthand now allows any top-level domain and the old "top-level-domain" now also supports "edu" (fixed by Sandra Snan). - Finally, a problem was fixed with capturing groups inside a kleene + Also, a problem was fixed with capturing groups inside a kleene star, which could sometimes return incorrect parts of the match. + Finally, "bol" handling was fixed to handle newlines consistently + so that multiple matches don't overlap (reported by Sandra Snan). - current-milliseconds has been deprecated in favor of the name current-process-milliseconds, to avoid confusion due to naming of current-milliseconds versus current-seconds, which do something diff --git a/irregex-core.scm b/irregex-core.scm index f86b7992..55e9a6c0 100644 --- a/irregex-core.scm +++ b/irregex-core.scm @@ -30,6 +30,10 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;; History +;; 0.9.10: 2021/07/06 - fixes for submatches under kleene star, empty seqs +;; in alternations, and bol in folds for backtracking +;; matcher (thanks John Clements and snan for reporting +;; and Peter Bex for fixing) ;; 0.9.9: 2021/05/14 - more comprehensive fix for repeated empty matches ;; 0.9.8: 2020/07/13 - fix irregex-replace/all with look-behind patterns ;; 0.9.7: 2019/12/31 - more intuitive handling of empty matches in -fold, @@ -3508,9 +3512,10 @@ (fail)))) ((bol) (lambda (cnk init src str i end matches fail) - (if (or (and (eq? src (car init)) (eqv? i (cdr init))) - (and (> i ((chunker-get-start cnk) src)) - (eqv? #\newline (string-ref str (- i 1))))) + (if (let ((ch (if (> i ((chunker-get-start cnk) src)) + (string-ref str (- i 1)) + (chunker-prev-char cnk init src)))) + (or (not ch) (eqv? #\newline ch))) (next cnk init src str i end matches fail) (fail)))) ((bow) @@ -3908,13 +3913,14 @@ matches))) (if (not m) (finish from acc) - (let ((j (%irregex-match-end-index m 0)) + (let ((j-start (%irregex-match-start-index m 0)) + (j (%irregex-match-end-index m 0)) (acc (kons from m acc))) (irregex-reset-matches! matches) (cond ((flag-set? (irregex-flags irx) ~consumer?) (finish j acc)) - ((= j i) + ((= j j-start) ;; skip one char forward if we match the empty string (lp (list str j end) j (+ j 1) acc)) (else diff --git a/tests/test-irregex.scm b/tests/test-irregex.scm index 5cf5b685..0888f09b 100644 --- a/tests/test-irregex.scm +++ b/tests/test-irregex.scm @@ -451,6 +451,10 @@ (irregex-extract (irregex "[aeiou]*") "foobarbaz")) (test-equal '("Line 1\n" "Line 2\n" "Line 3") (irregex-split 'bol "Line 1\nLine 2\nLine 3")) + (test-equal '("foo\n" "bar\n" "baz\n") + (irregex-extract '(: bol (+ alpha) newline) "\nfoo\nbar\nbaz\n")) + (test-equal '("\nblah" "\nblah" "\nblah") + (irregex-extract '(: newline "blah" eol) "\nblah\nblah\nblah\n")) ) -- 2.20.1