From 15993532a4cdda8f5d4d942508854e3f157af5c8 Mon Sep 17 00:00:00 2001 From: Peter Bex Date: Mon, 30 Dec 2019 18:31:35 +0100 Subject: [PATCH] Update irregex to latest upstream (b194cab) to fix #1661 irregex-replace and irregex-replace/all now work properly for empty matches: they don't drop characters and will actually insert the replacement in the output string. --- NEWS | 2 + irregex-core.scm | 88 +++++++++++++++++++++++++----------------- irregex-utils.scm | 4 +- tests/test-irregex.scm | 11 +++++- 4 files changed, 66 insertions(+), 39 deletions(-) diff --git a/NEWS b/NEWS index 05b4dfe9..004baf15 100644 --- a/NEWS +++ b/NEWS @@ -15,6 +15,8 @@ `define-record-printer` which isn't a "real" definition (see #1294). - On Windows, `decompose-directory` no longer crashes when a drive letter is present in the supplied path string. + - irregex-replace[/all] have been fixed for empty matches, so they + will no longer drop characters and ignore the replacement (#1661). - Runtime system - Quoted empty keywords like ||: and :|| are now read like prescribed diff --git a/irregex-core.scm b/irregex-core.scm index f26e8de6..badc11c0 100644 --- a/irregex-core.scm +++ b/irregex-core.scm @@ -1653,25 +1653,36 @@ (null? (cddr sre)) (sre-repeater? (cadr sre)))))) -(define (sre-searcher? sre) +(define (sre-bos? sre) (if (pair? sre) (case (car sre) - ((* +) (sre-any? (sre-sequence (cdr sre)))) ((seq : $ submatch => submatch-named) - (and (pair? (cdr sre)) (sre-searcher? (cadr sre)))) - ((or) (every sre-searcher? (cdr sre))) + (and (pair? (cdr sre)) (sre-bos? (cadr sre)))) + ((or) (every sre-bos? (cdr sre))) (else #f)) (eq? 'bos sre))) +;; a searcher doesn't need explicit iteration to find the first match +(define (sre-searcher? sre) + (or (sre-bos? sre) + (and (pair? sre) + (case (car sre) + ((* +) (sre-any? (sre-sequence (cdr sre)))) + ((seq : $ submatch => submatch-named) + (and (pair? (cdr sre)) (sre-searcher? (cadr sre)))) + ((or) (every sre-searcher? (cdr sre))) + (else #f))))) + +;; a consumer doesn't need to match more than once (define (sre-consumer? sre) - (if (pair? sre) - (case (car sre) - ((* +) (sre-any? (sre-sequence (cdr sre)))) - ((seq : $ submatch => submatch-named) - (and (pair? (cdr sre)) (sre-consumer? (last sre)))) - ((or) (every sre-consumer? (cdr sre))) - (else #f)) - (eq? 'eos sre))) + (or (sre-bos? sre) + (and (pair? sre) + (case (car sre) + ((* +) (sre-any? (sre-sequence (cdr sre)))) + ((seq : $ submatch => submatch-named) + (and (pair? (cdr sre)) (sre-consumer? (last sre)))) + ((or) (every sre-consumer? (cdr sre))) + (else #f))))) (define (sre-has-submatches? sre) (and (pair? sre) @@ -3877,18 +3888,17 @@ matches))) (if (not m) (finish i acc) - (let ((j (%irregex-match-end-index m 0))) - (if (= j i) - ;; skip one char forward if we match the empty string - (lp (list str (+ j 1) end) (+ j 1) acc) - (let ((acc (kons i m acc))) - (irregex-reset-matches! matches) - ;; no need to continue looping if this is a - ;; searcher - it's already consumed the only - ;; available match - (if (flag-set? (irregex-flags irx) ~searcher?) - (finish j acc) - (lp (list str j end) j acc))))))))))) + (let ((j (%irregex-match-end-index m 0)) + (acc (kons i m acc))) + (irregex-reset-matches! matches) + (cond + ((flag-set? (irregex-flags irx) ~consumer?) + (finish j acc)) + ((= j i) + ;; skip one char forward if we match the empty string + (lp (list str (+ j 1) end) (+ j 1) acc)) + (else + (lp (list str j end) j acc)))))))))) (define (irregex-fold irx kons . args) (if (not (procedure? kons)) (error 'irregex-fold "not a procedure" kons)) @@ -3920,10 +3930,7 @@ (lp end-src (+ end-index 1) acc)) (let ((acc (kons start i m acc))) (irregex-reset-matches! matches) - ;; no need to continue looping if this is a - ;; searcher - it's already consumed the only - ;; available match - (if (flag-set? (irregex-flags irx) ~searcher?) + (if (flag-set? (irregex-flags irx) ~consumer?) (finish end-src end-index acc) (lp end-src end-index acc))))))))))) @@ -3948,11 +3955,15 @@ (irregex-fold/fast irx (lambda (i m acc) - (let ((m-start (%irregex-match-start-index m 0))) - (append (irregex-apply-match m o) - (if (>= i m-start) - acc - (cons (substring str i m-start) acc))))) + (let* ((m-start (%irregex-match-start-index m 0)) + (res (if (>= i m-start) + (append (irregex-apply-match m o) acc) + (append (irregex-apply-match m o) + (cons (substring str i m-start) acc))))) + ;; include the skipped char on empty matches + (if (= i (%irregex-match-end-index m 0)) + (cons (substring str i (+ i 1)) res) + res))) '() str (lambda (i acc) @@ -4012,9 +4023,14 @@ (irregex-fold/fast irx (lambda (i m a) - (if (= i (%irregex-match-start-index m 0)) - a - (cons (substring str i (%irregex-match-start-index m 0)) a))) + (cond + ;; ((= i (%irregex-match-end-index m 0)) + ;; ;; empty match, just include the char + ;; (cons (substring str i (+ i 1)) a)) + ((= i (%irregex-match-start-index m 0)) + a) + (else + (cons (substring str i (%irregex-match-start-index m 0)) a)))) '() str (lambda (i a) diff --git a/irregex-utils.scm b/irregex-utils.scm index a2195a91..291b03ea 100644 --- a/irregex-utils.scm +++ b/irregex-utils.scm @@ -121,10 +121,10 @@ (display "]" out)) ((- & / ~) (cond - ((or (eq? #\~ (car x)) + ((or (eqv? #\~ (car x)) (and (eq? '- (car x)) (pair? (cdr x)) (eq? 'any (cadr x)))) (display "[^" out) - (display (cset->string (if (eq? #\~ (car x)) (cdr x) (cddr x))) out) + (display (cset->string (if (eqv? #\~ (car x)) (cdr x) (cddr x))) out) (display "]" out)) (else (lp `(cset ,@(sre->cset x)))))) diff --git a/tests/test-irregex.scm b/tests/test-irregex.scm index 8626b82c..1bb63a58 100644 --- a/tests/test-irregex.scm +++ b/tests/test-irregex.scm @@ -361,7 +361,10 @@ (lambda (src i s) (reverse s)))) (test-equal '("poo poo ") (irregex-fold '(* "poo ") - (lambda (i m s) (cons (irregex-match-substring m) s)) + (lambda (i m s) + (if (< i (irregex-match-end-index m 0)) + (cons (irregex-match-substring m) s) + s)) '() "poo poo platter")) (test-equal "* x " @@ -388,8 +391,14 @@ (test-equal "xaac" (irregex-replace/all (irregex '(or (seq bos "a") "b") 'dfa) "aaac" "x")) + (test-equal "*Line 1\n*Line 2" + (irregex-replace/all 'bol "Line 1\nLine 2" "*")) + (test-equal "**p*l*a*t*t*e*r" + (irregex-replace/all '(* "poo ") "poo poo platter" "*")) (test-equal '("foo" " " "foo" " " "b" "a" "r" " " "foo") (irregex-extract '(or (: bow "foo" eow) any) "foo foo bar foo")) + ;; (test-equal '("f" "o" "o" "b" "a" "r" "b" "a" "z") + ;; (irregex-split (irregex "") "foobarbaz")) ) -- 2.20.1