From d210eaac4762a7b5d95405b8a6b990329f941760 Mon Sep 17 00:00:00 2001 From: LemonBoy Date: Thu, 9 Nov 2017 13:29:08 +0100 Subject: [PATCH] Fix an error in unicode-range->utf8-pattern The sequence generated for a utf8 character class contained an unintended trailing '(), causing the code to fail when `sre-length-ranges' is called. Reported by Chunyang Xu at CHICKEN-users. Signed-off-by: Peter Bex --- NEWS | 2 ++ irregex-core.scm | 7 +++---- tests/test-irregex.scm | 2 ++ 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/NEWS b/NEWS index c849aede..eff9865b 100644 --- a/NEWS +++ b/NEWS @@ -138,6 +138,8 @@ on s8vectors (thanks to Kristian Lein-Mathisen). - Large literals no longer crash with "invalid encoded numeric literal" on mingw-64 (#1344, thanks to Lemonboy). + - Unit irregex: Fix bug that prevented multibyte UTF-8 character sets + from being matched correctly (Thanks to Lemonboy and Chunyang Xu). - Runtime system: - The profiler no longer uses malloc from a signal handler which may diff --git a/irregex-core.scm b/irregex-core.scm index c83aff9b..bef8336e 100644 --- a/irregex-core.scm +++ b/irregex-core.scm @@ -1402,12 +1402,11 @@ (unicode-range-up-to hi-ls))) (let lp ((lo-ls lo-ls) (hi-ls hi-ls)) (cond - ((null? lo-ls) - '()) ((= (car lo-ls) (car hi-ls)) (sre-sequence - (list (integer->char (car lo-ls)) - (lp (cdr lo-ls) (cdr hi-ls))))) + (cons (integer->char (car lo-ls)) + (if (null? (cdr lo-ls)) '() + (cons (lp (cdr lo-ls) (cdr hi-ls)) '()))))) ((= (+ (car lo-ls) 1) (car hi-ls)) (sre-alternate (list (unicode-range-up-from lo-ls) (unicode-range-up-to hi-ls)))) diff --git a/tests/test-irregex.scm b/tests/test-irregex.scm index 19218bd8..d7bfaf59 100644 --- a/tests/test-irregex.scm +++ b/tests/test-irregex.scm @@ -539,6 +539,8 @@ (test-assert (not (irregex-search "(?u:<[^あ-ん語]*>)" "<ひらがな>"))) (test-assert (not (irregex-search "(?u:<[^あ-ん語]*>)" "<語>"))) +(test-assert (not (irregex-search (irregex "[一二]" 'utf8 #t) "三四"))) + (test-end) (test-exit) -- 2.11.0