From 9aa8bbc642d1b4bb9870327dd3dff6e200f0bd27 Mon Sep 17 00:00:00 2001 From: LemonBoy Date: Thu, 9 Nov 2017 13:29:08 +0100 Subject: [PATCH] Fix an error in unicode-range->utf8-pattern The sequence generated for a utf8 character class contained an unintended trailing '(), causing the code to fail when `sre-length-ranges' is called. Reported by Chunyang Xu at CHICKEN-users. Signed-off-by: Peter Bex --- NEWS | 2 ++ irregex-core.scm | 7 +++---- tests/test-irregex.scm | 2 ++ 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/NEWS b/NEWS index 397fd532..46d2c0eb 100644 --- a/NEWS +++ b/NEWS @@ -19,6 +19,8 @@ on s8vectors (thanks to Kristian Lein-Mathisen). - Large literals no longer crash with "invalid encoded numeric literal" on mingw-64 (#1344, thanks to Lemonboy). + - Unit irregex: Fix bug that prevented multibyte UTF-8 character sets + from being matched correctly (Thanks to Lemonboy and Chunyang Xu). - Runtime system: - The profiler no longer uses malloc from a signal handler which may diff --git a/irregex-core.scm b/irregex-core.scm index 7ac043d3..ba6d1f72 100644 --- a/irregex-core.scm +++ b/irregex-core.scm @@ -1407,12 +1407,11 @@ (unicode-range-up-to hi-ls))) (let lp ((lo-ls lo-ls) (hi-ls hi-ls)) (cond - ((null? lo-ls) - '()) ((= (car lo-ls) (car hi-ls)) (sre-sequence - (list (integer->char (car lo-ls)) - (lp (cdr lo-ls) (cdr hi-ls))))) + (cons (integer->char (car lo-ls)) + (if (null? (cdr lo-ls)) '() + (cons (lp (cdr lo-ls) (cdr hi-ls)) '()))))) ((= (+ (car lo-ls) 1) (car hi-ls)) (sre-alternate (list (unicode-range-up-from lo-ls) (unicode-range-up-to hi-ls)))) diff --git a/tests/test-irregex.scm b/tests/test-irregex.scm index 1a460549..9a5402c4 100644 --- a/tests/test-irregex.scm +++ b/tests/test-irregex.scm @@ -538,5 +538,7 @@ (test-assert (not (irregex-search "(?u:<[^あ-ん語]*>)" "<ひらがな>"))) (test-assert (not (irregex-search "(?u:<[^あ-ん語]*>)" "<語>"))) +(test-assert (not (irregex-search (irregex "[一二]" 'utf8 #t) "三四"))) + (test-end) -- 2.11.0