>From 9194c8e9f9ca7315c2e8c25a7986d0690fb31d7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlo=20Marcelo=20Arenas=20Bel=C3=B3n?= Date: Thu, 20 Apr 2023 18:37:20 -0700 Subject: [PATCH] pcre: workaround bug affecting \W or \D PCRE2 has a bug when using PCRE2_MATCH_INVALID_UTF that would randomly fail to match patterns using \W or \D. * NEWS: mention this * src/pcre2search.c: not use the problematic flag in all broken versions of PCRE2 * tests: add new pcre2-utf-bug224 test --- NEWS | 5 +++++ src/pcresearch.c | 23 ++++++++++++++--------- tests/Makefile.am | 1 + tests/pcre-utf8-bug224 | 31 +++++++++++++++++++++++++++++++ 4 files changed, 51 insertions(+), 9 deletions(-) create mode 100755 tests/pcre-utf8-bug224 diff --git a/NEWS b/NEWS index f16c576..8e371dc 100644 --- a/NEWS +++ b/NEWS @@ -15,6 +15,11 @@ GNU grep NEWS -*- outline -*- when running on 32-bit x86 and ARM hosts using glibc 2.34+. [bug introduced in grep 3.9] + grep no longer fails to match patterns with \D or \W when linked to + PCRE2 10.34 or newer. + [bug introduced in grep 3.8] + + ** Changes in behavior grep --version now prints a line describing the version of PCRE2 it uses. diff --git a/src/pcresearch.c b/src/pcresearch.c index 1f82932..6ef0d2e 100644 --- a/src/pcresearch.c +++ b/src/pcresearch.c @@ -58,6 +58,9 @@ struct pcre_comp /* Table, indexed by ! (flag & PCRE2_NOTBOL), of whether the empty string matches when that flag is used. */ int empty_match[2]; + + /* Flags */ + unsigned binary_safe:1; }; /* Memory allocation functions for PCRE. */ @@ -130,16 +133,11 @@ jit_exec (struct pcre_comp *pc, char const *subject, idx_t search_bytes, } } -/* Return true if E is an error code for bad UTF-8, and if pcre2_match - could return E because PCRE lacks PCRE2_MATCH_INVALID_UTF. */ +/* Return true if E is an error code for bad UTF-8 */ static bool bad_utf8_from_pcre2 (int e) { -#ifdef PCRE2_MATCH_INVALID_UTF - return false; -#else return PCRE2_ERROR_UTF8_ERR21 <= e && e <= PCRE2_ERROR_UTF8_ERR1; -#endif } /* Compile the -P style PATTERN, containing SIZE bytes that are @@ -157,6 +155,7 @@ Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact) = pcre2_general_context_create (private_malloc, private_free, NULL); pcre2_compile_context *ccontext = pcre2_compile_context_create (gcontext); + pc->binary_safe = false; if (localeinfo.multibyte) { uint32_t unicode; @@ -181,8 +180,14 @@ Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact) flags |= PCRE2_NEVER_BACKSLASH_C; #endif #ifdef PCRE2_MATCH_INVALID_UTF - /* Consider invalid UTF-8 as a barrier, instead of error. */ - flags |= PCRE2_MATCH_INVALID_UTF; + /* workaround PCRE2 bug + https://github.com/PCRE2Project/pcre2/issues/224 */ +#if PCRE2_MAJOR == 10 && PCRE2_MINOR <= 42 + pc->binary_safe = !strstr (pattern, "\\D") && !strstr (pattern, "\\W"); + if (pc->binary_safe) + /* Consider invalid UTF-8 as a barrier, instead of error. */ + flags |= PCRE2_MATCH_INVALID_UTF; +#endif #endif } @@ -313,7 +318,7 @@ Pexecute (void *vcp, char const *buf, idx_t size, idx_t *match_size, e = jit_exec (pc, subject, line_end - subject, search_offset, options); - if (!bad_utf8_from_pcre2 (e)) + if (pc->binary_safe || !bad_utf8_from_pcre2 (e)) break; idx_t valid_bytes = pcre2_get_startchar (pc->data); diff --git a/tests/Makefile.am b/tests/Makefile.am index 7718f24..9b4422e 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -155,6 +155,7 @@ TESTS = \ pcre-jitstack \ pcre-o \ pcre-utf8 \ + pcre-utf8-bug224 \ pcre-utf8-w \ pcre-w \ pcre-wx-backref \ diff --git a/tests/pcre-utf8-bug224 b/tests/pcre-utf8-bug224 new file mode 100755 index 0000000..739e7b5 --- /dev/null +++ b/tests/pcre-utf8-bug224 @@ -0,0 +1,31 @@ +#!/bin/sh +# Ensure \D and \W matches multibyte characters in UTF mode +# +# Copyright (C) 2023 Free Software Foundation, Inc. +# +# Copying and distribution of this file, with or without modification, +# are permitted in any medium without royalty provided the copyright +# notice and this notice are preserved. + +. "${srcdir=.}/init.sh"; path_prepend_ ../src +require_en_utf8_locale_ +LC_ALL=en_US.UTF-8 +export LC_ALL +require_pcre_ + +echo . | grep -qP '(*UTF).' 2>/dev/null \ + || skip_ 'PCRE unicode support is compiled out' + +fail=0 + +# 'ñ' (U+00F1) +printf '\302\221\n' > in || framework_failure_ +grep -P '\D' in > out || fail=1 +compare in out || fail=1 + +# “𝄞” (U+1D11E) +printf '\360\235\204\236\n' > in || framework_failure_ +grep -P '\W' in > out || fail=1 +compare in out || fail=1 + +Exit $fail -- 2.39.2 (Apple Git-143)