From d31900e3fc2406ccb8aa972dd173f27f583c95ec Mon Sep 17 00:00:00 2001 From: Paul Eggert
Date: Fri, 1 Jan 2016 21:16:12 -0800 Subject: [PATCH] grep: fix bug with with invalid unibyte sequence This was introduced by the recent binary-data-detection changes. Problem reported by Norihiro Tanaka in: http://bugs.gnu.org/20526#86 * src/grep.c (HIBYTE, easy_encoding, init_easy_encoding): Remove, replacing with ... (uword_max, unibyte_mask, initialize_unibyte_mask): ... this new constant, static var, and function. All uses changed. The unibyte_mask var generalizes the old local var hibyte_mask, which worked only for encodings where every byte with 0x80 turned off is a single-byte character. (buf_has_encoding_errors): Return false immediately if unibyte_mask is zero, not whether the current encoding is unibyte. The old test was incorrect in unibyte locales in which some bytes were encoding errors. * tests/pcre-z: Require UTF-8 locale, since the grep -z . test now needs this. Use printf \0 rather than tr. Port the 'grep -z .' test to platforms where the C locale says '\200' is an encoding error. Use cmp rather than compare, as the file is binary and so non-GNU diff might not work. * tests/unibyte-binary: New file. * tests/Makefile.am (TESTS): Add it. --- src/grep.c | 57 +++++++++++++++++++++++++--------------------------- tests/Makefile.am | 1 + tests/pcre-z | 9 +++++---- tests/unibyte-binary | 28 ++++++++++++++++++++++++++ 4 files changed, 61 insertions(+), 34 deletions(-) create mode 100755 tests/unibyte-binary diff --git a/src/grep.c b/src/grep.c index 1207a76..a5f1fa2 100644 --- a/src/grep.c +++ b/src/grep.c @@ -484,21 +484,6 @@ clean_up_stdout (void) close_stdout (); } -/* The high-order bit of a byte. */ -enum { HIBYTE = 0x80 }; - -/* True if every byte with HIBYTE off is a single-byte character. - UTF-8 has this property. */ -static bool easy_encoding; - -static void -init_easy_encoding (void) -{ - easy_encoding = true; - for (int i = 0; i < HIBYTE; i++) - easy_encoding &= mbclen_cache[i] == 1; -} - /* A cast to TYPE of VAL. Use this when TYPE is a pointer type, VAL is properly aligned for TYPE, and 'gcc -Wcast-align' cannot infer the alignment and would otherwise complain about the cast. */ @@ -517,21 +502,33 @@ init_easy_encoding (void) /* An unsigned type suitable for fast matching. */ typedef uintmax_t uword; +/* All bytes that are not unibyte characters, ANDed together, and then + with the pattern repeated to fill a uword. For an encoding where + all bytes are unibyte characters, this is 0. For UTF-8, this is + 0x808080.... For encodings where unibyte characters have no useful + pattern, this is all 1s. The unsigned char C is a unibyte + character if C & UNIBYTE_MASK is zero. If the uword W is the + concatenation of bytes, the bytes are all unibyte characters + if W & UNIBYTE_MASK is zero. */ +static uword unibyte_mask; + +static void +initialize_unibyte_mask (void) +{ + unsigned char mask = UCHAR_MAX; + for (int i = 1; i <= UCHAR_MAX; i++) + if (mbclen_cache[i] != 1) + mask &= i; + uword uword_max = -1; + unibyte_mask = uword_max / UCHAR_MAX * mask; +} + /* Skip the easy bytes in a buffer that is guaranteed to have a sentinel that is not easy, and return a pointer to the first non-easy byte. - In easy encodings, the easy bytes all have HIBYTE off. - In other encodings, no byte is easy. */ + The easy bytes all have UNIBYTE_MASK off. */ static char const * _GL_ATTRIBUTE_PURE skip_easy_bytes (char const *buf) { - if (!easy_encoding) - return buf; - - uword uword_max = -1; - - /* 0x8080..., extended to be wide enough for uword. */ - uword hibyte_mask = uword_max / UCHAR_MAX * HIBYTE; - /* Search a byte at a time until the pointer is aligned, then a uword at a time until a match is found, then a byte at a time to identify the exact byte. The uword search may go slightly past @@ -539,11 +536,11 @@ skip_easy_bytes (char const *buf) char const *p; uword const *s; for (p = buf; (uintptr_t) p % sizeof (uword) != 0; p++) - if (*p & HIBYTE) + if (to_uchar (*p) & unibyte_mask) return p; - for (s = CAST_ALIGNED (uword const *, p); ! (*s & hibyte_mask); s++) + for (s = CAST_ALIGNED (uword const *, p); ! (*s & unibyte_mask); s++) continue; - for (p = (char const *) s; ! (*p & HIBYTE); p++) + for (p = (char const *) s; ! (to_uchar (*p) & unibyte_mask); p++) continue; return p; } @@ -554,7 +551,7 @@ skip_easy_bytes (char const *buf) static bool buf_has_encoding_errors (char *buf, size_t size) { - if (MB_CUR_MAX <= 1) + if (! unibyte_mask) return false; mbstate_t mbs = { 0 }; @@ -2592,7 +2589,7 @@ main (int argc, char **argv) usage (EXIT_TROUBLE); build_mbclen_cache (); - init_easy_encoding (); + initialize_unibyte_mask (); /* In a unibyte locale, switch from fgrep to grep if the pattern matches words (where grep is typically faster). diff --git a/tests/Makefile.am b/tests/Makefile.am index f349aa3..a38303c 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -133,6 +133,7 @@ TESTS = \ turkish-I-without-dot \ turkish-eyes \ two-files \ + unibyte-binary \ unibyte-bracket-expr \ unibyte-negated-circumflex \ utf8-bracket \ diff --git a/tests/pcre-z b/tests/pcre-z index 6bbde94..4ce9a93 100755 --- a/tests/pcre-z +++ b/tests/pcre-z @@ -2,10 +2,11 @@ # Test Perl regex with NUL-separated input . "${srcdir=.}/init.sh"; path_prepend_ ../src require_pcre_ +require_en_utf8_locale_ REGEX=a -printf "%s\n0" abc def ghi aaa gah | tr 0 \\0 > in +printf '%s\n\0' abc def ghi aaa gah > in || framework_failure_ grep -z "$REGEX" in > exp 2>err || fail_ 'Cannot do BRE (grep -z) match.' compare /dev/null err || fail_ 'stderr not empty on grep -z.' @@ -20,8 +21,8 @@ grep -Pz "$REGEX" in > out 2>err || fail=1 compare exp out || fail=1 compare /dev/null err || fail=1 -printf '\200\0' >in0 -LC_ALL=C grep -z . in0 >out || fail=1 -compare in0 out || fail=1 +printf '\303\200\0' >in0 # "À" followed by a NUL. +LC_ALL=en_US.UTF-8 grep -z . in0 >out || fail=1 +cmp in0 out || fail=1 Exit $fail diff --git a/tests/unibyte-binary b/tests/unibyte-binary new file mode 100755 index 0000000..78735b8 --- /dev/null +++ b/tests/unibyte-binary @@ -0,0 +1,28 @@ +#!/bin/sh +# Test binary files in unibyte locales with encoding errors + +# Copyright 2016 Free Software Foundation, Inc. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see