grep-commit
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

grep branch, master, updated. v2.18-11-gc50283a


From: Paul Eggert
Subject: grep branch, master, updated. v2.18-11-gc50283a
Date: Sat, 01 Mar 2014 06:47:53 +0000

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "grep".

The branch, master has been updated
       via  c50283a3a8d15d382691c447c3a1e1fde1c548fc (commit)
      from  cf1c98cca8844f851596dc03a9c4d6eebcf44bf2 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.savannah.gnu.org/cgit/grep.git/commit/?id=c50283a3a8d15d382691c447c3a1e1fde1c548fc


commit c50283a3a8d15d382691c447c3a1e1fde1c548fc
Author: Paul Eggert <address@hidden>
Date:   Fri Feb 28 22:46:02 2014 -0800

    grep: fix bugs with -i and titlecase
    
    * NEWS: Document this.
    * src/dfa.c (setbit_wc): Simplify.
    (setbit_c): Remove; no longer used.
    (setbit_case_fold_c, parse_bracket_exp, atom):
    Don't mishandle titlecase.  For 'atom', this removes the need for
    the refactoring of Bug#16729.
    (lex): Use the slower approach only for letters that have a
    differing case.
    * tests/case-fold-titlecase: New file.
    * tests/Makefile.am (TESTS): Add it.

diff --git a/NEWS b/NEWS
index 6cfcaba..4b1364c 100644
--- a/NEWS
+++ b/NEWS
@@ -19,6 +19,11 @@ GNU grep NEWS                                    -*- outline 
-*-
   echo a@@a| grep -w @@ would not.  Now, they both fail to match,
   per the documentation on how grep's -w works.
 
+  grep -i no longer mishandles patterns containing titlecase characters.
+  For example, in a locale containing the titlecase character
+  'Lj' (U+01C8 LATIN CAPITAL LETTER L WITH SMALL LETTER J),
+  'grep -i Lj' now matches 'LJ' (U+01C7 LATIN CAPITAL LETTER LJ).
+
 
 * Noteworthy changes in release 2.18 (2014-02-20) [stable]
 
diff --git a/src/dfa.c b/src/dfa.c
index 4708895..b3d9da8 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -694,42 +694,27 @@ dfasyntax (reg_syntax_t bits, int fold, unsigned char eol)
    this may happen when folding case in weird Turkish locales where
    dotless i/dotted I are not included in the chosen character set.
    Return whether a bit was set in the charclass.  */
-#if MBS_SUPPORT
 static bool
 setbit_wc (wint_t wc, charclass c)
 {
+#if MBS_SUPPORT
   int b = wctob (wc);
   if (b == EOF)
     return false;
 
   setbit (b, c);
   return true;
-}
-
-/* Set a bit in the charclass for the given single byte character,
-   if it is valid in the current character set.  */
-static void
-setbit_c (int b, charclass c)
-{
-  /* Do nothing if b is invalid in this character set.  */
-  if (MB_CUR_MAX > 1 && btowc (b) == WEOF)
-    return;
-  setbit (b, c);
-}
 #else
-# define setbit_c setbit
-static inline bool
-setbit_wc (wint_t wc, charclass c)
-{
   abort ();
    /*NOTREACHED*/ return false;
-}
 #endif
+}
 
-/* Like setbit_c, but if case is folded, set both cases of a letter.  For
-   MB_CUR_MAX > 1, the resulting charset is only used as an optimization,
-   and the caller takes care of setting the appropriate field of struct
-   mb_char_classes.  */
+/* Set a bit for B in the charclass C, if B is a valid single byte
+   character in the current character set.  If case is folded, set B's
+   lower and upper case variants similarly.  If MB_CUR_MAX > 1, the
+   resulting charset is used only as an optimization, and the caller
+   should set the appropriate field of struct mb_char_classes.  */
 static void
 setbit_case_fold_c (int b, charclass c)
 {
@@ -738,16 +723,21 @@ setbit_case_fold_c (int b, charclass c)
       wint_t wc = btowc (b);
       if (wc == WEOF)
         return;
-      setbit (b, c);
-      if (case_fold && iswalpha (wc))
-        setbit_wc (iswupper (wc) ? towlower (wc) : towupper (wc), c);
+      if (case_fold)
+        {
+          setbit_wc (towlower (wc), c);
+          setbit_wc (towupper (wc), c);
+        }
     }
   else
     {
-      setbit (b, c);
-      if (case_fold && isalpha (b))
-        setbit_c (isupper (b) ? tolower (b) : toupper (b), c);
+      if (case_fold)
+        {
+          setbit (tolower (b), c);
+          setbit (toupper (b), c);
+        }
     }
+  setbit (b, c);
 }
 
 
@@ -1104,52 +1094,51 @@ parse_bracket_exp (void)
               c2 = ']';
             }
 
-          if (c2 == ']')
+          if (c2 != ']')
             {
-              /* In the case [x-], the - is an ordinary hyphen,
-                 which is left in c1, the lookahead character.  */
-              lexptr -= cur_mb_len;
-              lexleft += cur_mb_len;
-            }
-        }
-
-      if (c1 == '-' && c2 != ']')
-        {
-          if (c2 == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
-            FETCH_WC (c2, wc2, _("unbalanced ["));
+              if (c2 == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
+                FETCH_WC (c2, wc2, _("unbalanced ["));
 
-          if (MB_CUR_MAX > 1)
-            {
-              /* When case folding map a range, say [m-z] (or even [M-z])
-                 to the pair of ranges, [m-z] [M-Z].  */
-              REALLOC_IF_NECESSARY (work_mbc->range_sts,
-                                    range_sts_al, work_mbc->nranges + 1);
-              REALLOC_IF_NECESSARY (work_mbc->range_ends,
-                                    range_ends_al, work_mbc->nranges + 1);
-              work_mbc->range_sts[work_mbc->nranges] =
-                case_fold ? towlower (wc) : (wchar_t) wc;
-              work_mbc->range_ends[work_mbc->nranges++] =
-                case_fold ? towlower (wc2) : (wchar_t) wc2;
-
-              if (case_fold && (iswalpha (wc) || iswalpha (wc2)))
+              if (MB_CUR_MAX > 1)
                 {
+                  /* When case folding map a range, say [m-z] (or even [M-z])
+                     to the pair of ranges, [m-z] [M-Z].  Although this code
+                     is wrong in multiple ways, it's never used in practice.
+                     FIXME: Remove this (and related) unused code.  */
                   REALLOC_IF_NECESSARY (work_mbc->range_sts,
                                         range_sts_al, work_mbc->nranges + 1);
-                  work_mbc->range_sts[work_mbc->nranges] = towupper (wc);
                   REALLOC_IF_NECESSARY (work_mbc->range_ends,
                                         range_ends_al, work_mbc->nranges + 1);
-                  work_mbc->range_ends[work_mbc->nranges++] = towupper (wc2);
+                  work_mbc->range_sts[work_mbc->nranges] =
+                    case_fold ? towlower (wc) : (wchar_t) wc;
+                  work_mbc->range_ends[work_mbc->nranges++] =
+                    case_fold ? towlower (wc2) : (wchar_t) wc2;
+
+                  if (case_fold && (iswalpha (wc) || iswalpha (wc2)))
+                    {
+                      REALLOC_IF_NECESSARY (work_mbc->range_sts,
+                                            range_sts_al, work_mbc->nranges + 
1);
+                      work_mbc->range_sts[work_mbc->nranges] = towupper (wc);
+                      REALLOC_IF_NECESSARY (work_mbc->range_ends,
+                                            range_ends_al, work_mbc->nranges + 
1);
+                      work_mbc->range_ends[work_mbc->nranges++] = towupper 
(wc2);
+                    }
                 }
+              else if (using_simple_locale ())
+                for (; c <= c2; c++)
+                  setbit_case_fold_c (c, ccl);
+              else
+                known_bracket_exp = false;
+
+              colon_warning_state |= 8;
+              FETCH_WC (c1, wc1, _("unbalanced ["));
+              continue;
             }
-          else if (using_simple_locale ())
-            for (; c <= c2; c++)
-              setbit_case_fold_c (c, ccl);
-          else
-            known_bracket_exp = false;
 
-          colon_warning_state |= 8;
-          FETCH_WC (c1, wc1, _("unbalanced ["));
-          continue;
+          /* In the case [x-], the - is an ordinary hyphen,
+             which is left in c1, the lookahead character.  */
+          lexptr -= cur_mb_len;
+          lexleft += cur_mb_len;
         }
 
       colon_warning_state |= (c == ':') ? 2 : 4;
@@ -1160,16 +1149,22 @@ parse_bracket_exp (void)
           continue;
         }
 
-      if (case_fold && iswalpha (wc))
+      if (case_fold)
         {
-          wc = towlower (wc);
-          if (!setbit_wc (wc, ccl))
+          wint_t folded = towlower (wc);
+          if (folded != wc && !setbit_wc (folded, ccl))
+            {
+              REALLOC_IF_NECESSARY (work_mbc->chars, chars_al,
+                                    work_mbc->nchars + 1);
+              work_mbc->chars[work_mbc->nchars++] = folded;
+            }
+          folded = towupper (wc);
+          if (folded != wc && !setbit_wc (folded, ccl))
             {
               REALLOC_IF_NECESSARY (work_mbc->chars, chars_al,
                                     work_mbc->nchars + 1);
-              work_mbc->chars[work_mbc->nchars++] = wc;
+              work_mbc->chars[work_mbc->nchars++] = folded;
             }
-          wc = towupper (wc);
         }
       if (!setbit_wc (wc, ccl))
         {
@@ -1515,7 +1510,7 @@ lex (void)
           if (MB_CUR_MAX > 1)
             return lasttok = WCHAR;
 
-          if (case_fold && isalpha (c))
+          if (case_fold && (tolower (c) != c || toupper (c) != c))
             {
               zeroset (ccl);
               setbit_case_fold_c (c, ccl);
@@ -1759,17 +1754,23 @@ add_utf8_anychar (void)
 static void
 atom (void)
 {
-  if (0)
+  if (MBS_SUPPORT && tok == WCHAR)
     {
-      /* empty */
-    }
-  else if (MBS_SUPPORT && tok == WCHAR)
-    {
-      addtok_wc (case_fold ? towlower (wctok) : wctok);
-      if (case_fold && iswalpha (wctok))
+      addtok_wc (wctok);
+      if (case_fold)
         {
-          addtok_wc (towupper (wctok));
-          addtok (OR);
+          wint_t folded = towlower (wctok);
+          if (folded != wctok)
+            {
+              addtok_wc (folded);
+              addtok (OR);
+            }
+          folded = towupper (wctok);
+          if (folded != wctok)
+            {
+              addtok_wc (folded);
+              addtok (OR);
+            }
         }
 
       tok = lex ();
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 972ffc5..219e96a 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -47,6 +47,7 @@ TESTS =                                               \
   case-fold-char-class                         \
   case-fold-char-range                         \
   case-fold-char-type                          \
+  case-fold-titlecase                          \
   char-class-multibyte                         \
   char-class-multibyte2                                \
   dfa-coverage                                 \
diff --git a/tests/case-fold-titlecase b/tests/case-fold-titlecase
new file mode 100755
index 0000000..0ece5c8
--- /dev/null
+++ b/tests/case-fold-titlecase
@@ -0,0 +1,41 @@
+#!/bin/sh
+# Check that case folding works even with titlecase characters.
+
+# Copyright 2014 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/init.sh"; path_prepend_ ../src
+
+require_en_utf8_locale_
+require_compiled_in_MB_support
+LC_ALL=en_US.UTF-8
+export LC_ALL
+
+fail=0
+
+LJ='\307\207' # U+01C7 LATIN CAPITAL LETTER LJ
+Lj='\307\210' # U+01C8 LATIN CAPITAL LETTER L WITH SMALL LETTER J
+lj='\307\211' # U+01C9 LATIN SMALL LETTER LJ
+pattern=$(printf "$Lj\n") || framework_failure_
+printf "$lj$lj\n$Lj$Lj\n$LJ$LJ\n" >in || framework_failure_
+
+grep -i "$pattern" in >out || fail=1
+compare in out || fail=1
+
+pattern="($pattern)\\1"
+grep -Ei "$pattern" in >out || fail=1
+compare in out || fail=1
+
+Exit $fail

-----------------------------------------------------------------------

Summary of changes:
 NEWS                                          |    5 +
 src/dfa.c                                     |  159 +++++++++++++------------
 tests/Makefile.am                             |    1 +
 tests/{surrogate-pair => case-fold-titlecase} |   31 ++---
 4 files changed, 100 insertions(+), 96 deletions(-)
 copy tests/{surrogate-pair => case-fold-titlecase} (58%)


hooks/post-receive
-- 
grep



reply via email to

[Prev in Thread] Current Thread [Next in Thread]