grep branch, master, updated. v2.18-125-g94555dd

grep-commit
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
grep branch, master, updated. v2.18-125-g94555dd

From:	Paul Eggert
Subject:	grep branch, master, updated. v2.18-125-g94555dd
Date:	Tue, 06 May 2014 06:56:08 +0000
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "grep".

The branch, master has been updated
       via  94555dd281cdcd7530bc2c4466f0bbfd8d47d5c0 (commit)
       via  00024ede52a0d9cb04ac6549cabffab485f5e8aa (commit)
      from  eb3292b3b205e50d0373f26ff0950ec82f49c14a (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://git.savannah.gnu.org/cgit/grep.git/commit/?id=94555dd281cdcd7530bc2c4466f0bbfd8d47d5c0


commit 94555dd281cdcd7530bc2c4466f0bbfd8d47d5c0
Author: Paul Eggert <address@hidden>
Date:   Mon May 5 23:53:59 2014 -0700

    grep: fix -w match next to a multibyte letter
    
    * NEWS: Document this.
    * src/dfasearch.c, src/kwsearch.c (WCHAR): Remove.
    (wordchar): New static function.
    * src/dfasearch.c (EGexecute):
    * src/kwsearch.c (Fexecute): Use the new functions, so that the
    code works correctly if a multibyte character adjacent to the
    match has two or more bytes.
    * src/search.h, src/searchutils.c (mb_prev_wc, mb_next_wc):
    New functions.
    * tests/word-delim-multibyte: Add a test for grep -w (which now
    passes), and a test for \> (which still fails).  The \< test also
    still fails.

diff --git a/NEWS b/NEWS
index 3bf7b69..7269d57 100644
--- a/NEWS
+++ b/NEWS
@@ -19,6 +19,10 @@ GNU grep NEWS                                    -*- outline 
-*-
   to whether the errors can match parts of multibyte characters in data.
   [bug present since "the beginning"]
 
+  grep -w no longer mishandles a potential match adjacent to a letter that
+  takes up two or more bytes in a multibyte encoding.
+  [bug present since "the beginning"]
+
   grep -P now reports an error and exits when given invalid UTF-8 data.
   Previously it was unreliable, and sometimes crashed or looped.
   [bug introduced in grep-2.16]
diff --git a/src/dfasearch.c b/src/dfasearch.c
index d16bc47..4202666 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -22,8 +22,12 @@
 #include "intprops.h"
 #include "search.h"
 
-/* For -w, we also consider _ to be word constituent.  */
-#define WCHAR(C) (isalnum (C) || (C) == '_')
+/* Whether -w considers WC to be a word constituent.  */
+static bool
+wordchar (wint_t wc)
+{
+  return wc == L'_' || iswalnum (wc);
+}
 
 /* KWset compiled pattern.  For Ecompile and Gcompile, we compile
    a list of strings, at least one of which is known to occur in
@@ -372,9 +376,8 @@ EGexecute (char const *buf, size_t size, size_t *match_size,
                 while (match <= best_match)
                   {
                     regoff_t shorter_len = 0;
-                    if ((match == beg || !WCHAR (to_uchar (match[-1])))
-                        && (start + len == end - beg - 1
-                            || !WCHAR (to_uchar (match[len]))))
+                    if (!wordchar (mb_prev_wc (beg, match, end - 1))
+                        && !wordchar (mb_next_wc (match + len, end - 1)))
                       goto assess_pattern_match;
                     if (len > 0)
                       {
diff --git a/src/kwsearch.c b/src/kwsearch.c
index c16b1be..6bd516a 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -21,8 +21,12 @@
 #include <config.h>
 #include "search.h"
 
-/* For -w, we also consider _ to be word constituent.  */
-#define WCHAR(C) (isalnum (C) || (C) == '_')
+/* Whether -w considers WC to be a word constituent.  */
+static bool
+wordchar (wint_t wc)
+{
+  return wc == L'_' || iswalnum (wc);
+}
 
 /* KWset compiled pattern.  For Ecompile and Gcompile, we compile
    a list of strings, at least one of which is known to occur in
@@ -142,9 +146,9 @@ Fexecute (char const *buf, size_t size, size_t *match_size,
       if (match_words)
         for (try = beg; ; )
           {
-            if (try > buf && WCHAR(to_uchar (try[-1])))
+            if (wordchar (mb_prev_wc (buf, try, buf + size)))
               break;
-            if (try + len < buf + size && WCHAR(to_uchar (try[len])))
+            if (wordchar (mb_next_wc (try + len, buf + size)))
               {
                 if (!len)
                   break;
diff --git a/src/search.h b/src/search.h
index 44d5a20..14877bc 100644
--- a/src/search.h
+++ b/src/search.h
@@ -46,6 +46,8 @@ extern void kwsinit (kwset_t *);
 extern char *mbtoupper (char const *, size_t *, mb_len_map_t **);
 extern void build_mbclen_cache (void);
 extern ptrdiff_t mb_goback (char const **, char const *, char const *);
+extern wint_t mb_prev_wc (char const *, char const *, char const *);
+extern wint_t mb_next_wc (char const *, char const *);
 
 /* dfasearch.c */
 extern void GEAcompile (char const *, size_t, reg_syntax_t);
diff --git a/src/searchutils.c b/src/searchutils.c
index d92ede5..5eb9a12 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -263,3 +263,27 @@ mb_goback (char const **mb_start, char const *cur, char 
const *end)
   *mb_start = p;
   return p == cur ? 0 : cur - p0;
 }
+
+/* In the buffer BUF, return the wide character that is encoded just
+   before CUR.  The buffer ends at END.  Return WEOF if there is no
+   wide character just before CUR.  */
+wint_t
+mb_prev_wc (char const *buf, char const *cur, char const *end)
+{
+  if (cur == buf)
+    return WEOF;
+  char const *p = buf;
+  cur--;
+  cur -= mb_goback (&p, cur, end);
+  return mb_next_wc (cur, end);
+}
+
+/* Return the wide character that is encoded at CUR.  The buffer ends
+   at END.  Return WEOF if there is no wide character encoded at CUR.  */
+wint_t
+mb_next_wc (char const *cur, char const *end)
+{
+  wchar_t wc;
+  mbstate_t mbs = { 0 };
+  return mbrtowc (&wc, cur, end - cur, &mbs) < (size_t) -2 ? wc : WEOF;
+}
diff --git a/tests/word-delim-multibyte b/tests/word-delim-multibyte
index b7f1576..f65298f 100755
--- a/tests/word-delim-multibyte
+++ b/tests/word-delim-multibyte
@@ -11,8 +11,16 @@ LC_ALL=en_US.UTF-8
 export LC_ALL
 
 fail=0
-grep "\<$e_acute" in > out 2>err || fail=1
 
+grep "\\<$e_acute" in > out 2>err || fail=1
+compare out in || fail=1
+compare /dev/null err || fail=1
+
+grep "$e_acute\\>" in > out 2>err || fail=1
+compare out in || fail=1
+compare /dev/null err || fail=1
+
+grep -w "$e_acute" in > out 2>err || fail=1
 compare out in || fail=1
 compare /dev/null err || fail=1
 

http://git.savannah.gnu.org/cgit/grep.git/commit/?id=00024ede52a0d9cb04ac6549cabffab485f5e8aa


commit 94555dd281cdcd7530bc2c4466f0bbfd8d47d5c0
Author: Paul Eggert <address@hidden>
Date:   Mon May 5 23:53:59 2014 -0700

    grep: fix -w match next to a multibyte letter
    
    * NEWS: Document this.
    * src/dfasearch.c, src/kwsearch.c (WCHAR): Remove.
    (wordchar): New static function.
    * src/dfasearch.c (EGexecute):
    * src/kwsearch.c (Fexecute): Use the new functions, so that the
    code works correctly if a multibyte character adjacent to the
    match has two or more bytes.
    * src/search.h, src/searchutils.c (mb_prev_wc, mb_next_wc):
    New functions.
    * tests/word-delim-multibyte: Add a test for grep -w (which now
    passes), and a test for \> (which still fails).  The \< test also
    still fails.

diff --git a/NEWS b/NEWS
index 3bf7b69..7269d57 100644
--- a/NEWS
+++ b/NEWS
@@ -19,6 +19,10 @@ GNU grep NEWS                                    -*- outline 
-*-
   to whether the errors can match parts of multibyte characters in data.
   [bug present since "the beginning"]
 
+  grep -w no longer mishandles a potential match adjacent to a letter that
+  takes up two or more bytes in a multibyte encoding.
+  [bug present since "the beginning"]
+
   grep -P now reports an error and exits when given invalid UTF-8 data.
   Previously it was unreliable, and sometimes crashed or looped.
   [bug introduced in grep-2.16]
diff --git a/src/dfasearch.c b/src/dfasearch.c
index d16bc47..4202666 100644
--- a/src/dfasearch.c
+++ b/src/dfasearch.c
@@ -22,8 +22,12 @@
 #include "intprops.h"
 #include "search.h"
 
-/* For -w, we also consider _ to be word constituent.  */
-#define WCHAR(C) (isalnum (C) || (C) == '_')
+/* Whether -w considers WC to be a word constituent.  */
+static bool
+wordchar (wint_t wc)
+{
+  return wc == L'_' || iswalnum (wc);
+}
 
 /* KWset compiled pattern.  For Ecompile and Gcompile, we compile
    a list of strings, at least one of which is known to occur in
@@ -372,9 +376,8 @@ EGexecute (char const *buf, size_t size, size_t *match_size,
                 while (match <= best_match)
                   {
                     regoff_t shorter_len = 0;
-                    if ((match == beg || !WCHAR (to_uchar (match[-1])))
-                        && (start + len == end - beg - 1
-                            || !WCHAR (to_uchar (match[len]))))
+                    if (!wordchar (mb_prev_wc (beg, match, end - 1))
+                        && !wordchar (mb_next_wc (match + len, end - 1)))
                       goto assess_pattern_match;
                     if (len > 0)
                       {
diff --git a/src/kwsearch.c b/src/kwsearch.c
index c16b1be..6bd516a 100644
--- a/src/kwsearch.c
+++ b/src/kwsearch.c
@@ -21,8 +21,12 @@
 #include <config.h>
 #include "search.h"
 
-/* For -w, we also consider _ to be word constituent.  */
-#define WCHAR(C) (isalnum (C) || (C) == '_')
+/* Whether -w considers WC to be a word constituent.  */
+static bool
+wordchar (wint_t wc)
+{
+  return wc == L'_' || iswalnum (wc);
+}
 
 /* KWset compiled pattern.  For Ecompile and Gcompile, we compile
    a list of strings, at least one of which is known to occur in
@@ -142,9 +146,9 @@ Fexecute (char const *buf, size_t size, size_t *match_size,
       if (match_words)
         for (try = beg; ; )
           {
-            if (try > buf && WCHAR(to_uchar (try[-1])))
+            if (wordchar (mb_prev_wc (buf, try, buf + size)))
               break;
-            if (try + len < buf + size && WCHAR(to_uchar (try[len])))
+            if (wordchar (mb_next_wc (try + len, buf + size)))
               {
                 if (!len)
                   break;
diff --git a/src/search.h b/src/search.h
index 44d5a20..14877bc 100644
--- a/src/search.h
+++ b/src/search.h
@@ -46,6 +46,8 @@ extern void kwsinit (kwset_t *);
 extern char *mbtoupper (char const *, size_t *, mb_len_map_t **);
 extern void build_mbclen_cache (void);
 extern ptrdiff_t mb_goback (char const **, char const *, char const *);
+extern wint_t mb_prev_wc (char const *, char const *, char const *);
+extern wint_t mb_next_wc (char const *, char const *);
 
 /* dfasearch.c */
 extern void GEAcompile (char const *, size_t, reg_syntax_t);
diff --git a/src/searchutils.c b/src/searchutils.c
index d92ede5..5eb9a12 100644
--- a/src/searchutils.c
+++ b/src/searchutils.c
@@ -263,3 +263,27 @@ mb_goback (char const **mb_start, char const *cur, char 
const *end)
   *mb_start = p;
   return p == cur ? 0 : cur - p0;
 }
+
+/* In the buffer BUF, return the wide character that is encoded just
+   before CUR.  The buffer ends at END.  Return WEOF if there is no
+   wide character just before CUR.  */
+wint_t
+mb_prev_wc (char const *buf, char const *cur, char const *end)
+{
+  if (cur == buf)
+    return WEOF;
+  char const *p = buf;
+  cur--;
+  cur -= mb_goback (&p, cur, end);
+  return mb_next_wc (cur, end);
+}
+
+/* Return the wide character that is encoded at CUR.  The buffer ends
+   at END.  Return WEOF if there is no wide character encoded at CUR.  */
+wint_t
+mb_next_wc (char const *cur, char const *end)
+{
+  wchar_t wc;
+  mbstate_t mbs = { 0 };
+  return mbrtowc (&wc, cur, end - cur, &mbs) < (size_t) -2 ? wc : WEOF;
+}
diff --git a/tests/word-delim-multibyte b/tests/word-delim-multibyte
index b7f1576..f65298f 100755
--- a/tests/word-delim-multibyte
+++ b/tests/word-delim-multibyte
@@ -11,8 +11,16 @@ LC_ALL=en_US.UTF-8
 export LC_ALL
 
 fail=0
-grep "\<$e_acute" in > out 2>err || fail=1
 
+grep "\\<$e_acute" in > out 2>err || fail=1
+compare out in || fail=1
+compare /dev/null err || fail=1
+
+grep "$e_acute\\>" in > out 2>err || fail=1
+compare out in || fail=1
+compare /dev/null err || fail=1
+
+grep -w "$e_acute" in > out 2>err || fail=1
 compare out in || fail=1
 compare /dev/null err || fail=1
 

-----------------------------------------------------------------------

Summary of changes:
 NEWS                       |    4 +++
 src/dfasearch.c            |   16 ++++++++------
 src/kwsearch.c             |   14 ++++++++----
 src/search.h               |    6 +++-
 src/searchutils.c          |   49 +++++++++++++++++++++++++++++++++++++------
 tests/word-delim-multibyte |   10 ++++++++-
 6 files changed, 77 insertions(+), 22 deletions(-)


hooks/post-receive
-- 
grep
[Prev in Thread]
Current Thread
[Next in Thread]
grep branch, master, updated. v2.18-125-g94555dd, Paul Eggert <=
Prev by Date: grep branch, master, updated. v2.18-123-geb3292b
Next by Date: grep branch, master, updated. v2.18-128-g51f9654
Previous by thread: grep branch, master, updated. v2.18-123-geb3292b
Next by thread: grep branch, master, updated. v2.18-128-g51f9654
Index(es):
- Date
- Thread