Re: diff -y + UTF-8 = irregular columns

bug-gnu-utils

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: diff -y + UTF-8 = irregular columns

From:	Paul Eggert
Subject:	Re: diff -y + UTF-8 = irregular columns
Date:	Wed, 23 Jan 2008 15:59:46 -0800
User-agent:	Gnus/5.11 (Gnus v5.11) Emacs/22.1 (gnu/linux)

John Cowan <address@hidden> writes:

> it's incomplete, for it assumes that characters are monowidth

It uses MB_WIDTH to compute the character width.  Unless I'm missing
something, that should work.

For this particular case, though, the savings in code duplication are
not that great; I think it's simpler just to repeat the code (a putc
for unibyte and a fwrite for multibyte).  I installed the following.
It might be nice to move that list of printable ASCII characters into
an include file somewhere, since I stole it from lib/mbswidth.c.

2008-01-23  Paul Eggert  <address@hidden>

        Don't mishandle multibyte characters in side-by-side format.
        Problem reported by Erik Karlsson in
        <http://lists.gnu.org/archive/html/bug-gnu-utils/2008-01/msg00030.html>.
        * bootstrap.conf (gnulib_modules): Add wcwidth.
        * src/side.c: Include <wchar.h>.
        (print_half_line): Compute number of bytes and print widths of
        multibyte characters.  Inspired by an alternative solution from
        Bruno Haible in
        <http://lists.gnu.org/archive/html/bug-gnu-utils/2008-01/msg00032.html>.

Index: bootstrap.conf
===================================================================
RCS file: /cvsroot/diffutils/diffutils/bootstrap.conf,v
retrieving revision 1.4
diff -u -p -r1.4 bootstrap.conf
--- bootstrap.conf      17 Aug 2007 23:35:47 -0000      1.4
+++ bootstrap.conf      23 Jan 2008 23:56:27 -0000
@@ -22,7 +22,7 @@ gnulib_modules='
        extensions fcntl fdl file-type fnmatch-gnu getopt gettext
        gettime hard-locale inttostr inttypes mkstemp regex sh-quote
        stat-macros stat-time strcase strftime strtoumax unistd
-       unlocked-io verify version-etc version-etc-fsf xalloc
+       unlocked-io verify version-etc version-etc-fsf wcwidth xalloc
        xstrtoumax
 '

Index: src/side.c
===================================================================
RCS file: /cvsroot/diffutils/diffutils/src/side.c,v
retrieving revision 1.16
diff -u -p -r1.16 side.c
--- src/side.c  19 Jul 2007 17:19:39 -0000      1.16
+++ src/side.c  23 Jan 2008 23:56:27 -0000
@@ -22,6 +22,8 @@

 #include "diff.h"

+#include <wchar.h>
+
 static void print_sdiff_common_lines (lin, lin);
 static void print_sdiff_hunk (struct change *);

@@ -73,9 +75,11 @@ print_half_line (char const *const *line
   register size_t out_position = 0;
   register char const *text_pointer = line[0];
   register char const *text_limit = line[1];
+  mbstate_t mbstate = { 0 };

   while (text_pointer < text_limit)
     {
+      char const *tp0 = text_pointer;
       register unsigned char c = *text_pointer++;

       switch (c)
@@ -127,18 +131,53 @@ print_half_line (char const *const *line
            }
          break;

+       default:
+         {
+           wchar_t wc;
+           size_t bytes = mbrtowc (&wc, tp0, text_limit - tp0, &mbstate);
+
+           if (0 < bytes && bytes < (size_t) -2)
+             {
+               int width = wcwidth (wc);
+               if (0 < width)
+                 in_position += width;
+               if (in_position <= out_bound)
+                 {
+                   out_position = in_position;
+                   fwrite (tp0, 1, bytes, stdout);
+                 }
+               text_pointer = tp0 + bytes;
+               break;
+             }
+         }
+         /* Fall through.  */
        case '\f':
        case '\v':
-       control_char:
          if (in_position < out_bound)
            putc (c, out);
          break;

-       default:
-         if (! isprint (c))
-           goto control_char;
-         /* falls through */
-       case ' ':
+       case ' ': case '!': case '"': case '#': case '%':
+       case '&': case '\'': case '(': case ')': case '*':
+       case '+': case ',': case '-': case '.': case '/':
+       case '0': case '1': case '2': case '3': case '4':
+       case '5': case '6': case '7': case '8': case '9':
+       case ':': case ';': case '<': case '=': case '>':
+       case '?':
+       case 'A': case 'B': case 'C': case 'D': case 'E':
+       case 'F': case 'G': case 'H': case 'I': case 'J':
+       case 'K': case 'L': case 'M': case 'N': case 'O':
+       case 'P': case 'Q': case 'R': case 'S': case 'T':
+       case 'U': case 'V': case 'W': case 'X': case 'Y':
+       case 'Z':
+       case '[': case '\\': case ']': case '^': case '_':
+       case 'a': case 'b': case 'c': case 'd': case 'e':
+       case 'f': case 'g': case 'h': case 'i': case 'j':
+       case 'k': case 'l': case 'm': case 'n': case 'o':
+       case 'p': case 'q': case 'r': case 's': case 't':
+       case 'u': case 'v': case 'w': case 'x': case 'y':
+       case 'z': case '{': case '|': case '}': case '~':
+         /* These characters are printable ASCII characters.  */
          if (in_position++ < out_bound)
            {
              out_position = in_position;

[Prev in Thread]

Current Thread

[Next in Thread]

diff -y + UTF-8 = irregular columns, Erik Karlsson, 2008/01/21
- Re: diff -y + UTF-8 = irregular columns, Paul Eggert, 2008/01/22
  - Re: diff -y + UTF-8 = irregular columns, Bruno Haible, 2008/01/23
    - Re: diff -y + UTF-8 = irregular columns, John Cowan, 2008/01/23
    - Re: diff -y + UTF-8 = irregular columns, Paul Eggert <=
    - Re: diff -y + UTF-8 = irregular columns, Bruno Haible, 2008/01/23
    - Re: diff -y + UTF-8 = irregular columns, Paul Eggert, 2008/01/24
    - Re: diff -y + UTF-8 = irregular columns, Bruno Haible, 2008/01/23

Prev by Date: Re: diff -y + UTF-8 = irregular columns
Next by Date: Re: diff -y + UTF-8 = irregular columns
Previous by thread: Re: diff -y + UTF-8 = irregular columns
Next by thread: Re: diff -y + UTF-8 = irregular columns
Index(es):
- Date
- Thread