From e371037316da8be04e7797cb766f41eec6e9374a Mon Sep 17 00:00:00 2001 From: Ondrej Oprala Date: Wed, 20 Feb 2013 18:53:38 +0100 Subject: [PATCH] expand,unexpand: add multibyte support * NEWS: Mention the changes. * bootstrap.conf: Include libunistring in the list of modules. * configure.ac: Use libunistring if available. * po/POTFILES.in: Add new source file. * src/expand-core.c: Move functions common to both expand and unexpand to this file. * src/expand-core.h: Add function prototypes from expand-core.c. * src/expand.c (expand): Iterate over multibyte characters properly. * src/local.mk: Link expand and unexpand with libunistring and add expand-core.c to their lists of source codes. * src/unexpand.c (unexpand): Iterate over multibyte characters properly. * tests/local.mk: Add new tests. * tests/{expand,unexpand}/mb.sh: New tests. --- NEWS | 4 + bootstrap.conf | 1 + configure.ac | 2 + po/POTFILES.in | 1 + src/expand-core.c | 199 +++++++++++++++++++++++++++++++++++++++ src/expand-core.h | 35 +++++++ src/expand.c | 215 ++++++++++++------------------------------ src/local.mk | 5 + src/unexpand.c | 256 ++++++++++++++++++++------------------------------- tests/expand/mb.sh | 69 ++++++++++++++ tests/local.mk | 2 + tests/unexpand/mb.sh | 74 +++++++++++++++ 12 files changed, 551 insertions(+), 312 deletions(-) create mode 100644 src/expand-core.c create mode 100644 src/expand-core.h create mode 100755 tests/expand/mb.sh create mode 100755 tests/unexpand/mb.sh diff --git a/NEWS b/NEWS index 37bcdf7..c997007 100644 --- a/NEWS +++ b/NEWS @@ -2,6 +2,10 @@ GNU coreutils NEWS -*- outline -*- * Noteworthy changes in release ?.? (????-??-??) [?] +** New features + + expand and unexpand now handle multibyte characters properly. + * Noteworthy changes in release 8.21 (2013-02-14) [stable] diff --git a/bootstrap.conf b/bootstrap.conf index bb6c145..455f0e5 100644 --- a/bootstrap.conf +++ b/bootstrap.conf @@ -140,6 +140,7 @@ gnulib_modules=" lchown ldtoastr lib-ignore + libunistring linebuffer link link-follow diff --git a/configure.ac b/configure.ac index 3f0c58b..79bc920 100644 --- a/configure.ac +++ b/configure.ac @@ -373,6 +373,8 @@ fi # I'm leaving it here for now. This whole thing needs to be modernized... gl_WINSIZE_IN_PTEM +gl_LIBUNISTRING + gl_HEADER_TIOCGWINSZ_IN_TERMIOS_H if test $gl_cv_sys_tiocgwinsz_needs_termios_h = no && \ diff --git a/po/POTFILES.in b/po/POTFILES.in index 21617cc..27b19e5 100644 --- a/po/POTFILES.in +++ b/po/POTFILES.in @@ -59,6 +59,7 @@ src/dirname.c src/du.c src/echo.c src/env.c +src/expand-core.c src/expand.c src/expr.c src/factor.c diff --git a/src/expand-core.c b/src/expand-core.c new file mode 100644 index 0000000..184bb70 --- /dev/null +++ b/src/expand-core.c @@ -0,0 +1,199 @@ +/* exp-common.c - elementary functions for the expand and unexpand utilities + Copyright (C) 1989-2013 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include + +#include +#include + +#if HAVE_LIBUNISTRING +# include +# include +# include +# include +# include +#endif + +#include "expand-core.h" + +#include "system.h" +#include "error.h" +#include "fadvise.h" +#include "quote.h" +#include "xstrndup.h" + +extern size_t first_free_tab; + +extern size_t n_tabs_allocated; + +extern uintmax_t *tab_list; + +extern int exit_status; + +extern char **file_list; + +extern bool have_read_stdin; + +extern bool u8_locale; + +/* Add the comma or blank separated list of tab stops STOPS + to the list of tab stops. */ + +extern void +parse_tab_stops (char const *stops) +{ + bool have_tabval = false; + uintmax_t tabval IF_LINT ( = 0); + char const *num_start IF_LINT ( = NULL); + bool ok = true; + + for (; *stops; stops++) + { + if (*stops == ',' || isblank (to_uchar (*stops))) + { + if (have_tabval) + add_tab_stop (tabval); + have_tabval = false; + } + else if (ISDIGIT (*stops)) + { + if (!have_tabval) + { + tabval = 0; + have_tabval = true; + num_start = stops; + } + + /* Detect overflow. */ + if (!DECIMAL_DIGIT_ACCUMULATE (tabval, *stops - '0', uintmax_t)) + { + size_t len = strspn (num_start, "0123456789"); + char *bad_num = xstrndup (num_start, len); + error (0, 0, _("tab stop is too large %s"), quote (bad_num)); + free (bad_num); + ok = false; + stops = num_start + len - 1; + } + } + else + { + error (0, 0, _("tab size contains invalid character(s): %s"), + quote (stops)); + ok = false; + break; + } + } + + if (!ok) + exit (EXIT_FAILURE); + + if (have_tabval) + add_tab_stop (tabval); +} + +/* Add tab stop TABVAL to the end of 'tab_list'. */ + +extern void +add_tab_stop (uintmax_t tabval) +{ + if (first_free_tab == n_tabs_allocated) + tab_list = X2NREALLOC (tab_list, &n_tabs_allocated); + tab_list[first_free_tab++] = tabval; +} + +/* Check that the list of tab stops TABS, with ENTRIES entries, + contains only nonzero, ascending values. */ + +extern void +validate_tab_stops (uintmax_t const *tabs, size_t entries) +{ + uintmax_t prev_tab = 0; + size_t i; + + for (i = 0; i < entries; i++) + { + if (tabs[i] == 0) + error (EXIT_FAILURE, 0, _("tab size cannot be 0")); + if (tabs[i] <= prev_tab) + error (EXIT_FAILURE, 0, _("tab sizes must be ascending")); + prev_tab = tabs[i]; + } +} + +/* Close the old stream pointer FP if it is non-NULL, + and return a new one opened to read the next input file. + Open a filename of '-' as the standard input. + Return NULL if there are no more input files. */ + +extern FILE * +next_file (FILE *fp) +{ + static char *prev_file; + char *file; + + if (fp) + { + if (ferror (fp)) + { + error (0, errno, "%s", prev_file); + exit_status = EXIT_FAILURE; + } + if (STREQ (prev_file, "-")) + clearerr (fp); /* Also clear EOF. */ + else if (fclose (fp) != 0) + { + error (0, errno, "%s", prev_file); + exit_status = EXIT_FAILURE; + } + } + + while ((file = *file_list++) != NULL) + { + if (STREQ (file, "-")) + { + have_read_stdin = true; + fp = stdin; + } + else + fp = fopen (file, "r"); + if (fp) + { + prev_file = file; + fadvise (fp, FADVISE_SEQUENTIAL); + return fp; + } + error (0, errno, "%s", file); + exit_status = EXIT_FAILURE; + } + return NULL; +} + +extern uint32_t +mb_index (uint8_t *line, size_t *clen, ssize_t *dlen, size_t offt) +{ +#if HAVE_LIBUNISTRING + if (u8_locale) + { + uint32_t c; + *clen = u8_mblen (line + offt, MB_CUR_MAX); + u8_next (&c, line + offt); + *dlen = uc_width (c, "UTF-8"); + return c; + } + else +#endif + return line[offt]; +} diff --git a/src/expand-core.h b/src/expand-core.h new file mode 100644 index 0000000..c3fb44c --- /dev/null +++ b/src/expand-core.h @@ -0,0 +1,35 @@ +/* exp-common.h - function prototypes for the expand and unexpand utilities + Copyright (C) 1989-2013 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#ifndef EXP_COMMON_H_ +#define EXP_COMMON_H_ + +void +parse_tab_stops (char const *stops); + +void +add_tab_stop (uintmax_t tabval); + +void +validate_tab_stops (uintmax_t const *tabs, size_t entries); + +FILE * +next_file (FILE *fp); + +uint32_t +mb_index (uint8_t *line, size_t *clen, ssize_t *dlen, size_t offt); + +#endif /* EXP_COMMON_H_ */ diff --git a/src/expand.c b/src/expand.c index 0b12b02..d9e8b3f 100644 --- a/src/expand.c +++ b/src/expand.c @@ -37,12 +37,22 @@ #include #include #include + +#if HAVE_LIBUNISTRING +# include +# include +# include +# include +# include +#endif + #include "system.h" #include "error.h" #include "fadvise.h" -#include "quote.h" #include "xstrndup.h" +#include "expand-core.h" + /* The official name of this program (e.g., no 'g' prefix). */ #define PROGRAM_NAME "expand" @@ -58,17 +68,17 @@ static uintmax_t tab_size; /* Array of the explicit column numbers of the tab stops; after 'tab_list' is exhausted, each additional tab is replaced by a space. The first column is column 0. */ -static uintmax_t *tab_list; +uintmax_t *tab_list; /* The number of allocated entries in 'tab_list'. */ -static size_t n_tabs_allocated; +size_t n_tabs_allocated; /* The index of the first invalid element of 'tab_list', where the next element can be added. */ -static size_t first_free_tab; +size_t first_free_tab; /* Null-terminated array of input filenames. */ -static char **file_list; +char **file_list; /* Default for 'file_list' if no files are given on the command line. */ static char *stdin_argv[] = @@ -77,10 +87,13 @@ static char *stdin_argv[] = }; /* True if we have ever read standard input. */ -static bool have_read_stdin; +bool have_read_stdin; /* The desired exit status. */ -static int exit_status; +int exit_status; + +/* True if we are operating under UTF-8 locale. */ +bool u8_locale; static char const shortopts[] = "it:0::1::2::3::4::5::6::7::8::9::"; @@ -125,138 +138,6 @@ With no FILE, or when FILE is -, read standard input.\n\ exit (status); } -/* Add tab stop TABVAL to the end of 'tab_list'. */ - -static void -add_tab_stop (uintmax_t tabval) -{ - if (first_free_tab == n_tabs_allocated) - tab_list = X2NREALLOC (tab_list, &n_tabs_allocated); - tab_list[first_free_tab++] = tabval; -} - -/* Add the comma or blank separated list of tab stops STOPS - to the list of tab stops. */ - -static void -parse_tab_stops (char const *stops) -{ - bool have_tabval = false; - uintmax_t tabval IF_LINT ( = 0); - char const *num_start IF_LINT ( = NULL); - bool ok = true; - - for (; *stops; stops++) - { - if (*stops == ',' || isblank (to_uchar (*stops))) - { - if (have_tabval) - add_tab_stop (tabval); - have_tabval = false; - } - else if (ISDIGIT (*stops)) - { - if (!have_tabval) - { - tabval = 0; - have_tabval = true; - num_start = stops; - } - - /* Detect overflow. */ - if (!DECIMAL_DIGIT_ACCUMULATE (tabval, *stops - '0', uintmax_t)) - { - size_t len = strspn (num_start, "0123456789"); - char *bad_num = xstrndup (num_start, len); - error (0, 0, _("tab stop is too large %s"), quote (bad_num)); - free (bad_num); - ok = false; - stops = num_start + len - 1; - } - } - else - { - error (0, 0, _("tab size contains invalid character(s): %s"), - quote (stops)); - ok = false; - break; - } - } - - if (!ok) - exit (EXIT_FAILURE); - - if (have_tabval) - add_tab_stop (tabval); -} - -/* Check that the list of tab stops TABS, with ENTRIES entries, - contains only nonzero, ascending values. */ - -static void -validate_tab_stops (uintmax_t const *tabs, size_t entries) -{ - uintmax_t prev_tab = 0; - size_t i; - - for (i = 0; i < entries; i++) - { - if (tabs[i] == 0) - error (EXIT_FAILURE, 0, _("tab size cannot be 0")); - if (tabs[i] <= prev_tab) - error (EXIT_FAILURE, 0, _("tab sizes must be ascending")); - prev_tab = tabs[i]; - } -} - -/* Close the old stream pointer FP if it is non-NULL, - and return a new one opened to read the next input file. - Open a filename of '-' as the standard input. - Return NULL if there are no more input files. */ - -static FILE * -next_file (FILE *fp) -{ - static char *prev_file; - char *file; - - if (fp) - { - if (ferror (fp)) - { - error (0, errno, "%s", prev_file); - exit_status = EXIT_FAILURE; - } - if (STREQ (prev_file, "-")) - clearerr (fp); /* Also clear EOF. */ - else if (fclose (fp) != 0) - { - error (0, errno, "%s", prev_file); - exit_status = EXIT_FAILURE; - } - } - - while ((file = *file_list++) != NULL) - { - if (STREQ (file, "-")) - { - have_read_stdin = true; - fp = stdin; - } - else - fp = fopen (file, "r"); - if (fp) - { - prev_file = file; - fadvise (fp, FADVISE_SEQUENTIAL); - return fp; - } - error (0, errno, "%s", file); - exit_status = EXIT_FAILURE; - } - return NULL; -} - /* Change tabs to spaces, writing to stdout. Read each file in 'file_list', in order. */ @@ -265,19 +146,20 @@ expand (void) { /* Input stream. */ FILE *fp = next_file (NULL); + char *rawline = NULL; + uint8_t *line; + uint32_t c; + size_t clen = 0, offt; + ssize_t rawlen, dlen; if (!fp) return; while (true) { - /* Input character, or EOF. */ - int c; - /* If true, perform translations. */ bool convert = true; - /* The following variables have valid values only when CONVERT is true: */ @@ -287,14 +169,33 @@ expand (void) /* Index in TAB_LIST of next tab stop to examine. */ size_t tab_index = 0; - /* Convert a line of text. */ + while ((rawlen = getline (&rawline, &offt, fp)) == -1 + && (fp = next_file (fp))) + continue; + + if (!fp) + { + free (rawline); + return; + } +#if HAVE_LIBUNISTRING + if (u8_locale) + line = u8_strconv_from_locale (rawline); + else +#endif + { + line = (uint8_t *) xstrdup (rawline); + clen = dlen = 1; + } - do - { - while ((c = getc (fp)) < 0 && (fp = next_file (fp))) - continue; + free (rawline); + rawline = NULL; + offt = 0; + while (offt != rawlen) + { + c = mb_index (line, &clen, &dlen, offt); if (convert) { if (c == '\t') @@ -339,7 +240,7 @@ expand (void) } else { - column++; + column += dlen; if (!column) error (EXIT_FAILURE, 0, _("input line is too long")); } @@ -347,13 +248,15 @@ expand (void) convert &= convert_entire_line || !! isblank (c); } - if (c < 0) - return; + if (c == ' ') + putchar (c); + else + fwrite (line + offt, 1, clen, stdout); + + offt += clen; - if (putchar (c) < 0) - error (EXIT_FAILURE, errno, _("write error")); } - while (c != '\n'); + free (line); } } @@ -376,6 +279,10 @@ main (int argc, char **argv) tab_list = NULL; first_free_tab = 0; +#if HAVE_LIBUNISTRING + u8_locale = STREQ ("UTF-8", locale_charset ()); +#endif + while ((c = getopt_long (argc, argv, shortopts, longopts, NULL)) != -1) { switch (c) diff --git a/src/local.mk b/src/local.mk index 982cd4d..35616e4 100644 --- a/src/local.mk +++ b/src/local.mk @@ -118,6 +118,7 @@ src_du_LDADD = $(LDADD) src_echo_LDADD = $(LDADD) src_env_LDADD = $(LDADD) src_expand_LDADD = $(LDADD) +src_expand_LDADD += $(LIBUNISTRING) src_expr_LDADD = $(LDADD) src_factor_LDADD = $(LDADD) src_false_LDADD = $(LDADD) @@ -195,6 +196,7 @@ src_tsort_LDADD = $(LDADD) src_tty_LDADD = $(LDADD) src_uname_LDADD = $(LDADD) src_unexpand_LDADD = $(LDADD) +src_unexpand_LDADD += $(LIBUNISTRING) src_uniq_LDADD = $(LDADD) src_unlink_LDADD = $(LDADD) src_uptime_LDADD = $(LDADD) @@ -288,6 +290,7 @@ src_stdbuf_LDADD += $(LIBICONV) src_timeout_LDADD += $(LIBICONV) src_truncate_LDADD += $(LIBICONV) + # for canon_host src_pinky_LDADD += $(GETADDRINFO_LIB) src_who_LDADD += $(GETADDRINFO_LIB) @@ -325,6 +328,8 @@ src___SOURCES = src/lbracket.c src_cp_SOURCES = src/cp.c $(copy_sources) src_dir_SOURCES = src/ls.c src/ls-dir.c +src_expand_SOURCES = src/expand.c src/expand-core.c +src_unexpand_SOURCES = src/unexpand.c src/expand-core.c src_vdir_SOURCES = src/ls.c src/ls-vdir.c src_id_SOURCES = src/id.c src/group-list.c src_groups_SOURCES = src/groups.c src/group-list.c diff --git a/src/unexpand.c b/src/unexpand.c index 1803cd5..3c1d212 100644 --- a/src/unexpand.c +++ b/src/unexpand.c @@ -38,10 +38,21 @@ #include #include #include + +#if HAVE_LIBUNISTRING +# include +# include +# include +# include +# include +# include +#endif + +#include "expand-core.h" + #include "system.h" #include "error.h" #include "fadvise.h" -#include "quote.h" #include "xstrndup.h" /* The official name of this program (e.g., no 'g' prefix). */ @@ -62,17 +73,17 @@ static size_t max_column_width; /* Array of the explicit column numbers of the tab stops; after 'tab_list' is exhausted, the rest of the line is printed unchanged. The first column is column 0. */ -static uintmax_t *tab_list; +uintmax_t *tab_list; /* The number of allocated entries in 'tab_list'. */ -static size_t n_tabs_allocated; +size_t n_tabs_allocated; /* The index of the first invalid element of 'tab_list', where the next element can be added. */ -static size_t first_free_tab; +size_t first_free_tab; /* Null-terminated array of input filenames. */ -static char **file_list; +char **file_list; /* Default for 'file_list' if no files are given on the command line. */ static char *stdin_argv[] = @@ -81,10 +92,13 @@ static char *stdin_argv[] = }; /* True if we have ever read standard input. */ -static bool have_read_stdin; +bool have_read_stdin; /* The desired exit status. */ -static int exit_status; +int exit_status; + +/* True if we are operating under UTF-8 locale. */ +bool u8_locale; /* For long options that have no equivalent short option, use a non-character as a pseudo short option, starting with CHAR_MAX + 1. */ @@ -134,17 +148,13 @@ With no FILE, or when FILE is -, read standard input.\n\ exit (status); } -/* Add tab stop TABVAL to the end of 'tab_list'. */ - static void -add_tab_stop (uintmax_t tabval) +set_max_width (uintmax_t tabval) { uintmax_t prev_column = first_free_tab ? tab_list[first_free_tab - 1] : 0; uintmax_t column_width = prev_column <= tabval ? tabval - prev_column : 0; - if (first_free_tab == n_tabs_allocated) - tab_list = X2NREALLOC (tab_list, &n_tabs_allocated); - tab_list[first_free_tab++] = tabval; + add_tab_stop (tabval); if (max_column_width < column_width) { @@ -154,128 +164,6 @@ add_tab_stop (uintmax_t tabval) } } -/* Add the comma or blank separated list of tab stops STOPS - to the list of tab stops. */ - -static void -parse_tab_stops (char const *stops) -{ - bool have_tabval = false; - uintmax_t tabval IF_LINT ( = 0); - char const *num_start IF_LINT ( = NULL); - bool ok = true; - - for (; *stops; stops++) - { - if (*stops == ',' || isblank (to_uchar (*stops))) - { - if (have_tabval) - add_tab_stop (tabval); - have_tabval = false; - } - else if (ISDIGIT (*stops)) - { - if (!have_tabval) - { - tabval = 0; - have_tabval = true; - num_start = stops; - } - - /* Detect overflow. */ - if (!DECIMAL_DIGIT_ACCUMULATE (tabval, *stops - '0', uintmax_t)) - { - size_t len = strspn (num_start, "0123456789"); - char *bad_num = xstrndup (num_start, len); - error (0, 0, _("tab stop is too large %s"), quote (bad_num)); - free (bad_num); - ok = false; - stops = num_start + len - 1; - } - } - else - { - error (0, 0, _("tab size contains invalid character(s): %s"), - quote (stops)); - ok = false; - break; - } - } - - if (!ok) - exit (EXIT_FAILURE); - - if (have_tabval) - add_tab_stop (tabval); -} - -/* Check that the list of tab stops TABS, with ENTRIES entries, - contains only nonzero, ascending values. */ - -static void -validate_tab_stops (uintmax_t const *tabs, size_t entries) -{ - uintmax_t prev_tab = 0; - size_t i; - - for (i = 0; i < entries; i++) - { - if (tabs[i] == 0) - error (EXIT_FAILURE, 0, _("tab size cannot be 0")); - if (tabs[i] <= prev_tab) - error (EXIT_FAILURE, 0, _("tab sizes must be ascending")); - prev_tab = tabs[i]; - } -} - -/* Close the old stream pointer FP if it is non-NULL, - and return a new one opened to read the next input file. - Open a filename of '-' as the standard input. - Return NULL if there are no more input files. */ - -static FILE * -next_file (FILE *fp) -{ - static char *prev_file; - char *file; - - if (fp) - { - if (ferror (fp)) - { - error (0, errno, "%s", prev_file); - exit_status = EXIT_FAILURE; - } - if (STREQ (prev_file, "-")) - clearerr (fp); /* Also clear EOF. */ - else if (fclose (fp) != 0) - { - error (0, errno, "%s", prev_file); - exit_status = EXIT_FAILURE; - } - } - - while ((file = *file_list++) != NULL) - { - if (STREQ (file, "-")) - { - have_read_stdin = true; - fp = stdin; - } - else - fp = fopen (file, "r"); - if (fp) - { - prev_file = file; - fadvise (fp, FADVISE_SEQUENTIAL); - return fp; - } - error (0, errno, "%s", file); - exit_status = EXIT_FAILURE; - } - return NULL; -} - /* Change blanks to tabs, writing to stdout. Read each file in 'file_list', in order. */ @@ -284,11 +172,16 @@ unexpand (void) { /* Input stream. */ FILE *fp = next_file (NULL); + char *rawline = NULL; + uint8_t *line, *pending_start = NULL; + uint32_t c; + size_t clen, offt, pending_first = 0, pending_len = 0; + ssize_t rawlen, dlen; /* The array of pending blanks. In non-POSIX locales, blanks can include characters other than spaces, so the blanks must be stored, not merely counted. */ - char *pending_blank; + uint32_t *pending_blank; if (!fp) return; @@ -296,13 +189,10 @@ unexpand (void) /* The worst case is a non-blank character, then one blank, then a tab stop, then MAX_COLUMN_WIDTH - 1 blanks, then a non-blank; so allocate MAX_COLUMN_WIDTH bytes to store the blanks. */ - pending_blank = xmalloc (max_column_width); + pending_blank = xmalloc (max_column_width * sizeof (uint32_t)); while (true) { - /* Input character, or EOF. */ - int c; - /* If true, perform translations. */ bool convert = true; @@ -333,17 +223,49 @@ unexpand (void) /* Convert a line of text. */ - do - { - while ((c = getc (fp)) < 0 && (fp = next_file (fp))) - continue; + while ((rawlen = getline (&rawline, &offt, fp)) == -1 + && (fp = next_file (fp))) + continue; + + if (!fp) + { + free (rawline); + free (pending_blank); + return; + } +#if HAVE_LIBUNISTRING + if (u8_locale) + line = u8_strconv_from_locale (rawline); + else +#endif + { + line = (uint8_t *) xstrdup (rawline); + clen = dlen = 1; + } + offt = 0; + free (rawline); + rawline = NULL; + + while (offt != rawlen) + { + c = mb_index (line, &clen, &dlen, offt); if (convert) { +#if HAVE_LIBUNISTRING + bool blank = !! uc_is_blank (c); +#else bool blank = !! isblank (c); +#endif if (blank) { + /* First pending character. */ + if (!pending_first) + { + pending_start = line + offt; + pending_first = clen; + } if (next_tab_column <= column) { if (tab_size) @@ -381,7 +303,7 @@ unexpand (void) } else { - column++; + column += dlen; if (! (prev_blank && column == next_tab_column)) { @@ -390,7 +312,9 @@ unexpand (void) if (column == next_tab_column) one_blank_before_tab_stop = true; pending_blank[pending++] = c; + pending_len += clen; prev_blank = true; + offt += clen; continue; } @@ -401,6 +325,12 @@ unexpand (void) /* Discard pending blanks, unless it was a single blank just before the previous tab stop. */ pending = one_blank_before_tab_stop; + if (!pending) + { + pending_len = 0; + pending_first = 0; + pending_start = NULL; + } } } else if (c == '\b') @@ -413,7 +343,7 @@ unexpand (void) } else { - column++; + column += dlen; if (!column) error (EXIT_FAILURE, 0, _("input line is too long")); } @@ -422,9 +352,18 @@ unexpand (void) { if (pending > 1 && one_blank_before_tab_stop) pending_blank[0] = '\t'; - if (fwrite (pending_blank, 1, pending, stdout) != pending) - error (EXIT_FAILURE, errno, _("write error")); + /* There's never a tab in pending_blank[0] taken from input, + so skip past first pending character. */ + if (pending_blank[0] == '\t') + { + pending_len += pending_first - 1; + putchar ('\t'); + } + fwrite (pending_start, 1, pending_len, stdout); pending = 0; + pending_len = 0; + pending_first = 0; + pending_start = NULL; one_blank_before_tab_stop = false; } @@ -432,16 +371,14 @@ unexpand (void) convert &= convert_entire_line || blank; } - if (c < 0) - { - free (pending_blank); - return; - } + if (c == '\t') + putchar (c); + else + fwrite (line + offt, 1, clen, stdout); - if (putchar (c) < 0) - error (EXIT_FAILURE, errno, _("write error")); + offt += clen; } - while (c != '\n'); + free (line); } } @@ -469,6 +406,9 @@ main (int argc, char **argv) convert_entire_line = false; tab_list = NULL; first_free_tab = 0; +#if HAVE_LIBUNISTRING + u8_locale = STREQ ("UTF-8", locale_charset ()); +#endif while ((c = getopt_long (argc, argv, ",0123456789at:", longopts, NULL)) != -1) @@ -489,7 +429,7 @@ main (int argc, char **argv) break; case ',': if (have_tabval) - add_tab_stop (tabval); + set_max_width (tabval); have_tabval = false; break; case_GETOPT_HELP_CHAR; @@ -510,7 +450,7 @@ main (int argc, char **argv) convert_entire_line = false; if (have_tabval) - add_tab_stop (tabval); + set_max_width (tabval); validate_tab_stops (tab_list, first_free_tab); diff --git a/tests/expand/mb.sh b/tests/expand/mb.sh new file mode 100755 index 0000000..136f6d1 --- /dev/null +++ b/tests/expand/mb.sh @@ -0,0 +1,69 @@ +#!/bin/sh + +# Copyright (C) 2012-2013 Free Software Foundation, Inc. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src +print_ver_ expand + +export LC_ALL=en_US.UTF-8 + +#input containing multibyte characters +cat > in <<\EOF +1234567812345678123456781 +. . . . +a b c d +. . . . +ä ö ü ß +. . . . + äöü . öüä. ä xx +EOF + +cat > exp <<\EOF +1234567812345678123456781 +. . . . +a b c d +. . . . +ä ö ü ß +. . . . + äöü . öüä. ä xx +EOF + +expand < in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + +#test characters with a display width larger than 1 +cat > in <<\EOF +12345678 +e |ascii(1) +é |composed(1) +é |decomposed(1) +  |ideo-space(2) +- |full-hypen(2) +EOF + +cat > exp <<\EOF +12345678 +e |ascii(1) +é |composed(1) +é |decomposed(1) +  |ideo-space(2) +- |full-hypen(2) +EOF + +expand < in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + +exit $fail diff --git a/tests/local.mk b/tests/local.mk index 9be1970..9dcba02 100644 --- a/tests/local.mk +++ b/tests/local.mk @@ -502,6 +502,7 @@ all_tests = \ tests/du/threshold.sh \ tests/du/trailing-slash.sh \ tests/du/two-args.sh \ + tests/expand/mb.sh \ tests/id/gnu-zero-uids.sh \ tests/id/no-context.sh \ tests/install/basic-1.sh \ @@ -631,6 +632,7 @@ all_tests = \ tests/touch/read-only.sh \ tests/touch/relative.sh \ tests/touch/trailing-slash.sh \ + tests/unexpand/mb.sh \ $(all_root_tests) # See tests/factor/create-test.sh. diff --git a/tests/unexpand/mb.sh b/tests/unexpand/mb.sh new file mode 100755 index 0000000..64a997a --- /dev/null +++ b/tests/unexpand/mb.sh @@ -0,0 +1,74 @@ +#!/bin/sh + +# Copyright (C) 2012-2013 Free Software Foundation, Inc. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src +print_ver_ unexpand + +export LC_ALL=en_US.UTF-8 + +#input containing multibyte characters +cat > in <<\EOF +1234567812345678123456781 +. . . . +a b c d +. . . . +ä ö ü ß +. . . . + äöü . öüä. ä xx +EOF + +cat > exp <<\EOF +1234567812345678123456781 +. . . . +a b c d +. . . . +ä ö ü ß +. . . . + äöü . öüä. ä xx +EOF + +unexpand -a < in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + +#test characters with a display width larger than 1 +cat > in <<\EOF +12345678 +e |ascii(1) +é |composed(1) +é |decomposed(1) +e  |ideo-space(2) +- |full-hypen(2) +EOF + +cat > exp <<\EOF +12345678 +e |ascii(1) +é |composed(1) +é |decomposed(1) +e |ideo-space(2) +- |full-hypen(2) +EOF + +unexpand -a < in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 + +#test input where a blank of width > 1 is not being substituted +in="$(LC_ALL=en_US.UTF-8 printf ' \u3000 ö ü ß')" +exp='   ö ü ß' + +unexpand -a < in > out || fail=1 +compare exp out > /dev/null 2>&1 || fail=1 -- 1.7.11.7