[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: split the linebreak module
From: |
Bruno Haible |
Subject: |
Re: split the linebreak module |
Date: |
Sat, 10 May 2008 15:26:28 +0200 |
User-agent: |
KMail/1.5.4 |
And this patch changes the linebreak functions to use the u8_conv_from_encoding
function rather than rolling its own copy of the iconv-based conversion.
2008-05-10 Bruno Haible <address@hidden>
* lib/unilbrk/ulc-common.c: Don't include <stdlib.h>.
(iconv_string_length, iconv_string_keeping_offsets): Remove functions.
* lib/unilbrk/ulc-common.h (iconv_string_length,
iconv_string_keeping_offsets): Remove declarations.
* lib/unilbrk/ulc-possible-linebreaks.c: Include <string.h>, uniconv.h.
Don't include <iconv.h>, streq.h, xsize.h.
(ulc_possible_linebreaks): Use u8_conv_from_encoding for doing the
conversion.
* lib/unilbrk/ulc-width-linebreaks.c: Include uniconv.h. Don't include
<iconv.h>, streq.h, xsize.h.
(ulc_width_linebreaks): Use u8_conv_from_encoding for doing the
conversion.
* modules/unilbrk/ulc-common (Depends-on): Remove iconv.
* modules/unilbrk/ulc-possible-linebreaks (Depends-on): Add
uniconv/u8-conv-from-enc. Remove iconv_open, streq, xsize.
* modules/unilbrk/ulc-width-linebreaks (Depends-on): Likewise.
*** lib/unilbrk/ulc-possible-linebreaks.c.orig 2008-05-10 15:18:18.000000000
+0200
--- lib/unilbrk/ulc-possible-linebreaks.c 2008-05-10 15:15:33.000000000
+0200
***************
*** 21,33 ****
#include "unilbrk.h"
#include <stdlib.h>
! #if HAVE_ICONV
! # include <iconv.h>
! #endif
#include "c-ctype.h"
! #include "streq.h"
! #include "xsize.h"
#include "unilbrk/ulc-common.h"
/* Line breaking of a string in an arbitrary encoding.
--- 21,30 ----
#include "unilbrk.h"
#include <stdlib.h>
! #include <string.h>
#include "c-ctype.h"
! #include "uniconv.h"
#include "unilbrk/ulc-common.h"
/* Line breaking of a string in an arbitrary encoding.
***************
*** 47,138 ****
ulc_possible_linebreaks (const char *s, size_t n, const char *encoding,
char *p)
{
! if (n == 0)
! return;
! if (is_utf8_encoding (encoding))
! u8_possible_linebreaks ((const uint8_t *) s, n, encoding, p);
! else
{
! #if HAVE_ICONV
! iconv_t to_utf8;
! /* Avoid glibc-2.1 bug with EUC-KR. */
! # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined
_LIBICONV_VERSION
! if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
! to_utf8 = (iconv_t)(-1);
! else
! # endif
! /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK,
! GB18030. */
! # if defined __sun && !defined _LIBICONV_VERSION
! if ( STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
! || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
! || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
! || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K',
'S', 'C')
! || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
! || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0,
0))
! to_utf8 = (iconv_t)(-1);
else
- # endif
- to_utf8 = iconv_open (UTF8_NAME, encoding);
- if (to_utf8 != (iconv_t)(-1))
{
! /* Determine the length of the resulting UTF-8 string. */
! size_t m = iconv_string_length (to_utf8, s, n);
! if (m != (size_t)(-1))
{
! /* Convert the string to UTF-8 and build a translation table
! from offsets into s to offsets into the translated string. */
! size_t memory_size = xsum3 (xtimes (n, sizeof (size_t)), m, m);
! char *memory =
! (size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL);
! if (memory != NULL)
{
! size_t *offtable = (size_t *) memory;
! char *t = (char *) (offtable + n);
! char *q = (char *) (t + m);
! size_t i;
!
! iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
!
! /* Determine the possible line breaks of the UTF-8 string. */
! u8_possible_linebreaks ((const uint8_t *) t, m, encoding, q);
!
! /* Translate the result back to the original string. */
! memset (p, UC_BREAK_PROHIBITED, n);
! for (i = 0; i < n; i++)
! if (offtable[i] != (size_t)(-1))
! p[i] = q[offtable[i]];
!
! free (memory);
! iconv_close (to_utf8);
! return;
}
}
! iconv_close (to_utf8);
! }
! #endif
! /* Impossible to convert. */
#if C_CTYPE_ASCII
! if (is_all_ascii (s, n))
! {
! /* ASCII is a subset of UTF-8. */
! u8_possible_linebreaks ((const uint8_t *) s, n, encoding, p);
! return;
! }
#endif
! /* We have a non-ASCII string and cannot convert it.
! Don't produce line breaks except those already present in the
! input string. All we assume here is that the encoding is
! minimally ASCII compatible. */
! {
! const char *s_end = s + n;
! while (s < s_end)
{
! *p = (*s == '\n' ? UC_BREAK_MANDATORY : UC_BREAK_PROHIBITED);
! s++;
! p++;
}
! }
}
}
--- 44,116 ----
ulc_possible_linebreaks (const char *s, size_t n, const char *encoding,
char *p)
{
! if (n > 0)
{
! if (is_utf8_encoding (encoding))
! u8_possible_linebreaks ((const uint8_t *) s, n, encoding, p);
else
{
! /* Convert the string to UTF-8 and build a translation table
! from offsets into s to offsets into the translated string. */
! size_t *offsets = (size_t *) malloc (n * sizeof (size_t));
!
! if (offsets != NULL)
{
! uint8_t *t = NULL;
! size_t m;
! if (u8_conv_from_encoding (encoding, iconveh_question_mark,
! s, n, offsets, &t, &m)
! == 0)
{
! char *q = (char *) malloc (m);
!
! if (q != NULL)
! {
! size_t i;
!
! /* Determine the possible line breaks of the UTF-8
! string. */
! u8_possible_linebreaks (t, m, encoding, q);
!
! /* Translate the result back to the original string. */
! memset (p, UC_BREAK_PROHIBITED, n);
! for (i = 0; i < n; i++)
! if (offsets[i] != (size_t)(-1))
! p[i] = q[offsets[i]];
!
! free (q);
! free (t);
! free (offsets);
! return;
! }
! free (t);
}
+ free (offsets);
}
!
! /* Impossible to convert. */
#if C_CTYPE_ASCII
! if (is_all_ascii (s, n))
! {
! /* ASCII is a subset of UTF-8. */
! u8_possible_linebreaks ((const uint8_t *) s, n, encoding, p);
! return;
! }
#endif
! /* We have a non-ASCII string and cannot convert it.
! Don't produce line breaks except those already present in the
! input string. All we assume here is that the encoding is
! minimally ASCII compatible. */
{
! const char *s_end = s + n;
! while (s < s_end)
! {
! *p = (*s == '\n' ? UC_BREAK_MANDATORY : UC_BREAK_PROHIBITED);
! s++;
! p++;
! }
}
! }
}
}
*** lib/unilbrk/ulc-width-linebreaks.c.orig 2008-05-10 15:18:18.000000000
+0200
--- lib/unilbrk/ulc-width-linebreaks.c 2008-05-10 15:15:25.000000000 +0200
***************
*** 22,34 ****
#include <stdlib.h>
#include <string.h>
- #if HAVE_ICONV
- # include <iconv.h>
- #endif
#include "c-ctype.h"
! #include "streq.h"
! #include "xsize.h"
#include "unilbrk/ulc-common.h"
/* Line breaking of a string in an arbitrary encoding.
--- 22,30 ----
#include <stdlib.h>
#include <string.h>
#include "c-ctype.h"
! #include "uniconv.h"
#include "unilbrk/ulc-common.h"
/* Line breaking of a string in an arbitrary encoding.
***************
*** 50,162 ****
const char *o, const char *encoding,
char *p)
{
! if (n == 0)
! return start_column;
! if (is_utf8_encoding (encoding))
! return u8_width_linebreaks ((const uint8_t *) s, n, width, start_column,
at_end_columns, o, encoding, p);
! else
{
! #if HAVE_ICONV
! iconv_t to_utf8;
! /* Avoid glibc-2.1 bug with EUC-KR. */
! # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined
_LIBICONV_VERSION
! if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
! to_utf8 = (iconv_t)(-1);
else
- # endif
- /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK,
- GB18030. */
- # if defined __sun && !defined _LIBICONV_VERSION
- if ( STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
- || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
- || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
- || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K',
'S', 'C')
- || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
- || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0,
0))
- to_utf8 = (iconv_t)(-1);
- else
- # endif
- to_utf8 = iconv_open (UTF8_NAME, encoding);
- if (to_utf8 != (iconv_t)(-1))
{
! /* Determine the length of the resulting UTF-8 string. */
! size_t m = iconv_string_length (to_utf8, s, n);
! if (m != (size_t)(-1))
{
! /* Convert the string to UTF-8 and build a translation table
! from offsets into s to offsets into the translated string. */
! size_t memory_size =
! xsum4 (xtimes (n, sizeof (size_t)), m, m,
! (o != NULL ? m : 0));
! char *memory =
! (char *)
! (size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL);
! if (memory != NULL)
{
! size_t *offtable = (size_t *) memory;
! char *t = (char *) (offtable + n);
! char *q = (char *) (t + m);
! char *o8 = (o != NULL ? (char *) (q + m) : NULL);
! int res_column;
! size_t i;
! iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
!
! /* Translate the overrides to the UTF-8 string. */
! if (o != NULL)
{
! memset (o8, UC_BREAK_UNDEFINED, m);
for (i = 0; i < n; i++)
! if (offtable[i] != (size_t)(-1))
! o8[offtable[i]] = o[i];
! }
! /* Determine the line breaks of the UTF-8 string. */
! res_column =
! u8_width_linebreaks ((const uint8_t *) t, m, width,
start_column, at_end_columns, o8, encoding, q);
!
! /* Translate the result back to the original string. */
! memset (p, UC_BREAK_PROHIBITED, n);
! for (i = 0; i < n; i++)
! if (offtable[i] != (size_t)(-1))
! p[i] = q[offtable[i]];
!
! free (memory);
! iconv_close (to_utf8);
! return res_column;
}
}
! iconv_close (to_utf8);
! }
! #endif
! /* Impossible to convert. */
#if C_CTYPE_ASCII
! if (is_all_ascii (s, n))
! {
! /* ASCII is a subset of UTF-8. */
! return u8_width_linebreaks ((const uint8_t *) s, n, width,
start_column, at_end_columns, o, encoding, p);
! }
#endif
! /* We have a non-ASCII string and cannot convert it.
! Don't produce line breaks except those already present in the
! input string. All we assume here is that the encoding is
! minimally ASCII compatible. */
! {
! const char *s_end = s + n;
! while (s < s_end)
{
! *p = ((o != NULL && *o == UC_BREAK_MANDATORY) || *s == '\n'
! ? UC_BREAK_MANDATORY
! : UC_BREAK_PROHIBITED);
! s++;
! p++;
! if (o != NULL)
! o++;
}
! /* We cannot compute widths in this case. */
! return start_column;
! }
}
}
--- 46,135 ----
const char *o, const char *encoding,
char *p)
{
! if (n > 0)
{
! if (is_utf8_encoding (encoding))
! return u8_width_linebreaks ((const uint8_t *) s, n, width,
start_column, at_end_columns, o, encoding, p);
else
{
! /* Convert the string to UTF-8 and build a translation table
! from offsets into s to offsets into the translated string. */
! size_t *offsets = (size_t *) malloc (n * sizeof (size_t));
!
! if (offsets != NULL)
{
! uint8_t *t = NULL;
! size_t m;
! if (u8_conv_from_encoding (encoding, iconveh_question_mark,
! s, n, offsets, &t, &m)
! == 0)
{
! char *memory = (char *) malloc (m + (o != NULL ? m : 0));
! if (memory != NULL)
{
! char *q = (char *) memory;
! char *o8 = (o != NULL ? (char *) (q + m) : NULL);
! int res_column;
! size_t i;
!
! /* Translate the overrides to the UTF-8 string. */
! if (o != NULL)
! {
! memset (o8, UC_BREAK_UNDEFINED, m);
! for (i = 0; i < n; i++)
! if (offsets[i] != (size_t)(-1))
! o8[offsets[i]] = o[i];
! }
!
! /* Determine the line breaks of the UTF-8 string. */
! res_column =
! u8_width_linebreaks (t, m, width, start_column,
at_end_columns, o8, encoding, q);
!
! /* Translate the result back to the original string. */
! memset (p, UC_BREAK_PROHIBITED, n);
for (i = 0; i < n; i++)
! if (offsets[i] != (size_t)(-1))
! p[i] = q[offsets[i]];
! free (memory);
! free (t);
! free (offsets);
! return res_column;
! }
! free (t);
}
+ free (offsets);
}
! /* Impossible to convert. */
#if C_CTYPE_ASCII
! if (is_all_ascii (s, n))
! {
! /* ASCII is a subset of UTF-8. */
! return u8_width_linebreaks ((const uint8_t *) s, n, width,
start_column, at_end_columns, o, encoding, p);
! }
#endif
! /* We have a non-ASCII string and cannot convert it.
! Don't produce line breaks except those already present in the
! input string. All we assume here is that the encoding is
! minimally ASCII compatible. */
{
! const char *s_end = s + n;
! while (s < s_end)
! {
! *p = ((o != NULL && *o == UC_BREAK_MANDATORY) || *s == '\n'
! ? UC_BREAK_MANDATORY
! : UC_BREAK_PROHIBITED);
! s++;
! p++;
! if (o != NULL)
! o++;
! }
! /* We cannot compute widths in this case. */
}
! }
}
+ return start_column;
}
*** lib/unilbrk/ulc-common.h.orig 2008-05-10 15:18:18.000000000 +0200
--- lib/unilbrk/ulc-common.h 2008-05-10 15:01:37.000000000 +0200
***************
*** 23,44 ****
#define is_utf8_encoding unilbrk_is_utf8_encoding
extern int is_utf8_encoding (const char *encoding);
- #if HAVE_ICONV
-
- # include <iconv.h>
-
- /* Luckily, the encoding's name is platform independent. */
- # define UTF8_NAME "UTF-8"
-
- /* Return the length of a string after conversion through an iconv_t. */
- # define iconv_string_length unilbrk_iconv_string_length
- extern size_t iconv_string_length (iconv_t cd, const char *s, size_t n);
-
- # define iconv_string_keeping_offsets unilbrk_iconv_string_keeping_offsets
- extern void iconv_string_keeping_offsets (iconv_t cd, const char *s, size_t
n, size_t *offtable, char *t, size_t m);
-
- #endif /* HAVE_ICONV */
-
#if C_CTYPE_ASCII
# define is_all_ascii unilbrk_is_all_ascii
--- 23,28 ----
*** lib/unilbrk/ulc-common.c.orig 2008-05-10 15:18:18.000000000 +0200
--- lib/unilbrk/ulc-common.c 2008-05-10 15:02:10.000000000 +0200
***************
*** 20,27 ****
/* Specification. */
#include "unilbrk/ulc-common.h"
- #include <stdlib.h>
-
#include "c-ctype.h"
#include "streq.h"
--- 20,25 ----
***************
*** 33,154 ****
return 0;
}
- #if HAVE_ICONV
-
- # include <errno.h>
-
- size_t
- iconv_string_length (iconv_t cd, const char *s, size_t n)
- {
- # define TMPBUFSIZE 4096
- size_t count = 0;
- char tmpbuf[TMPBUFSIZE];
- const char *inptr = s;
- size_t insize = n;
-
- while (insize > 0)
- {
- char *outptr = tmpbuf;
- size_t outsize = TMPBUFSIZE;
- size_t res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr,
&outsize);
- if (res == (size_t)(-1) && errno != E2BIG
- # if !defined _LIBICONV_VERSION && !defined __GLIBC__
- /* Irix iconv() inserts a NUL byte if it cannot convert.
- NetBSD iconv() inserts a question mark if it cannot convert.
- Only GNU libiconv and GNU libc are known to prefer to fail rather
- than doing a lossy conversion. */
- || res > 0
- # endif
- )
- return (size_t)(-1);
- count += outptr - tmpbuf;
- }
- /* Avoid glibc-2.1 bug and Solaris 7 through 9 bug. */
- # if defined _LIBICONV_VERSION \
- || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
- {
- char *outptr = tmpbuf;
- size_t outsize = TMPBUFSIZE;
- size_t res = iconv (cd, NULL, NULL, &outptr, &outsize);
- if (res == (size_t)(-1))
- return (size_t)(-1);
- count += outptr - tmpbuf;
- }
- /* Return to the initial state. */
- iconv (cd, NULL, NULL, NULL, NULL);
- # endif
- return count;
- # undef TMPBUFSIZE
- }
-
- void
- iconv_string_keeping_offsets (iconv_t cd, const char *s, size_t n,
- size_t *offtable, char *t, size_t m)
- {
- size_t i;
- const char *s_end;
- const char *inptr;
- char *outptr;
- size_t outsize;
- /* Avoid glibc-2.1 bug. */
- # if !defined _LIBICONV_VERSION && (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0
<= 1)
- const size_t extra = 1;
- # else
- const size_t extra = 0;
- # endif
-
- for (i = 0; i < n; i++)
- offtable[i] = (size_t)(-1);
-
- s_end = s + n;
- inptr = s;
- outptr = t;
- outsize = m + extra;
- while (inptr < s_end)
- {
- const char *saved_inptr;
- size_t insize;
- size_t res;
-
- offtable[inptr - s] = outptr - t;
-
- saved_inptr = inptr;
- res = (size_t)(-1);
- for (insize = 1; inptr + insize <= s_end; insize++)
- {
- res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr,
&outsize);
- if (!(res == (size_t)(-1) && errno == EINVAL))
- break;
- /* We expect that no input bytes have been consumed so far. */
- if (inptr != saved_inptr)
- abort ();
- }
- /* After we verified the convertibility and computed the translation's
- size m, there shouldn't be any conversion error here. */
- if (res == (size_t)(-1)
- # if !defined _LIBICONV_VERSION && !defined __GLIBC__
- /* Irix iconv() inserts a NUL byte if it cannot convert.
- NetBSD iconv() inserts a question mark if it cannot convert.
- Only GNU libiconv and GNU libc are known to prefer to fail rather
- than doing a lossy conversion. */
- || res > 0
- # endif
- )
- abort ();
- }
- /* Avoid glibc-2.1 bug and Solaris 7 bug. */
- # if defined _LIBICONV_VERSION \
- || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
- if (iconv (cd, NULL, NULL, &outptr, &outsize) == (size_t)(-1))
- abort ();
- # endif
- /* We should have produced exactly m output bytes. */
- if (outsize != extra)
- abort ();
- }
-
- #endif /* HAVE_ICONV */
-
#if C_CTYPE_ASCII
/* Tests whether a string is entirely ASCII. Returns 1 if yes.
--- 31,36 ----
*** modules/unilbrk/ulc-possible-linebreaks.orig 2008-05-10
15:18:18.000000000 +0200
--- modules/unilbrk/ulc-possible-linebreaks 2008-05-10 15:05:20.000000000
+0200
***************
*** 8,17 ****
unilbrk/base
unilbrk/u8-possible-linebreaks
unilbrk/ulc-common
c-ctype
- iconv_open
- streq
- xsize
configure.ac:
--- 8,15 ----
unilbrk/base
unilbrk/u8-possible-linebreaks
unilbrk/ulc-common
+ uniconv/u8-conv-from-enc
c-ctype
configure.ac:
*** modules/unilbrk/ulc-width-linebreaks.orig 2008-05-10 15:18:18.000000000
+0200
--- modules/unilbrk/ulc-width-linebreaks 2008-05-10 15:05:21.000000000
+0200
***************
*** 8,17 ****
unilbrk/base
unilbrk/u8-width-linebreaks
unilbrk/ulc-common
c-ctype
- iconv_open
- streq
- xsize
configure.ac:
--- 8,15 ----
unilbrk/base
unilbrk/u8-width-linebreaks
unilbrk/ulc-common
+ uniconv/u8-conv-from-enc
c-ctype
configure.ac:
*** modules/unilbrk/ulc-common.orig 2008-05-10 15:18:18.000000000 +0200
--- modules/unilbrk/ulc-common 2008-05-10 15:03:02.000000000 +0200
***************
*** 7,13 ****
Depends-on:
c-ctype
- iconv
streq
configure.ac:
--- 7,12 ----