new modules for Unicode normalization

bug-gnulib
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
new modules for Unicode normalization

From:	Bruno Haible
Subject:	new modules for Unicode normalization
Date:	Sat, 21 Feb 2009 13:00:09 +0100
User-agent:	KMail/1.9.9
Planned for a long time. I've added modules for Unicode normalization forms.

Here is the API, the ChangeLog entry (without the tests), and the core of
the code. For the rest, please look in lib/uninorm/*.

================================ lib/uninorm.h ================================
/* Normalization forms (composition and decomposition) of Unicode strings.
   Copyright (C) 2001-2002, 2009 Free Software Foundation, Inc.
   Written by Bruno Haible <address@hidden>, 2009.

   This program is free software: you can redistribute it and/or modify it
   under the terms of the GNU Lesser General Public License as published
   by the Free Software Foundation; either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */

#ifndef _UNINORM_H
#define _UNINORM_H

/* Get size_t.  */
#include <stddef.h>

#include "unitypes.h"


#ifdef __cplusplus
extern "C" {
#endif


/* Conventions:

   All functions prefixed with u8_ operate on UTF-8 encoded strings.
   Their unit is an uint8_t (1 byte).

   All functions prefixed with u16_ operate on UTF-16 encoded strings.
   Their unit is an uint16_t (a 2-byte word).

   All functions prefixed with u32_ operate on UCS-4 encoded strings.
   Their unit is an uint32_t (a 4-byte word).

   All argument pairs (s, n) denote a Unicode string s[0..n-1] with exactly
   n units.

   Functions returning a string result take a (resultbuf, lengthp) argument
   pair.  If resultbuf is not NULL and the result fits into *lengthp units,
   it is put in resultbuf, and resultbuf is returned.  Otherwise, a freshly
   allocated string is returned.  In both cases, *lengthp is set to the
   length (number of units) of the returned string.  In case of error,
   NULL is returned and errno is set.  */


enum
{
  UC_DECOMP_CANONICAL,/*            Canonical decomposition.                  */
  UC_DECOMP_FONT,    /*   <font>    A font variant (e.g. a blackletter form). */
  UC_DECOMP_NOBREAK, /* <noBreak>   A no-break version of a space or hyphen.  */
  UC_DECOMP_INITIAL, /* <initial>   An initial presentation form (Arabic).    */
  UC_DECOMP_MEDIAL,  /*  <medial>   A medial presentation form (Arabic).      */
  UC_DECOMP_FINAL,   /*  <final>    A final presentation form (Arabic).       */
  UC_DECOMP_ISOLATED,/* <isolated>  An isolated presentation form (Arabic).   */
  UC_DECOMP_CIRCLE,  /*  <circle>   An encircled form.                        */
  UC_DECOMP_SUPER,   /*  <super>    A superscript form.                       */
  UC_DECOMP_SUB,     /*   <sub>     A subscript form.                         */
  UC_DECOMP_VERTICAL,/* <vertical>  A vertical layout presentation form.      */
  UC_DECOMP_WIDE,    /*   <wide>    A wide (or zenkaku) compatibility 
character. */
  UC_DECOMP_NARROW,  /*  <narrow>   A narrow (or hankaku) compatibility 
character. */
  UC_DECOMP_SMALL,   /*  <small>    A small variant form (CNS compatibility). */
  UC_DECOMP_SQUARE,  /*  <square>   A CJK squared font variant.               */
  UC_DECOMP_FRACTION,/* <fraction>  A vulgar fraction form.                   */
  UC_DECOMP_COMPAT   /*  <compat>   Otherwise unspecified compatibility 
character. */
};

/* Maximum size of decomposition of a single Unicode character.  */
#define UC_DECOMPOSITION_MAX_LENGTH 32

/* Return the character decomposition mapping of a Unicode character.
   DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH
   ucs_t elements.
   When a decomposition exists, DECOMPOSITION[0..N-1] and *DECOMP_TAG are
   filled and N is returned.  Otherwise -1 is returned.  */
extern int
       uc_decomposition (ucs4_t uc, int *decomp_tag, ucs4_t *decomposition);

/* Return the canonical character decomposition mapping of a Unicode character.
   DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH
   ucs_t elements.
   When a decomposition exists, DECOMPOSITION[0..N-1] is filled and N is
   returned.  Otherwise -1 is returned.  */
extern int
       uc_canonical_decomposition (ucs4_t uc, ucs4_t *decomposition);


/* Attempt to combine the Unicode characters uc1, uc2.
   uc1 is known to have canonical combining class 0.
   Return the combination of uc1 and uc2, if it exists.
   Return 0 otherwise.
   Not all decompositions can be recombined using this function.  See the
   Unicode file CompositionExclusions.txt for details.  */
extern ucs4_t
       uc_composition (ucs4_t uc1, ucs4_t uc2);


/* An object of type uninorm_t denotes a Unicode normalization form.  */
struct unicode_normalization_form;
typedef const struct unicode_normalization_form *uninorm_t;

/* UNINORM_NFD: Normalization form D: canonical decomposition.  */
extern const struct unicode_normalization_form uninorm_nfd;
#define UNINORM_NFD (&uninorm_nfd)

/* UNINORM_NFC: Normalization form C: canonical decomposition, then
   canonical composition.  */
extern const struct unicode_normalization_form uninorm_nfc;
#define UNINORM_NFC (&uninorm_nfc)

/* UNINORM_NFKD: Normalization form KD: compatibility decomposition.  */
extern const struct unicode_normalization_form uninorm_nfkd;
#define UNINORM_NFKD (&uninorm_nfkd)

/* UNINORM_NFKC: Normalization form KC: compatibility decomposition, then
   canonical composition.  */
extern const struct unicode_normalization_form uninorm_nfkc;
#define UNINORM_NFKC (&uninorm_nfkc)

/* Test whether a normalization form does compatibility decomposition.  */
#define uninorm_is_compat_decomposing(nf) \
  ((* (const unsigned int *) (nf) >> 0) & 1)

/* Test whether a normalization form includes canonical composition.  */
#define uninorm_is_composing(nf) \
  ((* (const unsigned int *) (nf) >> 1) & 1)


/* Return the specified normalization form of a string.  */
extern uint8_t *
       u8_normalize (uninorm_t nf, const uint8_t *s, size_t n,
                     uint8_t *resultbuf, size_t *lengthp);
extern uint16_t *
       u16_normalize (uninorm_t nf, const uint16_t *s, size_t n,
                      uint16_t *resultbuf, size_t *lengthp);
extern uint32_t *
       u32_normalize (uninorm_t nf, const uint32_t *s, size_t n,
                      uint32_t *resultbuf, size_t *lengthp);


#ifdef __cplusplus
}
#endif


#endif /* _UNINORM_H */
==============================================================================
2009-02-21  Bruno Haible  <address@hidden>

        New module 'uninorm/nfkc'.
        * lib/uninorm/nfkc.c: New file.
        * modules/uninorm/nfkc: New file.

        New module 'uninorm/nfkd'.
        * lib/uninorm/nfkd.c: New file.
        * modules/uninorm/nfkd: New file.

        New module 'uninorm/nfc'.
        * lib/uninorm/nfc.c: New file.
        * modules/uninorm/nfc: New file.

        New module 'uninorm/nfd'.
        * lib/uninorm/nfd.c: New file.
        * modules/uninorm/nfd: New file.

        New module 'uninorm/u32-normalize'.
        * lib/uninorm/u32-normalize.c: New file.
        * modules/uninorm/u32-normalize: New file.

        New module 'uninorm/u16-normalize'.
        * lib/uninorm/u16-normalize.c: New file.
        * modules/uninorm/u16-normalize: New file.

        New module 'uninorm/u8-normalize'.
        * lib/uninorm/u8-normalize.c: New file.
        * lib/uninorm/normalize-internal.h: New file.
        * lib/uninorm/u-normalize-internal.h: New file.
        * modules/uninorm/u8-normalize: New file.

        New module 'uninorm/decompose-internal'.
        * lib/uninorm/decompose-internal.c: New file.
        * modules/uninorm/decompose-internal: New file.

        New module 'uninorm/composition'.
        * lib/uninorm/composition.c: New file.
        * lib/uninorm/composition-table.gperf: New file, generated by
        gen-uni-tables.
        * modules/uninorm/composition: New file.

        New module 'uninorm/compat-decomposition'.
        * lib/uninorm/decompose-internal.h: New file.
        * lib/uninorm/compat-decomposition.c: New file.
        * modules/uninorm/compat-decomposition: New file.

        New module 'uninorm/canonical-decomposition'.
        * lib/uninorm/canonical-decomposition.c: New file.
        * modules/uninorm/canonical-decomposition: New file.

        New module 'uninorm/decomposition'.
        * lib/uninorm/decomposition.c: New file.
        * modules/uninorm/decomposition: New file.

        New module 'uninorm/decomposition-table'.
        * lib/uninorm/decomposition-table.h: New file.
        * lib/uninorm/decomposition-table.c: New file.
        * lib/uninorm/decomposition-table1.h: New file, generated by
        gen-uni-tables.
        * lib/uninorm/decomposition-table2.h: New file, generated by
        gen-uni-tables.
        * modules/uninorm/decomposition-table: New file.

        * lib/gen-uni-tables.c (MAX_DECOMP_LENGTH): New macro.
        (UC_DECOMP_*): New enumeration items.
        (get_decomposition): New function.
        (struct decomp_table): New type.
        (output_decomposition, output_decomposition_tables): New functions.
        (unicode_composition_exclusions): New variable.
        (fill_composition_exclusions, debug_output_composition_tables): New
        functions.
        (main): Accept one more argument. Invoke fill_composition_exclusions.
        Output decomposition and composition tables.

        New module 'uninorm/base'.
        * lib/uninorm.h: New file.
        * lib/unictype.h: Update comment.
        * modules/uninorm/base: New file.

===================== lib/uninorm/u-normalize-internal.h =====================
/* Decomposition and composition of Unicode strings.
   Copyright (C) 2009 Free Software Foundation, Inc.
   Written by Bruno Haible <address@hidden>, 2009.

   This program is free software: you can redistribute it and/or modify it
   under the terms of the GNU Lesser General Public License as published
   by the Free Software Foundation; either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */

UNIT *
FUNC (uninorm_t nf, const UNIT *s, size_t n,
      UNIT *resultbuf, size_t *lengthp)
{
  int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer;
  ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer;

  /* The result being accumulated.  */
  UNIT *result;
  size_t length;
  size_t allocated;
  /* The buffer for sorting.  */
  #define SORTBUF_PREALLOCATED 64
  struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED];
  struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */
  size_t sortbuf_allocated;
  size_t sortbuf_count;

  /* Initialize the accumulator.  */
  if (resultbuf == NULL)
    {
      result = NULL;
      allocated = 0;
    }
  else
    {
      result = resultbuf;
      allocated = *lengthp;
    }
  length = 0;

  /* Initialize the buffer for sorting.  */
  sortbuf = sortbuf_preallocated;
  sortbuf_allocated = SORTBUF_PREALLOCATED;
  sortbuf_count = 0;

  {
    const UNIT *s_end = s + n;

    for (;;)
      {
        int count;
        ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH];
        int decomposed_count;
        int i;

        if (s < s_end)
          {
            /* Fetch the next character.  */
            count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s);
            decomposed_count = 1;

            /* Decompose it, recursively.
               It would be possible to precompute the recursive decomposition
               and store it in a table.  But this would significantly increase
               the size of the decomposition tables, because for example for
               U+1FC1 the recursive canonical decomposition and the recursive
               compatibility decomposition are different.  */
            {
              int curr;

              for (curr = 0; curr < decomposed_count; )
                {
                  /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e.
                     all elements are atomic.  */
                  ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH];
                  int curr_decomposed_count;

                  curr_decomposed_count = decomposer (decomposed[curr], 
curr_decomposed);
                  if (curr_decomposed_count >= 0)
                    {
                      /* Move curr_decomposed[0..curr_decomposed_count-1] over
                         decomposed[curr], making room.  It's not worth using
                         memcpy() here, since the counts are so small.  */
                      int shift = curr_decomposed_count - 1;

                      if (shift < 0)
                        abort ();
                      if (shift > 0)
                        {
                          int j;

                          decomposed_count += shift;
                          if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH)
                            abort ();
                          for (j = decomposed_count - 1 - shift; j > curr; j--)
                            decomposed[j + shift] = decomposed[j];
                        }
                      for (; shift >= 0; shift--)
                        decomposed[curr + shift] = curr_decomposed[shift];
                    }
                  else
                    {
                      /* decomposed[curr] is atomic.  */
                      curr++;
                    }
                }
            }
          }
        else
          {
            count = 0;
            decomposed_count = 0;
          }

        i = 0;
        for (;;)
          {
            ucs4_t uc;
            int ccc;

            if (s < s_end)
              {
                /* Fetch the next character from the decomposition.  */
                if (i == decomposed_count)
                  break;
                uc = decomposed[i];
                ccc = uc_combining_class (uc);
              }
            else
              {
                /* End of string reached.  */
                uc = 0;
                ccc = 0;
              }

            if (ccc == 0)
              {
                size_t j;

                /* Apply the canonical ordering algorithm to the accumulated
                   sequence of characters.  */
                if (sortbuf_count > 1)
                  gl_uninorm_decompose_merge_sort_inplace (sortbuf, 
sortbuf_count,
                                                           sortbuf + 
sortbuf_count);

                if (composer != NULL)
                  {
                    /* Attempt to combine decomposed characters, as specified
                       in the Unicode Standard Annex #15 "Unicode Normalization
                       Forms".  We need to check
                         1. whether the first accumulated character is a
                            "starter" (i.e. has ccc = 0).  This is usually the
                            case.  But when the string starts with a
                            non-starter, the sortbuf also starts with a
                            non-starter.  Btw, this check could also be
                            omitted, because the composition table has only
                            entries (code1, code2) for which code1 is a
                            starter; if the first accumulated character is not
                            a starter, no lookup will succeed.
                         2. If the sortbuf has more than one character, check
                            for each of these characters that are not "blocked"
                            from the starter (i.e. have a ccc that is higher
                            than the ccc of the previous character) whether it
                            can be combined with the first character.
                         3. If only one character is left in sortbuf, check
                            whether it can be combined with the next character
                            (also a starter).  */
                    if (sortbuf_count > 0 && sortbuf[0].ccc == 0)
                      {
                        for (j = 1; j < sortbuf_count; )
                          {
                            if (sortbuf[j].ccc > sortbuf[j - 1].ccc)
                              {
                                ucs4_t combined =
                                  composer (sortbuf[0].code, sortbuf[j].code);
                                if (combined)
                                  {
                                    size_t k;

                                    sortbuf[0].code = combined;
                                    /* sortbuf[0].ccc = 0, still valid.  */
                                    for (k = j + 1; k < sortbuf_count; k++)
                                      sortbuf[k - 1] = sortbuf[k];
                                    sortbuf_count--;
                                    continue;
                                  }
                              }
                            j++;
                          }
                        if (s < s_end && sortbuf_count == 1)
                          {
                            ucs4_t combined =
                              composer (sortbuf[0].code, uc);
                            if (combined)
                              {
                                uc = combined;
                                ccc = 0;
                                /* uc could be further combined with subsequent
                                   characters.  So don't put it into sortbuf[0] 
in
                                   this round, only in the next round.  */
                                sortbuf_count = 0;
                              }
                          }
                      }
                  }

                for (j = 0; j < sortbuf_count; j++)
                  {
                    ucs4_t muc = sortbuf[j].code;

                    /* Append muc to the result accumulator.  */
                    if (length < allocated)
                      {
                        int ret =
                          U_UCTOMB (result + length, muc, allocated - length);
                        if (ret == -1)
                          {
                            errno = EINVAL;
                            goto fail;
                          }
                        if (ret >= 0)
                          {
                            length += ret;
                            goto done_appending;
                          }
                      }
                    {
                      size_t old_allocated = allocated;
                      size_t new_allocated = 2 * old_allocated;
                      if (new_allocated < 64)
                        new_allocated = 64;
                      if (new_allocated < old_allocated) /* integer overflow? */
                        abort ();
                      {
                        UNIT *larger_result;
                        if (result == NULL)
                          {
                            larger_result =
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
                            if (larger_result == NULL)
                              {
                                errno = ENOMEM;
                                goto fail;
                              }
                          }
                        else if (result == resultbuf)
                          {
                            larger_result =
                              (UNIT *) malloc (new_allocated * sizeof (UNIT));
                            if (larger_result == NULL)
                              {
                                errno = ENOMEM;
                                goto fail;
                              }
                            U_CPY (larger_result, resultbuf, length);
                          }
                        else
                          {
                            larger_result =
                              (UNIT *) realloc (result, new_allocated * sizeof 
(UNIT));
                            if (larger_result == NULL)
                              {
                                errno = ENOMEM;
                                goto fail;
                              }
                          }
                        result = larger_result;
                        allocated = new_allocated;
                        {
                          int ret =
                            U_UCTOMB (result + length, muc, allocated - length);
                          if (ret == -1)
                            {
                              errno = EINVAL;
                              goto fail;
                            }
                          if (ret < 0)
                            abort ();
                          length += ret;
                          goto done_appending;
                        }
                      }
                    }
                   done_appending: ;
                  }

                /* sortbuf is now empty.  */
                sortbuf_count = 0;
              }

            if (!(s < s_end))
              /* End of string reached.  */
              break;

            /* Append (uc, ccc) to sortbuf.  */
            if (sortbuf_count == sortbuf_allocated)
              {
                struct ucs4_with_ccc *new_sortbuf;

                sortbuf_allocated = 2 * sortbuf_allocated;
                if (sortbuf_allocated < sortbuf_count) /* integer overflow? */
                  abort ();
                new_sortbuf =
                  (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * 
sizeof (struct ucs4_with_ccc));
                memcpy (new_sortbuf, sortbuf,
                        sortbuf_count * sizeof (struct ucs4_with_ccc));
                if (sortbuf != sortbuf_preallocated)
                  free (sortbuf);
                sortbuf = new_sortbuf;
              }
            sortbuf[sortbuf_count].code = uc;
            sortbuf[sortbuf_count].ccc = ccc;
            sortbuf_count++;

            i++;
          }

        if (!(s < s_end))
          /* End of string reached.  */
          break;

        s += count;
      }
  }

  if (sortbuf_count > 0)
    abort ();
  if (sortbuf != sortbuf_preallocated)
    free (sortbuf);

  *lengthp = length;
  return result;

 fail:
  {
    int saved_errno = errno;
    if (sortbuf != sortbuf_preallocated)
      free (sortbuf);
    if (result != resultbuf)
      free (result);
    errno = saved_errno;
  }
  return NULL;
}
[Prev in Thread]
Current Thread
[Next in Thread]
new modules for Unicode normalization, Bruno Haible <=
- Re: new modules for Unicode normalization, Bruno Haible, 2009/02/21
  - Re: new modules for Unicode normalization, Jim Meyering, 2009/02/21
    - Re: new modules for Unicode normalization, Pádraig Brady, 2009/02/21
    - Re: new modules for Unicode normalization, Bruno Haible, 2009/02/21
Prev by Date: Re: [PATCH] Add 'safe-alloc' module
Next by Date: Re: new modules for Unicode normalization
Previous by thread: Typo/oddity in realpath.texi
Next by thread: Re: new modules for Unicode normalization
Index(es):
- Date
- Thread