texinfo-commits
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[no subject]


From: Patrice Dumas
Date: Sat, 30 Dec 2023 04:08:34 -0500 (EST)

branch: master
commit d4da0502ce2d3628e451cede2ad01a43c3646f0c
Author: Patrice Dumas <pertusus@free.fr>
AuthorDate: Sat Dec 30 10:07:46 2023 +0100

    * tp/Texinfo/XS/main/unicode.c (format_eight_bit_accents_stack): fix
    level instack at the end of the first phase of gathering codepoints.
    Do not double free new_eight_bit.
    
    * tp/Texinfo/XS/main/utils.c (word_bytes_len_multibyte): add a
    function that counts the bytes in contiguous word characters in a
    string, emulating \p{Word} or \w with unicode in perl.
    
    * tp/maintain/setup_converters_code_tables.pl (unicode_diacritics),
    tp/Texinfo/XS/main/unicode.h (DIACRITIC_UNICODE),
    tp/Texinfo/XS/main/unicode.c (unicode_accent): add the diacritic
    numeric representation in addition to the UTF-8 encoded string.
    
    * tp/Texinfo/XS/convert/converter.c (convert_accents),
    tp/Texinfo/XS/main/convert_to_text.c (ascii_accent),
    tp/Texinfo/XS/main/unicode.c (format_eight_bit_accents_stack)
    (format_unicode_accents_stack_internal): add convert accents. Use
    converter in accent formatting functions arguments passed to the
    accents stacks conversion functions.  Update callers.
    
    * tp/Texinfo/XS/convert/converter.c (next_for_tieaccent)
    (UNICODE_ACCENT_LETTER, xml_numeric_entity_accent): implement
    xml_numeric_entity_accent.
    
    * tp/Texinfo/XS/convert/convert_html.c
    (html_accent_entities_html_accent_internal)
    (html_accent_entities_html_accent)
    (html_accent_entities_numeric_entities_accent)
    (convert_accent_command, html_converter_initialize)
    (html_free_converter, html_free_converter),
    tp/Texinfo/XS/convert/get_html_perl_info.c
    (html_converter_initialize_sv), tp/Texinfo/XS/main/converter_types.h
    (ACCENT_ENTITY_INFO, CONVERTER): add a list of accent commands,
    converter->accent_formatted_cmd, collected while getting the
    formatting references.  Get accent_entities information from perl to
    C.  Implement convert_accent_command in C.
---
 ChangeLog                                   |  39 ++++
 tp/Texinfo/XS/convert/convert_html.c        | 147 +++++++++++++++
 tp/Texinfo/XS/convert/converter.c           | 276 +++++++++++++++++++++++++---
 tp/Texinfo/XS/convert/converter.h           |   8 +
 tp/Texinfo/XS/convert/get_html_perl_info.c  |  68 +++++++
 tp/Texinfo/XS/main/convert_to_text.c        |  21 ++-
 tp/Texinfo/XS/main/converter_types.h        |   7 +
 tp/Texinfo/XS/main/unicode.c                |  47 ++---
 tp/Texinfo/XS/main/unicode.h                |  14 +-
 tp/Texinfo/XS/main/utils.c                  |  37 ++++
 tp/Texinfo/XS/main/utils.h                  |   1 +
 tp/maintain/setup_converters_code_tables.pl |   9 +-
 12 files changed, 606 insertions(+), 68 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index aea265909d..23298c3ab2 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,42 @@
+2023-12-30  Patrice Dumas  <pertusus@free.fr>
+
+       * tp/Texinfo/XS/main/unicode.c (format_eight_bit_accents_stack): fix
+       level instack at the end of the first phase of gathering codepoints.
+       Do not double free new_eight_bit.
+
+       * tp/Texinfo/XS/main/utils.c (word_bytes_len_multibyte): add a
+       function that counts the bytes in contiguous word characters in a
+       string, emulating \p{Word} or \w with unicode in perl.
+
+       * tp/maintain/setup_converters_code_tables.pl (unicode_diacritics),
+       tp/Texinfo/XS/main/unicode.h (DIACRITIC_UNICODE),
+       tp/Texinfo/XS/main/unicode.c (unicode_accent): add the diacritic
+       numeric representation in addition to the UTF-8 encoded string.
+
+       * tp/Texinfo/XS/convert/converter.c (convert_accents),
+       tp/Texinfo/XS/main/convert_to_text.c (ascii_accent),
+       tp/Texinfo/XS/main/unicode.c (format_eight_bit_accents_stack)
+       (format_unicode_accents_stack_internal): add convert accents. Use
+       converter in accent formatting functions arguments passed to the
+       accents stacks conversion functions.  Update callers.
+
+       * tp/Texinfo/XS/convert/converter.c (next_for_tieaccent)
+       (UNICODE_ACCENT_LETTER, xml_numeric_entity_accent): implement
+       xml_numeric_entity_accent.
+
+       * tp/Texinfo/XS/convert/convert_html.c
+       (html_accent_entities_html_accent_internal)
+       (html_accent_entities_html_accent)
+       (html_accent_entities_numeric_entities_accent)
+       (convert_accent_command, html_converter_initialize)
+       (html_free_converter, html_free_converter),
+       tp/Texinfo/XS/convert/get_html_perl_info.c
+       (html_converter_initialize_sv), tp/Texinfo/XS/main/converter_types.h
+       (ACCENT_ENTITY_INFO, CONVERTER): add a list of accent commands,
+       converter->accent_formatted_cmd, collected while getting the
+       formatting references.  Get accent_entities information from perl to
+       C.  Implement convert_accent_command in C.
+
 2023-12-30  Patrice Dumas  <pertusus@free.fr>
 
        * tp/Texinfo/Convert/Converter.pm (convert_accents): reorganize code
diff --git a/tp/Texinfo/XS/convert/convert_html.c 
b/tp/Texinfo/XS/convert/convert_html.c
index 99d2fabcb0..9be35468e6 100644
--- a/tp/Texinfo/XS/convert/convert_html.c
+++ b/tp/Texinfo/XS/convert/convert_html.c
@@ -9024,6 +9024,122 @@ convert_math_command (CONVERTER *self, const enum 
command_id cmd,
   free (attribute_class);
 }
 
+char *
+html_accent_entities_html_accent_internal (CONVERTER *self, const char *text,
+                         const ELEMENT *element, int set_case,
+                         int use_numeric_entities)
+{
+  char *text_set;
+
+  if (set_case)
+    {
+      int str_len = strlen (text);
+      if (str_len != 1 || !isascii_alnum (*text))
+        {
+          int w_len = word_bytes_len_multibyte (text);
+          if (w_len != str_len)
+            set_case = 0;
+        }
+    }
+
+  if (set_case)
+    text_set = to_upper_or_lower_multibyte (text, set_case);
+  else
+    text_set = strdup (text);
+
+  /* do not return a dotless i or j as such if it is further composed
+     with an accented letter, return the letter as is */
+  if (element->cmd == CM_dotless
+      && (!strcmp (text_set, "i") || !strcmp (text_set, "j")))
+    {
+      if (element->parent && element->parent->parent
+          && element->parent->parent->cmd)
+        {
+          enum command_id p_cmd = element->parent->parent->cmd;
+          if (builtin_command_data[p_cmd].flags & CF_accent
+              && p_cmd != CM_tieaccent)
+            {
+              return text_set;
+            }
+        }
+    }
+
+  if (use_numeric_entities)
+    {
+      char *formatted_accent
+        = xml_numeric_entity_accent (element->cmd, text_set);
+      if (formatted_accent)
+        {
+          free (text_set);
+          return formatted_accent;
+        }
+    }
+  else
+    {
+      char *formatted_accent;
+      if (strlen (text_set) == 1 && isascii_alpha (*text_set)
+          && self->accent_entities[element->cmd].entity
+          && self->accent_entities[element->cmd].characters
+          && strlen (self->accent_entities[element->cmd].characters)
+          && strrchr (self->accent_entities[element->cmd].characters,
+                       *text_set))
+        {
+          xasprintf (&formatted_accent, "&%s%s;", text_set,
+                     self->accent_entities[element->cmd].entity);
+          free (text_set);
+          return formatted_accent;
+        }
+      formatted_accent = xml_numeric_entity_accent (element->cmd, text_set);
+      if (formatted_accent)
+        {
+          free (text_set);
+          return formatted_accent;
+        }
+    }
+  fprintf (stderr, "NNNNNNNNNNn %s %s\n", builtin_command_name (element->cmd), 
text_set);
+  return text_set;
+}
+
+char *
+html_accent_entities_html_accent (CONVERTER *self, const char *text,
+                         const ELEMENT *element, int set_case)
+{
+  return html_accent_entities_html_accent_internal (self, text,
+                                            element, set_case, 0);
+}
+
+char *
+html_accent_entities_numeric_entities_accent (CONVERTER *self,
+             const char *text, const ELEMENT *element, int set_case)
+{
+  return html_accent_entities_html_accent_internal (self, text,
+                                            element, set_case, 1);
+}
+
+void
+convert_accent_command (CONVERTER *self, const enum command_id cmd,
+                    const ELEMENT *element,
+                    const HTML_ARGS_FORMATTED *args_formatted,
+                    const char *content, TEXT *result)
+{
+  char *(*format_accents)(CONVERTER *self, const char *text,
+                         const ELEMENT *element, int set_case);
+
+  int output_encoded_characters = (self->conf->OUTPUT_CHARACTERS > 0);
+
+  if (self->conf->USE_NUMERIC_ENTITY > 0)
+    format_accents = &html_accent_entities_numeric_entities_accent;
+  else
+    format_accents = &html_accent_entities_html_accent;
+
+  char *accent_text = convert_accents (self, element, &html_convert_tree,
+                          format_accents, output_encoded_characters,
+                          html_in_upper_case (self));
+
+  text_append (result, accent_text);
+  free (accent_text);
+}
+
 void
 convert_indicateurl_command (CONVERTER *self, const enum command_id cmd,
                     const ELEMENT *element,
@@ -11461,6 +11577,26 @@ html_converter_initialize (CONVERTER *self)
         }
     }
 
+  /* accents commands implemented in C, but not css strings accents */
+  if (self->accent_formatted_cmd.number)
+    {
+      for (i = 0; i < self->accent_formatted_cmd.number; i++)
+        {
+          enum command_id cmd = self->accent_formatted_cmd.list[i];
+          COMMAND_CONVERSION_FUNCTION *command_conversion
+               = &self->command_conversion_function[cmd];
+          if (command_conversion->status == FRS_status_default_set)
+            {
+              command_conversion->formatting_reference = 0;
+              command_conversion->status = FRS_status_internal;
+              command_conversion->command_conversion
+                = &convert_accent_command;
+            }
+        }
+    }
+
+
+
   /* all the commands in style_formatted_cmd are implemented in C.
      It is not only the style commands, some others too.  indicateurl
      is not in style_formatted_cmd for now either */
@@ -11958,6 +12094,15 @@ html_free_converter (CONVERTER *self)
         }
     }
 
+  for (i = 0; i < self->accent_formatted_cmd.number; i++)
+    {
+      enum command_id cmd = self->accent_formatted_cmd.list[i];
+      ACCENT_ENTITY_INFO *accent_info
+          = &self->accent_entities[cmd];
+      free (accent_info->entity);
+      free (accent_info->characters);
+    }
+
   for (i = 0; i < self->style_formatted_cmd.number; i++)
     {
       enum command_id cmd = self->style_formatted_cmd.list[i];
@@ -12023,6 +12168,8 @@ html_free_converter (CONVERTER *self)
 
   free (self->no_arg_formatted_cmd.list);
 
+  free (self->accent_formatted_cmd.list);
+
   free (self->style_formatted_cmd.list);
 
   free (self->pending_closes.stack);
diff --git a/tp/Texinfo/XS/convert/converter.c 
b/tp/Texinfo/XS/convert/converter.c
index a353beb1a2..cc5754b442 100644
--- a/tp/Texinfo/XS/convert/converter.c
+++ b/tp/Texinfo/XS/convert/converter.c
@@ -20,6 +20,10 @@
 #include <string.h>
 #include <stdio.h>
 #include <stddef.h>
+#include <inttypes.h>
+#include <unistr.h>
+#include <uniconv.h>
+#include <unictype.h>
 
 #include "command_ids.h"
 #include "tree_types.h"
@@ -32,6 +36,7 @@
 #include "convert_utils.h"
 #include "translations.h"
 #include "manipulate_tree.h"
+#include "unicode.h"
 #include "converter.h"
 
 static CONVERTER **converter_list;
@@ -302,6 +307,94 @@ node_information_filename (CONVERTER *self, char 
*normalized,
   return filename;
 }
 
+ELEMENT *
+float_type_number (CONVERTER *self, const ELEMENT *float_e)
+{
+  ELEMENT *tree = 0;
+  ELEMENT *type_element = 0;
+  NAMED_STRING_ELEMENT_LIST *replaced_substrings
+     = new_named_string_element_list ();
+  char *float_type = lookup_extra_string (float_e, "float_type");
+  char *float_number = lookup_extra_string (float_e, "float_number");
+
+  if (float_type && strlen (float_type))
+    type_element = float_e->args.list[0];
+
+  if (float_number)
+    {
+      ELEMENT *e_number = new_element (ET_NONE);
+      text_append (&e_number->text, float_number);
+      add_element_to_named_string_element_list (replaced_substrings,
+                                     "float_number", e_number);
+    }
+
+  if (type_element)
+    {
+      ELEMENT *type_element_copy = copy_tree (type_element);
+      add_element_to_named_string_element_list (replaced_substrings,
+                                     "float_type", type_element_copy);
+      if (float_number)
+        tree = gdt_tree ("{float_type} {float_number}", self->document,
+                         self->conf, replaced_substrings, 0, 0);
+      else
+        tree = gdt_tree ("{float_type}", self->document, self->conf,
+                         replaced_substrings, 0, 0);
+    }
+  else if (float_number)
+    tree = gdt_tree ("{float_number}", self->document, self->conf,
+                     replaced_substrings, 0, 0);
+
+  destroy_named_string_element_list (replaced_substrings);
+
+  return tree;
+}
+
+char *
+convert_accents (CONVERTER *self, const ELEMENT *accent,
+ char *(*convert_tree)(CONVERTER *self, const ELEMENT *tree, char 
*explanation),
+ char *(*format_accent)(CONVERTER *self, const char *text, const ELEMENT 
*element,
+                        int set_case),
+  int output_encoded_characters,
+  int set_case)
+{
+  ACCENTS_STACK *accent_stack = find_innermost_accent_contents (accent);
+  const ELEMENT_STACK *stack;
+  char *arg_text;
+  char *result;
+  int i;
+
+  if (accent_stack->argument)
+    arg_text = (*convert_tree) (self, accent_stack->argument, 0);
+  else
+    arg_text = strdup ("");
+
+  if (output_encoded_characters)
+    {
+      char *encoded = encoded_accents (self, arg_text, &accent_stack->stack,
+                                 self->conf->OUTPUT_ENCODING_NAME, 
format_accent,
+                                 set_case);
+      if (encoded)
+        {
+          free (arg_text);
+          destroy_accent_stack (accent_stack);
+          return encoded;
+        }
+    }
+
+  stack = &accent_stack->stack;
+  result = arg_text;
+  for (i = stack->top - 1; i >= 0; i--)
+    {
+      const ELEMENT *accent_command = stack->stack[i];
+      char *formatted_accent = (*format_accent) (self, result, accent_command,
+                                                 set_case);
+      free (result);
+      result = formatted_accent;
+    }
+  destroy_accent_stack (accent_stack);
+  return result;
+}
+
 ELEMENT_LIST *
 comma_index_subentries_tree (const ELEMENT *current_entry,
                              char *separator)
@@ -652,44 +745,167 @@ xml_protect_text (const char *text, TEXT *result)
     }
 }
 
-ELEMENT *
-float_type_number (CONVERTER *self, const ELEMENT *float_e)
+static char *
+next_for_tieaccent (const char *text, const char **next)
 {
-  ELEMENT *tree = 0;
-  ELEMENT *type_element = 0;
-  NAMED_STRING_ELEMENT_LIST *replaced_substrings
-     = new_named_string_element_list ();
-  char *float_type = lookup_extra_string (float_e, "float_type");
-  char *float_number = lookup_extra_string (float_e, "float_number");
+  const char *p;
+  if (!strlen (text))
+    {
+      return 0;
+    }
+  if (text[0] == '&')
+    {
+      if (strlen (text) > 3 && isascii_alnum(*(text+1)))
+        {
+          p = text +2;
+          while (*p)
+            {
+              if (*p == ';')
+                {
+                  p++;
+                  *next = p;
+                  return strndup (text, p - text);
+                }
+              else if (isascii_alnum (*p))
+                {
+                  p++;
+                }
+              else
+                break;
+            }
+        }
+      return 0;
+    }
+  else
+    {
+      uint8_t *encoded_u8 = u8_strconv_from_encoding (text, "UTF-8",
+                                                  iconveh_question_mark);
+      ucs4_t first_char;
+      u8_next (&first_char, encoded_u8);
+      if (uc_is_general_category (first_char, UC_CATEGORY_L)
+          /* ASCII digits */
+          || (first_char >= 0x0030 && first_char <= 0x0039))
+        {
+          char *first_char_text;
+          uint8_t *first_char_u8 = malloc (7 * sizeof(uint8_t));
+          int first_char_len = u8_uctomb (first_char_u8, first_char, 6);
+          if (first_char_len < 0)
+            fatal ("u8_uctomb returns negative value");
+          first_char_u8[first_char_len] = 0;
+          first_char_text = u8_strconv_to_encoding (first_char_u8, "UTF-8",
+                                                    iconveh_question_mark);
+          free (first_char_u8);
+          p = text + strlen (first_char_text);
+          *next = p;
+          return first_char_text;
+        }
+      return 0;
+    }
+}
 
-  if (float_type && strlen (float_type))
-    type_element = float_e->args.list[0];
+typedef struct UNICODE_ACCENT_LETTER {
+    enum command_id cmd;
+    char *letter;
+    char *numerical_entity;
+} UNICODE_ACCENT_LETTER;
 
-  if (float_number)
+/* only those that are not obtained through diacritic + normalization */
+static UNICODE_ACCENT_LETTER unicode_accented_letters[] = {
+{CM_dotless, "i", "305"},
+{CM_dotless, "j", "567"},
+{0, 0, 0}
+};
+
+char *
+xml_numeric_entity_accent (enum command_id cmd, const char *text)
+{
+  char *result;
+  if (! builtin_command_data[cmd].flags & CF_accent)
     {
-      ELEMENT *e_number = new_element (ET_NONE);
-      text_append (&e_number->text, float_number);
-      add_element_to_named_string_element_list (replaced_substrings,
-                                     "float_number", e_number);
+      return 0;
     }
 
-  if (type_element)
+  if (strlen (text) == 1 && isascii_alpha (*text))
     {
-      ELEMENT *type_element_copy = copy_tree (type_element);
-      add_element_to_named_string_element_list (replaced_substrings,
-                                     "float_type", type_element_copy);
-      if (float_number)
-        tree = gdt_tree ("{float_type} {float_number}", self->document,
-                         self->conf, replaced_substrings, 0, 0);
-      else
-        tree = gdt_tree ("{float_type}", self->document, self->conf,
-                         replaced_substrings, 0, 0);
+      int i;
+      for (i = 0; unicode_accented_letters[i].cmd; i++)
+        {
+          UNICODE_ACCENT_LETTER *letter = &unicode_accented_letters[i];
+          if (cmd == letter->cmd && ! strcmp (text, letter->letter))
+            {
+              xasprintf (&result, "&#%s;", letter->numerical_entity);
+              return result;
+            }
+        }
     }
-  else if (float_number)
-    tree = gdt_tree ("{float_number}", self->document, self->conf,
-                     replaced_substrings, 0, 0);
 
-  destroy_named_string_element_list (replaced_substrings);
+  if (unicode_diacritics[cmd].text)
+    {
+      if (cmd != CM_tieaccent)
+        {
+          if (strlen (text) == 1 && isascii_alpha (*text))
+            {
+              char *accented_char;
+              char *normalized_char;
+              uint8_t *encoded_u8;
+              ucs4_t first_char;
+              const uint8_t *next;
+
+              xasprintf (&accented_char, "%s%s", text,
+                         unicode_diacritics[cmd].text);
+              normalized_char = normalize_NFC (accented_char);
+              encoded_u8 = u8_strconv_from_encoding (normalized_char, "UTF-8",
+                                                     iconveh_question_mark);
+              next = u8_next (&first_char, encoded_u8);
+              if (next)
+                {
+                  ucs4_t other_char;
+                  const uint8_t *after = u8_next (&other_char, next);
+                  next = after;
+                }
+              free (encoded_u8);
+              free (accented_char);
+              free (normalized_char);
+              if (!next)
+                {
+                  char *entity;
+              /* hex entity
+                 xasprintf (&entity, "&#%04lX;", first_char); */
+        /* seems to be the way for portable uint32_t unsigned integer format */
+                  xasprintf (&entity, "&#%" PRIu32 ";", first_char);
+                  return entity;
+                }
+            }
+          xasprintf (&result, "%s&#%s;", text, 
unicode_diacritics[cmd].codepoint);
+          return result;
+        }
+      else
+        {
+          char *result;
+          const char *p = 0;
+          const char *remaining = 0;
+          char *first = next_for_tieaccent (text, &p);
+          char *second;
+          if (!first)
+            goto invalid;
+          second = next_for_tieaccent (p, &remaining);
+          if (second)
+            {
+              xasprintf (&result, "%s&#%s;%s%s", first,
+                         unicode_diacritics[cmd].codepoint, second, remaining);
+              free (first);
+              free (second);
+              return result;
+            }
+          else
+            free (first);
 
-  return tree;
+         invalid:
+          xasprintf (&result, "%s&#%s;", text,
+                     unicode_diacritics[cmd].codepoint);
+          return result;
+        }
+    }
+  return 0;
 }
+
diff --git a/tp/Texinfo/XS/convert/converter.h 
b/tp/Texinfo/XS/convert/converter.h
index 0fb4de449d..364a1b052c 100644
--- a/tp/Texinfo/XS/convert/converter.h
+++ b/tp/Texinfo/XS/convert/converter.h
@@ -83,6 +83,13 @@ char *node_information_filename (CONVERTER *self, char 
*normalized,
 TARGET_FILENAME *normalized_sectioning_command_filename (CONVERTER *self,
                                                    const ELEMENT *command);
 
+char *convert_accents (CONVERTER *self, const ELEMENT *accent,
+ char *(*convert_tree)(CONVERTER *self, const ELEMENT *tree, char 
*explanation),
+ char *(*format_accent)(CONVERTER *self, const char *text, const ELEMENT 
*element,
+                        int set_case),
+  int output_encoded_characters,
+  int set_case);
+
 ELEMENT_LIST *comma_index_subentries_tree (const ELEMENT *current_entry,
                                            char *separator);
 void free_comma_index_subentries_tree (ELEMENT_LIST *element);
@@ -103,4 +110,5 @@ void free_generic_converter (CONVERTER *self);
 
 
 void xml_format_text_with_numeric_entities (const char *text, TEXT *result);
+char *xml_numeric_entity_accent (enum command_id cmd, const char *text);
 #endif
diff --git a/tp/Texinfo/XS/convert/get_html_perl_info.c 
b/tp/Texinfo/XS/convert/get_html_perl_info.c
index 223a59f875..12c4f01427 100644
--- a/tp/Texinfo/XS/convert/get_html_perl_info.c
+++ b/tp/Texinfo/XS/convert/get_html_perl_info.c
@@ -152,6 +152,7 @@ html_converter_initialize_sv (SV *converter_sv,
   SV **formatting_function_sv;
   SV **sorted_special_unit_varieties_sv;
   SV **no_arg_commands_formatting_sv;
+  SV **accent_entities_sv;
   SV **style_commands_formatting_sv;
   SV **types_open_sv;
   SV **types_conversion_sv;
@@ -175,6 +176,7 @@ html_converter_initialize_sv (SV *converter_sv,
   int nr_string_directions;
   int nr_dir_str_contexts = TDS_context_string +1;
   enum direction_string_type DS_type;
+  int nr_accent_cmd = 0;
 
   dTHX;
 
@@ -326,8 +328,16 @@ html_converter_initialize_sv (SV *converter_sv,
         conversion_formatting_reference, ref_name,
         default_commands_conversion_hv,
         commands_conversion_hv);
+
+  /* NOTE use the loop to collect the number of accent commands too */
+      if (builtin_command_data[i].flags & CF_accent)
+        nr_accent_cmd++;
     }
 
+  converter->accent_formatted_cmd.list = (enum command_id *)
+    malloc (nr_accent_cmd * sizeof (enum command_id));
+  converter->accent_formatted_cmd.number = 0;
+
   default_css_string_commands_conversion_hv
     = (HV *)SvRV (default_css_string_commands_conversion);
   /* copy the normal formatting references and replace the css strings
@@ -350,6 +360,14 @@ html_converter_initialize_sv (SV *converter_sv,
      register_formatting_reference_default ("css_command_conversion",
         conversion_formatting_reference, ref_name,
         default_css_string_commands_conversion_hv);
+
+  /* NOTE we use the loop to collect the accent commands too */
+     if (builtin_command_data[i].flags & CF_accent)
+       {
+         converter->accent_formatted_cmd.list[
+            converter->accent_formatted_cmd.number] = i;
+         converter->accent_formatted_cmd.number++;
+       }
     }
 
 
@@ -744,6 +762,56 @@ html_converter_initialize_sv (SV *converter_sv,
              sizeof (enum command_id), compare_ints);
     }
 
+  FETCH(accent_entities)
+
+  if (accent_entities_sv)
+    {
+      I32 hv_number;
+      I32 i;
+
+      HV *accent_entities_hv
+        = (HV *)SvRV (*accent_entities_sv);
+
+      hv_number = hv_iterinit (accent_entities_hv);
+
+      for (i = 0; i < hv_number; i++)
+        {
+          char *cmdname;
+          I32 retlen;
+          SV *spec_sv = hv_iternextsv (accent_entities_hv,
+                                          &cmdname, &retlen);
+          if (SvOK (spec_sv))
+            {
+              enum command_id cmd = lookup_builtin_command (cmdname);
+              if (!cmd)
+                fprintf (stderr, "ERROR: %s: no accent command\n", cmdname);
+              else
+                {
+                  ACCENT_ENTITY_INFO *accent_info
+                    = &converter->accent_entities[cmd];
+                  AV *spec_av = (AV *)SvRV (spec_sv);
+                  SV **entity_sv = av_fetch (spec_av, 0, 0);
+                  SV **characters_sv = av_fetch (spec_av, 1, 0);
+
+                  if (entity_sv)
+                    {
+                      char *entity = (char *) SvPVutf8_nolen (*entity_sv);
+                      accent_info->entity = strdup (entity);
+                    }
+
+                  if (characters_sv && SvOK (*characters_sv))
+                    {
+                      char *characters
+                        = (char *) SvPVutf8_nolen (*characters_sv);
+                      if (strlen (characters))
+                        accent_info->characters = strdup (characters);
+                    }
+                }
+            }
+        }
+    }
+
+
   FETCH(style_commands_formatting)
 
   if (style_commands_formatting_sv)
diff --git a/tp/Texinfo/XS/main/convert_to_text.c 
b/tp/Texinfo/XS/main/convert_to_text.c
index 2f8e280c3b..fa71a6b677 100644
--- a/tp/Texinfo/XS/main/convert_to_text.c
+++ b/tp/Texinfo/XS/main/convert_to_text.c
@@ -43,8 +43,12 @@
 #include "cmd_symbol.c"
 #include "cmd_text.c"
 
+
+/* the CONVERTER argument is not used, it is there solely to match the
+   calling prototype in accent formatting commands */
 char *
-ascii_accent (const char *text, const ELEMENT *command, int set_case)
+ascii_accent (CONVERTER *self, const char *text,
+              const ELEMENT *command, int set_case)
 {
   const enum command_id cmd = command->cmd;
   TEXT accent_text;
@@ -96,7 +100,8 @@ ascii_accents_internal (const char *text, const 
ELEMENT_STACK *stack,
   for (i = stack->top - 1; i >= 0; i--)
     {
       const ELEMENT *accent_command = stack->stack[i];
-      char *formatted_accent = ascii_accent (result, accent_command, set_case);
+      char *formatted_accent = ascii_accent (0, result, accent_command,
+                                             set_case);
       free (result);
       result = formatted_accent;
     }
@@ -180,7 +185,7 @@ char *
 text_accents (const ELEMENT *accent, char *encoding, int set_case)
 {
   ACCENTS_STACK *accent_stack = find_innermost_accent_contents (accent);
-  char *text;
+  char *arg_text;
   char *result;
   TEXT_OPTIONS *text_options = new_text_options ();
 
@@ -189,16 +194,16 @@ text_accents (const ELEMENT *accent, char *encoding, int 
set_case)
   text_options->set_case = set_case;
 
   if (accent_stack->argument)
-    text = convert_to_text (accent_stack->argument, text_options);
+    arg_text = convert_to_text (accent_stack->argument, text_options);
   else
-    text = strdup ("");
+    arg_text = strdup ("");
 
-  result = encoded_accents (text, &accent_stack->stack, encoding,
+  result = encoded_accents (0, arg_text, &accent_stack->stack, encoding,
                             ascii_accent, set_case);
 
   if (!result)
-    result = ascii_accents_internal (text, &accent_stack->stack, set_case);
-  free (text);
+    result = ascii_accents_internal (arg_text, &accent_stack->stack, set_case);
+  free (arg_text);
   destroy_accent_stack (accent_stack);
   destroy_text_options (text_options);
   return result;
diff --git a/tp/Texinfo/XS/main/converter_types.h 
b/tp/Texinfo/XS/main/converter_types.h
index a8ee33e43e..b9cee1b993 100644
--- a/tp/Texinfo/XS/main/converter_types.h
+++ b/tp/Texinfo/XS/main/converter_types.h
@@ -525,6 +525,11 @@ typedef struct HTML_ARGS_FORMATTED {
     HTML_ARG_FORMATTED *args;
 } HTML_ARGS_FORMATTED;
 
+typedef struct ACCENT_ENTITY_INFO {
+    char *entity;
+    char *characters;
+} ACCENT_ENTITY_INFO;
+
 typedef struct COMMAND_CONVERSION_FUNCTION {
     enum formatting_reference_status status;
     /* points to the perl formatting reference if it is used for
@@ -694,9 +699,11 @@ typedef struct CONVERTER {
     /* set for a converter */
     COMMAND_ID_LIST no_arg_formatted_cmd;
     COMMAND_ID_LIST style_formatted_cmd;
+    COMMAND_ID_LIST accent_formatted_cmd;
     int code_types[TXI_TREE_TYPES_NUMBER];
     char *pre_class_types[TXI_TREE_TYPES_NUMBER];
     int upper_case[BUILTIN_CMD_NUMBER];
+    ACCENT_ENTITY_INFO accent_entities[BUILTIN_CMD_NUMBER];
     STRING_WITH_LEN special_character[SC_non_breaking_space+1];
     STRING_WITH_LEN line_break_element;
     CSS_SELECTOR_STYLE_LIST css_element_class_styles;
diff --git a/tp/Texinfo/XS/main/unicode.c b/tp/Texinfo/XS/main/unicode.c
index 2095f0d1f4..1d293e50e0 100644
--- a/tp/Texinfo/XS/main/unicode.c
+++ b/tp/Texinfo/XS/main/unicode.c
@@ -91,7 +91,7 @@ unicode_accent (const char *text, const ELEMENT *e)
   if (e->cmd == CM_dotless)
     {
       if (!e->parent || !e->parent->parent || !e->parent->parent->cmd
-          || !unicode_diacritics[e->parent->parent->cmd])
+          || !unicode_diacritics[e->parent->parent->cmd].text)
         {
           if (!strcmp (text, "i"))
             /* dotless i in UTF-8 */
@@ -103,7 +103,7 @@ unicode_accent (const char *text, const ELEMENT *e)
       return strdup(text);
     }
 
-  if (unicode_diacritics[e->cmd])
+  if (unicode_diacritics[e->cmd].text)
     {
       static TEXT accented_text;
       if (e->cmd == CM_tieaccent)
@@ -139,7 +139,7 @@ unicode_accent (const char *text, const ELEMENT *e)
                   text_init (&accented_text);
                   text_append (&accented_text, first_char_text);
                   free (first_char_text);
-                  text_append (&accented_text, unicode_diacritics[e->cmd]);
+                  text_append (&accented_text, 
unicode_diacritics[e->cmd].text);
                   next_text = u8_strconv_to_encoding (next, "UTF-8",
                                                       iconveh_question_mark);
                   text_append (&accented_text, next_text);
@@ -154,7 +154,7 @@ unicode_accent (const char *text, const ELEMENT *e)
         }
       text_init (&accented_text);
       text_append (&accented_text, text);
-      text_append (&accented_text, unicode_diacritics[e->cmd]);
+      text_append (&accented_text, unicode_diacritics[e->cmd].text);
       result = normalize_NFC (accented_text.text);
       free (accented_text.text);
     }
@@ -172,9 +172,10 @@ compare_strings (const void *a, const void *b)
 }
 
 char *
-format_eight_bit_accents_stack (const char *text, const ELEMENT_STACK *stack,
-  int encoding_index,
-  char *(*format_accent)(const char *text, const ELEMENT *element, int 
set_case),
+format_eight_bit_accents_stack (CONVERTER *self, const char *text,
+                      const ELEMENT_STACK *stack, int encoding_index,
+  char *(*format_accent)(CONVERTER *self, const char *text,
+                         const ELEMENT *element, int set_case),
   int set_case)
 {
   int i, j, k;
@@ -195,7 +196,10 @@ format_eight_bit_accents_stack (const char *text, const 
ELEMENT_STACK *stack,
       results_stack[i] = unicode_accent (results_stack[i+1],
                                          accent_command);
       if (!results_stack[i])
-        break;
+        {
+          i--;
+          break;
+        }
       else if (set_case)
         {
           char *cased = to_upper_or_lower_multibyte (results_stack[i], 
set_case);
@@ -203,6 +207,8 @@ format_eight_bit_accents_stack (const char *text, const 
ELEMENT_STACK *stack,
           results_stack[i] = cased;
         }
     }
+  /* undo the last decrease of i */
+  i++;
 
   /*
     At this point we have the unicode character results for the accent
@@ -282,7 +288,6 @@ format_eight_bit_accents_stack (const char *text, const 
ELEMENT_STACK *stack,
     }
 
   free (prev_eight_bit);
-  free (new_eight_bit);
 
   /*
     handle the remaining accents, that have not been converted to 8bit
@@ -292,7 +297,7 @@ format_eight_bit_accents_stack (const char *text, const 
ELEMENT_STACK *stack,
     {
       const ELEMENT *accent_command = stack->stack[j];
       char *formatted_result
-          = (*format_accent) (result, accent_command, set_case);
+          = (*format_accent) (self, result, accent_command, set_case);
       free (result);
       result = formatted_result;
     }
@@ -306,11 +311,11 @@ format_eight_bit_accents_stack (const char *text, const 
ELEMENT_STACK *stack,
   return result;
 }
 
-/* FIXME converter in perl for (*format_accent), see encoded_accents comment*/
 char *
-format_unicode_accents_stack_internal (const char *text,
+format_unicode_accents_stack_internal (CONVERTER *self, const char *text,
   const ELEMENT_STACK *stack,
-  char *(*format_accent)(const char *text, const ELEMENT *element, int 
set_case),
+  char *(*format_accent)(CONVERTER *self, const char *text,
+                         const ELEMENT *element, int set_case),
   int set_case)
 {
   int i;
@@ -340,20 +345,18 @@ format_unicode_accents_stack_internal (const char *text,
     {
       const ELEMENT *accent_command = stack->stack[i];
       char *formatted_result
-          = (*format_accent) (result, accent_command, set_case);
+          = (*format_accent) (self, result, accent_command, set_case);
       free (result);
       result = formatted_result;
     }
   return result;
 }
 
-/* FIXME a converter is passed in perl to (*format_accent), both
-   directly and through functions.  It is not clear whether it is
-   actually used in perl, nor if it could be useful in C */
 char *
-encoded_accents (const char *text, const ELEMENT_STACK *stack,
+encoded_accents (CONVERTER *self, const char *text, const ELEMENT_STACK *stack,
   const char *encoding,
-  char *(*format_accent)(const char *text, const ELEMENT *element, int 
set_case),
+  char *(*format_accent)(CONVERTER *self, const char *text,
+                         const ELEMENT *element, int set_case),
   int set_case)
 {
   if (encoding)
@@ -374,7 +377,7 @@ encoded_accents (const char *text, const ELEMENT_STACK 
*stack,
           if (!strcmp (normalized_encoding, "utf-8"))
             {
               free (normalized_encoding);
-              return format_unicode_accents_stack_internal (text, stack,
+              return format_unicode_accents_stack_internal (self, text, stack,
                                                 format_accent, set_case);
             }
           for (i = 0; i < sizeof (unicode_to_eight_bit)
@@ -390,8 +393,8 @@ encoded_accents (const char *text, const ELEMENT_STACK 
*stack,
           if (encoding_index >= 0)
             {
               free (normalized_encoding);
-              return format_eight_bit_accents_stack (text, stack, 
encoding_index,
-                                                     format_accent, set_case);
+              return format_eight_bit_accents_stack (self, text, stack,
+                                     encoding_index, format_accent, set_case);
             }
         }
       free (normalized_encoding);
diff --git a/tp/Texinfo/XS/main/unicode.h b/tp/Texinfo/XS/main/unicode.h
index 0850a4b497..bfcc7b53a0 100644
--- a/tp/Texinfo/XS/main/unicode.h
+++ b/tp/Texinfo/XS/main/unicode.h
@@ -18,6 +18,11 @@ typedef struct COMMAND_UNICODE {
     int is_extra;
 } COMMAND_UNICODE;
 
+typedef struct DIACRITIC_UNICODE {
+    char *text; /* UTF-8 encoded */
+    char *codepoint;
+} DIACRITIC_UNICODE;
+
 /* can be inlined in text parsing codes */
 #define OTXI_UNICODE_TEXT_CASES(var) \
         case '-': \
@@ -69,7 +74,7 @@ typedef struct COMMAND_UNICODE {
           break;
 
 
-extern char *unicode_diacritics[];
+extern DIACRITIC_UNICODE unicode_diacritics[];
 extern COMMAND_UNICODE unicode_character_brace_no_arg_commands[];
 
 int unicode_point_decoded_in_encoding (char *encoding, char *codepoint);
@@ -78,9 +83,10 @@ char *normalize_NFC (const char *text);
 char *normalize_NFKD (const char *text);
 char *unicode_accent (const char *text, const ELEMENT *e);
 
-char *encoded_accents (const char *text, const ELEMENT_STACK *stack,
-  const char *encoding,
-  char *(*format_accent)(const char *text, const ELEMENT *element, int 
set_case),
+char *encoded_accents (CONVERTER *self, const char *text,
+  const ELEMENT_STACK *stack, const char *encoding,
+  char *(*format_accent)(CONVERTER *self, const char *text,
+                         const ELEMENT *element, int set_case),
   int set_case);
 char *unicode_brace_no_arg_command (enum command_id cmd, char *encoding);
 
diff --git a/tp/Texinfo/XS/main/utils.c b/tp/Texinfo/XS/main/utils.c
index a863e57f85..a5817fbbce 100644
--- a/tp/Texinfo/XS/main/utils.c
+++ b/tp/Texinfo/XS/main/utils.c
@@ -28,6 +28,7 @@
 #include "unistr.h"
 #include "unicase.h"
 #include "uniwidth.h"
+#include <unictype.h>
 
 #include "global_commands_types.h"
 #include "options_types.h"
@@ -242,6 +243,42 @@ width_multibyte (const char *text)
   return result;
 }
 
+/* length of next word in multibyte setting.  Should correspond to \w or
+   \p{Word} in perl */
+int
+word_bytes_len_multibyte (const char *text)
+{
+  uint8_t *encoded_u8 = u8_strconv_from_encoding (text, "UTF-8",
+                                                  iconveh_question_mark);
+  uint8_t *current_u8 = encoded_u8;
+  int len = 0;
+  while (1)
+    {
+      ucs4_t next_char;
+      int new_len = u8_strmbtouc (&next_char, current_u8);
+      if (!new_len)
+        {
+          break;
+        }
+      /* (\p{Alnum} = \p{Alphabetic} + \p{Nd}) + \pM + \p{Pc}
+                                              + \p{Join_Control} */
+      if (uc_is_general_category (next_char, UC_CATEGORY_M)
+          || uc_is_general_category (next_char, UC_CATEGORY_Nd)
+          || uc_is_property (next_char, UC_PROPERTY_ALPHABETIC)
+          || uc_is_property (next_char, UC_PROPERTY_JOIN_CONTROL))
+        {
+          len += new_len;
+          current_u8 += new_len;
+        }
+      else
+        {
+          break;
+        }
+    }
+  free (encoded_u8);
+  return len;
+}
+
 
 /* encoding and decoding. Use iconv. */
 /* conversion to or from utf-8 should always be set before other
diff --git a/tp/Texinfo/XS/main/utils.h b/tp/Texinfo/XS/main/utils.h
index 342cc1f57d..795ca1ebc9 100644
--- a/tp/Texinfo/XS/main/utils.h
+++ b/tp/Texinfo/XS/main/utils.h
@@ -182,6 +182,7 @@ int isascii_upper (int c);
 size_t count_multibyte (const char *text);
 char *to_upper_or_lower_multibyte (const char *text, int lower_or_upper);
 int width_multibyte (const char *text);
+int word_bytes_len_multibyte (const char *text);
 
 void delete_global_info (GLOBAL_INFO *global_info_ref);
 void delete_global_commands (GLOBAL_COMMANDS *global_commands_ref);
diff --git a/tp/maintain/setup_converters_code_tables.pl 
b/tp/maintain/setup_converters_code_tables.pl
index f2bcf013fb..bac7566369 100755
--- a/tp/maintain/setup_converters_code_tables.pl
+++ b/tp/maintain/setup_converters_code_tables.pl
@@ -145,7 +145,7 @@ my %extra_unicode_map = 
%Texinfo::Convert::Unicode::extra_unicode_map;
 open (UNIC, '>', $unicode_file) or die "Open $unicode_file: $!\n";
 
 print UNIC '#include "unicode.h"'."\n\n";
-print UNIC "char *unicode_diacritics[] = {\n";
+print UNIC "DIACRITIC_UNICODE unicode_diacritics[] = {\n";
 foreach my $command_name (@commands_order) {
   my $command = $command_name;
   if (exists($name_commands{$command_name})) {
@@ -154,11 +154,12 @@ foreach my $command_name (@commands_order) {
   #print UNIC "$command; ";
 
   if (defined($unicode_diacritics{$command_name})) {
-    my $result = chr(hex($unicode_diacritics{$command_name}));
+    my $numeric_codepoint = hex($unicode_diacritics{$command_name});
+    my $result = chr($numeric_codepoint);
     my $protected = join ('', map {_protect_char($_)} split ('', $result));
-    print UNIC "\"$protected\",   /* $command */\n";
+    print UNIC "{\"$protected\", \"$numeric_codepoint\"},  /* $command */\n";
   } else {
-    print UNIC "0,\n";
+    print UNIC "{0, 0},\n";
   }
 }
 print UNIC "};\n\n";



reply via email to

[Prev in Thread] Current Thread [Next in Thread]