[no subject]

texinfo-commits
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[no subject]

From:	Patrice Dumas
Date:	Sat, 13 Jan 2024 14:34:22 -0500 (EST)
branch: master
commit 40fd859eb21c9be2d0f5b677ece1c240796037ac
Author: Patrice Dumas <pertusus@free.fr>
AuthorDate: Fri Jan 12 12:02:54 2024 +0100

    * tp/Texinfo/XS/main/unicode.h (DIACRITIC_UNICODE),
    tp/maintain/setup_converters_code_tables.pl (unicode_diacritics): add
    hexadecimal representation of diacritics to the unicode_diacritics
    table.
    
    * tp/Texinfo/XS/convert/convert_html.c
    (set_case_if_only_word_characters)
    (html_accent_entities_html_accent_internal)
    (after_escaped_characters, css_string_accent)
    (css_string_convert_accent_command, html_converter_initialize):
    implement css_string_convert_accent_command in C.  Create
    set_case_if_only_word_characters based on
    html_accent_entities_html_accent_internal tu use common code.
---
 ChangeLog                                   |  16 ++
 tp/Texinfo/XS/convert/convert_html.c        | 246 +++++++++++++++++++++++++++-
 tp/Texinfo/XS/main/unicode.h                |   3 +-
 tp/maintain/setup_converters_code_tables.pl |   5 +-
 4 files changed, 261 insertions(+), 9 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index ed0d64da34..7a4f06fa93 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,19 @@
+2024-01-12  Patrice Dumas  <pertusus@free.fr>
+
+       * tp/Texinfo/XS/main/unicode.h (DIACRITIC_UNICODE),
+       tp/maintain/setup_converters_code_tables.pl (unicode_diacritics): add
+       hexadecimal representation of diacritics to the unicode_diacritics
+       table.
+
+       * tp/Texinfo/XS/convert/convert_html.c
+       (set_case_if_only_word_characters)
+       (html_accent_entities_html_accent_internal)
+       (after_escaped_characters, css_string_accent)
+       (css_string_convert_accent_command, html_converter_initialize):
+       implement css_string_convert_accent_command in C.  Create
+       set_case_if_only_word_characters based on
+       html_accent_entities_html_accent_internal tu use common code.
+
 2024-01-11  Patrice Dumas  <pertusus@free.fr>
 
        * tp/Texinfo/XS/main/convert_to_text.c: make two functions static.
diff --git a/tp/Texinfo/XS/convert/convert_html.c 
b/tp/Texinfo/XS/convert/convert_html.c
index c1be81ef30..1ea789534a 100644
--- a/tp/Texinfo/XS/convert/convert_html.c
+++ b/tp/Texinfo/XS/convert/convert_html.c
@@ -9220,10 +9220,8 @@ convert_math_command (CONVERTER *self, const enum 
command_id cmd,
   free (attribute_class);
 }
 
-char *
-html_accent_entities_html_accent_internal (CONVERTER *self, const char *text,
-                         const ELEMENT *element, int set_case,
-                         int use_numeric_entities)
+static char *
+set_case_if_only_word_characters (const char *text, int set_case)
 {
   char *text_set;
 
@@ -9244,6 +9242,16 @@ html_accent_entities_html_accent_internal (CONVERTER 
*self, const char *text,
   else
     text_set = strdup (text);
 
+  return text_set;
+}
+
+char *
+html_accent_entities_html_accent_internal (CONVERTER *self, const char *text,
+                         const ELEMENT *element, int set_case,
+                         int use_numeric_entities)
+{
+  char *text_set = set_case_if_only_word_characters (text, set_case);
+
   /* do not return a dotless i or j as such if it is further composed
      with an accented letter, return the letter as is */
   if (element->cmd == CM_dotless
@@ -9322,6 +9330,7 @@ convert_accent_command (CONVERTER *self, const enum 
command_id cmd,
                     const HTML_ARGS_FORMATTED *args_formatted,
                     const char *content, TEXT *result)
 {
+  char *accent_text;
   char *(*format_accents)(CONVERTER *self, const char *text,
                          const ELEMENT *element, int set_case);
 
@@ -9332,7 +9341,225 @@ convert_accent_command (CONVERTER *self, const enum 
command_id cmd,
   else
     format_accents = &html_accent_entities_html_accent;
 
-  char *accent_text = convert_accents (self, element, &html_convert_tree,
+  accent_text = convert_accents (self, element, &html_convert_tree,
+                          format_accents, output_encoded_characters,
+                          html_in_upper_case (self));
+
+  text_append (result, accent_text);
+  free (accent_text);
+}
+
+/* same as matching the regex /^\\[a-zA-Z0-9]+ /
+ */
+static char *
+after_escaped_characters (char *text)
+{
+  char *p = text;
+  if (*p != '\\')
+    return 0;
+
+  p++;
+
+  if (!isascii_alnum (*p))
+    return 0;
+
+  while (isascii_alnum (*p))
+    p++;
+
+  if (*p == ' ')
+    return p+1;
+
+  return 0;
+}
+
+char *
+css_string_accent (CONVERTER *self, const char *text,
+                         const ELEMENT *element, int set_case)
+{
+  char *text_set = set_case_if_only_word_characters (text, set_case);
+
+  if (element->cmd == CM_dotless)
+    {
+      /* corresponds in perl, and for dotless, to
+ Texinfo::Convert::Unicode::unicode_accented_letters{$accent}->{$text} */
+      if (!strcmp (text_set, "i"))
+        {
+          free (text_set);
+          return strdup ("\\" "0131" " ");
+        }
+      else if (!strcmp (text_set, "j"))
+        {
+          free (text_set);
+          return strdup ("\\" "0237" " ");
+        }
+    }
+
+  if (unicode_diacritics[element->cmd].text)
+    {
+      char *accent_and_diacritic;
+      char *normalized_accent_text;
+      static TEXT accented_text;
+      if (element->cmd == CM_tieaccent)
+        {
+          /* tieaccent diacritic is naturally and correctly composed
+             between two characters */
+          /* we consider that letters are either characters or
+             escaped characters as they appear in CSS strings */
+          /* p non NUL corresponds to escaped characters */
+          char *p = after_escaped_characters (text_set);
+          char *next_text = 0;
+          ucs4_t first_char;
+          const uint8_t *next = 0;
+          uint8_t *encoded_u8 = 0;
+
+          if (!p)
+            {
+              /* check if a character matches */
+              encoded_u8 = u8_strconv_from_encoding (text, "UTF-8",
+                                                  iconveh_question_mark);
+              next = u8_next (&first_char, encoded_u8);
+              if (next && (uc_is_general_category (first_char, UC_CATEGORY_L)
+                          /* ASCII digits */
+                          || (first_char >= 0x0030 && first_char <= 0x0039)))
+                {
+                  next_text = u8_strconv_to_encoding (next, "UTF-8",
+                                                      iconveh_question_mark);
+                }
+            }
+          else
+            {
+              next_text = p;
+            }
+
+          if (next_text)
+            {
+              ucs4_t second_char;
+              const char *q = after_escaped_characters (next_text);
+
+              if (!q)
+                {
+                  const uint8_t *remaining;
+                  if (!next)
+                    {
+                      encoded_u8 = u8_strconv_from_encoding (text, "UTF-8",
+                                                  iconveh_question_mark);
+                      next = encoded_u8;
+                    }
+                  remaining = u8_next (&second_char, next);
+                  if (remaining
+                      && (uc_is_general_category (second_char, UC_CATEGORY_L)
+                            /* ASCII digits */
+                          || (second_char >= 0x0030 && second_char <= 0x0039)))
+                    {
+                      /* next_text remains as the text to add after
+                         the diacritic */
+                    }
+                  else
+                    next_text = 0;
+                }
+
+              if (next_text)
+                {
+                  /* add the first character or escaped text */
+                  text_init (&accented_text);
+                  if (!p)
+                    {
+                      char *first_char_text;
+                      uint8_t *first_char_u8 = malloc (7 * sizeof(uint8_t));
+                      int first_char_len
+                        = u8_uctomb (first_char_u8, first_char, 6);
+                      if (first_char_len < 0)
+                        fatal ("u8_uctomb returns negative value");
+                      first_char_u8[first_char_len] = 0;
+                      first_char_text = u8_strconv_to_encoding (first_char_u8,
+                                              "UTF-8", iconveh_question_mark);
+                      free (first_char_u8);
+                      text_append (&accented_text, first_char_text);
+                      free (first_char_text);
+                    }
+                  else
+                    text_append_n (&accented_text, p, p - text_set);
+
+                  /* add the tie accent */
+                  text_printf (&accented_text, "\\%s ",
+                               unicode_diacritics[element->cmd].hex_codepoint);
+                  /* add the remaining, second character or escaped text
+                     and everything else after (which is in general invalid
+                     but we do not care) */
+                  text_append (&accented_text, next_text);
+                  if (!p)
+                    free (next_text);
+                }
+            }
+          free (encoded_u8);
+          if (next_text)
+            return accented_text.text;
+        }
+
+      /* case of text and diacritic (including fallback for invalid tie
+         accent) */
+      text_init (&accented_text);
+      /* check if the normalization leads to merging text and diacritic,
+         if yes use the merged character, if not output text and diacitic
+         to be set up for composition */
+      xasprintf (&accent_and_diacritic, "%s%s",
+                 text, unicode_diacritics[element->cmd].text);
+      normalized_accent_text = normalize_NFC (accent_and_diacritic);
+      free (accent_and_diacritic);
+      /* check if the normalization led to merging text and diacritic
+         as one character.  If not, the leading text remains, this
+         is what the comparison checks */
+      if (!strncmp (normalized_accent_text, text, strlen (text)))
+        {
+          /* no normalization as one character, output text and diacritic
+             such that they could be composed */
+          text_append (&accented_text, text);
+          text_printf (&accented_text, "\\%s ",
+                   unicode_diacritics[element->cmd].hex_codepoint);
+        }
+      else
+        {
+          /* determine the hexadecimal unicode point of the normalized
+             character to output in the format expected in CSS strings */
+          char *next_text;
+          uint8_t *encoded_u8 = u8_strconv_from_encoding (
+                                 normalized_accent_text, "UTF-8",
+                                               iconveh_question_mark);
+          ucs4_t first_char;
+          const uint8_t *next = u8_next (&first_char, encoded_u8);
+          text_printf (&accented_text, "\\%04lX ", first_char);
+          next_text = u8_strconv_to_encoding (next, "UTF-8",
+                                              iconveh_question_mark);
+          free (encoded_u8);
+          text_append (&accented_text, next_text);
+          free (next_text);
+        }
+      free (normalized_accent_text);
+      free (text_set);
+      return accented_text.text;
+    }
+
+ /* There are diacritics for every accent command except for dotless.
+    We should only get there with dotless if the argument is not recognized.
+  */
+  return text_set;
+}
+
+void
+css_string_convert_accent_command (CONVERTER *self, const enum command_id cmd,
+                    const ELEMENT *element,
+                    const HTML_ARGS_FORMATTED *args_formatted,
+                    const char *content, TEXT *result)
+{
+  char *accent_text;
+  char *(*format_accents)(CONVERTER *self, const char *text,
+                         const ELEMENT *element, int set_case);
+
+  int output_encoded_characters = (self->conf->OUTPUT_CHARACTERS.integer > 0);
+
+  format_accents = &css_string_accent;
+
+  accent_text = convert_accents (self, element, &html_convert_tree,
                           format_accents, output_encoded_characters,
                           html_in_upper_case (self));
 
@@ -9340,6 +9567,7 @@ convert_accent_command (CONVERTER *self, const enum 
command_id cmd,
   free (accent_text);
 }
 
+
 void
 convert_indicateurl_command (CONVERTER *self, const enum command_id cmd,
                     const ELEMENT *element,
@@ -15969,7 +16197,7 @@ html_converter_initialize (CONVERTER *self)
         }
     }
 
-  /* accents commands implemented in C, but not css strings accents */
+  /* accents commands implemented in C */
   if (self->accent_cmd.number)
     {
       for (i = 0; i < self->accent_cmd.number; i++)
@@ -15977,6 +16205,8 @@ html_converter_initialize (CONVERTER *self)
           enum command_id cmd = self->accent_cmd.list[i];
           COMMAND_CONVERSION_FUNCTION *command_conversion
                = &self->command_conversion_function[cmd];
+          COMMAND_CONVERSION_FUNCTION *css_string_command_conversion
+               = &self->css_string_command_conversion_function[cmd];
           if (command_conversion->status == FRS_status_default_set)
             {
               command_conversion->formatting_reference = 0;
@@ -15984,6 +16214,10 @@ html_converter_initialize (CONVERTER *self)
               command_conversion->command_conversion
                 = &convert_accent_command;
             }
+          css_string_command_conversion->formatting_reference = 0;
+          css_string_command_conversion->status = FRS_status_internal;
+          css_string_command_conversion->command_conversion
+            = &css_string_convert_accent_command;
         }
     }
 
diff --git a/tp/Texinfo/XS/main/unicode.h b/tp/Texinfo/XS/main/unicode.h
index 15b3320a2f..2bd700b664 100644
--- a/tp/Texinfo/XS/main/unicode.h
+++ b/tp/Texinfo/XS/main/unicode.h
@@ -20,7 +20,8 @@ typedef struct COMMAND_UNICODE {
 
 typedef struct DIACRITIC_UNICODE {
     char *text; /* UTF-8 encoded */
-    char *codepoint;
+    char *codepoint; /* numeric */
+    char *hex_codepoint; /* hexadecimal */
 } DIACRITIC_UNICODE;
 
 /* can be inlined in text parsing codes */
diff --git a/tp/maintain/setup_converters_code_tables.pl 
b/tp/maintain/setup_converters_code_tables.pl
index bac7566369..30b5f2eabc 100755
--- a/tp/maintain/setup_converters_code_tables.pl
+++ b/tp/maintain/setup_converters_code_tables.pl
@@ -157,9 +157,10 @@ foreach my $command_name (@commands_order) {
     my $numeric_codepoint = hex($unicode_diacritics{$command_name});
     my $result = chr($numeric_codepoint);
     my $protected = join ('', map {_protect_char($_)} split ('', $result));
-    print UNIC "{\"$protected\", \"$numeric_codepoint\"},  /* $command */\n";
+    print UNIC "{\"$protected\", \"$numeric_codepoint\","
+              ." \"$unicode_diacritics{$command_name}\"},  /* $command */\n";
   } else {
-    print UNIC "{0, 0},\n";
+    print UNIC "{0, 0, 0},\n";
   }
 }
 print UNIC "};\n\n";
[Prev in Thread]
Current Thread
[Next in Thread]
master updated (1910257cc0 -> 114e10b2a1), Patrice Dumas, 2024/01/13
- [no subject], Patrice Dumas <=
- [no subject], Patrice Dumas, 2024/01/13
- [no subject], Patrice Dumas, 2024/01/13
- [no subject], Patrice Dumas, 2024/01/13
- [no subject], Patrice Dumas, 2024/01/13
Prev by Date: branch master updated: * tp/Texinfo/XS/main/convert_to_text.c: make two functions static.
Next by Date: [no subject]
Previous by thread: master updated (1910257cc0 -> 114e10b2a1)
Next by thread: [no subject]
Index(es):
- Date
- Thread