groff-commit
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[groff] 17/23: Support CJK fonts encoded in UTF-16 (3/6).


From: G. Branden Robinson
Subject: [groff] 17/23: Support CJK fonts encoded in UTF-16 (3/6).
Date: Thu, 21 Nov 2024 14:47:49 -0500 (EST)

gbranden pushed a commit to branch master
in repository groff.

commit 64e5f5c687160592d1a47a8dff83d8088fdcc39b
Author: TANAKA Takuji <ttk@t-lab.opal.ne.jp>
AuthorDate: Fri Dec 29 13:56:37 2023 +0000

    Support CJK fonts encoded in UTF-16 (3/6).
    
    * src/preproc/html/pre-html.cpp (scanArguments): Recognize but ignore
      new option `-U`, used by `grohtml` postprocessor.
    
    * src/devices/grohtml/post-html.cpp: Declare new constant integer
      objects `CHARSET_ASCII`, `CHARSET_MIXED`, and `CHARSET_UTF8` to
      configure representation of character entities in output.
    
      (main): New option `-U` takes argument configuring the means of
      encoding character entities.  If the argument is `0` or `-`, select
      `CHARSET_ASCII`; if `1`, select `CHARSET_MIXED`, and if `2` or `+`,
      select `CHARSET_UTF8`, which is also the default.
    
      (to_unicode): Replace this function with...  (to_numerical_char_ref):
      ...this, which generates a hexadecimal HTML character entity.
    
      (html_printer::add_to_sbuf): Write out UTF-8 sequence if
      `charset_encoding` is not `CHARSET_ASCII`, otherwise a numerical
      character reference.
    
      (get_html_entity): Return UTF-8 sequence if `charset_encoding` is
      `CHARSET_UTF8`.  Otherise, Return UTF-8 sequence if `charset_encoding`
      is not `CHARSET_ASCII`, otherwise a numerical character reference.
    
      (html_printer::writeHeadMetaStyle): Describe document {XHTML: encoding
      and} content as UTF-8 if `charset_encoding` is not `CHARSET_ASCII`,
      otherwise as US-ASCII.
---
 ChangeLog                         | 29 +++++++++++++++++
 src/devices/grohtml/post-html.cpp | 66 ++++++++++++++++++++++++++++++++-------
 src/preproc/html/pre-html.cpp     |  7 ++++-
 3 files changed, 89 insertions(+), 13 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index cb309aead..5fcfb050d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,32 @@
+2024-11-20  TANAKA Takuji <ttk@t-lab.opal.ne.jp>
+
+       Support CJK fonts encoded in UTF-16 (3/6).
+
+       * src/preproc/html/pre-html.cpp (scanArguments): Recognize but
+       ignore new option `-U`, used by `grohtml` postprocessor.
+
+       * src/devices/grohtml/post-html.cpp: Declare new constant
+       integer objects `CHARSET_ASCII`, `CHARSET_MIXED`, and
+       `CHARSET_UTF8` to configure representation of character entities
+       in output.
+       (main): New option `-U` takes argument configuring the means of
+       encoding character entities.  If the argument is `0` or `-`,
+       select `CHARSET_ASCII`; if `1`, select `CHARSET_MIXED`, and if
+       `2` or `+`, select `CHARSET_UTF8`, which is also the default.
+       (to_unicode): Replace this function with...
+       (to_numerical_char_ref): ...this, which generates a hexadecimal
+       HTML character entity.
+       (html_printer::add_to_sbuf): Write out UTF-8 sequence if
+       `charset_encoding` is not `CHARSET_ASCII`, otherwise a numerical
+       character reference.
+       (get_html_entity): Return UTF-8 sequence if `charset_encoding`
+       is `CHARSET_UTF8`.  Otherise, Return UTF-8 sequence if
+       `charset_encoding` is not `CHARSET_ASCII`, otherwise a numerical
+       character reference.
+       (html_printer::writeHeadMetaStyle): Describe document {XHTML:
+       encoding and} content as UTF-8 if `charset_encoding` is not
+       `CHARSET_ASCII`, otherwise as US-ASCII.
+
 2024-11-20  TANAKA Takuji <ttk@t-lab.opal.ne.jp>
 
        Support CJK fonts encoded in UTF-16 (2/6).
diff --git a/src/devices/grohtml/post-html.cpp 
b/src/devices/grohtml/post-html.cpp
index 70c0a2748..a8c040e56 100644
--- a/src/devices/grohtml/post-html.cpp
+++ b/src/devices/grohtml/post-html.cpp
@@ -28,6 +28,7 @@ along with this program.  If not, see 
<http://www.gnu.org/licenses/>. */
 #include "html-text.h"
 #include "html-table.h"
 #include "curtime.h"
+#include "unicode.h"
 
 #include <time.h>
 
@@ -94,6 +95,12 @@ static int valid_flag = FALSE;              /* has user 
requested a valid flag a
                                             /* end of each page?               
         */
 static int groff_sig = FALSE;               /* "This document was produced 
using"       */
 html_dialect dialect = html4;               /* which html dialect should 
grohtml output */
+static const int CHARSET_ASCII = 0;
+static const int CHARSET_MIXED = 1;
+static const int CHARSET_UTF8  = 2;
+static int charset_encoding = CHARSET_MIXED;/* The character set may be plain 
ASCII,    */
+                                            /* pure UTF-8, or a mixture of 
character    */
+                                            /* entity references.              
         */
 
 
 /*
@@ -1399,14 +1406,16 @@ void page::add_line (style *s,
 }
 
 /*
- *  to_unicode - returns a unicode translation of int, ch.
+ *  to_numerical_char_ref - returns a numerical character reference of
+ *                          unicode character code `ch`.
  */
 
-static char *to_unicode (unsigned int ch)
+static char *to_numerical_char_ref (unsigned int ch)
 {
-  static char buf[30];
-
-  sprintf(buf, "&#%u;", ch);
+  // Make static buffer large enough for a 64-bit `int` type in
+  // hexadecimal (8 bytes) plus '&#x;' plus null terminator.
+  static char buf[8 + 4 + 1];
+  sprintf(buf, "&#x%X;", ch);
   return buf;
 }
 
@@ -4416,7 +4425,9 @@ void html_printer::add_to_sbuf (glyph *g, const string &s)
       html_glyph = 0;
 
     if ((0 /* nullptr */ == html_glyph) && (code >= UNICODE_DESC_START))
-      html_glyph = to_unicode(code);
+      html_glyph = static_cast<bool>(charset_encoding)
+                    ? to_utf8_string(code)
+                    : to_numerical_char_ref(code);
   } else
     html_glyph = get_html_translation(sbuf_style.f, s);
 
@@ -4497,6 +4508,8 @@ static const char *get_html_entity (unsigned int code)
       case 0x003E: return "&gt;";
       default: return 0;
     }
+  } else if (CHARSET_UTF8 == charset_encoding) {
+      return to_utf8_string(code);
   } else {
     switch (code) {
       case 0x00A0: return "&nbsp;";
@@ -4736,7 +4749,9 @@ static const char *get_html_entity (unsigned int code)
       case 0x2666: return "&diams;";
       case 0x27E8: return "&lang;";
       case 0x27E9: return "&rang;";
-      default: return to_unicode(code);
+      default: return (static_cast<bool>(charset_encoding)
+                        ? to_utf8_string(code)
+                        : to_numerical_char_ref(code));
     }
   }
 }
@@ -5170,13 +5185,19 @@ void html_printer::writeHeadMetaStyle (void)
     fputs("<meta name=\"generator\" "
          "content=\"groff -Thtml, see www.gnu.org\">\n", stdout);
     fputs("<meta http-equiv=\"Content-Type\" "
-         "content=\"text/html; charset=US-ASCII\">\n", stdout);
+         "content=\"text/html; charset=", stdout);
+    fputs(static_cast<bool>(charset_encoding)
+           ? "UTF-8" : "US-ASCII", stdout);
+    fputs("\">\n", stdout);
     fputs("<meta name=\"Content-Style\" content=\"text/css\">\n",
          stdout);
     fputs("<style type=\"text/css\">\n", stdout);
   }
   else {
-    fputs("<?xml version=\"1.0\" encoding=\"us-ascii\"?>\n", stdout);
+    fputs("<?xml version=\"1.0\" encoding=\"", stdout);
+    fputs(static_cast<bool>(charset_encoding)
+           ? "UTF-8" : "us-ascii", stdout);
+    fputs("\"?>\n", stdout);
     fputs("<!DOCTYPE html PUBLIC \"-//W3C//"
          "DTD XHTML 1.1 plus MathML 2.0//EN\"\n", stdout);
     fputs(" \"http://www.w3.org/TR/MathML2/dtd/xhtml-math11-f.dtd\"\n";,
@@ -5190,7 +5211,10 @@ void html_printer::writeHeadMetaStyle (void)
     fputs("<meta name=\"generator\" "
          "content=\"groff -Txhtml, see www.gnu.org\"/>\n", stdout);
     fputs("<meta http-equiv=\"Content-Type\" "
-         "content=\"text/html; charset=US-ASCII\"/>\n", stdout);
+         "content=\"text/html; charset=", stdout);
+    fputs(static_cast<bool>(charset_encoding)
+           ? "UTF-8" : "US-ASCII", stdout);
+    fputs("\"/>\n", stdout);
     fputs("<meta name=\"Content-Style\" content=\"text/css\"/>\n",
          stdout);
     fputs("<style type=\"text/css\">\n", stdout);
@@ -5551,8 +5575,10 @@ int main(int argc, char **argv)
     { NULL, 0, 0, 0 }
   };
   opterr = 0;
+  // TODO: Rename `U` option, which generally means "unsafe mode" in
+  // groff, to `u`.
   while ((c = getopt_long(argc, argv,
-         "a:bCdD:eF:g:Ghi:I:j:lno:prs:S:vVx:y", long_options, NULL))
+         "a:bCdD:eF:g:Ghi:I:j:lno:prs:S:U::vVx:y", long_options, NULL))
         != EOF)
     switch(c) {
     case 'a':
@@ -5621,6 +5647,22 @@ int main(int argc, char **argv)
     case 'S':
       split_level = atoi(optarg) + 1;
       break;
+    case 'U':
+      if (optarg) {
+       // TODO: This argument semantic scheme seems unergonomic to GBR;
+       // come up with an alternative.
+        if ((strcmp(optarg, "0") == 0 || strcmp(optarg, "-") == 0))
+          charset_encoding = CHARSET_ASCII;
+        else if ((strcmp(optarg, "1") == 0))
+          charset_encoding = CHARSET_MIXED;
+        else if (optarg && ((strcmp(optarg, "2") == 0)
+                            || strcmp(optarg, "+") == 0))
+          charset_encoding = CHARSET_UTF8;
+        else
+          charset_encoding = CHARSET_UTF8;
+      } else
+        charset_encoding = CHARSET_UTF8;
+      break;
     case 'v':
       printf("GNU post-grohtml (groff) version %s\n", Version_string);
       exit(0);
@@ -5664,7 +5706,7 @@ int main(int argc, char **argv)
 static void usage(FILE *stream)
 {
   fprintf(stream,
-"usage: %s [-bCGhlnrVy] [-F font-directory] [-j output-stem]"
+"usage: %s [-bCGhlnrUVy] [-F font-directory] [-j output-stem]"
 " [-s base-type-size] [-S heading-level] [-x html-dialect] [file ...]\n"
 "usage: %s {-v | --version}\n"
 "usage: %s --help\n",
diff --git a/src/preproc/html/pre-html.cpp b/src/preproc/html/pre-html.cpp
index 432e98d25..c11598fcc 100644
--- a/src/preproc/html/pre-html.cpp
+++ b/src/preproc/html/pre-html.cpp
@@ -1604,8 +1604,10 @@ static int scanArguments(int argc, char **argv)
     { 0 /* nullptr */, 0, 0, 0 }
   };
   opterr = 0;
+  // TODO: Rename `U` option, which generally means "unsafe mode" in
+  // groff, to `u`.
   while ((c = getopt_long(argc, argv,
-         "+a:bCdD:eF:g:Ghi:I:j:lno:prs:S:vVx:y", long_options,
+         "+a:bCdD:eF:g:Ghi:I:j:lno:prs:S:U::vVx:y", long_options,
          0 /* nullptr */))
         != EOF)
     switch(c) {
@@ -1677,6 +1679,9 @@ static int scanArguments(int argc, char **argv)
     case 'S':
       // handled by post-grohtml (set file split level)
       break;
+    case 'U':
+      // handled by post-grohtml (charset UTF-8)
+      break;
     case 'v':
       printf("GNU pre-grohtml (groff) version %s\n", Version_string);
       exit(EXIT_SUCCESS);



reply via email to

[Prev in Thread] Current Thread [Next in Thread]