[bug #62830] [PATCH] [grops] support CJK fonts encoded in UTF16

G. Branden Robinson Wed, 20 Nov 2024 20:50:18 -0800

Follow-up Comment #14, bug #62830 (group groff):

Things look pretty good with this.  I made some modest changes to the patch.
Here's an interdiff to show them clearly.



diff -u b/font/devdvi/KOG b/font/devdvi/KOG
--- b/font/devdvi/KOG
+++ b/font/devdvi/KOG
@@ -1,5 +1,5 @@
 #
-#  Korean, GoThic style
+#  Korean, Gothic style
 #     with upTeX tfm font
 #
 
diff -u b/font/devhtml/KOG b/font/devhtml/KOG
--- b/font/devhtml/KOG
+++ b/font/devhtml/KOG
@@ -1,5 +1,5 @@
 #
-#  Korean, GoThic style
+#  Korean, Gothic style
 #
 
 name KOG
diff -u b/font/devps/KOG b/font/devps/KOG
--- b/font/devps/KOG
+++ b/font/devps/KOG
@@ -1,5 +1,5 @@
 #
-#  Korean, GoThic style
+#  Korean, Gothic style
 #  Adobe-Korea1
 #
 
diff -u b/font/devutf8/KOG b/font/devutf8/KOG
--- b/font/devutf8/KOG
+++ b/font/devutf8/KOG
@@ -1,5 +1,5 @@
 #
-#  Korean, GoThic style
+#  Korean, Gothic style
 #
 
 name KOG
diff -u b/src/devices/grohtml/post-html.cpp
b/src/devices/grohtml/post-html.cpp
--- b/src/devices/grohtml/post-html.cpp
+++ b/src/devices/grohtml/post-html.cpp
@@ -95,13 +95,11 @@
                                             /* end of each page?
          */
 static int groff_sig = FALSE;               /* "This document was produced
using"       */
 html_dialect dialect = html4;               /* which html dialect should
grohtml output */
-#define PARTIAL 1
-#define FULL    2
-static int charset_utf8 = PARTIAL;          /* charset partially in "UTF-8"
or in       */
-                                            /* character entity references by
default.  */
-                                            /* If false then "US-ASCII".
          */
-                                            /* If FULL then all Unicode
characters are  */
-                                            /* written in UTF-8 and do not
use character*/
+static const int CHARSET_ASCII = 0;
+static const int CHARSET_MIXED = 1;
+static const int CHARSET_UTF8  = 2;
+static int charset_encoding = CHARSET_MIXED;/* The character set may be plain
ASCII,    */
+                                            /* pure UTF-8, or a mixture of
character    */
                                             /* entity references.
          */
 
 
@@ -1408,13 +1406,15 @@
 }
 
 /*
- *  to_numerical_char_ref - returns a numerical character reference of
unicode ch.
+ *  to_numerical_char_ref - returns a numerical character reference of
+ *                          unicode character code `ch`.
  */
 
 static char *to_numerical_char_ref (unsigned int ch)
 {
-  static char buf[16];
-
+  // Make static buffer large enough for a 64-bit `int` type in
+  // hexadecimal (8 bytes) plus '&#x;' plus null terminator.
+  static char buf[8 + 4 + 1];
   sprintf(buf, "&#x%X;", ch);
   return buf;
 }
@@ -4425,7 +4425,9 @@
       html_glyph = 0;
 
     if ((0 /* nullptr */ == html_glyph) && (code >= UNICODE_DESC_START))
-      html_glyph = charset_utf8 ? to_utf8_string(code) :
to_numerical_char_ref(code);
+      html_glyph = static_cast<bool>(charset_encoding)
+                    ? to_utf8_string(code)
+                    : to_numerical_char_ref(code);
   } else
     html_glyph = get_html_translation(sbuf_style.f, s);
 
@@ -4506,7 +4508,7 @@
       case 0x003E: return "&gt;";
       default: return 0;
     }
-  } else if (charset_utf8==FULL) {
+  } else if (CHARSET_UTF8 == charset_encoding) {
       return to_utf8_string(code);
   } else {
     switch (code) {
@@ -4747,7 +4749,9 @@
       case 0x2666: return "&diams;";
       case 0x27E8: return "&lang;";
       case 0x27E9: return "&rang;";
-      default: return (charset_utf8 ? to_utf8_string(code) :
to_numerical_char_ref(code));
+      default: return (static_cast<bool>(charset_encoding)
+                        ? to_utf8_string(code)
+                        : to_numerical_char_ref(code));
     }
   }
 }
@@ -5182,7 +5186,8 @@
          "content=\"groff -Thtml, see www.gnu.org\">\n", stdout);
     fputs("<meta http-equiv=\"Content-Type\" "
          "content=\"text/html; charset=", stdout);
-    fputs(charset_utf8 ? "UTF-8" : "US-ASCII", stdout);
+    fputs(static_cast<bool>(charset_encoding)
+           ? "UTF-8" : "US-ASCII", stdout);
     fputs("\">\n", stdout);
     fputs("<meta name=\"Content-Style\" content=\"text/css\">\n",
          stdout);
@@ -5190,7 +5195,8 @@
   }
   else {
     fputs("<?xml version=\"1.0\" encoding=\"", stdout);
-    fputs(charset_utf8 ? "UTF-8" : "us-ascii", stdout);
+    fputs(static_cast<bool>(charset_encoding)
+           ? "UTF-8" : "us-ascii", stdout);
     fputs("\"?>\n", stdout);
     fputs("<!DOCTYPE html PUBLIC \"-//W3C//"
          "DTD XHTML 1.1 plus MathML 2.0//EN\"\n", stdout);
@@ -5206,7 +5212,8 @@
          "content=\"groff -Txhtml, see www.gnu.org\"/>\n", stdout);
     fputs("<meta http-equiv=\"Content-Type\" "
          "content=\"text/html; charset=", stdout);
-    fputs(charset_utf8 ? "UTF-8" : "US-ASCII", stdout);
+    fputs(static_cast<bool>(charset_encoding)
+           ? "UTF-8" : "US-ASCII", stdout);
     fputs("\"/>\n", stdout);
     fputs("<meta name=\"Content-Style\" content=\"text/css\"/>\n",
          stdout);
@@ -5568,6 +5575,8 @@
     { NULL, 0, 0, 0 }
   };
   opterr = 0;
+  // TODO: Rename `U` option, which generally means "unsafe mode" in
+  // groff, to `u`.
   while ((c = getopt_long(argc, argv,
          "a:bCdD:eF:g:Ghi:I:j:lno:prs:S:U::vVx:y", long_options, NULL))
         != EOF)
@@ -5639,18 +5648,20 @@
       split_level = atoi(optarg) + 1;
       break;
     case 'U':
-      /* default: PARTIAL */
       if (optarg) {
+       // TODO: This argument semantic scheme seems unergonomic to GBR;
+       // come up with an alternative.
         if ((strcmp(optarg, "0") == 0 || strcmp(optarg, "-") == 0))
-          charset_utf8 = FALSE;
+          charset_encoding = CHARSET_ASCII;
         else if ((strcmp(optarg, "1") == 0))
-          charset_utf8 = PARTIAL;
-        else if (optarg && (strcmp(optarg, "2") == 0 || strcmp(optarg, "+")
== 0))
-          charset_utf8 = FULL;
+          charset_encoding = CHARSET_MIXED;
+        else if (optarg && ((strcmp(optarg, "2") == 0)
+                            || strcmp(optarg, "+") == 0))
+          charset_encoding = CHARSET_UTF8;
         else
-          charset_utf8 = FULL;
+          charset_encoding = CHARSET_UTF8;
       } else
-        charset_utf8 = FULL;
+        charset_encoding = CHARSET_UTF8;
       break;
     case 'v':
       printf("GNU post-grohtml (groff) version %s\n", Version_string);


Interdiff rejected one hunk because it depended on code style cleanups to
"ps.cpp" that I inserted into my working copy before applying your patches.


--- interdiff-1.Qm0i6k
+++ interdiff-1.Qm0i6k
@@ -219,7 +221,7 @@ ps_output &ps_output::put_string(const char *s, size_t n)
     else
       len += 4;
   }
-  if (len > n*2) {
+  if (len > n*2 || is_utf16be) {
     if (col + n*2 + 2 > max_line_length && n*2 + 2 <= max_line_length) {
       putc('\n', fp);
       col = 0;


I also wrote changelog entries for the patch.


commit a02d03dc271dc85315e9f439692b2b2bfe7dc596
Author: TANAKA Takuji <t...@t-lab.opal.ne.jp>
Date:   Fri Dec 29 13:56:37 2023 +0000

    Support CJK fonts encoded in UTF-16 (1/6).
    
    * src/include/unicode.h (to_utf8_string): Declare new function.
    
    * src/libs/libgroff/unicode.cpp (to_utf8_string): New function converts
      input integer into UTF-8 sequence (or an HTML character entity in
      hexadecimal if the integer is out of range).

commit 0f5b9b63a7ee943e9c1f62d020ed9877e7bfef9b
Author: TANAKA Takuji <t...@t-lab.opal.ne.jp>
Date:   Fri Dec 29 13:56:37 2023 +0000

    Support CJK fonts encoded in UTF-16 (2/6).
    
    * src/include/font.h (class font): Declare private member variable
      `wch`, a pointer to an existing list type `font_char_metric`.  Declare
      private member function `get_font_wchar_metric()` to access it.
    
    * src/libs/libgroff/font.cpp (struct font_char_metric): Add members
      `next` (a pointer to the struct's own type) and `end_code` of type
      `int`.
    
      (glyph_to_ucs_codepoint): New function returns UCS code point from a
      (non-composite) `glyph` object, or -1 if invalid.
    
      (font::font): Constructor initializes `wch` member variable to null
      pointer.
    
      (font::~font): Destructor frees storage allocated in `font::load()`
      for `special_device_coding` member of `wcp` struct, and that of `wcp`
      itself.
    
      (font::contains): If `glyph_to_ucs_codepoint()` returns a valid value
      for the glyph, populate its wide character metrics and return true.
    
      (font::get_font_wchar_metric): New function obtains font metrics of
      input character by Unicode code point.
    
      (font::get_width, font::get_height, font::get_depth)
      (font::get_italic_correction, font::get_left_italic_correction)
      (font::get_subscript_correction, font::get_character_type)
      (font::get_code, font::get_special_device_encoding): If
      `glyph_to_ucs_codepoint()` returns a valid value for the glyph,
      populate its wide character metrics and return the appropriate
      parameter based on them.
    
      (font::get_width): Add conditional guard when computing width for a
      glyph from a "Unicode font"; use the computation only if the device
      description file ("DESC") didn't declare "unscaled_charwidths".
    
      (font::load): Recognize new directive in font description files:
      "charset-range", which works like the existing "charset" directive
      except that the glyph descriptions use a `name` of the form
      "uFFFF..uFFFF" (where "FFFF" is a hexadecimal digit sequence), and
      apply the metrics identically to all glyphs in the designated range.
    
      (font::load): When processing glyph descriptions in "charset" section
      and the device has declared the "unicode" directive, stop scaling the
      width of the glyph by what `wcwidth()` returns for it.  (Does this fix
      Savannah #44018?)

commit 525f6215348305de608c9ac65fe299f3e2076e4c
Author: TANAKA Takuji <t...@t-lab.opal.ne.jp>
Date:   Fri Dec 29 13:56:37 2023 +0000

    Support CJK fonts encoded in UTF-16 (3/6).
    
    * src/preproc/html/pre-html.cpp (scanArguments): Recognize but ignore
      new option `-U`, used by `grohtml` postprocessor.
    
    * src/devices/grohtml/post-html.cpp: Declare new constant integer
      objects `CHARSET_ASCII`, `CHARSET_MIXED`, and `CHARSET_UTF8` to
      configure representation of character entities in output.
    
      (main): New option `-U` takes argument configuring the means of
      encoding character entities.  If the argument is `0` or `-`, select
      `CHARSET_ASCII`; if `1`, select `CHARSET_MIXED`, and if `2` or `+`,
      select `CHARSET_UTF8`, which is also the default.
    
      (to_unicode): Replace this function with...  (to_numerical_char_ref):
      ...this, which generates a hexadecimal HTML character entity.
    
      (html_printer::add_to_sbuf): Write out UTF-8 sequence if
      `charset_encoding` is not `CHARSET_ASCII`, otherwise a numerical
      character reference.
    
      (get_html_entity): Return UTF-8 sequence if `charset_encoding` is
      `CHARSET_UTF8`.  Otherise, Return UTF-8 sequence if `charset_encoding`
      is not `CHARSET_ASCII`, otherwise a numerical character reference.
    
      (html_printer::writeHeadMetaStyle): Describe document {XHTML: encoding
      and} content as UTF-8 if `charset_encoding` is not `CHARSET_ASCII`,
      otherwise as US-ASCII.

commit fcd0113a296a91f32bc9f61b93604daa80b41fb3
Author: TANAKA Takuji <t...@t-lab.opal.ne.jp>
Date:   Fri Dec 29 13:56:37 2023 +0000

    Support CJK fonts encoded in UTF-16 (4/6).
    
    * src/devices/grops/ps.h:
    * src/devices/grops/ps.cpp: Include C99 "stdint.h" header for desired
      `unit16_t` data type.
    
      (class ps_output): Change type of `put_string` member function's first
      argument from `const char *` to `const uint16_t *`.  Add third
      argument of Boolean type, `is_utf16le`.
    
    * src/devices/grops/ps.cpp (ps_output::put_string): Adjust computations
      of `len` and `col` locals if the font in use is UTF-16LE-encoding, and
      write out 4-digit instead of 2-digit hexadecimal numeric literals when
      that is the case.
    
      (class ps_printer): Change type of `sbuf` member variable from `char`
      to `uint16_t`.  Change type of third argument to `set_subencoding`
      member function from `unsigned char *` to `uint16_t *`.
    
      (ps_printer::set_subencoding): Rename third argument from `codep` to
      `code`--it's no longer an indirect reference to a single `char`, but a
      2-element `uint16_t` array.  If the font's "internalname" directive
      contains the substring "-UTF16-", populate `code` argument with
      little-endian 16-bit value.
    
      (ps_printer::set_char): Declare `code` as above: a 2-element
      `uint16_t` array instead of an unsigned char.  Handle case of `code`
      using surrogate pairs (`code[1] > 0`).
    
      (ps_printer::flush_sbuf): Conditionalize form of output on font
      encoding.  Set the Boolean argument to `ps::put_string()` per the
      font's "internalname" directive matching the substring "-UTF16-".

commit d471e4fb84861d23f5c0b5a064db57896e544aaa
Author: TANAKA Takuji <t...@t-lab.opal.ne.jp>
Date:   Fri Dec 29 13:56:37 2023 +0000

    Support CJK fonts encoded in UTF-16 (5/6).
    
    Ship font description files.  These are intended as abstractions of
    faces to permit consistent naming while permitting customization, just
    as with the 12 text typefaces supported across output devices for Latin
    scripts in groff (three families of four styles each).  These CJK font
    descriptions are not organized into groff font families, but are
    similar.
    
            CSH: Simplified Chinese, Hei style
            CSS: Simplified Chinese, Song style
            CTH: Traditional Chinese, Hei style
            CTS: Traditional Chinese, Song style
            JPG: Japanese, Gothic style
            JPM: Japanese, Mincho style
            KOG: Korean, Gothic style
            KOM: Korean, Mincho style
    
    * font/devdvi/CSH:
    * font/devdvi/CSS:
    * font/devdvi/CTH:
    * font/devdvi/CTS:
    * font/devdvi/JPG:
    * font/devdvi/JPM:
    * font/devdvi/KOG:
    * font/devdvi/KOM:
    * font/devhtml/CSH:
    * font/devhtml/CSS:
    * font/devhtml/CTH:
    * font/devhtml/CTS:
    * font/devhtml/JPG:
    * font/devhtml/JPM:
    * font/devhtml/KOG:
    * font/devhtml/KOM:
    * font/devps/CSH:
    * font/devps/CSS:
    * font/devps/CTH:
    * font/devps/CTS:
    * font/devps/JPG:
    * font/devps/JPM:
    * font/devps/KOG:
    * font/devps/KOM:
    * font/devutf8/CSH:
    * font/devutf8/CSS:
    * font/devutf8/CTH:
    * font/devutf8/CTS:
    * font/devutf8/JPG:
    * font/devutf8/JPM:
    * font/devutf8/KOG:
    * font/devutf8/KOM: Ship font descriptions.
    
    * font/devdvi/devdvi.am (DEVDVIFONTFILES):
    * font/devhtml/devhtml.am (DEVHTMLFONTS, DEVHTMLFONTFILES):
    * font/devdvi/devps.am (DEVPSFONTFILES):
    * font/devutf8/devutf8.am (DEVUTF8FONTS, DEVUTF8FONTFILES): Add them.
    
    Note: the test "contrib/hdtbl/examples/test-hdtbl.sh" fails at this
    commit.

commit 3cc1ed649c09f739b8df0aafb3a461f9b030082b
Author: G. Branden Robinson <g.branden.robin...@gmail.com>
Date:   Wed Nov 20 19:51:12 2024 -0600

    [hdtbl]: Update test expectations WRT new fonts.
    
    * contrib/hdtbl/examples/test-hdtbl.sh.in: Update test expectations to
      reflect addition of 8 font descriptions for CJK support.

commit 500a5c98fc8258de3b69b846304e56fafb196797 (HEAD -> master)
Author: TANAKA Takuji <t...@t-lab.opal.ne.jp>
Date:   Fri Dec 29 13:56:37 2023 +0000

    Support CJK fonts encoded in UTF-16 (6/6).
    
    * src/roff/groff/tests/dvi-device-smoke-test.sh:
    * src/roff/groff/tests/ps-device-smoke-test.sh: New tests exercise
      output drivers and their encodings of CJK characters.
    
    * src/roff/groff/groff.am (groff_TESTS): Run tests.
    
    Fixes <https://savannah.gnu.org/bugs/?62830>.


There is a little work left to do; see the TODO items in the interdiff, and I
also need to document the new font description file directive in
_groff_font_(5) and our Texinfo manual, and document the newly available font
names in _grodvi(1)_, _grohtml_(1), _grops_(1), and _grotty_(1).

Thank you for your *immense* patience, Tanaka-sama.


    _______________________________________________________

Reply to this item at:

  <https://savannah.gnu.org/bugs/?62830>

_______________________________________________
Message sent via Savannah
https://savannah.gnu.org/

signature.asc
Description: PGP signature

[bug #62830] [PATCH] [grops] support CJK fonts encoded in UTF16

Reply via email to