gbranden pushed a commit to branch master
in repository groff.

commit 6008b6b7aa2920035e09d1dea44d262d30391195
Author: G. Branden Robinson <[email protected]>
AuthorDate: Thu Jan 18 12:45:57 2024 -0600

    [troff]: Diagnose bogus composite char escapes.
    
    [troff]: Diagnose bogus composite character escape sequences.  That is,
    when a composite character escape sequence like \[a ~] has a bogus
    modifier (as opposed to base) character, meaning one that has not been
    defined as the source _or_ destination of a `composite` request, warn
    about it.  For instance, \[a $] is nonsense, barring a request like
    `.composite $ \[uFF00]`, which would map `$`, when used as a modifier
    character in a composite special character escape sequence, to U+FF00,
    which would be a modifier form of the dollar sign in an alternate
    universe.
    
    * src/roff/troff/input.cpp (is_codepoint_composite): New function
      searches `composite_dictionary` for the presence of the given
      four-digit hexadecimal string as a key _or_ value.
    
    * src/roff/troff/input.h: Expose foregoing function to other translation
      units.
    
    * src/roff/troff/node.cpp (make_glyph_node): Check input `charinfo` for
      a Unicode code point sequence, and if it contains one, call
      `valid_unicode_code_sequence()` to check it for validity.  Then,
      iterate through each code point after the first {the base character},
      and call `is_codepoint_composite()` on it.  Diagnose invalid composite
      character and return null pointer if validation fails.
    
    Input:
    .nf
    \[A a~]
    \[A ~]
    \[u0041_0301]
    \[u0041_007E] \" should fail because 007E is explicitly spacing
    \[u0041_0041] \" same reason, more obviously
    \[u0041_0301_0301] \" should fail, would have a different meaning
    \[u0041_007E_0301] \" both problems above
    
    groff 1.23.0 and earlier:
    $ groff -T ps -z EXPERIMENTS/composite_character_construction.groff
    troff:...:5: warning: special character 'u0041_007E' not defined
    troff:...:6: warning: special character 'u0041_0041' not defined
    troff:...:7: warning: special character 'u0041_0301_0301' not defined
    troff:...:8: warning: special character 'u0041_007E_0301' not defined
    $ groff -Tutf8 -z EXPERIMENTS/composite_character_construction.groff
    [no output due to Savannah #65109]
    
    Now:
    $ ./build/test-groff -T ps -z 
EXPERIMENTS/composite_character_construction.groff
    troff:...:5: warning: special character 'u0041_007E' not defined
    troff:...:6: error: cannot format glyph: 'u0041_0041' is not a valid 
composite character
    troff:...:7: warning: special character 'u0041_0301_0301' not defined
    troff:...:8: warning: special character 'u0041_007E_0301' not defined
    $ ./build/test-groff -T utf8 -z 
EXPERIMENTS/composite_character_construction.groff
    troff:...:6: error: cannot format glyph: 'u0041_0041' is not a valid 
composite character
---
 ChangeLog                | 27 +++++++++++++++++++++++++++
 src/roff/troff/input.cpp | 32 ++++++++++++++++++++++++++++++++
 src/roff/troff/input.h   |  1 +
 src/roff/troff/node.cpp  | 24 +++++++++++++++++++++++-
 4 files changed, 83 insertions(+), 1 deletion(-)

diff --git a/ChangeLog b/ChangeLog
index e500142e4..023054db6 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,30 @@
+2024-01-18  G. Branden Robinson <[email protected]>
+
+       [troff]: Diagnose bogus composite character escape sequences.
+       That is, when a composite character escape sequence like \[a ~]
+       has a bogus modifier (as opposed to base) character, meaning one
+       that has not been defined as the source _or_ destination of a
+       `composite` request, warn about it.  For instance, \[a $] is
+       nonsense, barring a request like `.composite $ \[uFF00]`, which
+       would map `$`, when used as a modifier character in a composite
+       special character escape sequence, to U+FF00, which would be a
+       modifier form of the dollar sign in an alternate universe.
+
+       * src/roff/troff/input.cpp (is_codepoint_composite): New
+       function searches `composite_dictionary` for the presence of the
+       given four-digit hexadecimal string as a key _or_ value.
+
+       * src/roff/troff/input.h: Expose foregoing function to other
+       translation units.
+
+       * src/roff/troff/node.cpp (make_glyph_node): Check input
+       `charinfo` for a Unicode code point sequence, and if it contains
+       one, call `valid_unicode_code_sequence()` to check it for
+       validity.  Then, iterate through each code point after the first
+       {the base character}, and call `is_codepoint_composite()` on it.
+       Diagnose invalid composite character and return null pointer if
+       validation fails.
+
 2024-01-17  G. Branden Robinson <[email protected]>
 
        * src/roff/troff/input.cpp (map_composite_character): Stop
diff --git a/src/roff/troff/input.cpp b/src/roff/troff/input.cpp
index 0cbec2f4d..94a977e38 100644
--- a/src/roff/troff/input.cpp
+++ b/src/roff/troff/input.cpp
@@ -4225,6 +4225,38 @@ static symbol composite_glyph_name(symbol nm)
   return symbol(gl.contents());
 }
 
+// Does the hexadecimal four-character sequence `n` represent a code
+// point with a composite mapping?  Either the key or value component
+// of an entry in the composite dictionary qualifies.
+//
+// This is an O(n) search, but by default groff only defines 22
+// composite character mappings ("tmac/composite.tmac").  If this
+// becomes a performance problem, we will need another dictionary
+// mapping the unique values of `composite_dictionary` (which is not
+// one-to-one) to a Boolean.
+bool is_codepoint_composite(const char *n)
+{
+  bool result = false;
+  dictionary_iterator iter(composite_dictionary);
+  symbol key;
+  char *value;
+  while(iter.get(&key, reinterpret_cast<void **>(&value))) {
+    assert(!key.is_null());
+    assert(value != 0 /* nullptr */);
+    const char *k = key.contents();
+    if (strcmp(k, n) == 0) {
+      result = true;
+      break;
+    }
+    const char *v = reinterpret_cast<char *>(value);
+    if (strcmp(v, n) == 0) {
+      result = true;
+      break;
+    }
+  }
+  return result;
+}
+
 static void report_composite_characters()
 {
   dictionary_iterator iter(composite_dictionary);
diff --git a/src/roff/troff/input.h b/src/roff/troff/input.h
index e78124f92..179feabd3 100644
--- a/src/roff/troff/input.h
+++ b/src/roff/troff/input.h
@@ -112,6 +112,7 @@ const int INPUT_SOFT_HYPHEN= 0312;
 
 extern void do_glyph_color(symbol);
 extern void do_fill_color(symbol);
+extern bool is_codepoint_composite(const char *n);
 
 // Local Variables:
 // fill-column: 72
diff --git a/src/roff/troff/node.cpp b/src/roff/troff/node.cpp
index 719bb2f3a..c7f9116bd 100644
--- a/src/roff/troff/node.cpp
+++ b/src/roff/troff/node.cpp
@@ -1,4 +1,4 @@
-/* Copyright (C) 1989-2020 Free Software Foundation, Inc.
+/* Copyright (C) 1989-2024 Free Software Foundation, Inc.
      Written by James Clark ([email protected])
 
 This file is part of groff.
@@ -36,6 +36,7 @@ along with this program.  If not, see 
<http://www.gnu.org/licenses/>. */
 #include "charinfo.h"
 #include "input.h"
 #include "geometry.h"
+#include "unicode.h" // valid_unicode_code_sequence()
 
 #include "nonposix.h"
 
@@ -4910,6 +4911,27 @@ static node *make_glyph_node(charinfo *s, environment 
*env,
     error("cannot format glyph: no current font");
     return 0 /* nullptr */;
   }
+  const char *seq = valid_unicode_code_sequence(s->nm.contents());
+  if (seq != 0 /* nullptr */) {
+    // If it is a multi-character sequence like u1234_5678, every code
+    // point after the first must have (or be) a composite mapping.
+    char codepoint[5] = { 0, 0, 0, 0, 0};
+    bool is_composite_glyph_valid = true;
+    while ((seq = strchr(seq, '_')) != 0 /* nullptr */) {
+      seq++;
+      (void) strncpy(codepoint, seq, 4);
+      if (!is_codepoint_composite(codepoint)) {
+       is_composite_glyph_valid = false;
+       break;
+      }
+      seq += 4;
+    }
+    if (!is_composite_glyph_valid) {
+      error("cannot format glyph: '%1' is not a valid composite"
+           " character", s->nm.contents());
+      return 0 /* nullptr */;
+    }
+  }
   assert(fontno < font_table_size && font_table[fontno] != 0);
   int fn = fontno;
   bool found = font_table[fontno]->contains(s);

_______________________________________________
Groff-commit mailing list
[email protected]
https://lists.gnu.org/mailman/listinfo/groff-commit

Reply via email to