Package: tunnelx
Version: 20160713-3
Severity: normal
Tags: patch
Characters outside US-ASCII get corrupted when saving and loading
tunnel sketches. The reason is that each char is simply cast to byte
when saving, and sign-extended to char when loading. I've made a patch
which writes and reads XML numeric character references for characters
not in US-ASCII as declared in the XML declaration. I tested with my
current survey which contains Gaelic names, and hand-edited the XML to
ensure that reading hexadecimal representations works (we always write
decimal). The patch is probably suitable for forwarding upstream.
Perhaps an alternative approach might be considered - open the output
file with UTF-8 encoding, and change the XML declaration to reflect
that.
Index: tunnelx-20160713/src/TNXML.java
===================================================================
--- tunnelx-20160713.orig/src/TNXML.java
+++ tunnelx-20160713/src/TNXML.java
@@ -726,7 +726,7 @@ class TNXML
/////////////////////////////////////////////
static char[] chconvCH = { (char)176, (char)246, (char)252, '<', '>', '"', '&', '\\', '\'', '\n', '\t', ' ' };
static char[] chconv = chconvCH; // allow for hacks (which vary chconvleng)
- static String[] chconvName = {"°", "ö", "ü", "<", ">", """, "&", "&backslash;", "&apostrophe;", "&newline;", "&tab;", "&space;" };
+ static String[] chconvName = {"deg", "ouml", "uuml", "lt", "gt", "quot", "amp", "backslash", "apostrophe", "newline", "tab", "space" };
static int chconvleng = chconvCH.length; // used for hacking out the space ones (this hack needs to be killed, or replaced with a flag)
static int chconvlengWSP = chconvCH.length - 4; // used for hacking out the space ones (this hack needs to be killed, or replaced with a flag)
/////////////////////////////////////////////
@@ -739,16 +739,23 @@ class TNXML
int j;
// there might be a regexp that would do this substitution directly, or use indexOf in a concatenated string of chconvCH
- for (j = 0; j < chconvleng; j++)
+ for (j = 3; j < chconvleng; j++) // start at '<' to allow deg, ouml, and uuml to use the general substitution below
{
if ((ch == chconvCH[j]) && (bAlsoSpace || (ch != ' ')))
{
- sb.append(chconvName[j]);
+ sb.append('&').append(chconvName[j]).append(';');
break;
}
}
- if (j == chconvleng)
- sb.append(ch);
+ if (j == chconvleng) {
+ // not found in table
+ if (' ' <= ch && ch <= 127)
+ // printable ASCII
+ sb.append(ch);
+ else
+ // general Unicode character
+ sb.append("&#").append((int)ch).append(";");
+ }
}
}
@@ -771,31 +778,34 @@ class TNXML
char ch = s.charAt(i);
if (ch == '&')
{
- int j;
- for (j = 0; j < chconvleng; j++)
- {
- if (s.regionMatches(i, chconvName[j], 0, chconvName[j].length()))
- {
- sb.append(chconvCH[j]);
- i += chconvName[j].length() - 1;
- //if (j < 2)
- // System.out.println(chconv[j] + " -- " + (int)chconv[j].toCharArray()[0]);
- break;
- }
- }
- if (j == chconvleng)
- {
- if (s.regionMatches(i, "&space;", 0, 7)) // back-compatible
- {
- sb.append(" ");
- i += 6;
- }
+ int refc = s.indexOf(';', i);
+ if (refc < 0)
+ TN.emitError("Missing reference close at " + s.substring(i, Math.max(i+15, s.length())));
+
+ if (s.charAt(++i) == '#') {
+ // A malformed numeric character reference will result in NumberFormatException
+ if (s.charAt(++i) == 'x')
+ // hexadecimal
+ sb.append((char)Integer.parseInt(s.substring(++i, refc), 16));
else
+ // decimal
+ sb.append((char)Integer.parseInt(s.substring(i, refc), 10));
+ } else {
+ String name = s.substring(i, refc);
+ int j;
+ for (j = 0; j < chconvleng; j++)
{
- System.out.println(s.substring(i));
- TN.emitError("unable to resolve & from pos " + i + " in string:" + s);
+ if (name.equals(chconvName[j]))
+ {
+ sb.append(chconvCH[j]);
+ break;
+ }
}
+ if (j == chconvleng)
+ TN.emitError("unable to resolve entity " + name);
}
+ // advance to the reference-close character (loop increment will skip it)
+ i = refc;
}
else
sb.append(ch);
-- System Information:
Debian Release: 9.0
APT prefers testing
APT policy: (900, 'testing'), (900, 'stable'), (400, 'unstable')
Architecture: amd64 (x86_64)
Foreign Architectures: i386, armel
Kernel: Linux 3.16.7-ckt2-balti (SMP w/8 CPU cores; PREEMPT)
Locale: LANG=en_GB.UTF-8, LC_CTYPE=en_GB.UTF-8 (charmap=UTF-8)
Shell: /bin/sh linked to /bin/dash
Init: sysvinit (via /sbin/init)
Versions of packages tunnelx depends on:
ii default-jre [java8-runtime] 2:1.8-58
ii gcj-4.8-jre [java5-runtime] 4.8.5-4
ii jarwrapper 0.59
ii openjdk-8-jre [java8-runtime] 8u111-b14-3
tunnelx recommends no packages.
tunnelx suggests no packages.
-- no debconf information