Hi,

For a long time OpenBSD has been careful about filtering potentially-
hostile strings that were destined for logs or TTYs using strvis(3) and
friends. Unfortunately, these don't do a great job for UTF-8 strings
since they mangle anything that isn't basic ASCII (not even ISO-8859-1).

This shows up in ssh, where non-English speakers have complained for
years about their server banners being rendered as gobbledygook, so a
few years ago I wrote the patch below that used RFC3454 stringprep to
try to filter hostile characters (e.g. terminal control sequences) while
leaving benign Unicode characters untouched when the user's LC_CTYPE
indicated they wanted UTF-8 output.

The patch never got committed because I never had enough confidence in
my knowledge of Unicode to be sure I'd picked the right characters,
but now that OpenBSD seems have settled on UTF-8 for non-LC_CTYPE=C
locales, I think it is time to revisit it.

My questions:

1) Is the approach correct? (I think so)

2) Are the tables correct? I'd like someone who knows more about Unicode
   than me (which is not much) to weigh in.

3) Would this be better off in libutil or libc?

4) If #4, should it be done in strvis(3) itself?

Comments appreciated.

-d

diff --git a/lib/Makefile b/lib/Makefile
index ed505b4..05cf8a0 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -34,7 +34,8 @@ SRCS= ${LIB_SRCS} \
        smult_curve25519_ref.c \
        kexc25519.c kexc25519c.c kexc25519s.c \
        roaming_dummy.c \
-       chacha.c poly1305.c cipher-chachapoly.c ssh-ed25519.c hmac.c umac.c
+       chacha.c poly1305.c cipher-chachapoly.c ssh-ed25519.c hmac.c umac.c \
+       utf8_stringprep.c
 
 .if (${SSH1:L} == "yes")
 SRCS+= cipher-3des1.c cipher-bf1.c
diff --git a/misc.h b/misc.h
index 53d469b..e476f1d 100644
--- a/misc.h
+++ b/misc.h
@@ -133,4 +133,7 @@ char        *read_passphrase(const char *, int);
 int     ask_permission(const char *, ...) __attribute__((format(printf, 1, 
2)));
 int     read_keyfile_line(FILE *, const char *, char *, size_t, u_long *);
 
+/* utf8_stringprep.c */
+int utf8_stringprep(const char *, char *, size_t);
+
 #endif /* _MISC_H */
diff --git a/sshconnect2.c b/sshconnect2.c
index 2b525ac..04120e7 100644
--- a/sshconnect2.c
+++ b/sshconnect2.c
@@ -39,6 +39,8 @@
 #include <pwd.h>
 #include <unistd.h>
 #include <vis.h>
+#include <locale.h>
+#include <langinfo.h>
 
 #include "xmalloc.h"
 #include "ssh.h"
@@ -455,21 +457,51 @@ input_userauth_error(int type, u_int32_t seq, void *ctxt)
        return 0;
 }
 
+/* Check whether we can display UTF-8 safely */
+static int
+utf8_ok(void)
+{
+       static int ret = -1;
+       char *cp;
+
+       if (ret == -1) {
+               setlocale(LC_CTYPE, "");
+               cp = nl_langinfo(CODESET);
+               ret = strcmp(cp, "UTF-8") == 0;
+       }
+       return ret;
+}
+
 /* ARGSUSED */
 int
 input_userauth_banner(int type, u_int32_t seq, void *ctxt)
 {
        char *msg, *raw, *lang;
-       u_int len;
+       u_int done, len;
 
        debug3("input_userauth_banner");
+
        raw = packet_get_string(&len);
        lang = packet_get_string(NULL);
        if (len > 0 && options.log_level >= SYSLOG_LEVEL_INFO) {
                if (len > 65536)
                        len = 65536;
                msg = xmalloc(len * 4 + 1); /* max expansion from strnvis() */
-               strnvis(msg, raw, len * 4 + 1, VIS_SAFE|VIS_OCTAL|VIS_NOSLASH);
+               done = 0;
+               if (utf8_ok()) {
+                       if (utf8_stringprep(raw, msg, len * 4 + 1) == 0)
+                               done = 1;
+                       else
+                               debug2("%s: UTF8 stringprep failed", __func__);
+               }
+               /*
+                * Fallback to strnvis if UTF8 display not supported or
+                * conversion failed.
+                */
+               if (!done) {
+                       strnvis(msg, raw, len * 4 + 1,
+                           VIS_SAFE|VIS_OCTAL|VIS_NOSLASH);
+               }
                fprintf(stderr, "%s", msg);
                free(msg);
        }
diff --git a/stringprep-tables.c b/stringprep-tables.c
new file mode 100644
index 0000000..c02facb
--- /dev/null
+++ b/stringprep-tables.c
@@ -0,0 +1,1321 @@
+/* Public domain.  */
+
+/* $OpenBSD$ */
+
+/*
+ * Tables for RFC3454 stringprep algorithm, updated with a table of allocated
+ * characters generated from Unicode.6.2's UnicodeData.txt
+ *
+ * Intended to be included directly from utf8_stringprep.c
+ */
+
+/* Unassigned characters in Unicode 6.2 */
+static const struct u32_range unassigned[] = {
+       { 0x0378, 0x0379 },
+       { 0x037F, 0x0383 },
+       { 0x038B, 0x038B },
+       { 0x038D, 0x038D },
+       { 0x03A2, 0x03A2 },
+       { 0x0528, 0x0530 },
+       { 0x0557, 0x0558 },
+       { 0x0560, 0x0560 },
+       { 0x0588, 0x0588 },
+       { 0x058B, 0x058E },
+       { 0x0590, 0x0590 },
+       { 0x05C8, 0x05CF },
+       { 0x05EB, 0x05EF },
+       { 0x05F5, 0x05FF },
+       { 0x0605, 0x0605 },
+       { 0x061C, 0x061D },
+       { 0x070E, 0x070E },
+       { 0x074B, 0x074C },
+       { 0x07B2, 0x07BF },
+       { 0x07FB, 0x07FF },
+       { 0x082E, 0x082F },
+       { 0x083F, 0x083F },
+       { 0x085C, 0x085D },
+       { 0x085F, 0x089F },
+       { 0x08A1, 0x08A1 },
+       { 0x08AD, 0x08E3 },
+       { 0x08FF, 0x08FF },
+       { 0x0978, 0x0978 },
+       { 0x0980, 0x0980 },
+       { 0x0984, 0x0984 },
+       { 0x098D, 0x098E },
+       { 0x0991, 0x0992 },
+       { 0x09A9, 0x09A9 },
+       { 0x09B1, 0x09B1 },
+       { 0x09B3, 0x09B5 },
+       { 0x09BA, 0x09BB },
+       { 0x09C5, 0x09C6 },
+       { 0x09C9, 0x09CA },
+       { 0x09CF, 0x09D6 },
+       { 0x09D8, 0x09DB },
+       { 0x09DE, 0x09DE },
+       { 0x09E4, 0x09E5 },
+       { 0x09FC, 0x0A00 },
+       { 0x0A04, 0x0A04 },
+       { 0x0A0B, 0x0A0E },
+       { 0x0A11, 0x0A12 },
+       { 0x0A29, 0x0A29 },
+       { 0x0A31, 0x0A31 },
+       { 0x0A34, 0x0A34 },
+       { 0x0A37, 0x0A37 },
+       { 0x0A3A, 0x0A3B },
+       { 0x0A3D, 0x0A3D },
+       { 0x0A43, 0x0A46 },
+       { 0x0A49, 0x0A4A },
+       { 0x0A4E, 0x0A50 },
+       { 0x0A52, 0x0A58 },
+       { 0x0A5D, 0x0A5D },
+       { 0x0A5F, 0x0A65 },
+       { 0x0A76, 0x0A80 },
+       { 0x0A84, 0x0A84 },
+       { 0x0A8E, 0x0A8E },
+       { 0x0A92, 0x0A92 },
+       { 0x0AA9, 0x0AA9 },
+       { 0x0AB1, 0x0AB1 },
+       { 0x0AB4, 0x0AB4 },
+       { 0x0ABA, 0x0ABB },
+       { 0x0AC6, 0x0AC6 },
+       { 0x0ACA, 0x0ACA },
+       { 0x0ACE, 0x0ACF },
+       { 0x0AD1, 0x0ADF },
+       { 0x0AE4, 0x0AE5 },
+       { 0x0AF2, 0x0B00 },
+       { 0x0B04, 0x0B04 },
+       { 0x0B0D, 0x0B0E },
+       { 0x0B11, 0x0B12 },
+       { 0x0B29, 0x0B29 },
+       { 0x0B31, 0x0B31 },
+       { 0x0B34, 0x0B34 },
+       { 0x0B3A, 0x0B3B },
+       { 0x0B45, 0x0B46 },
+       { 0x0B49, 0x0B4A },
+       { 0x0B4E, 0x0B55 },
+       { 0x0B58, 0x0B5B },
+       { 0x0B5E, 0x0B5E },
+       { 0x0B64, 0x0B65 },
+       { 0x0B78, 0x0B81 },
+       { 0x0B84, 0x0B84 },
+       { 0x0B8B, 0x0B8D },
+       { 0x0B91, 0x0B91 },
+       { 0x0B96, 0x0B98 },
+       { 0x0B9B, 0x0B9B },
+       { 0x0B9D, 0x0B9D },
+       { 0x0BA0, 0x0BA2 },
+       { 0x0BA5, 0x0BA7 },
+       { 0x0BAB, 0x0BAD },
+       { 0x0BBA, 0x0BBD },
+       { 0x0BC3, 0x0BC5 },
+       { 0x0BC9, 0x0BC9 },
+       { 0x0BCE, 0x0BCF },
+       { 0x0BD1, 0x0BD6 },
+       { 0x0BD8, 0x0BE5 },
+       { 0x0BFB, 0x0C00 },
+       { 0x0C04, 0x0C04 },
+       { 0x0C0D, 0x0C0D },
+       { 0x0C11, 0x0C11 },
+       { 0x0C29, 0x0C29 },
+       { 0x0C34, 0x0C34 },
+       { 0x0C3A, 0x0C3C },
+       { 0x0C45, 0x0C45 },
+       { 0x0C49, 0x0C49 },
+       { 0x0C4E, 0x0C54 },
+       { 0x0C57, 0x0C57 },
+       { 0x0C5A, 0x0C5F },
+       { 0x0C64, 0x0C65 },
+       { 0x0C70, 0x0C77 },
+       { 0x0C80, 0x0C81 },
+       { 0x0C84, 0x0C84 },
+       { 0x0C8D, 0x0C8D },
+       { 0x0C91, 0x0C91 },
+       { 0x0CA9, 0x0CA9 },
+       { 0x0CB4, 0x0CB4 },
+       { 0x0CBA, 0x0CBB },
+       { 0x0CC5, 0x0CC5 },
+       { 0x0CC9, 0x0CC9 },
+       { 0x0CCE, 0x0CD4 },
+       { 0x0CD7, 0x0CDD },
+       { 0x0CDF, 0x0CDF },
+       { 0x0CE4, 0x0CE5 },
+       { 0x0CF0, 0x0CF0 },
+       { 0x0CF3, 0x0D01 },
+       { 0x0D04, 0x0D04 },
+       { 0x0D0D, 0x0D0D },
+       { 0x0D11, 0x0D11 },
+       { 0x0D3B, 0x0D3C },
+       { 0x0D45, 0x0D45 },
+       { 0x0D49, 0x0D49 },
+       { 0x0D4F, 0x0D56 },
+       { 0x0D58, 0x0D5F },
+       { 0x0D64, 0x0D65 },
+       { 0x0D76, 0x0D78 },
+       { 0x0D80, 0x0D81 },
+       { 0x0D84, 0x0D84 },
+       { 0x0D97, 0x0D99 },
+       { 0x0DB2, 0x0DB2 },
+       { 0x0DBC, 0x0DBC },
+       { 0x0DBE, 0x0DBF },
+       { 0x0DC7, 0x0DC9 },
+       { 0x0DCB, 0x0DCE },
+       { 0x0DD5, 0x0DD5 },
+       { 0x0DD7, 0x0DD7 },
+       { 0x0DE0, 0x0DF1 },
+       { 0x0DF5, 0x0E00 },
+       { 0x0E3B, 0x0E3E },
+       { 0x0E5C, 0x0E80 },
+       { 0x0E83, 0x0E83 },
+       { 0x0E85, 0x0E86 },
+       { 0x0E89, 0x0E89 },
+       { 0x0E8B, 0x0E8C },
+       { 0x0E8E, 0x0E93 },
+       { 0x0E98, 0x0E98 },
+       { 0x0EA0, 0x0EA0 },
+       { 0x0EA4, 0x0EA4 },
+       { 0x0EA6, 0x0EA6 },
+       { 0x0EA8, 0x0EA9 },
+       { 0x0EAC, 0x0EAC },
+       { 0x0EBA, 0x0EBA },
+       { 0x0EBE, 0x0EBF },
+       { 0x0EC5, 0x0EC5 },
+       { 0x0EC7, 0x0EC7 },
+       { 0x0ECE, 0x0ECF },
+       { 0x0EDA, 0x0EDB },
+       { 0x0EE0, 0x0EFF },
+       { 0x0F48, 0x0F48 },
+       { 0x0F6D, 0x0F70 },
+       { 0x0F98, 0x0F98 },
+       { 0x0FBD, 0x0FBD },
+       { 0x0FCD, 0x0FCD },
+       { 0x0FDB, 0x0FFF },
+       { 0x10C6, 0x10C6 },
+       { 0x10C8, 0x10CC },
+       { 0x10CE, 0x10CF },
+       { 0x1249, 0x1249 },
+       { 0x124E, 0x124F },
+       { 0x1257, 0x1257 },
+       { 0x1259, 0x1259 },
+       { 0x125E, 0x125F },
+       { 0x1289, 0x1289 },
+       { 0x128E, 0x128F },
+       { 0x12B1, 0x12B1 },
+       { 0x12B6, 0x12B7 },
+       { 0x12BF, 0x12BF },
+       { 0x12C1, 0x12C1 },
+       { 0x12C6, 0x12C7 },
+       { 0x12D7, 0x12D7 },
+       { 0x1311, 0x1311 },
+       { 0x1316, 0x1317 },
+       { 0x135B, 0x135C },
+       { 0x137D, 0x137F },
+       { 0x139A, 0x139F },
+       { 0x13F5, 0x13FF },
+       { 0x169D, 0x169F },
+       { 0x16F1, 0x16FF },
+       { 0x170D, 0x170D },
+       { 0x1715, 0x171F },
+       { 0x1737, 0x173F },
+       { 0x1754, 0x175F },
+       { 0x176D, 0x176D },
+       { 0x1771, 0x1771 },
+       { 0x1774, 0x177F },
+       { 0x17DE, 0x17DF },
+       { 0x17EA, 0x17EF },
+       { 0x17FA, 0x17FF },
+       { 0x180F, 0x180F },
+       { 0x181A, 0x181F },
+       { 0x1878, 0x187F },
+       { 0x18AB, 0x18AF },
+       { 0x18F6, 0x18FF },
+       { 0x191D, 0x191F },
+       { 0x192C, 0x192F },
+       { 0x193C, 0x193F },
+       { 0x1941, 0x1943 },
+       { 0x196E, 0x196F },
+       { 0x1975, 0x197F },
+       { 0x19AC, 0x19AF },
+       { 0x19CA, 0x19CF },
+       { 0x19DB, 0x19DD },
+       { 0x1A1C, 0x1A1D },
+       { 0x1A5F, 0x1A5F },
+       { 0x1A7D, 0x1A7E },
+       { 0x1A8A, 0x1A8F },
+       { 0x1A9A, 0x1A9F },
+       { 0x1AAE, 0x1AFF },
+       { 0x1B4C, 0x1B4F },
+       { 0x1B7D, 0x1B7F },
+       { 0x1BF4, 0x1BFB },
+       { 0x1C38, 0x1C3A },
+       { 0x1C4A, 0x1C4C },
+       { 0x1C80, 0x1CBF },
+       { 0x1CC8, 0x1CCF },
+       { 0x1CF7, 0x1CFF },
+       { 0x1DE7, 0x1DFB },
+       { 0x1F16, 0x1F17 },
+       { 0x1F1E, 0x1F1F },
+       { 0x1F46, 0x1F47 },
+       { 0x1F4E, 0x1F4F },
+       { 0x1F58, 0x1F58 },
+       { 0x1F5A, 0x1F5A },
+       { 0x1F5C, 0x1F5C },
+       { 0x1F5E, 0x1F5E },
+       { 0x1F7E, 0x1F7F },
+       { 0x1FB5, 0x1FB5 },
+       { 0x1FC5, 0x1FC5 },
+       { 0x1FD4, 0x1FD5 },
+       { 0x1FDC, 0x1FDC },
+       { 0x1FF0, 0x1FF1 },
+       { 0x1FF5, 0x1FF5 },
+       { 0x1FFF, 0x1FFF },
+       { 0x2065, 0x2069 },
+       { 0x2072, 0x2073 },
+       { 0x208F, 0x208F },
+       { 0x209D, 0x209F },
+       { 0x20BB, 0x20CF },
+       { 0x20F1, 0x20FF },
+       { 0x218A, 0x218F },
+       { 0x23F4, 0x23FF },
+       { 0x2427, 0x243F },
+       { 0x244B, 0x245F },
+       { 0x2700, 0x2700 },
+       { 0x2B4D, 0x2B4F },
+       { 0x2B5A, 0x2BFF },
+       { 0x2C2F, 0x2C2F },
+       { 0x2C5F, 0x2C5F },
+       { 0x2CF4, 0x2CF8 },
+       { 0x2D26, 0x2D26 },
+       { 0x2D28, 0x2D2C },
+       { 0x2D2E, 0x2D2F },
+       { 0x2D68, 0x2D6E },
+       { 0x2D71, 0x2D7E },
+       { 0x2D97, 0x2D9F },
+       { 0x2DA7, 0x2DA7 },
+       { 0x2DAF, 0x2DAF },
+       { 0x2DB7, 0x2DB7 },
+       { 0x2DBF, 0x2DBF },
+       { 0x2DC7, 0x2DC7 },
+       { 0x2DCF, 0x2DCF },
+       { 0x2DD7, 0x2DD7 },
+       { 0x2DDF, 0x2DDF },
+       { 0x2E3C, 0x2E7F },
+       { 0x2E9A, 0x2E9A },
+       { 0x2EF4, 0x2EFF },
+       { 0x2FD6, 0x2FEF },
+       { 0x2FFC, 0x2FFF },
+       { 0x3040, 0x3040 },
+       { 0x3097, 0x3098 },
+       { 0x3100, 0x3104 },
+       { 0x312E, 0x3130 },
+       { 0x318F, 0x318F },
+       { 0x31BB, 0x31BF },
+       { 0x31E4, 0x31EF },
+       { 0x321F, 0x321F },
+       { 0x32FF, 0x32FF },
+       { 0x4DB6, 0x4DBF },
+       { 0x9FA6, 0x9FCB },
+       { 0x9FCD, 0x9FFF },
+       { 0xA48D, 0xA48F },
+       { 0xA4C7, 0xA4CF },
+       { 0xA62C, 0xA63F },
+       { 0xA698, 0xA69E },
+       { 0xA6F8, 0xA6FF },
+       { 0xA78F, 0xA78F },
+       { 0xA794, 0xA79F },
+       { 0xA7AB, 0xA7F7 },
+       { 0xA82C, 0xA82F },
+       { 0xA83A, 0xA83F },
+       { 0xA878, 0xA87F },
+       { 0xA8C5, 0xA8CD },
+       { 0xA8DA, 0xA8DF },
+       { 0xA8FC, 0xA8FF },
+       { 0xA954, 0xA95E },
+       { 0xA97D, 0xA97F },
+       { 0xA9CE, 0xA9CE },
+       { 0xA9DA, 0xA9DD },
+       { 0xA9E0, 0xA9FF },
+       { 0xAA37, 0xAA3F },
+       { 0xAA4E, 0xAA4F },
+       { 0xAA5A, 0xAA5B },
+       { 0xAA7C, 0xAA7F },
+       { 0xAAC3, 0xAADA },
+       { 0xAAF7, 0xAB00 },
+       { 0xAB07, 0xAB08 },
+       { 0xAB0F, 0xAB10 },
+       { 0xAB17, 0xAB1F },
+       { 0xAB27, 0xAB27 },
+       { 0xAB2F, 0xABBF },
+       { 0xABEE, 0xABEF },
+       { 0xABFA, 0xABFF },
+       { 0xD7A4, 0xD7AF },
+       { 0xD7C7, 0xD7CA },
+       { 0xD7FC, 0xD7FF },
+       { 0xFA6E, 0xFA6F },
+       { 0xFADA, 0xFAFF },
+       { 0xFB07, 0xFB12 },
+       { 0xFB18, 0xFB1C },
+       { 0xFB37, 0xFB37 },
+       { 0xFB3D, 0xFB3D },
+       { 0xFB3F, 0xFB3F },
+       { 0xFB42, 0xFB42 },
+       { 0xFB45, 0xFB45 },
+       { 0xFBC2, 0xFBD2 },
+       { 0xFD40, 0xFD4F },
+       { 0xFD90, 0xFD91 },
+       { 0xFDC8, 0xFDCF },
+       { 0xFDFE, 0xFDFF },
+       { 0xFE1A, 0xFE1F },
+       { 0xFE27, 0xFE2F },
+       { 0xFE53, 0xFE53 },
+       { 0xFE67, 0xFE67 },
+       { 0xFE6C, 0xFE6F },
+       { 0xFE75, 0xFE75 },
+       { 0xFEFD, 0xFEFE },
+       { 0xFF00, 0xFF00 },
+       { 0xFFBF, 0xFFC1 },
+       { 0xFFC8, 0xFFC9 },
+       { 0xFFD0, 0xFFD1 },
+       { 0xFFD8, 0xFFD9 },
+       { 0xFFDD, 0xFFDF },
+       { 0xFFE7, 0xFFE7 },
+       { 0xFFEF, 0xFFF8 },
+       { 0x1000C, 0x1000C },
+       { 0x10027, 0x10027 },
+       { 0x1003B, 0x1003B },
+       { 0x1003E, 0x1003E },
+       { 0x1004E, 0x1004F },
+       { 0x1005E, 0x1007F },
+       { 0x100FB, 0x100FF },
+       { 0x10103, 0x10106 },
+       { 0x10134, 0x10136 },
+       { 0x1018B, 0x1018F },
+       { 0x1019C, 0x101CF },
+       { 0x101FE, 0x1027F },
+       { 0x1029D, 0x1029F },
+       { 0x102D1, 0x102FF },
+       { 0x1031F, 0x1031F },
+       { 0x10324, 0x1032F },
+       { 0x1034B, 0x1037F },
+       { 0x1039E, 0x1039E },
+       { 0x103C4, 0x103C7 },
+       { 0x103D6, 0x103FF },
+       { 0x1049E, 0x1049F },
+       { 0x104AA, 0x107FF },
+       { 0x10806, 0x10807 },
+       { 0x10809, 0x10809 },
+       { 0x10836, 0x10836 },
+       { 0x10839, 0x1083B },
+       { 0x1083D, 0x1083E },
+       { 0x10856, 0x10856 },
+       { 0x10860, 0x108FF },
+       { 0x1091C, 0x1091E },
+       { 0x1093A, 0x1093E },
+       { 0x10940, 0x1097F },
+       { 0x109B8, 0x109BD },
+       { 0x109C0, 0x109FF },
+       { 0x10A04, 0x10A04 },
+       { 0x10A07, 0x10A0B },
+       { 0x10A14, 0x10A14 },
+       { 0x10A18, 0x10A18 },
+       { 0x10A34, 0x10A37 },
+       { 0x10A3B, 0x10A3E },
+       { 0x10A48, 0x10A4F },
+       { 0x10A59, 0x10A5F },
+       { 0x10A80, 0x10AFF },
+       { 0x10B36, 0x10B38 },
+       { 0x10B56, 0x10B57 },
+       { 0x10B73, 0x10B77 },
+       { 0x10B80, 0x10BFF },
+       { 0x10C49, 0x10E5F },
+       { 0x10E7F, 0x10FFF },
+       { 0x1104E, 0x11051 },
+       { 0x11070, 0x1107F },
+       { 0x110C2, 0x110CF },
+       { 0x110E9, 0x110EF },
+       { 0x110FA, 0x110FF },
+       { 0x11135, 0x11135 },
+       { 0x11144, 0x1117F },
+       { 0x111C9, 0x111CF },
+       { 0x111DA, 0x1167F },
+       { 0x116B8, 0x116BF },
+       { 0x116CA, 0x11FFF },
+       { 0x1236F, 0x123FF },
+       { 0x12463, 0x1246F },
+       { 0x12474, 0x12FFF },
+       { 0x1342F, 0x167FF },
+       { 0x16A39, 0x16EFF },
+       { 0x16F45, 0x16F4F },
+       { 0x16F7F, 0x16F8E },
+       { 0x16FA0, 0x1AFFF },
+       { 0x1B002, 0x1CFFF },
+       { 0x1D0F6, 0x1D0FF },
+       { 0x1D127, 0x1D128 },
+       { 0x1D1DE, 0x1D1FF },
+       { 0x1D246, 0x1D2FF },
+       { 0x1D357, 0x1D35F },
+       { 0x1D372, 0x1D3FF },
+       { 0x1D455, 0x1D455 },
+       { 0x1D49D, 0x1D49D },
+       { 0x1D4A0, 0x1D4A1 },
+       { 0x1D4A3, 0x1D4A4 },
+       { 0x1D4A7, 0x1D4A8 },
+       { 0x1D4AD, 0x1D4AD },
+       { 0x1D4BA, 0x1D4BA },
+       { 0x1D4BC, 0x1D4BC },
+       { 0x1D4C4, 0x1D4C4 },
+       { 0x1D506, 0x1D506 },
+       { 0x1D50B, 0x1D50C },
+       { 0x1D515, 0x1D515 },
+       { 0x1D51D, 0x1D51D },
+       { 0x1D53A, 0x1D53A },
+       { 0x1D53F, 0x1D53F },
+       { 0x1D545, 0x1D545 },
+       { 0x1D547, 0x1D549 },
+       { 0x1D551, 0x1D551 },
+       { 0x1D6A6, 0x1D6A7 },
+       { 0x1D7CC, 0x1D7CD },
+       { 0x1D800, 0x1EDFF },
+       { 0x1EE04, 0x1EE04 },
+       { 0x1EE20, 0x1EE20 },
+       { 0x1EE23, 0x1EE23 },
+       { 0x1EE25, 0x1EE26 },
+       { 0x1EE28, 0x1EE28 },
+       { 0x1EE33, 0x1EE33 },
+       { 0x1EE38, 0x1EE38 },
+       { 0x1EE3A, 0x1EE3A },
+       { 0x1EE3C, 0x1EE41 },
+       { 0x1EE43, 0x1EE46 },
+       { 0x1EE48, 0x1EE48 },
+       { 0x1EE4A, 0x1EE4A },
+       { 0x1EE4C, 0x1EE4C },
+       { 0x1EE50, 0x1EE50 },
+       { 0x1EE53, 0x1EE53 },
+       { 0x1EE55, 0x1EE56 },
+       { 0x1EE58, 0x1EE58 },
+       { 0x1EE5A, 0x1EE5A },
+       { 0x1EE5C, 0x1EE5C },
+       { 0x1EE5E, 0x1EE5E },
+       { 0x1EE60, 0x1EE60 },
+       { 0x1EE63, 0x1EE63 },
+       { 0x1EE65, 0x1EE66 },
+       { 0x1EE6B, 0x1EE6B },
+       { 0x1EE73, 0x1EE73 },
+       { 0x1EE78, 0x1EE78 },
+       { 0x1EE7D, 0x1EE7D },
+       { 0x1EE7F, 0x1EE7F },
+       { 0x1EE8A, 0x1EE8A },
+       { 0x1EE9C, 0x1EEA0 },
+       { 0x1EEA4, 0x1EEA4 },
+       { 0x1EEAA, 0x1EEAA },
+       { 0x1EEBC, 0x1EEEF },
+       { 0x1EEF2, 0x1EFFF },
+       { 0x1F02C, 0x1F02F },
+       { 0x1F094, 0x1F09F },
+       { 0x1F0AF, 0x1F0B0 },
+       { 0x1F0BF, 0x1F0C0 },
+       { 0x1F0D0, 0x1F0D0 },
+       { 0x1F0E0, 0x1F0FF },
+       { 0x1F10B, 0x1F10F },
+       { 0x1F12F, 0x1F12F },
+       { 0x1F16C, 0x1F16F },
+       { 0x1F19B, 0x1F1E5 },
+       { 0x1F203, 0x1F20F },
+       { 0x1F23B, 0x1F23F },
+       { 0x1F249, 0x1F24F },
+       { 0x1F252, 0x1F2FF },
+       { 0x1F321, 0x1F32F },
+       { 0x1F336, 0x1F336 },
+       { 0x1F37D, 0x1F37F },
+       { 0x1F394, 0x1F39F },
+       { 0x1F3C5, 0x1F3C5 },
+       { 0x1F3CB, 0x1F3DF },
+       { 0x1F3F1, 0x1F3FF },
+       { 0x1F43F, 0x1F43F },
+       { 0x1F441, 0x1F441 },
+       { 0x1F4F8, 0x1F4F8 },
+       { 0x1F4FD, 0x1F4FF },
+       { 0x1F53E, 0x1F53F },
+       { 0x1F544, 0x1F54F },
+       { 0x1F568, 0x1F5FA },
+       { 0x1F641, 0x1F644 },
+       { 0x1F650, 0x1F67F },
+       { 0x1F6C6, 0x1F6FF },
+       { 0x1F774, 0x1FFFD },
+       { 0x2A6D7, 0x2A6FF },
+       { 0x2A701, 0x2B733 },
+       { 0x2B735, 0x2B73F },
+       { 0x2B741, 0x2B81C },
+       { 0x2B81E, 0x2F7FF },
+       { 0x2FA1E, 0x2FFFD },
+       { 0x30000, 0x3FFFD },
+       { 0x40000, 0x4FFFD },
+       { 0x50000, 0x5FFFD },
+       { 0x60000, 0x6FFFD },
+       { 0x70000, 0x7FFFD },
+       { 0x80000, 0x8FFFD },
+       { 0x90000, 0x9FFFD },
+       { 0xA0000, 0xAFFFD },
+       { 0xB0000, 0xBFFFD },
+       { 0xC0000, 0xCFFFD },
+       { 0xD0000, 0xDFFFD },
+       { 0xE0000, 0xE0000 },
+       { 0xE0002, 0xE001F },
+       { 0xE0080, 0xE00FF },
+       { 0xE01F0, 0xEFFFD },
+};
+
+/* RFC3454 Table B.1 */
+static const struct u32_range map_to_nothing[] = {
+       { 0x00AD, 0x00AD },
+       { 0x034F, 0x034F },
+       { 0x1806, 0x1806 },
+       { 0x180B, 0x180D },
+       { 0x200B, 0x200D },
+       { 0x2060, 0x2060 },
+       { 0xFE00, 0xFE0F },
+       { 0xFEFF, 0xFEFF },
+};
+
+/* Local: allow tab, CR and LF */
+static const struct u32_range whitelist[] = {
+       { 0x09, 0x00 },
+       { 0x0a, 0x0a },
+       { 0x0d, 0x0d },
+};
+
+/* RFC3454 Tables in appendix C */
+static const struct u32_range prohibited[] = {
+       /* C.2.1 ASCII control characters */
+       { 0x0000, 0x001F },
+       { 0x007F, 0x007F },
+       /* C.2.2 Non-ASCII control characters */
+       { 0x0080, 0x009F },
+       { 0x06DD, 0x06DD },
+       { 0x070F, 0x070F },
+       { 0x180E, 0x180E },
+       { 0x200C, 0x200C },
+       { 0x200D, 0x200D },
+       { 0x2028, 0x2028 },
+       { 0x2029, 0x2029 },
+       { 0x2060, 0x2060 },
+       { 0x2061, 0x2061 },
+       { 0x2062, 0x2062 },
+       { 0x2063, 0x2063 },
+       { 0x206A, 0x206F },
+       { 0xFEFF, 0xFEFF },
+       { 0xFFF9, 0xFFFC },
+       { 0x1D173, 0x1D17A },
+       /* C.3 Private use */
+       { 0xE000, 0xF8FF },
+       { 0xF0000, 0xFFFFD },
+       { 0x100000, 0x10FFFD },
+       /* C.4 Non-character code points */
+       { 0xFDD0, 0xFDEF },
+       { 0xFFFE, 0xFFFF },
+       { 0x1FFFE, 0x1FFFF },
+       { 0x2FFFE, 0x2FFFF },
+       { 0x3FFFE, 0x3FFFF },
+       { 0x4FFFE, 0x4FFFF },
+       { 0x5FFFE, 0x5FFFF },
+       { 0x6FFFE, 0x6FFFF },
+       { 0x7FFFE, 0x7FFFF },
+       { 0x8FFFE, 0x8FFFF },
+       { 0x9FFFE, 0x9FFFF },
+       { 0xAFFFE, 0xAFFFF },
+       { 0xBFFFE, 0xBFFFF },
+       { 0xCFFFE, 0xCFFFF },
+       { 0xDFFFE, 0xDFFFF },
+       { 0xEFFFE, 0xEFFFF },
+       { 0xFFFFE, 0xFFFFF },
+       { 0x10FFFE, 0x10FFFF },
+       /* C.5 Surrogate codes */
+       { 0xD800, 0xDFFF },
+       /* C.6 Inappropriate for plain text */
+       { 0xFFF9, 0xFFF9 },
+       { 0xFFFA, 0xFFFA },
+       { 0xFFFB, 0xFFFB },
+       { 0xFFFC, 0xFFFC },
+       { 0xFFFD, 0xFFFD },
+       /* C.7 Inappropriate for canonical representation */
+       { 0x2FF0, 0x2FFB },
+       /* C.8 Change display properties or are deprecated */
+       { 0x0340, 0x0340 },
+       { 0x0341, 0x0341 },
+       { 0x200E, 0x200E },
+       { 0x200F, 0x200F },
+       { 0x202A, 0x202A },
+       { 0x202B, 0x202B },
+       { 0x202C, 0x202C },
+       { 0x202D, 0x202D },
+       { 0x202E, 0x202E },
+       { 0x206A, 0x206A },
+       { 0x206B, 0x206B },
+       { 0x206C, 0x206C },
+       { 0x206D, 0x206D },
+       { 0x206E, 0x206E },
+       { 0x206F, 0x206F },
+       /* C.9 Tagging characters */
+       { 0xE0001, 0xE0001 },
+       { 0xE0020, 0xE007F },
+};
+
+/* Public domain.  */
+
+/* $OpenBSD$ */
+
+/*
+ * Tables for RFC3454 stringprep algorithm, updated with a table of allocated
+ * characters generated from Unicode.6.2's UnicodeData.txt
+ *
+ * Intended to be included directly from utf8_stringprep.c
+ */
+
+/* Unassigned characters in Unicode 6.2 */
+static const struct u32_range unassigned[] = {
+       { 0x0378, 0x0379 },
+       { 0x037F, 0x0383 },
+       { 0x038B, 0x038B },
+       { 0x038D, 0x038D },
+       { 0x03A2, 0x03A2 },
+       { 0x0528, 0x0530 },
+       { 0x0557, 0x0558 },
+       { 0x0560, 0x0560 },
+       { 0x0588, 0x0588 },
+       { 0x058B, 0x058E },
+       { 0x0590, 0x0590 },
+       { 0x05C8, 0x05CF },
+       { 0x05EB, 0x05EF },
+       { 0x05F5, 0x05FF },
+       { 0x0605, 0x0605 },
+       { 0x061C, 0x061D },
+       { 0x070E, 0x070E },
+       { 0x074B, 0x074C },
+       { 0x07B2, 0x07BF },
+       { 0x07FB, 0x07FF },
+       { 0x082E, 0x082F },
+       { 0x083F, 0x083F },
+       { 0x085C, 0x085D },
+       { 0x085F, 0x089F },
+       { 0x08A1, 0x08A1 },
+       { 0x08AD, 0x08E3 },
+       { 0x08FF, 0x08FF },
+       { 0x0978, 0x0978 },
+       { 0x0980, 0x0980 },
+       { 0x0984, 0x0984 },
+       { 0x098D, 0x098E },
+       { 0x0991, 0x0992 },
+       { 0x09A9, 0x09A9 },
+       { 0x09B1, 0x09B1 },
+       { 0x09B3, 0x09B5 },
+       { 0x09BA, 0x09BB },
+       { 0x09C5, 0x09C6 },
+       { 0x09C9, 0x09CA },
+       { 0x09CF, 0x09D6 },
+       { 0x09D8, 0x09DB },
+       { 0x09DE, 0x09DE },
+       { 0x09E4, 0x09E5 },
+       { 0x09FC, 0x0A00 },
+       { 0x0A04, 0x0A04 },
+       { 0x0A0B, 0x0A0E },
+       { 0x0A11, 0x0A12 },
+       { 0x0A29, 0x0A29 },
+       { 0x0A31, 0x0A31 },
+       { 0x0A34, 0x0A34 },
+       { 0x0A37, 0x0A37 },
+       { 0x0A3A, 0x0A3B },
+       { 0x0A3D, 0x0A3D },
+       { 0x0A43, 0x0A46 },
+       { 0x0A49, 0x0A4A },
+       { 0x0A4E, 0x0A50 },
+       { 0x0A52, 0x0A58 },
+       { 0x0A5D, 0x0A5D },
+       { 0x0A5F, 0x0A65 },
+       { 0x0A76, 0x0A80 },
+       { 0x0A84, 0x0A84 },
+       { 0x0A8E, 0x0A8E },
+       { 0x0A92, 0x0A92 },
+       { 0x0AA9, 0x0AA9 },
+       { 0x0AB1, 0x0AB1 },
+       { 0x0AB4, 0x0AB4 },
+       { 0x0ABA, 0x0ABB },
+       { 0x0AC6, 0x0AC6 },
+       { 0x0ACA, 0x0ACA },
+       { 0x0ACE, 0x0ACF },
+       { 0x0AD1, 0x0ADF },
+       { 0x0AE4, 0x0AE5 },
+       { 0x0AF2, 0x0B00 },
+       { 0x0B04, 0x0B04 },
+       { 0x0B0D, 0x0B0E },
+       { 0x0B11, 0x0B12 },
+       { 0x0B29, 0x0B29 },
+       { 0x0B31, 0x0B31 },
+       { 0x0B34, 0x0B34 },
+       { 0x0B3A, 0x0B3B },
+       { 0x0B45, 0x0B46 },
+       { 0x0B49, 0x0B4A },
+       { 0x0B4E, 0x0B55 },
+       { 0x0B58, 0x0B5B },
+       { 0x0B5E, 0x0B5E },
+       { 0x0B64, 0x0B65 },
+       { 0x0B78, 0x0B81 },
+       { 0x0B84, 0x0B84 },
+       { 0x0B8B, 0x0B8D },
+       { 0x0B91, 0x0B91 },
+       { 0x0B96, 0x0B98 },
+       { 0x0B9B, 0x0B9B },
+       { 0x0B9D, 0x0B9D },
+       { 0x0BA0, 0x0BA2 },
+       { 0x0BA5, 0x0BA7 },
+       { 0x0BAB, 0x0BAD },
+       { 0x0BBA, 0x0BBD },
+       { 0x0BC3, 0x0BC5 },
+       { 0x0BC9, 0x0BC9 },
+       { 0x0BCE, 0x0BCF },
+       { 0x0BD1, 0x0BD6 },
+       { 0x0BD8, 0x0BE5 },
+       { 0x0BFB, 0x0C00 },
+       { 0x0C04, 0x0C04 },
+       { 0x0C0D, 0x0C0D },
+       { 0x0C11, 0x0C11 },
+       { 0x0C29, 0x0C29 },
+       { 0x0C34, 0x0C34 },
+       { 0x0C3A, 0x0C3C },
+       { 0x0C45, 0x0C45 },
+       { 0x0C49, 0x0C49 },
+       { 0x0C4E, 0x0C54 },
+       { 0x0C57, 0x0C57 },
+       { 0x0C5A, 0x0C5F },
+       { 0x0C64, 0x0C65 },
+       { 0x0C70, 0x0C77 },
+       { 0x0C80, 0x0C81 },
+       { 0x0C84, 0x0C84 },
+       { 0x0C8D, 0x0C8D },
+       { 0x0C91, 0x0C91 },
+       { 0x0CA9, 0x0CA9 },
+       { 0x0CB4, 0x0CB4 },
+       { 0x0CBA, 0x0CBB },
+       { 0x0CC5, 0x0CC5 },
+       { 0x0CC9, 0x0CC9 },
+       { 0x0CCE, 0x0CD4 },
+       { 0x0CD7, 0x0CDD },
+       { 0x0CDF, 0x0CDF },
+       { 0x0CE4, 0x0CE5 },
+       { 0x0CF0, 0x0CF0 },
+       { 0x0CF3, 0x0D01 },
+       { 0x0D04, 0x0D04 },
+       { 0x0D0D, 0x0D0D },
+       { 0x0D11, 0x0D11 },
+       { 0x0D3B, 0x0D3C },
+       { 0x0D45, 0x0D45 },
+       { 0x0D49, 0x0D49 },
+       { 0x0D4F, 0x0D56 },
+       { 0x0D58, 0x0D5F },
+       { 0x0D64, 0x0D65 },
+       { 0x0D76, 0x0D78 },
+       { 0x0D80, 0x0D81 },
+       { 0x0D84, 0x0D84 },
+       { 0x0D97, 0x0D99 },
+       { 0x0DB2, 0x0DB2 },
+       { 0x0DBC, 0x0DBC },
+       { 0x0DBE, 0x0DBF },
+       { 0x0DC7, 0x0DC9 },
+       { 0x0DCB, 0x0DCE },
+       { 0x0DD5, 0x0DD5 },
+       { 0x0DD7, 0x0DD7 },
+       { 0x0DE0, 0x0DF1 },
+       { 0x0DF5, 0x0E00 },
+       { 0x0E3B, 0x0E3E },
+       { 0x0E5C, 0x0E80 },
+       { 0x0E83, 0x0E83 },
+       { 0x0E85, 0x0E86 },
+       { 0x0E89, 0x0E89 },
+       { 0x0E8B, 0x0E8C },
+       { 0x0E8E, 0x0E93 },
+       { 0x0E98, 0x0E98 },
+       { 0x0EA0, 0x0EA0 },
+       { 0x0EA4, 0x0EA4 },
+       { 0x0EA6, 0x0EA6 },
+       { 0x0EA8, 0x0EA9 },
+       { 0x0EAC, 0x0EAC },
+       { 0x0EBA, 0x0EBA },
+       { 0x0EBE, 0x0EBF },
+       { 0x0EC5, 0x0EC5 },
+       { 0x0EC7, 0x0EC7 },
+       { 0x0ECE, 0x0ECF },
+       { 0x0EDA, 0x0EDB },
+       { 0x0EE0, 0x0EFF },
+       { 0x0F48, 0x0F48 },
+       { 0x0F6D, 0x0F70 },
+       { 0x0F98, 0x0F98 },
+       { 0x0FBD, 0x0FBD },
+       { 0x0FCD, 0x0FCD },
+       { 0x0FDB, 0x0FFF },
+       { 0x10C6, 0x10C6 },
+       { 0x10C8, 0x10CC },
+       { 0x10CE, 0x10CF },
+       { 0x1249, 0x1249 },
+       { 0x124E, 0x124F },
+       { 0x1257, 0x1257 },
+       { 0x1259, 0x1259 },
+       { 0x125E, 0x125F },
+       { 0x1289, 0x1289 },
+       { 0x128E, 0x128F },
+       { 0x12B1, 0x12B1 },
+       { 0x12B6, 0x12B7 },
+       { 0x12BF, 0x12BF },
+       { 0x12C1, 0x12C1 },
+       { 0x12C6, 0x12C7 },
+       { 0x12D7, 0x12D7 },
+       { 0x1311, 0x1311 },
+       { 0x1316, 0x1317 },
+       { 0x135B, 0x135C },
+       { 0x137D, 0x137F },
+       { 0x139A, 0x139F },
+       { 0x13F5, 0x13FF },
+       { 0x169D, 0x169F },
+       { 0x16F1, 0x16FF },
+       { 0x170D, 0x170D },
+       { 0x1715, 0x171F },
+       { 0x1737, 0x173F },
+       { 0x1754, 0x175F },
+       { 0x176D, 0x176D },
+       { 0x1771, 0x1771 },
+       { 0x1774, 0x177F },
+       { 0x17DE, 0x17DF },
+       { 0x17EA, 0x17EF },
+       { 0x17FA, 0x17FF },
+       { 0x180F, 0x180F },
+       { 0x181A, 0x181F },
+       { 0x1878, 0x187F },
+       { 0x18AB, 0x18AF },
+       { 0x18F6, 0x18FF },
+       { 0x191D, 0x191F },
+       { 0x192C, 0x192F },
+       { 0x193C, 0x193F },
+       { 0x1941, 0x1943 },
+       { 0x196E, 0x196F },
+       { 0x1975, 0x197F },
+       { 0x19AC, 0x19AF },
+       { 0x19CA, 0x19CF },
+       { 0x19DB, 0x19DD },
+       { 0x1A1C, 0x1A1D },
+       { 0x1A5F, 0x1A5F },
+       { 0x1A7D, 0x1A7E },
+       { 0x1A8A, 0x1A8F },
+       { 0x1A9A, 0x1A9F },
+       { 0x1AAE, 0x1AFF },
+       { 0x1B4C, 0x1B4F },
+       { 0x1B7D, 0x1B7F },
+       { 0x1BF4, 0x1BFB },
+       { 0x1C38, 0x1C3A },
+       { 0x1C4A, 0x1C4C },
+       { 0x1C80, 0x1CBF },
+       { 0x1CC8, 0x1CCF },
+       { 0x1CF7, 0x1CFF },
+       { 0x1DE7, 0x1DFB },
+       { 0x1F16, 0x1F17 },
+       { 0x1F1E, 0x1F1F },
+       { 0x1F46, 0x1F47 },
+       { 0x1F4E, 0x1F4F },
+       { 0x1F58, 0x1F58 },
+       { 0x1F5A, 0x1F5A },
+       { 0x1F5C, 0x1F5C },
+       { 0x1F5E, 0x1F5E },
+       { 0x1F7E, 0x1F7F },
+       { 0x1FB5, 0x1FB5 },
+       { 0x1FC5, 0x1FC5 },
+       { 0x1FD4, 0x1FD5 },
+       { 0x1FDC, 0x1FDC },
+       { 0x1FF0, 0x1FF1 },
+       { 0x1FF5, 0x1FF5 },
+       { 0x1FFF, 0x1FFF },
+       { 0x2065, 0x2069 },
+       { 0x2072, 0x2073 },
+       { 0x208F, 0x208F },
+       { 0x209D, 0x209F },
+       { 0x20BB, 0x20CF },
+       { 0x20F1, 0x20FF },
+       { 0x218A, 0x218F },
+       { 0x23F4, 0x23FF },
+       { 0x2427, 0x243F },
+       { 0x244B, 0x245F },
+       { 0x2700, 0x2700 },
+       { 0x2B4D, 0x2B4F },
+       { 0x2B5A, 0x2BFF },
+       { 0x2C2F, 0x2C2F },
+       { 0x2C5F, 0x2C5F },
+       { 0x2CF4, 0x2CF8 },
+       { 0x2D26, 0x2D26 },
+       { 0x2D28, 0x2D2C },
+       { 0x2D2E, 0x2D2F },
+       { 0x2D68, 0x2D6E },
+       { 0x2D71, 0x2D7E },
+       { 0x2D97, 0x2D9F },
+       { 0x2DA7, 0x2DA7 },
+       { 0x2DAF, 0x2DAF },
+       { 0x2DB7, 0x2DB7 },
+       { 0x2DBF, 0x2DBF },
+       { 0x2DC7, 0x2DC7 },
+       { 0x2DCF, 0x2DCF },
+       { 0x2DD7, 0x2DD7 },
+       { 0x2DDF, 0x2DDF },
+       { 0x2E3C, 0x2E7F },
+       { 0x2E9A, 0x2E9A },
+       { 0x2EF4, 0x2EFF },
+       { 0x2FD6, 0x2FEF },
+       { 0x2FFC, 0x2FFF },
+       { 0x3040, 0x3040 },
+       { 0x3097, 0x3098 },
+       { 0x3100, 0x3104 },
+       { 0x312E, 0x3130 },
+       { 0x318F, 0x318F },
+       { 0x31BB, 0x31BF },
+       { 0x31E4, 0x31EF },
+       { 0x321F, 0x321F },
+       { 0x32FF, 0x32FF },
+       { 0x4DB6, 0x4DBF },
+       { 0x9FA6, 0x9FCB },
+       { 0x9FCD, 0x9FFF },
+       { 0xA48D, 0xA48F },
+       { 0xA4C7, 0xA4CF },
+       { 0xA62C, 0xA63F },
+       { 0xA698, 0xA69E },
+       { 0xA6F8, 0xA6FF },
+       { 0xA78F, 0xA78F },
+       { 0xA794, 0xA79F },
+       { 0xA7AB, 0xA7F7 },
+       { 0xA82C, 0xA82F },
+       { 0xA83A, 0xA83F },
+       { 0xA878, 0xA87F },
+       { 0xA8C5, 0xA8CD },
+       { 0xA8DA, 0xA8DF },
+       { 0xA8FC, 0xA8FF },
+       { 0xA954, 0xA95E },
+       { 0xA97D, 0xA97F },
+       { 0xA9CE, 0xA9CE },
+       { 0xA9DA, 0xA9DD },
+       { 0xA9E0, 0xA9FF },
+       { 0xAA37, 0xAA3F },
+       { 0xAA4E, 0xAA4F },
+       { 0xAA5A, 0xAA5B },
+       { 0xAA7C, 0xAA7F },
+       { 0xAAC3, 0xAADA },
+       { 0xAAF7, 0xAB00 },
+       { 0xAB07, 0xAB08 },
+       { 0xAB0F, 0xAB10 },
+       { 0xAB17, 0xAB1F },
+       { 0xAB27, 0xAB27 },
+       { 0xAB2F, 0xABBF },
+       { 0xABEE, 0xABEF },
+       { 0xABFA, 0xABFF },
+       { 0xD7A4, 0xD7AF },
+       { 0xD7C7, 0xD7CA },
+       { 0xD7FC, 0xD7FF },
+       { 0xFA6E, 0xFA6F },
+       { 0xFADA, 0xFAFF },
+       { 0xFB07, 0xFB12 },
+       { 0xFB18, 0xFB1C },
+       { 0xFB37, 0xFB37 },
+       { 0xFB3D, 0xFB3D },
+       { 0xFB3F, 0xFB3F },
+       { 0xFB42, 0xFB42 },
+       { 0xFB45, 0xFB45 },
+       { 0xFBC2, 0xFBD2 },
+       { 0xFD40, 0xFD4F },
+       { 0xFD90, 0xFD91 },
+       { 0xFDC8, 0xFDCF },
+       { 0xFDFE, 0xFDFF },
+       { 0xFE1A, 0xFE1F },
+       { 0xFE27, 0xFE2F },
+       { 0xFE53, 0xFE53 },
+       { 0xFE67, 0xFE67 },
+       { 0xFE6C, 0xFE6F },
+       { 0xFE75, 0xFE75 },
+       { 0xFEFD, 0xFEFE },
+       { 0xFF00, 0xFF00 },
+       { 0xFFBF, 0xFFC1 },
+       { 0xFFC8, 0xFFC9 },
+       { 0xFFD0, 0xFFD1 },
+       { 0xFFD8, 0xFFD9 },
+       { 0xFFDD, 0xFFDF },
+       { 0xFFE7, 0xFFE7 },
+       { 0xFFEF, 0xFFF8 },
+       { 0x1000C, 0x1000C },
+       { 0x10027, 0x10027 },
+       { 0x1003B, 0x1003B },
+       { 0x1003E, 0x1003E },
+       { 0x1004E, 0x1004F },
+       { 0x1005E, 0x1007F },
+       { 0x100FB, 0x100FF },
+       { 0x10103, 0x10106 },
+       { 0x10134, 0x10136 },
+       { 0x1018B, 0x1018F },
+       { 0x1019C, 0x101CF },
+       { 0x101FE, 0x1027F },
+       { 0x1029D, 0x1029F },
+       { 0x102D1, 0x102FF },
+       { 0x1031F, 0x1031F },
+       { 0x10324, 0x1032F },
+       { 0x1034B, 0x1037F },
+       { 0x1039E, 0x1039E },
+       { 0x103C4, 0x103C7 },
+       { 0x103D6, 0x103FF },
+       { 0x1049E, 0x1049F },
+       { 0x104AA, 0x107FF },
+       { 0x10806, 0x10807 },
+       { 0x10809, 0x10809 },
+       { 0x10836, 0x10836 },
+       { 0x10839, 0x1083B },
+       { 0x1083D, 0x1083E },
+       { 0x10856, 0x10856 },
+       { 0x10860, 0x108FF },
+       { 0x1091C, 0x1091E },
+       { 0x1093A, 0x1093E },
+       { 0x10940, 0x1097F },
+       { 0x109B8, 0x109BD },
+       { 0x109C0, 0x109FF },
+       { 0x10A04, 0x10A04 },
+       { 0x10A07, 0x10A0B },
+       { 0x10A14, 0x10A14 },
+       { 0x10A18, 0x10A18 },
+       { 0x10A34, 0x10A37 },
+       { 0x10A3B, 0x10A3E },
+       { 0x10A48, 0x10A4F },
+       { 0x10A59, 0x10A5F },
+       { 0x10A80, 0x10AFF },
+       { 0x10B36, 0x10B38 },
+       { 0x10B56, 0x10B57 },
+       { 0x10B73, 0x10B77 },
+       { 0x10B80, 0x10BFF },
+       { 0x10C49, 0x10E5F },
+       { 0x10E7F, 0x10FFF },
+       { 0x1104E, 0x11051 },
+       { 0x11070, 0x1107F },
+       { 0x110C2, 0x110CF },
+       { 0x110E9, 0x110EF },
+       { 0x110FA, 0x110FF },
+       { 0x11135, 0x11135 },
+       { 0x11144, 0x1117F },
+       { 0x111C9, 0x111CF },
+       { 0x111DA, 0x1167F },
+       { 0x116B8, 0x116BF },
+       { 0x116CA, 0x11FFF },
+       { 0x1236F, 0x123FF },
+       { 0x12463, 0x1246F },
+       { 0x12474, 0x12FFF },
+       { 0x1342F, 0x167FF },
+       { 0x16A39, 0x16EFF },
+       { 0x16F45, 0x16F4F },
+       { 0x16F7F, 0x16F8E },
+       { 0x16FA0, 0x1AFFF },
+       { 0x1B002, 0x1CFFF },
+       { 0x1D0F6, 0x1D0FF },
+       { 0x1D127, 0x1D128 },
+       { 0x1D1DE, 0x1D1FF },
+       { 0x1D246, 0x1D2FF },
+       { 0x1D357, 0x1D35F },
+       { 0x1D372, 0x1D3FF },
+       { 0x1D455, 0x1D455 },
+       { 0x1D49D, 0x1D49D },
+       { 0x1D4A0, 0x1D4A1 },
+       { 0x1D4A3, 0x1D4A4 },
+       { 0x1D4A7, 0x1D4A8 },
+       { 0x1D4AD, 0x1D4AD },
+       { 0x1D4BA, 0x1D4BA },
+       { 0x1D4BC, 0x1D4BC },
+       { 0x1D4C4, 0x1D4C4 },
+       { 0x1D506, 0x1D506 },
+       { 0x1D50B, 0x1D50C },
+       { 0x1D515, 0x1D515 },
+       { 0x1D51D, 0x1D51D },
+       { 0x1D53A, 0x1D53A },
+       { 0x1D53F, 0x1D53F },
+       { 0x1D545, 0x1D545 },
+       { 0x1D547, 0x1D549 },
+       { 0x1D551, 0x1D551 },
+       { 0x1D6A6, 0x1D6A7 },
+       { 0x1D7CC, 0x1D7CD },
+       { 0x1D800, 0x1EDFF },
+       { 0x1EE04, 0x1EE04 },
+       { 0x1EE20, 0x1EE20 },
+       { 0x1EE23, 0x1EE23 },
+       { 0x1EE25, 0x1EE26 },
+       { 0x1EE28, 0x1EE28 },
+       { 0x1EE33, 0x1EE33 },
+       { 0x1EE38, 0x1EE38 },
+       { 0x1EE3A, 0x1EE3A },
+       { 0x1EE3C, 0x1EE41 },
+       { 0x1EE43, 0x1EE46 },
+       { 0x1EE48, 0x1EE48 },
+       { 0x1EE4A, 0x1EE4A },
+       { 0x1EE4C, 0x1EE4C },
+       { 0x1EE50, 0x1EE50 },
+       { 0x1EE53, 0x1EE53 },
+       { 0x1EE55, 0x1EE56 },
+       { 0x1EE58, 0x1EE58 },
+       { 0x1EE5A, 0x1EE5A },
+       { 0x1EE5C, 0x1EE5C },
+       { 0x1EE5E, 0x1EE5E },
+       { 0x1EE60, 0x1EE60 },
+       { 0x1EE63, 0x1EE63 },
+       { 0x1EE65, 0x1EE66 },
+       { 0x1EE6B, 0x1EE6B },
+       { 0x1EE73, 0x1EE73 },
+       { 0x1EE78, 0x1EE78 },
+       { 0x1EE7D, 0x1EE7D },
+       { 0x1EE7F, 0x1EE7F },
+       { 0x1EE8A, 0x1EE8A },
+       { 0x1EE9C, 0x1EEA0 },
+       { 0x1EEA4, 0x1EEA4 },
+       { 0x1EEAA, 0x1EEAA },
+       { 0x1EEBC, 0x1EEEF },
+       { 0x1EEF2, 0x1EFFF },
+       { 0x1F02C, 0x1F02F },
+       { 0x1F094, 0x1F09F },
+       { 0x1F0AF, 0x1F0B0 },
+       { 0x1F0BF, 0x1F0C0 },
+       { 0x1F0D0, 0x1F0D0 },
+       { 0x1F0E0, 0x1F0FF },
+       { 0x1F10B, 0x1F10F },
+       { 0x1F12F, 0x1F12F },
+       { 0x1F16C, 0x1F16F },
+       { 0x1F19B, 0x1F1E5 },
+       { 0x1F203, 0x1F20F },
+       { 0x1F23B, 0x1F23F },
+       { 0x1F249, 0x1F24F },
+       { 0x1F252, 0x1F2FF },
+       { 0x1F321, 0x1F32F },
+       { 0x1F336, 0x1F336 },
+       { 0x1F37D, 0x1F37F },
+       { 0x1F394, 0x1F39F },
+       { 0x1F3C5, 0x1F3C5 },
+       { 0x1F3CB, 0x1F3DF },
+       { 0x1F3F1, 0x1F3FF },
+       { 0x1F43F, 0x1F43F },
+       { 0x1F441, 0x1F441 },
+       { 0x1F4F8, 0x1F4F8 },
+       { 0x1F4FD, 0x1F4FF },
+       { 0x1F53E, 0x1F53F },
+       { 0x1F544, 0x1F54F },
+       { 0x1F568, 0x1F5FA },
+       { 0x1F641, 0x1F644 },
+       { 0x1F650, 0x1F67F },
+       { 0x1F6C6, 0x1F6FF },
+       { 0x1F774, 0x1FFFD },
+       { 0x2A6D7, 0x2A6FF },
+       { 0x2A701, 0x2B733 },
+       { 0x2B735, 0x2B73F },
+       { 0x2B741, 0x2B81C },
+       { 0x2B81E, 0x2F7FF },
+       { 0x2FA1E, 0x2FFFD },
+       { 0x30000, 0x3FFFD },
+       { 0x40000, 0x4FFFD },
+       { 0x50000, 0x5FFFD },
+       { 0x60000, 0x6FFFD },
+       { 0x70000, 0x7FFFD },
+       { 0x80000, 0x8FFFD },
+       { 0x90000, 0x9FFFD },
+       { 0xA0000, 0xAFFFD },
+       { 0xB0000, 0xBFFFD },
+       { 0xC0000, 0xCFFFD },
+       { 0xD0000, 0xDFFFD },
+       { 0xE0000, 0xE0000 },
+       { 0xE0002, 0xE001F },
+       { 0xE0080, 0xE00FF },
+       { 0xE01F0, 0xEFFFD },
+};
+
+/* RFC3454 Table B.1 */
+static const struct u32_range map_to_nothing[] = {
+       { 0x00AD, 0x00AD },
+       { 0x034F, 0x034F },
+       { 0x1806, 0x1806 },
+       { 0x180B, 0x180D },
+       { 0x200B, 0x200D },
+       { 0x2060, 0x2060 },
+       { 0xFE00, 0xFE0F },
+       { 0xFEFF, 0xFEFF },
+};
+
+/* Local: allow tab, CR and LF */
+static const struct u32_range whitelist[] = {
+       { 0x09, 0x00 },
+       { 0x0a, 0x0a },
+       { 0x0d, 0x0d },
+};
+
+/* RFC3454 Tables in appendix C */
+static const struct u32_range prohibited[] = {
+       /* C.2.1 ASCII control characters */
+       { 0x0000, 0x001F },
+       { 0x007F, 0x007F },
+       /* C.2.2 Non-ASCII control characters */
+       { 0x0080, 0x009F },
+       { 0x06DD, 0x06DD },
+       { 0x070F, 0x070F },
+       { 0x180E, 0x180E },
+       { 0x200C, 0x200C },
+       { 0x200D, 0x200D },
+       { 0x2028, 0x2028 },
+       { 0x2029, 0x2029 },
+       { 0x2060, 0x2060 },
+       { 0x2061, 0x2061 },
+       { 0x2062, 0x2062 },
+       { 0x2063, 0x2063 },
+       { 0x206A, 0x206F },
+       { 0xFEFF, 0xFEFF },
+       { 0xFFF9, 0xFFFC },
+       { 0x1D173, 0x1D17A },
+       /* C.3 Private use */
+       { 0xE000, 0xF8FF },
+       { 0xF0000, 0xFFFFD },
+       { 0x100000, 0x10FFFD },
+       /* C.4 Non-character code points */
+       { 0xFDD0, 0xFDEF },
+       { 0xFFFE, 0xFFFF },
+       { 0x1FFFE, 0x1FFFF },
+       { 0x2FFFE, 0x2FFFF },
+       { 0x3FFFE, 0x3FFFF },
+       { 0x4FFFE, 0x4FFFF },
+       { 0x5FFFE, 0x5FFFF },
+       { 0x6FFFE, 0x6FFFF },
+       { 0x7FFFE, 0x7FFFF },
+       { 0x8FFFE, 0x8FFFF },
+       { 0x9FFFE, 0x9FFFF },
+       { 0xAFFFE, 0xAFFFF },
+       { 0xBFFFE, 0xBFFFF },
+       { 0xCFFFE, 0xCFFFF },
+       { 0xDFFFE, 0xDFFFF },
+       { 0xEFFFE, 0xEFFFF },
+       { 0xFFFFE, 0xFFFFF },
+       { 0x10FFFE, 0x10FFFF },
+       /* C.5 Surrogate codes */
+       { 0xD800, 0xDFFF },
+       /* C.6 Inappropriate for plain text */
+       { 0xFFF9, 0xFFF9 },
+       { 0xFFFA, 0xFFFA },
+       { 0xFFFB, 0xFFFB },
+       { 0xFFFC, 0xFFFC },
+       { 0xFFFD, 0xFFFD },
+       /* C.7 Inappropriate for canonical representation */
+       { 0x2FF0, 0x2FFB },
+       /* C.8 Change display properties or are deprecated */
+       { 0x0340, 0x0340 },
+       { 0x0341, 0x0341 },
+       { 0x200E, 0x200E },
+       { 0x200F, 0x200F },
+       { 0x202A, 0x202A },
+       { 0x202B, 0x202B },
+       { 0x202C, 0x202C },
+       { 0x202D, 0x202D },
+       { 0x202E, 0x202E },
+       { 0x206A, 0x206A },
+       { 0x206B, 0x206B },
+       { 0x206C, 0x206C },
+       { 0x206D, 0x206D },
+       { 0x206E, 0x206E },
+       { 0x206F, 0x206F },
+       /* C.9 Tagging characters */
+       { 0xE0001, 0xE0001 },
+       { 0xE0020, 0xE007F },
+};
diff --git a/utf8_stringprep.c b/utf8_stringprep.c
new file mode 100644
index 0000000..dcbd304
--- /dev/null
+++ b/utf8_stringprep.c
@@ -0,0 +1,457 @@
+/*
+ * Copyright (c) 2013 Damien Miller <[email protected]>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/*
+ * This is a simple RFC3454 stringprep profile to sanitise UTF-8 strings
+ * from untrusted sources.
+ *
+ * It is intended to be used prior to display of untrusted strings only.
+ * It should not be used for logging because of bi-di ambiguity. It
+ * should also not be used in any case where lack of normalisation may
+ * cause problems.
+ *
+ * This profile uses the prohibition and mapping tables from RFC3454
+ * (listed below) but the unassigned character table has been updated to
+ * Unicode 6.2. It uses a local whitelist of whitespace characters (\n,
+ * \a and \t). Unicode normalisation and bi-di testing are not used.
+ *
+ * XXX: implement bi-di handling (needed for logs)
+ * XXX: implement KC normalisation (needed for passing to libs/syscalls)
+ */
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include <ctype.h>
+
+#include "misc.h"
+
+struct u32_range {
+       u_int32_t lo, hi;  /* Inclusive */
+};
+
+#include "stringprep-tables.c"
+
+/* Returns 1 if code 'c' appears in the table or 0 otherwise */
+static int
+code_in_table(u_int32_t c, const struct u32_range *table, size_t tlen)
+{
+       const struct u32_range *e, *end = (void *)(tlen + (char *)table);
+
+       for (e = table; e < end; e++) {
+               if (c >= e->lo && c <= e->hi)
+                       return 1;
+       }
+       return 0;
+}
+
+/*
+ * Decode the next valid UCS character from a UTF-8 string, skipping past bad
+ * codes. Returns the decoded character or 0 for end-of-string and updates
+ * nextc to point to the start of the next character (if any).
+ * had_error is set if an invalid code was encountered.
+ */
+static u_int32_t
+decode_utf8(const char *in, const char **nextc, int *had_error)
+{
+       int state = 0;
+       size_t i;
+       u_int32_t c, e;
+
+       e = c = 0;
+       for (i = 0; in[i] != '\0'; i++) {
+               e = (u_char)in[i];
+               /* Invalid code point state */
+               if (state == -1) {
+                       /*
+                        * Continue eating continuation characters until
+                        * a new start character comes along.
+                        */
+                       if ((e & 0xc0) == 0x80)
+                               continue;
+                       state = 0;
+               }
+
+               /* New code point state */
+               if (state == 0) {
+                       if ((e & 0x80) == 0) { /* 7 bit code */
+                               c = e & 0x7f;
+                               goto have_code;
+                       } else if ((e & 0xe0) == 0xc0) { /* 11 bit code point */
+                               state = 1;
+                               c = (e & 0x1f) << 6;
+                       } else if ((e & 0xf0) == 0xe0) { /* 16 bit code point */
+                               state = 2;
+                               c = (e & 0xf) << 12;
+                       } else if ((e & 0xf8) == 0xf0) { /* 21 bit code point */
+                               state = 3;
+                               c = (e & 0x7) << 18;
+                       } else {
+                               /* A five or six byte header, or 0xff */
+                               goto bad_encoding;
+                       }
+                       /*
+                        * Check that the header byte has some non-zero data
+                        * after masking off the length marker. If not it is
+                        * an invalid encoding.
+                        */
+                       if (c == 0) {
+ bad_encoding:
+                               c = 0;
+                               state = -1;
+                               if (had_error != NULL)
+                                       *had_error = 1;
+                       }
+                       continue;
+               }
+
+               /* Sanity check: should never happen */
+               if (state < 1 || state > 5) {
+                       *nextc = NULL;
+                       if (had_error != NULL)
+                               *had_error = 1;
+                       return 0;
+               }
+               /* Multibyte code point state */
+               state--;
+               c |= (e & 0x3f) << (state * 6);
+               if (state > 0)
+                       continue;
+
+               /* RFC3629 bans codepoints > U+10FFFF */
+               if (c > 0x10FFFF) {
+                       if (had_error != NULL)
+                               *had_error = 1;
+                       continue;
+               }
+ have_code:
+               *nextc = in + i + 1;
+               return c;
+       }
+       if (state != 0 && had_error != NULL)
+               *had_error = 1;
+       *nextc = in + i;
+       return 0;
+}
+
+/*
+ * Attempt to encode a UCS character as a UTF-8 sequence. Returns the number
+ * of characters used or -1 on error (insufficient space or bad code).
+ */
+static int
+encode_utf8(u_int32_t c, char *s, size_t slen)
+{
+       size_t i, need;
+       u_char h;
+
+       if (c < 0x80) {
+               if (slen >= 1) {
+                       s[0] = (char)c;
+               }
+               return 1;
+       } else if (c < 0x800) {
+               need = 2;
+               h = 0xc0;
+       } else if (c < 0x10000) {
+               need = 3;
+               h = 0xe0;
+       } else if (c < 0x200000) {
+               need = 4;
+               h = 0xf0;
+       } else {
+               /* Invalid code point > U+10FFFF */
+               return -1;
+       }
+       if (need > slen)
+               return -1;
+       for (i = 0; i < need; i++) {
+               s[i] = (i == 0 ? h : 0x80);
+               s[i] |= (c >> (need - i - 1) * 6) & 0x3f;
+       }
+       return need;
+}
+
+
+/*
+ * Normalise a UTF-8 string using the RFC3454 stringprep algorithm.
+ * Returns 0 on success or -1 on failure (prohibited code or insufficient
+ * length in the output string.
+ * Requires an output buffer at most the same length as the input.
+ */
+int
+utf8_stringprep(const char *in, char *out, size_t olen)
+{
+       int r;
+       size_t o;
+       u_int32_t c;
+
+       if (olen < 1)
+               return -1;
+
+       for (o = 0; (c = decode_utf8(in, &in, NULL)) != 0;) {
+               /* Mapping */
+               if (code_in_table(c, map_to_nothing, sizeof(map_to_nothing)))
+                       continue;
+
+               /* Prohibitied output */
+               if (code_in_table(c, prohibited, sizeof(prohibited)) &&
+                   !code_in_table(c, whitelist, sizeof(whitelist)))
+                       return -1;
+
+               /* Map unassigned code points to U+FFFD */
+               if (code_in_table(c, unassigned, sizeof(unassigned)))
+                       c = 0xFFFD;
+
+               /* Encode the character */
+               r = encode_utf8(c, out + o, olen - o - 1);
+               if (r < 0)
+                       return -1;
+               o += r;
+       }
+       out[o] = '\0';
+       return 0;
+}
+
+/*
+ * Copyright (c) 2013 Damien Miller <[email protected]>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/*
+ * This is a simple RFC3454 stringprep profile to sanitise UTF-8 strings
+ * from untrusted sources.
+ *
+ * It is intended to be used prior to display of untrusted strings only.
+ * It should not be used for logging because of bi-di ambiguity. It
+ * should also not be used in any case where lack of normalisation may
+ * cause problems.
+ *
+ * This profile uses the prohibition and mapping tables from RFC3454
+ * (listed below) but the unassigned character table has been updated to
+ * Unicode 6.2. It uses a local whitelist of whitespace characters (\n,
+ * \a and \t). Unicode normalisation and bi-di testing are not used.
+ *
+ * XXX: implement bi-di handling (needed for logs)
+ * XXX: implement KC normalisation (needed for passing to libs/syscalls)
+ */
+
+#include <sys/types.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include <ctype.h>
+
+#include "misc.h"
+
+struct u32_range {
+       u_int32_t lo, hi;  /* Inclusive */
+};
+
+#include "stringprep-tables.c"
+
+/* Returns 1 if code 'c' appears in the table or 0 otherwise */
+static int
+code_in_table(u_int32_t c, const struct u32_range *table, size_t tlen)
+{
+       const struct u32_range *e, *end = (void *)(tlen + (char *)table);
+
+       for (e = table; e < end; e++) {
+               if (c >= e->lo && c <= e->hi)
+                       return 1;
+       }
+       return 0;
+}
+
+/*
+ * Decode the next valid UCS character from a UTF-8 string, skipping past bad
+ * codes. Returns the decoded character or 0 for end-of-string and updates
+ * nextc to point to the start of the next character (if any).
+ * had_error is set if an invalid code was encountered.
+ */
+static u_int32_t
+decode_utf8(const char *in, const char **nextc, int *had_error)
+{
+       int state = 0;
+       size_t i;
+       u_int32_t c, e;
+
+       e = c = 0;
+       for (i = 0; in[i] != '\0'; i++) {
+               e = (u_char)in[i];
+               /* Invalid code point state */
+               if (state == -1) {
+                       /*
+                        * Continue eating continuation characters until
+                        * a new start character comes along.
+                        */
+                       if ((e & 0xc0) == 0x80)
+                               continue;
+                       state = 0;
+               }
+
+               /* New code point state */
+               if (state == 0) {
+                       if ((e & 0x80) == 0) { /* 7 bit code */
+                               c = e & 0x7f;
+                               goto have_code;
+                       } else if ((e & 0xe0) == 0xc0) { /* 11 bit code point */
+                               state = 1;
+                               c = (e & 0x1f) << 6;
+                       } else if ((e & 0xf0) == 0xe0) { /* 16 bit code point */
+                               state = 2;
+                               c = (e & 0xf) << 12;
+                       } else if ((e & 0xf8) == 0xf0) { /* 21 bit code point */
+                               state = 3;
+                               c = (e & 0x7) << 18;
+                       } else {
+                               /* A five or six byte header, or 0xff */
+                               goto bad_encoding;
+                       }
+                       /*
+                        * Check that the header byte has some non-zero data
+                        * after masking off the length marker. If not it is
+                        * an invalid encoding.
+                        */
+                       if (c == 0) {
+ bad_encoding:
+                               c = 0;
+                               state = -1;
+                               if (had_error != NULL)
+                                       *had_error = 1;
+                       }
+                       continue;
+               }
+
+               /* Sanity check: should never happen */
+               if (state < 1 || state > 5) {
+                       *nextc = NULL;
+                       if (had_error != NULL)
+                               *had_error = 1;
+                       return 0;
+               }
+               /* Multibyte code point state */
+               state--;
+               c |= (e & 0x3f) << (state * 6);
+               if (state > 0)
+                       continue;
+
+               /* RFC3629 bans codepoints > U+10FFFF */
+               if (c > 0x10FFFF) {
+                       if (had_error != NULL)
+                               *had_error = 1;
+                       continue;
+               }
+ have_code:
+               *nextc = in + i + 1;
+               return c;
+       }
+       if (state != 0 && had_error != NULL)
+               *had_error = 1;
+       *nextc = in + i;
+       return 0;
+}
+
+/*
+ * Attempt to encode a UCS character as a UTF-8 sequence. Returns the number
+ * of characters used or -1 on error (insufficient space or bad code).
+ */
+static int
+encode_utf8(u_int32_t c, char *s, size_t slen)
+{
+       size_t i, need;
+       u_char h;
+
+       if (c < 0x80) {
+               if (slen >= 1) {
+                       s[0] = (char)c;
+               }
+               return 1;
+       } else if (c < 0x800) {
+               need = 2;
+               h = 0xc0;
+       } else if (c < 0x10000) {
+               need = 3;
+               h = 0xe0;
+       } else if (c < 0x200000) {
+               need = 4;
+               h = 0xf0;
+       } else {
+               /* Invalid code point > U+10FFFF */
+               return -1;
+       }
+       if (need > slen)
+               return -1;
+       for (i = 0; i < need; i++) {
+               s[i] = (i == 0 ? h : 0x80);
+               s[i] |= (c >> (need - i - 1) * 6) & 0x3f;
+       }
+       return need;
+}
+
+
+/*
+ * Normalise a UTF-8 string using the RFC3454 stringprep algorithm.
+ * Returns 0 on success or -1 on failure (prohibited code or insufficient
+ * length in the output string.
+ * Requires an output buffer at most the same length as the input.
+ */
+int
+utf8_stringprep(const char *in, char *out, size_t olen)
+{
+       int r;
+       size_t o;
+       u_int32_t c;
+
+       if (olen < 1)
+               return -1;
+
+       for (o = 0; (c = decode_utf8(in, &in, NULL)) != 0;) {
+               /* Mapping */
+               if (code_in_table(c, map_to_nothing, sizeof(map_to_nothing)))
+                       continue;
+
+               /* Prohibitied output */
+               if (code_in_table(c, prohibited, sizeof(prohibited)) &&
+                   !code_in_table(c, whitelist, sizeof(whitelist)))
+                       return -1;
+
+               /* Map unassigned code points to U+FFFD */
+               if (code_in_table(c, unassigned, sizeof(unassigned)))
+                       c = 0xFFFD;
+
+               /* Encode the character */
+               r = encode_utf8(c, out + o, olen - o - 1);
+               if (r < 0)
+                       return -1;
+               o += r;
+       }
+       out[o] = '\0';
+       return 0;
+}

Reply via email to