On 10/25/23 06:27, Pádraig Brady wrote:
But thinking about it more we probably should allow lower case for base32.

Yes, I'm thinking the same.

While thinking (:-) I couldn't resist improving performance a bit by installing the attached. This also fixes an unlikely bug when isxdigit behavior is oddball.
From f4a59d453ebfa6dcaf2dd1a89a64c1fa05af7a85 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Wed, 25 Oct 2023 08:45:15 -0700
Subject: [PATCH 1/3] build: update gnulib submodule to latest

---
 gnulib | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gnulib b/gnulib
index b8083ec2e..e0ae1a7f3 160000
--- a/gnulib
+++ b/gnulib
@@ -1 +1 @@
-Subproject commit b8083ec2eeb22c20d299d75aae06e465cf9e4494
+Subproject commit e0ae1a7f324d6b9462735273bc5a2848c712f883
-- 
2.41.0

From dcc1514d9ac12a98bf3067c206ce8e756c604719 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Wed, 25 Oct 2023 14:43:32 -0700
Subject: [PATCH 2/3] basenc: tweak checks to use unsigned char

This tends to generate better code, at least on x86-64,
because callers are just as fast and callees can avoid a conversion.
* src/basenc.c: The following renamings also change the arg type
from char to unsigned char.  All uses changed.
(isubase): Rename from isbase.
(isubase64url): Rename from isbase64url.
(isubase32hex): Rename from isbase32hex.
(isubase16): Rename from isbase16.
(isuz85): Rename from isz85.
(isubase2): Rename from isbase2.

2023-10-24  Paul Eggert  <egg...@cs.ucla.edu>

* src/basenc.c (struct base16_decode_context):
Simplify by storing -1 for missing nibbles.  All uses changed.
---
 src/basenc.c | 46 +++++++++++++++++++++++-----------------------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/src/basenc.c b/src/basenc.c
index b47bd013f..850dd4cd9 100644
--- a/src/basenc.c
+++ b/src/basenc.c
@@ -216,7 +216,7 @@ static_assert (DEC_BLOCKSIZE % 40 == 0); /* Complete encoded blocks are used. */
 # define base_decode_context base32_decode_context
 # define base_decode_ctx_init base32_decode_ctx_init
 # define base_decode_ctx base32_decode_ctx
-# define isbase isbase32
+# define isubase isubase32
 #elif BASE_TYPE == 64
 # define BASE_LENGTH BASE64_LENGTH
 # define REQUIRED_PADDING base64_required_padding
@@ -232,7 +232,7 @@ static_assert (DEC_BLOCKSIZE % 12 == 0); /* Complete encoded blocks are used. */
 # define base_decode_context base64_decode_context
 # define base_decode_ctx_init base64_decode_ctx_init
 # define base_decode_ctx base64_decode_ctx
-# define isbase isbase64
+# define isubase isubase64
 #elif BASE_TYPE == 42
 
 
@@ -247,7 +247,7 @@ static_assert (DEC_BLOCKSIZE % 12 == 0); /* complete encoded blocks for base64*/
 
 static int (*base_length) (int i);
 static int (*required_padding) (int i);
-static bool (*isbase) (char ch);
+static bool (*isubase) (unsigned char ch);
 static void (*base_encode) (char const *restrict in, idx_t inlen,
                             char *restrict out, idx_t outlen);
 
@@ -350,10 +350,10 @@ base64url_encode (char const *restrict in, idx_t inlen,
 }
 
 static bool
-isbase64url (char ch)
+isubase64url (unsigned char ch)
 {
   return (ch == '-' || ch == '_'
-          || (ch != '+' && ch != '/' && isbase64 (ch)));
+          || (ch != '+' && ch != '/' && isubase64 (ch)));
 }
 
 static void
@@ -463,7 +463,7 @@ static const char base32_hex_to_norm[32 + 9] = {
 
 
 inline static bool
-isbase32hex (char ch)
+isubase32hex (unsigned char ch)
 {
   return ('0' <= ch && ch <= '9') || ('A' <= ch && ch <= 'V');
 }
@@ -502,8 +502,8 @@ base32hex_decode_ctx_wrapper (struct base_decode_context *ctx,
   char *p = ctx->inbuf;
   while (i--)
     {
-      if (isbase32hex (*in))
-        *p = base32_hex_to_norm[ (int)*in - 0x30];
+      if (isubase32hex (*in))
+        *p = base32_hex_to_norm[*in - 0x30];
       else
         *p = *in;
       ++p;
@@ -518,9 +518,9 @@ base32hex_decode_ctx_wrapper (struct base_decode_context *ctx,
 }
 
 static bool
-isbase16 (char ch)
+isubase16 (unsigned char ch)
 {
-  return isxdigit (to_uchar (ch));
+  return isxdigit (ch);
 }
 
 static int
@@ -614,7 +614,7 @@ z85_length (int len)
 }
 
 static bool
-isz85 (char ch)
+isuz85 (unsigned char ch)
 {
   return c_isalnum (ch) || strchr (".-:+=^!/*?&<>()[]{}@%$#", ch) != nullptr;
 }
@@ -796,7 +796,7 @@ z85_decode_ctx (struct base_decode_context *ctx,
 
 
 inline static bool
-isbase2 (char ch)
+isubase2 (unsigned char ch)
 {
   return ch == '0' || ch == '1';
 }
@@ -875,7 +875,7 @@ base2lsbf_decode_ctx (struct base_decode_context *ctx,
           continue;
         }
 
-      if (!isbase2 (*in))
+      if (!isubase2 (*in))
         return false;
 
       bool bit = (*in == '1');
@@ -919,7 +919,7 @@ base2msbf_decode_ctx (struct base_decode_context *ctx,
           continue;
         }
 
-      if (!isbase2 (*in))
+      if (!isubase2 (*in))
         return false;
 
       bool bit = (*in == '1');
@@ -1065,7 +1065,7 @@ do_decode (FILE *in, char const *infile, FILE *out, bool ignore_garbage)
             {
               for (idx_t i = 0; n > 0 && i < n;)
                 {
-                  if (isbase (inbuf[sum + i]) || inbuf[sum + i] == '=')
+                  if (isubase (inbuf[sum + i]) || inbuf[sum + i] == '=')
                     i++;
                   else
                     memmove (inbuf + sum + i, inbuf + sum + i + 1, --n - i);
@@ -1193,7 +1193,7 @@ main (int argc, char **argv)
     case BASE64_OPTION:
       base_length = base64_length_wrapper;
       required_padding = base64_required_padding;
-      isbase = isbase64;
+      isubase = isubase64;
       base_encode = base64_encode;
       base_decode_ctx_init = base64_decode_ctx_init_wrapper;
       base_decode_ctx = base64_decode_ctx_wrapper;
@@ -1202,7 +1202,7 @@ main (int argc, char **argv)
     case BASE64URL_OPTION:
       base_length = base64_length_wrapper;
       required_padding = base64_required_padding;
-      isbase = isbase64url;
+      isubase = isubase64url;
       base_encode = base64url_encode;
       base_decode_ctx_init = base64url_decode_ctx_init_wrapper;
       base_decode_ctx = base64url_decode_ctx_wrapper;
@@ -1211,7 +1211,7 @@ main (int argc, char **argv)
     case BASE32_OPTION:
       base_length = base32_length_wrapper;
       required_padding = base32_required_padding;
-      isbase = isbase32;
+      isubase = isubase32;
       base_encode = base32_encode;
       base_decode_ctx_init = base32_decode_ctx_init_wrapper;
       base_decode_ctx = base32_decode_ctx_wrapper;
@@ -1220,7 +1220,7 @@ main (int argc, char **argv)
     case BASE32HEX_OPTION:
       base_length = base32_length_wrapper;
       required_padding = base32_required_padding;
-      isbase = isbase32hex;
+      isubase = isubase32hex;
       base_encode = base32hex_encode;
       base_decode_ctx_init = base32hex_decode_ctx_init_wrapper;
       base_decode_ctx = base32hex_decode_ctx_wrapper;
@@ -1229,7 +1229,7 @@ main (int argc, char **argv)
     case BASE16_OPTION:
       base_length = base16_length;
       required_padding = no_required_padding;
-      isbase = isbase16;
+      isubase = isubase16;
       base_encode = base16_encode;
       base_decode_ctx_init = base16_decode_ctx_init;
       base_decode_ctx = base16_decode_ctx;
@@ -1238,7 +1238,7 @@ main (int argc, char **argv)
     case BASE2MSBF_OPTION:
       base_length = base2_length;
       required_padding = no_required_padding;
-      isbase = isbase2;
+      isubase = isubase2;
       base_encode = base2msbf_encode;
       base_decode_ctx_init = base2_decode_ctx_init;
       base_decode_ctx = base2msbf_decode_ctx;
@@ -1247,7 +1247,7 @@ main (int argc, char **argv)
     case BASE2LSBF_OPTION:
       base_length = base2_length;
       required_padding = no_required_padding;
-      isbase = isbase2;
+      isubase = isubase2;
       base_encode = base2lsbf_encode;
       base_decode_ctx_init = base2_decode_ctx_init;
       base_decode_ctx = base2lsbf_decode_ctx;
@@ -1256,7 +1256,7 @@ main (int argc, char **argv)
     case Z85_OPTION:
       base_length = z85_length;
       required_padding = no_required_padding;
-      isbase = isz85;
+      isubase = isuz85;
       base_encode = z85_encode;
       base_decode_ctx_init = z85_decode_ctx_init;
       base_decode_ctx = z85_decode_ctx;
-- 
2.41.0

From 60bd7bad9d2473d2bd0462a1cbe05144c35ad557 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Wed, 25 Oct 2023 15:09:04 -0700
Subject: [PATCH 3/3] basenc: fix unlikely locale issue; tune
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This sped up ‘basenc -d --base16’ by 60% on my old platform,
AMD Phenom II X4 910e, Fedora 38.
* src/basenc.c (struct base16_decode_context): Simplify by
omitting have_nibble.  ‘nibble’ is now negative if it’s missing.
All uses changed.
(B16): New macro, inspired by ../lib/base64.c.
(base16_to_int): New static var, likewise.
(isubase16): Reimplement using base16_to_int, since isxdigit is
not guaranteed to succeed on the chars we want when the locale is
oddball.
(base16_decode_ctx): Tune by using base16_to_int and by
---
 src/basenc.c | 144 +++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 118 insertions(+), 26 deletions(-)

diff --git a/src/basenc.c b/src/basenc.c
index 850dd4cd9..858e0cd5e 100644
--- a/src/basenc.c
+++ b/src/basenc.c
@@ -253,8 +253,8 @@ static void (*base_encode) (char const *restrict in, idx_t inlen,
 
 struct base16_decode_context
 {
-  char nibble;
-  bool have_nibble;
+  /* Either a 4-bit nibble, or negative if we have no nibble.  */
+  signed char nibble;
 };
 
 struct z85_decode_context
@@ -516,11 +516,105 @@ base32hex_decode_ctx_wrapper (struct base_decode_context *ctx,
 
   return b;
 }
+/* With this approach this file works independent of the charset used
+   (think EBCDIC).  However, it does assume that the characters in the
+   Base32 alphabet (A-Z2-7) are encoded in 0..255.  POSIX
+   1003.1-2001 require that char and unsigned char are 8-bit
+   quantities, though, taking care of that problem.  But this may be a
+   potential problem on non-POSIX C99 platforms.
+
+   IBM C V6 for AIX mishandles "#define B32(x) ...'x'...", so use "_"
+   as the formal parameter rather than "x".  */
+#define B16(_)                                  \
+  ((_) == '0' ? 0                               \
+   : (_) == '1' ? 1                             \
+   : (_) == '2' ? 2                             \
+   : (_) == '3' ? 3                             \
+   : (_) == '4' ? 4                             \
+   : (_) == '5' ? 5                             \
+   : (_) == '6' ? 6                             \
+   : (_) == '7' ? 7                             \
+   : (_) == '8' ? 8                             \
+   : (_) == '9' ? 9                             \
+   : (_) == 'A' || (_) == 'a' ? 10              \
+   : (_) == 'B' || (_) == 'b' ? 11              \
+   : (_) == 'C' || (_) == 'c' ? 12              \
+   : (_) == 'D' || (_) == 'd' ? 13              \
+   : (_) == 'E' || (_) == 'e' ? 14              \
+   : (_) == 'F' || (_) == 'f' ? 15              \
+   : -1)
+
+static signed char const base16_to_int[256] = {
+  B16 (0), B16 (1), B16 (2), B16 (3),
+  B16 (4), B16 (5), B16 (6), B16 (7),
+  B16 (8), B16 (9), B16 (10), B16 (11),
+  B16 (12), B16 (13), B16 (14), B16 (15),
+  B16 (16), B16 (17), B16 (18), B16 (19),
+  B16 (20), B16 (21), B16 (22), B16 (23),
+  B16 (24), B16 (25), B16 (26), B16 (27),
+  B16 (28), B16 (29), B16 (30), B16 (31),
+  B16 (32), B16 (33), B16 (34), B16 (35),
+  B16 (36), B16 (37), B16 (38), B16 (39),
+  B16 (40), B16 (41), B16 (42), B16 (43),
+  B16 (44), B16 (45), B16 (46), B16 (47),
+  B16 (48), B16 (49), B16 (50), B16 (51),
+  B16 (52), B16 (53), B16 (54), B16 (55),
+  B16 (56), B16 (57), B16 (58), B16 (59),
+  B16 (60), B16 (61), B16 (62), B16 (63),
+  B16 (32), B16 (65), B16 (66), B16 (67),
+  B16 (68), B16 (69), B16 (70), B16 (71),
+  B16 (72), B16 (73), B16 (74), B16 (75),
+  B16 (76), B16 (77), B16 (78), B16 (79),
+  B16 (80), B16 (81), B16 (82), B16 (83),
+  B16 (84), B16 (85), B16 (86), B16 (87),
+  B16 (88), B16 (89), B16 (90), B16 (91),
+  B16 (92), B16 (93), B16 (94), B16 (95),
+  B16 (96), B16 (97), B16 (98), B16 (99),
+  B16 (100), B16 (101), B16 (102), B16 (103),
+  B16 (104), B16 (105), B16 (106), B16 (107),
+  B16 (108), B16 (109), B16 (110), B16 (111),
+  B16 (112), B16 (113), B16 (114), B16 (115),
+  B16 (116), B16 (117), B16 (118), B16 (119),
+  B16 (120), B16 (121), B16 (122), B16 (123),
+  B16 (124), B16 (125), B16 (126), B16 (127),
+  B16 (128), B16 (129), B16 (130), B16 (131),
+  B16 (132), B16 (133), B16 (134), B16 (135),
+  B16 (136), B16 (137), B16 (138), B16 (139),
+  B16 (140), B16 (141), B16 (142), B16 (143),
+  B16 (144), B16 (145), B16 (146), B16 (147),
+  B16 (148), B16 (149), B16 (150), B16 (151),
+  B16 (152), B16 (153), B16 (154), B16 (155),
+  B16 (156), B16 (157), B16 (158), B16 (159),
+  B16 (160), B16 (161), B16 (162), B16 (163),
+  B16 (132), B16 (165), B16 (166), B16 (167),
+  B16 (168), B16 (169), B16 (170), B16 (171),
+  B16 (172), B16 (173), B16 (174), B16 (175),
+  B16 (176), B16 (177), B16 (178), B16 (179),
+  B16 (180), B16 (181), B16 (182), B16 (183),
+  B16 (184), B16 (185), B16 (186), B16 (187),
+  B16 (188), B16 (189), B16 (190), B16 (191),
+  B16 (192), B16 (193), B16 (194), B16 (195),
+  B16 (196), B16 (197), B16 (198), B16 (199),
+  B16 (200), B16 (201), B16 (202), B16 (203),
+  B16 (204), B16 (205), B16 (206), B16 (207),
+  B16 (208), B16 (209), B16 (210), B16 (211),
+  B16 (212), B16 (213), B16 (214), B16 (215),
+  B16 (216), B16 (217), B16 (218), B16 (219),
+  B16 (220), B16 (221), B16 (222), B16 (223),
+  B16 (224), B16 (225), B16 (226), B16 (227),
+  B16 (228), B16 (229), B16 (230), B16 (231),
+  B16 (232), B16 (233), B16 (234), B16 (235),
+  B16 (236), B16 (237), B16 (238), B16 (239),
+  B16 (240), B16 (241), B16 (242), B16 (243),
+  B16 (244), B16 (245), B16 (246), B16 (247),
+  B16 (248), B16 (249), B16 (250), B16 (251),
+  B16 (252), B16 (253), B16 (254), B16 (255)
+};
 
 static bool
 isubase16 (unsigned char ch)
 {
-  return isxdigit (ch);
+  return ch < sizeof base16_to_int && 0 <= base16_to_int[ch];
 }
 
 static int
@@ -550,7 +644,7 @@ static void
 base16_decode_ctx_init (struct base_decode_context *ctx)
 {
   init_inbuf (ctx);
-  ctx->ctx.base16.have_nibble = false;
+  ctx->ctx.base16.nibble = -1;
   ctx->i = 1;
 }
 
@@ -561,44 +655,42 @@ base16_decode_ctx (struct base_decode_context *ctx,
                    char *restrict out, idx_t *outlen)
 {
   bool ignore_lines = true;  /* for now, always ignore them */
-
-  *outlen = 0;
+  char *out0 = out;
+  signed char nibble = ctx->ctx.base16.nibble;
 
   /* inlen==0 is request to flush output.
      if there is a dangling high nibble - we are missing the low nibble,
      so return false - indicating an invalid input.  */
   if (inlen == 0)
-    return !ctx->ctx.base16.have_nibble;
+    {
+      *outlen = 0;
+      return nibble < 0;
+    }
 
   while (inlen--)
     {
-      if (ignore_lines && *in == '\n')
+      unsigned char c = *in++;
+      if (ignore_lines && c == '\n')
+        continue;
+
+      if (sizeof base16_to_int <= c || base16_to_int[c] < 0)
         {
-          ++in;
-          continue;
+          *outlen = out - out0;
+          return false; /* garbage - return false */
         }
 
-      int nib = c_toupper (*in++);
-      if ('0' <= nib && nib <= '9')
-        nib -= '0';
-      else if ('A' <= nib && nib <= 'F')
-        nib -= 'A' - 10;
+      if (nibble < 0)
+        nibble = base16_to_int[c];
       else
-        return false; /* garbage - return false */
-
-      if (ctx->ctx.base16.have_nibble)
         {
           /* have both nibbles, write octet */
-          *out++ = (ctx->ctx.base16.nibble << 4) + nib;
-          ++(*outlen);
+          *out++ = (nibble << 4) + base16_to_int[c];
+          nibble = -1;
         }
-      else
-        {
-          /* Store higher nibble until next one arrives */
-          ctx->ctx.base16.nibble = nib;
-        }
-      ctx->ctx.base16.have_nibble = !ctx->ctx.base16.have_nibble;
     }
+
+  ctx->ctx.base16.nibble = nibble;
+  *outlen = out - out0;
   return true;
 }
 
-- 
2.41.0

Reply via email to