Re: proposed performance tweaks to Gnulib mbchar module

Bruno Haible Tue, 04 Jul 2023 06:33:47 -0700

Hi Paul,

Paul Eggert wrote:
> Attached are two proposed performance tweaks I found by inspection. No 
> big deal of course.


Thanks. I committed the first one in your name, with a reference to the
precise ISO C section. Then I found that POSIX's "Portable character set"
goes beyond that, and thus applied the patch below. Your second patch is
then not needed any more.

https://pubs.opengroup.org/onlinepubs/9699919799.2018edition/basedefs/V1_chap06.html
section 6.1 says "The encoded values associated with the members of the
portable character set are each represented in a single byte.", and this 
"portable
character set goes beyond what ISO C has.


2023-07-04  Bruno Haible  <br...@clisp.org>

        mbchar: Optimize all chars from the POSIX "portable character set".
        * lib/mbchar.h (is_basic): Include all the characters from the POSIX
        "portable character set".
        * lib/mbchar.c (is_basic_table): Likewise.
        * lib/mbiter.h (mbiter_multi_next): Update comment.
        * lib/mbuiter.h (mbuiter_multi_next): Likewise.
        * lib/mbfile.h (mbfile_multi_getc): Likewise.

diff --git a/lib/mbchar.c b/lib/mbchar.c
index 84b388bfee..63ff9c7a72 100644
--- a/lib/mbchar.c
+++ b/lib/mbchar.c
@@ -24,13 +24,12 @@
 
 #if IS_BASIC_ASCII
 
-/* Bit table of characters in the ISO C "basic character set",
-   plus the characters '@', '$', and '`' which
-   ISO C guarantees to be single-byte and in practice are safe
-   to treat as basic in the execution character set.  */
+/* Bit table of characters in the POSIX "portable character set", which
+   POSIX guarantees to be single-byte and in practice are safe to treat
+   like the ISO C "basic character set".  */
 const unsigned int is_basic_table [UCHAR_MAX / 32 + 1] =
 {
-  0x00001a00,           /* '\t' '\v' '\f' */
+  0x00003f81,           /* '\0' '\007' '\010' '\t' '\n' '\v' '\f' '\r' */
   0xffffffff,           /* ' '......'?' */
   0xffffffff,           /* '@' 'A'...'Z' '[' '\\' ']' '^' '_' */
   0x7fffffff            /* '`' 'a'...'z' '{' '|' '}' '~' */
diff --git a/lib/mbchar.h b/lib/mbchar.h
index dea1e462ee..36bae18276 100644
--- a/lib/mbchar.h
+++ b/lib/mbchar.h
@@ -272,35 +272,40 @@ mb_copy (mbchar_t *new_mbc, const mbchar_t *old_mbc)
 }
 
 
-/* is_basic(c) tests whether the single-byte character c is in the
-   ISO C "basic character set" or is one of '@', '$', and '`' which
-   ISO C 23 § 5.2.1.1.(1) guarantees to be single-byte and in practice
-   are safe to treat as basic in the execution character set.
+/* is_basic(c) tests whether the single-byte character c is
+   - in the ISO C "basic character set" or is one of '@', '$', and '`'
+     which ISO C 23 § 5.2.1.1.(1) guarantees to be single-byte and in
+     practice are safe to treat as basic in the execution character set,
+     or
+   - in the POSIX "portable character set", which
+     <https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap06.html>
+     equally guarantees to be single-byte.
    This is a convenience function, and is in this file only to share code
-   between mbiter_multi.h and mbfile_multi.h.  */
-#if (' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) && ('$' == 36) \
-    && ('%' == 37) && ('&' == 38) && ('\'' == 39) && ('(' == 40) \
-    && (')' == 41) && ('*' == 42) && ('+' == 43) && (',' == 44) \
-    && ('-' == 45) && ('.' == 46) && ('/' == 47) && ('0' == 48) \
-    && ('1' == 49) && ('2' == 50) && ('3' == 51) && ('4' == 52) \
-    && ('5' == 53) && ('6' == 54) && ('7' == 55) && ('8' == 56) \
-    && ('9' == 57) && (':' == 58) && (';' == 59) && ('<' == 60) \
-    && ('=' == 61) && ('>' == 62) && ('?' == 63) && ('@' == 64) && ('A' == 65) 
\
-    && ('B' == 66) && ('C' == 67) && ('D' == 68) && ('E' == 69) \
-    && ('F' == 70) && ('G' == 71) && ('H' == 72) && ('I' == 73) \
-    && ('J' == 74) && ('K' == 75) && ('L' == 76) && ('M' == 77) \
-    && ('N' == 78) && ('O' == 79) && ('P' == 80) && ('Q' == 81) \
-    && ('R' == 82) && ('S' == 83) && ('T' == 84) && ('U' == 85) \
-    && ('V' == 86) && ('W' == 87) && ('X' == 88) && ('Y' == 89) \
-    && ('Z' == 90) && ('[' == 91) && ('\\' == 92) && (']' == 93) \
-    && ('^' == 94) && ('_' == 95) && ('`' == 96) && ('a' == 97) && ('b' == 98) 
\
-    && ('c' == 99) && ('d' == 100) && ('e' == 101) && ('f' == 102) \
-    && ('g' == 103) && ('h' == 104) && ('i' == 105) && ('j' == 106) \
-    && ('k' == 107) && ('l' == 108) && ('m' == 109) && ('n' == 110) \
-    && ('o' == 111) && ('p' == 112) && ('q' == 113) && ('r' == 114) \
-    && ('s' == 115) && ('t' == 116) && ('u' == 117) && ('v' == 118) \
-    && ('w' == 119) && ('x' == 120) && ('y' == 121) && ('z' == 122) \
-    && ('{' == 123) && ('|' == 124) && ('}' == 125) && ('~' == 126)
+   between mbiter.h, mbuiter.h, and mbfile.h.  */
+#if (' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) \
+    && ('$' == 36) && ('%' == 37) && ('&' == 38) && ('\'' == 39) \
+    && ('(' == 40) && (')' == 41) && ('*' == 42) && ('+' == 43) \
+    && (',' == 44) && ('-' == 45) && ('.' == 46) && ('/' == 47) \
+    && ('0' == 48) && ('1' == 49) && ('2' == 50) && ('3' == 51) \
+    && ('4' == 52) && ('5' == 53) && ('6' == 54) && ('7' == 55) \
+    && ('8' == 56) && ('9' == 57) && (':' == 58) && (';' == 59) \
+    && ('<' == 60) && ('=' == 61) && ('>' == 62) && ('?' == 63) \
+    && ('@' == 64) && ('A' == 65) && ('B' == 66) && ('C' == 67) \
+    && ('D' == 68) && ('E' == 69) && ('F' == 70) && ('G' == 71) \
+    && ('H' == 72) && ('I' == 73) && ('J' == 74) && ('K' == 75) \
+    && ('L' == 76) && ('M' == 77) && ('N' == 78) && ('O' == 79) \
+    && ('P' == 80) && ('Q' == 81) && ('R' == 82) && ('S' == 83) \
+    && ('T' == 84) && ('U' == 85) && ('V' == 86) && ('W' == 87) \
+    && ('X' == 88) && ('Y' == 89) && ('Z' == 90) && ('[' == 91) \
+    && ('\\' == 92) && (']' == 93) && ('^' == 94) && ('_' == 95) \
+    && ('`' == 96) && ('a' == 97) && ('b' == 98) && ('c' == 99) \
+    && ('d' == 100) && ('e' == 101) && ('f' == 102) && ('g' == 103) \
+    && ('h' == 104) && ('i' == 105) && ('j' == 106) && ('k' == 107) \
+    && ('l' == 108) && ('m' == 109) && ('n' == 110) && ('o' == 111) \
+    && ('p' == 112) && ('q' == 113) && ('r' == 114) && ('s' == 115) \
+    && ('t' == 116) && ('u' == 117) && ('v' == 118) && ('w' == 119) \
+    && ('x' == 120) && ('y' == 121) && ('z' == 122) && ('{' == 123) \
+    && ('|' == 124) && ('}' == 125) && ('~' == 126)
 /* The character set is ISO-646, not EBCDIC. */
 # define IS_BASIC_ASCII 1
 
@@ -320,7 +325,9 @@ is_basic (char c)
 {
   switch (c)
     {
-    case '\t': case '\v': case '\f':
+    case '\0':
+    case '\007': case '\010':
+    case '\t': case '\n': case '\v': case '\f': case '\r':
     case ' ': case '!': case '"': case '#': case '$': case '%':
     case '&': case '\'': case '(': case ')': case '*':
     case '+': case ',': case '-': case '.': case '/':
diff --git a/lib/mbfile.h b/lib/mbfile.h
index 716ab3fc89..9a2532992e 100644
--- a/lib/mbfile.h
+++ b/lib/mbfile.h
@@ -108,9 +108,12 @@ mbfile_multi_getc (struct mbchar *mbc, struct mbfile_multi 
*mbf)
   /* Handle most ASCII characters quickly, without calling mbrtowc().  */
   if (mbf->bufcount == 1 && mbsinit (&mbf->state) && is_basic (mbf->buf[0]))
     {
-      /* These characters are part of the basic character set.  ISO C 99
-         guarantees that their wide character code is identical to their
-         char code.  The 32-bit wide character code is the same as well.  */
+      /* These characters are part of the POSIX portable character set.
+         For most of them, namely those in the ISO C basic character set,
+         ISO C 99 guarantees that their wide character code is identical to
+         their char code.  For the few other ones, this is the case as well,
+         in all locale encodings that are in use.  The 32-bit wide character
+         code is the same as well.  */
       mbc->wc = mbc->buf[0] = mbf->buf[0];
       mbc->wc_valid = true;
       mbc->ptr = &mbc->buf[0];
diff --git a/lib/mbiter.h b/lib/mbiter.h
index fadefe104b..bc88b4f3a2 100644
--- a/lib/mbiter.h
+++ b/lib/mbiter.h
@@ -124,9 +124,12 @@ mbiter_multi_next (struct mbiter_multi *iter)
   /* Handle most ASCII characters quickly, without calling mbrtowc().  */
   if (is_basic (*iter->cur.ptr))
     {
-      /* These characters are part of the basic character set.  ISO C 99
-         guarantees that their wide character code is identical to their
-         char code.  */
+      /* These characters are part of the POSIX portable character set.
+         For most of them, namely those in the ISO C basic character set,
+         ISO C 99 guarantees that their wide character code is identical to
+         their char code.  For the few other ones, this is the case as well,
+         in all locale encodings that are in use.  The 32-bit wide character
+         code is the same as well.  */
       iter->cur.bytes = 1;
       iter->cur.wc = *iter->cur.ptr;
       iter->cur.wc_valid = true;
diff --git a/lib/mbuiter.h b/lib/mbuiter.h
index 954e11f635..93dec81603 100644
--- a/lib/mbuiter.h
+++ b/lib/mbuiter.h
@@ -132,9 +132,12 @@ mbuiter_multi_next (struct mbuiter_multi *iter)
   /* Handle most ASCII characters quickly, without calling mbrtowc().  */
   if (is_basic (*iter->cur.ptr))
     {
-      /* These characters are part of the basic character set.  ISO C 99
-         guarantees that their wide character code is identical to their
-         char code.  */
+      /* These characters are part of the POSIX portable character set.
+         For most of them, namely those in the ISO C basic character set,
+         ISO C 99 guarantees that their wide character code is identical to
+         their char code.  For the few other ones, this is the case as well,
+         in all locale encodings that are in use.  The 32-bit wide character
+         code is the same as well.  */
       iter->cur.bytes = 1;
       iter->cur.wc = *iter->cur.ptr;
       iter->cur.wc_valid = true;

Re: proposed performance tweaks to Gnulib mbchar module

Reply via email to