proposed performance tweaks to Gnulib mbchar module

Paul Eggert Mon, 03 Jul 2023 11:13:29 -0700

Attached are two proposed performance tweaks I found by inspection. Nobig deal of course.

From 775a34de03f0c4cc9a8a87e65030d19733301193 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Mon, 3 Jul 2023 10:54:36 -0700
Subject: [PATCH 1/2] mbchar: treat @, $, ` as basic


The C standard says that @, $, ` must have a single-byte
representation so they count as basic as far as multibyte
character processing goes.
* lib/mbchar.c (is_basic_table):
* lib/mbchar.h (is_basic):
* lib/mbswidth.c (mbsnwidth):
Treat @, $, ` as basic too.
---
 ChangeLog      |  9 +++++++++
 lib/mbchar.c   | 11 +++++++----
 lib/mbchar.h   | 16 +++++++++-------
 lib/mbswidth.c |  6 +++---
 4 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 7d44959df2..a9e5b09a17 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,14 @@
 2023-07-03  Paul Eggert  <egg...@cs.ucla.edu>
 
+	mbchar: treat @, $, ` as basic
+	The C standard says that @, $, ` must have a single-byte
+	representation so they count as basic as far as multibyte
+	character processing goes.
+	* lib/mbchar.c (is_basic_table):
+	* lib/mbchar.h (is_basic):
+	* lib/mbswidth.c (mbsnwidth):
+	Treat @, $, ` as basic too.
+
 	mbrtoc32: document (size_t) -3 issue
 	* doc/posix-functions/mbrtoc32.texi (mbrtoc32):
 	Say (size_t) -3 might not be worth bothering about.
diff --git a/lib/mbchar.c b/lib/mbchar.c
index 7d5f72c9c5..84b388bfee 100644
--- a/lib/mbchar.c
+++ b/lib/mbchar.c
@@ -24,13 +24,16 @@
 
 #if IS_BASIC_ASCII
 
-/* Bit table of characters in the ISO C "basic character set".  */
+/* Bit table of characters in the ISO C "basic character set",
+   plus the characters '@', '$', and '`' which
+   ISO C guarantees to be single-byte and in practice are safe
+   to treat as basic in the execution character set.  */
 const unsigned int is_basic_table [UCHAR_MAX / 32 + 1] =
 {
   0x00001a00,           /* '\t' '\v' '\f' */
-  0xffffffef,           /* ' '...'#' '%'...'?' */
-  0xfffffffe,           /* 'A'...'Z' '[' '\\' ']' '^' '_' */
-  0x7ffffffe            /* 'a'...'z' '{' '|' '}' '~' */
+  0xffffffff,           /* ' '......'?' */
+  0xffffffff,           /* '@' 'A'...'Z' '[' '\\' ']' '^' '_' */
+  0x7fffffff            /* '`' 'a'...'z' '{' '|' '}' '~' */
   /* The remaining bits are 0.  */
 };
 
diff --git a/lib/mbchar.h b/lib/mbchar.h
index c183772cc6..ccbde17b8f 100644
--- a/lib/mbchar.h
+++ b/lib/mbchar.h
@@ -273,17 +273,19 @@ mb_copy (mbchar_t *new_mbc, const mbchar_t *old_mbc)
 
 
 /* is_basic(c) tests whether the single-byte character c is in the
-   ISO C "basic character set".
+   ISO C "basic character set" or is one of '@', '$', and '`' which
+   ISO C guarantees to be single-byte and in practice are safe
+   to treat as basic in the execution character set.
    This is a convenience function, and is in this file only to share code
    between mbiter_multi.h and mbfile_multi.h.  */
-#if (' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) \
+#if (' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) && ('$' == 36) \
     && ('%' == 37) && ('&' == 38) && ('\'' == 39) && ('(' == 40) \
     && (')' == 41) && ('*' == 42) && ('+' == 43) && (',' == 44) \
     && ('-' == 45) && ('.' == 46) && ('/' == 47) && ('0' == 48) \
     && ('1' == 49) && ('2' == 50) && ('3' == 51) && ('4' == 52) \
     && ('5' == 53) && ('6' == 54) && ('7' == 55) && ('8' == 56) \
     && ('9' == 57) && (':' == 58) && (';' == 59) && ('<' == 60) \
-    && ('=' == 61) && ('>' == 62) && ('?' == 63) && ('A' == 65) \
+    && ('=' == 61) && ('>' == 62) && ('?' == 63) && ('@' == 64) && ('A' == 65) \
     && ('B' == 66) && ('C' == 67) && ('D' == 68) && ('E' == 69) \
     && ('F' == 70) && ('G' == 71) && ('H' == 72) && ('I' == 73) \
     && ('J' == 74) && ('K' == 75) && ('L' == 76) && ('M' == 77) \
@@ -291,7 +293,7 @@ mb_copy (mbchar_t *new_mbc, const mbchar_t *old_mbc)
     && ('R' == 82) && ('S' == 83) && ('T' == 84) && ('U' == 85) \
     && ('V' == 86) && ('W' == 87) && ('X' == 88) && ('Y' == 89) \
     && ('Z' == 90) && ('[' == 91) && ('\\' == 92) && (']' == 93) \
-    && ('^' == 94) && ('_' == 95) && ('a' == 97) && ('b' == 98) \
+    && ('^' == 94) && ('_' == 95) && ('`' == 96) && ('a' == 97) && ('b' == 98) \
     && ('c' == 99) && ('d' == 100) && ('e' == 101) && ('f' == 102) \
     && ('g' == 103) && ('h' == 104) && ('i' == 105) && ('j' == 106) \
     && ('k' == 107) && ('l' == 108) && ('m' == 109) && ('n' == 110) \
@@ -319,20 +321,20 @@ is_basic (char c)
   switch (c)
     {
     case '\t': case '\v': case '\f':
-    case ' ': case '!': case '"': case '#': case '%':
+    case ' ': case '!': case '"': case '#': case '$': case '%':
     case '&': case '\'': case '(': case ')': case '*':
     case '+': case ',': case '-': case '.': case '/':
     case '0': case '1': case '2': case '3': case '4':
     case '5': case '6': case '7': case '8': case '9':
     case ':': case ';': case '<': case '=': case '>':
-    case '?':
+    case '?': case '@':
     case 'A': case 'B': case 'C': case 'D': case 'E':
     case 'F': case 'G': case 'H': case 'I': case 'J':
     case 'K': case 'L': case 'M': case 'N': case 'O':
     case 'P': case 'Q': case 'R': case 'S': case 'T':
     case 'U': case 'V': case 'W': case 'X': case 'Y':
     case 'Z':
-    case '[': case '\\': case ']': case '^': case '_':
+    case '[': case '\\': case ']': case '^': case '_': case '`':
     case 'a': case 'b': case 'c': case 'd': case 'e':
     case 'f': case 'g': case 'h': case 'i': case 'j':
     case 'k': case 'l': case 'm': case 'n': case 'o':
diff --git a/lib/mbswidth.c b/lib/mbswidth.c
index da2d8030f3..6b26c6a599 100644
--- a/lib/mbswidth.c
+++ b/lib/mbswidth.c
@@ -66,20 +66,20 @@ mbsnwidth (const char *string, size_t nbytes, int flags)
       while (p < plimit)
         switch (*p)
           {
-            case ' ': case '!': case '"': case '#': case '%':
+            case ' ': case '!': case '"': case '#': case '$': case '%':
             case '&': case '\'': case '(': case ')': case '*':
             case '+': case ',': case '-': case '.': case '/':
             case '0': case '1': case '2': case '3': case '4':
             case '5': case '6': case '7': case '8': case '9':
             case ':': case ';': case '<': case '=': case '>':
-            case '?':
+            case '?': case '@':
             case 'A': case 'B': case 'C': case 'D': case 'E':
             case 'F': case 'G': case 'H': case 'I': case 'J':
             case 'K': case 'L': case 'M': case 'N': case 'O':
             case 'P': case 'Q': case 'R': case 'S': case 'T':
             case 'U': case 'V': case 'W': case 'X': case 'Y':
             case 'Z':
-            case '[': case '\\': case ']': case '^': case '_':
+            case '[': case '\\': case ']': case '^': case '_': case '`':
             case 'a': case 'b': case 'c': case 'd': case 'e':
             case 'f': case 'g': case 'h': case 'i': case 'j':
             case 'k': case 'l': case 'm': case 'n': case 'o':
-- 
2.39.2

From 138a0d617e87438cb3689bcb40d284deeaea002a Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Mon, 3 Jul 2023 11:05:53 -0700
Subject: [PATCH 2/2] mbchar: treat '\0' as basic

In the initial state, '\0' stands for itself everywhere.
* lib/mbchar.c (is_basic_table):
* lib/mbchar.h (is_basic):
Treat '\0' as basic too.
---
 ChangeLog    | 6 ++++++
 lib/mbchar.c | 4 ++--
 lib/mbchar.h | 3 ++-
 3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index a9e5b09a17..d59e270588 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,11 @@
 2023-07-03  Paul Eggert  <egg...@cs.ucla.edu>
 
+	mbchar: treat '\0' as basic
+	In the initial state, '\0' stands for itself everywhere.
+	* lib/mbchar.c (is_basic_table):
+	* lib/mbchar.h (is_basic):
+	Treat '\0' as basic too.
+
 	mbchar: treat @, $, ` as basic
 	The C standard says that @, $, ` must have a single-byte
 	representation so they count as basic as far as multibyte
diff --git a/lib/mbchar.c b/lib/mbchar.c
index 84b388bfee..fcb8919cc2 100644
--- a/lib/mbchar.c
+++ b/lib/mbchar.c
@@ -25,12 +25,12 @@
 #if IS_BASIC_ASCII
 
 /* Bit table of characters in the ISO C "basic character set",
-   plus the characters '@', '$', and '`' which
+   plus the characters '\0', '@', '$', and '`' which
    ISO C guarantees to be single-byte and in practice are safe
    to treat as basic in the execution character set.  */
 const unsigned int is_basic_table [UCHAR_MAX / 32 + 1] =
 {
-  0x00001a00,           /* '\t' '\v' '\f' */
+  0x00001a01,           /* '\0' '\t' '\v' '\f' */
   0xffffffff,           /* ' '......'?' */
   0xffffffff,           /* '@' 'A'...'Z' '[' '\\' ']' '^' '_' */
   0x7fffffff            /* '`' 'a'...'z' '{' '|' '}' '~' */
diff --git a/lib/mbchar.h b/lib/mbchar.h
index ccbde17b8f..edf560a2b7 100644
--- a/lib/mbchar.h
+++ b/lib/mbchar.h
@@ -273,7 +273,7 @@ mb_copy (mbchar_t *new_mbc, const mbchar_t *old_mbc)
 
 
 /* is_basic(c) tests whether the single-byte character c is in the
-   ISO C "basic character set" or is one of '@', '$', and '`' which
+   ISO C "basic character set" or is one of '\0', '@', '$', and '`' which
    ISO C guarantees to be single-byte and in practice are safe
    to treat as basic in the execution character set.
    This is a convenience function, and is in this file only to share code
@@ -320,6 +320,7 @@ is_basic (char c)
 {
   switch (c)
     {
+    case '\0':
     case '\t': case '\v': case '\f':
     case ' ': case '!': case '"': case '#': case '$': case '%':
     case '&': case '\'': case '(': case ')': case '*':
-- 
2.39.2

proposed performance tweaks to Gnulib mbchar module

Reply via email to