Attached are two proposed performance tweaks I found by inspection. No
big deal of course.From 775a34de03f0c4cc9a8a87e65030d19733301193 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Mon, 3 Jul 2023 10:54:36 -0700
Subject: [PATCH 1/2] mbchar: treat @, $, ` as basic
The C standard says that @, $, ` must have a single-byte
representation so they count as basic as far as multibyte
character processing goes.
* lib/mbchar.c (is_basic_table):
* lib/mbchar.h (is_basic):
* lib/mbswidth.c (mbsnwidth):
Treat @, $, ` as basic too.
---
ChangeLog | 9 +++++++++
lib/mbchar.c | 11 +++++++----
lib/mbchar.h | 16 +++++++++-------
lib/mbswidth.c | 6 +++---
4 files changed, 28 insertions(+), 14 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index 7d44959df2..a9e5b09a17 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,14 @@
2023-07-03 Paul Eggert <egg...@cs.ucla.edu>
+ mbchar: treat @, $, ` as basic
+ The C standard says that @, $, ` must have a single-byte
+ representation so they count as basic as far as multibyte
+ character processing goes.
+ * lib/mbchar.c (is_basic_table):
+ * lib/mbchar.h (is_basic):
+ * lib/mbswidth.c (mbsnwidth):
+ Treat @, $, ` as basic too.
+
mbrtoc32: document (size_t) -3 issue
* doc/posix-functions/mbrtoc32.texi (mbrtoc32):
Say (size_t) -3 might not be worth bothering about.
diff --git a/lib/mbchar.c b/lib/mbchar.c
index 7d5f72c9c5..84b388bfee 100644
--- a/lib/mbchar.c
+++ b/lib/mbchar.c
@@ -24,13 +24,16 @@
#if IS_BASIC_ASCII
-/* Bit table of characters in the ISO C "basic character set". */
+/* Bit table of characters in the ISO C "basic character set",
+ plus the characters '@', '$', and '`' which
+ ISO C guarantees to be single-byte and in practice are safe
+ to treat as basic in the execution character set. */
const unsigned int is_basic_table [UCHAR_MAX / 32 + 1] =
{
0x00001a00, /* '\t' '\v' '\f' */
- 0xffffffef, /* ' '...'#' '%'...'?' */
- 0xfffffffe, /* 'A'...'Z' '[' '\\' ']' '^' '_' */
- 0x7ffffffe /* 'a'...'z' '{' '|' '}' '~' */
+ 0xffffffff, /* ' '......'?' */
+ 0xffffffff, /* '@' 'A'...'Z' '[' '\\' ']' '^' '_' */
+ 0x7fffffff /* '`' 'a'...'z' '{' '|' '}' '~' */
/* The remaining bits are 0. */
};
diff --git a/lib/mbchar.h b/lib/mbchar.h
index c183772cc6..ccbde17b8f 100644
--- a/lib/mbchar.h
+++ b/lib/mbchar.h
@@ -273,17 +273,19 @@ mb_copy (mbchar_t *new_mbc, const mbchar_t *old_mbc)
/* is_basic(c) tests whether the single-byte character c is in the
- ISO C "basic character set".
+ ISO C "basic character set" or is one of '@', '$', and '`' which
+ ISO C guarantees to be single-byte and in practice are safe
+ to treat as basic in the execution character set.
This is a convenience function, and is in this file only to share code
between mbiter_multi.h and mbfile_multi.h. */
-#if (' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) \
+#if (' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) && ('$' == 36) \
&& ('%' == 37) && ('&' == 38) && ('\'' == 39) && ('(' == 40) \
&& (')' == 41) && ('*' == 42) && ('+' == 43) && (',' == 44) \
&& ('-' == 45) && ('.' == 46) && ('/' == 47) && ('0' == 48) \
&& ('1' == 49) && ('2' == 50) && ('3' == 51) && ('4' == 52) \
&& ('5' == 53) && ('6' == 54) && ('7' == 55) && ('8' == 56) \
&& ('9' == 57) && (':' == 58) && (';' == 59) && ('<' == 60) \
- && ('=' == 61) && ('>' == 62) && ('?' == 63) && ('A' == 65) \
+ && ('=' == 61) && ('>' == 62) && ('?' == 63) && ('@' == 64) && ('A' == 65) \
&& ('B' == 66) && ('C' == 67) && ('D' == 68) && ('E' == 69) \
&& ('F' == 70) && ('G' == 71) && ('H' == 72) && ('I' == 73) \
&& ('J' == 74) && ('K' == 75) && ('L' == 76) && ('M' == 77) \
@@ -291,7 +293,7 @@ mb_copy (mbchar_t *new_mbc, const mbchar_t *old_mbc)
&& ('R' == 82) && ('S' == 83) && ('T' == 84) && ('U' == 85) \
&& ('V' == 86) && ('W' == 87) && ('X' == 88) && ('Y' == 89) \
&& ('Z' == 90) && ('[' == 91) && ('\\' == 92) && (']' == 93) \
- && ('^' == 94) && ('_' == 95) && ('a' == 97) && ('b' == 98) \
+ && ('^' == 94) && ('_' == 95) && ('`' == 96) && ('a' == 97) && ('b' == 98) \
&& ('c' == 99) && ('d' == 100) && ('e' == 101) && ('f' == 102) \
&& ('g' == 103) && ('h' == 104) && ('i' == 105) && ('j' == 106) \
&& ('k' == 107) && ('l' == 108) && ('m' == 109) && ('n' == 110) \
@@ -319,20 +321,20 @@ is_basic (char c)
switch (c)
{
case '\t': case '\v': case '\f':
- case ' ': case '!': case '"': case '#': case '%':
+ case ' ': case '!': case '"': case '#': case '$': case '%':
case '&': case '\'': case '(': case ')': case '*':
case '+': case ',': case '-': case '.': case '/':
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
case ':': case ';': case '<': case '=': case '>':
- case '?':
+ case '?': case '@':
case 'A': case 'B': case 'C': case 'D': case 'E':
case 'F': case 'G': case 'H': case 'I': case 'J':
case 'K': case 'L': case 'M': case 'N': case 'O':
case 'P': case 'Q': case 'R': case 'S': case 'T':
case 'U': case 'V': case 'W': case 'X': case 'Y':
case 'Z':
- case '[': case '\\': case ']': case '^': case '_':
+ case '[': case '\\': case ']': case '^': case '_': case '`':
case 'a': case 'b': case 'c': case 'd': case 'e':
case 'f': case 'g': case 'h': case 'i': case 'j':
case 'k': case 'l': case 'm': case 'n': case 'o':
diff --git a/lib/mbswidth.c b/lib/mbswidth.c
index da2d8030f3..6b26c6a599 100644
--- a/lib/mbswidth.c
+++ b/lib/mbswidth.c
@@ -66,20 +66,20 @@ mbsnwidth (const char *string, size_t nbytes, int flags)
while (p < plimit)
switch (*p)
{
- case ' ': case '!': case '"': case '#': case '%':
+ case ' ': case '!': case '"': case '#': case '$': case '%':
case '&': case '\'': case '(': case ')': case '*':
case '+': case ',': case '-': case '.': case '/':
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
case ':': case ';': case '<': case '=': case '>':
- case '?':
+ case '?': case '@':
case 'A': case 'B': case 'C': case 'D': case 'E':
case 'F': case 'G': case 'H': case 'I': case 'J':
case 'K': case 'L': case 'M': case 'N': case 'O':
case 'P': case 'Q': case 'R': case 'S': case 'T':
case 'U': case 'V': case 'W': case 'X': case 'Y':
case 'Z':
- case '[': case '\\': case ']': case '^': case '_':
+ case '[': case '\\': case ']': case '^': case '_': case '`':
case 'a': case 'b': case 'c': case 'd': case 'e':
case 'f': case 'g': case 'h': case 'i': case 'j':
case 'k': case 'l': case 'm': case 'n': case 'o':
--
2.39.2
From 138a0d617e87438cb3689bcb40d284deeaea002a Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Mon, 3 Jul 2023 11:05:53 -0700
Subject: [PATCH 2/2] mbchar: treat '\0' as basic
In the initial state, '\0' stands for itself everywhere.
* lib/mbchar.c (is_basic_table):
* lib/mbchar.h (is_basic):
Treat '\0' as basic too.
---
ChangeLog | 6 ++++++
lib/mbchar.c | 4 ++--
lib/mbchar.h | 3 ++-
3 files changed, 10 insertions(+), 3 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index a9e5b09a17..d59e270588 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,11 @@
2023-07-03 Paul Eggert <egg...@cs.ucla.edu>
+ mbchar: treat '\0' as basic
+ In the initial state, '\0' stands for itself everywhere.
+ * lib/mbchar.c (is_basic_table):
+ * lib/mbchar.h (is_basic):
+ Treat '\0' as basic too.
+
mbchar: treat @, $, ` as basic
The C standard says that @, $, ` must have a single-byte
representation so they count as basic as far as multibyte
diff --git a/lib/mbchar.c b/lib/mbchar.c
index 84b388bfee..fcb8919cc2 100644
--- a/lib/mbchar.c
+++ b/lib/mbchar.c
@@ -25,12 +25,12 @@
#if IS_BASIC_ASCII
/* Bit table of characters in the ISO C "basic character set",
- plus the characters '@', '$', and '`' which
+ plus the characters '\0', '@', '$', and '`' which
ISO C guarantees to be single-byte and in practice are safe
to treat as basic in the execution character set. */
const unsigned int is_basic_table [UCHAR_MAX / 32 + 1] =
{
- 0x00001a00, /* '\t' '\v' '\f' */
+ 0x00001a01, /* '\0' '\t' '\v' '\f' */
0xffffffff, /* ' '......'?' */
0xffffffff, /* '@' 'A'...'Z' '[' '\\' ']' '^' '_' */
0x7fffffff /* '`' 'a'...'z' '{' '|' '}' '~' */
diff --git a/lib/mbchar.h b/lib/mbchar.h
index ccbde17b8f..edf560a2b7 100644
--- a/lib/mbchar.h
+++ b/lib/mbchar.h
@@ -273,7 +273,7 @@ mb_copy (mbchar_t *new_mbc, const mbchar_t *old_mbc)
/* is_basic(c) tests whether the single-byte character c is in the
- ISO C "basic character set" or is one of '@', '$', and '`' which
+ ISO C "basic character set" or is one of '\0', '@', '$', and '`' which
ISO C guarantees to be single-byte and in practice are safe
to treat as basic in the execution character set.
This is a convenience function, and is in this file only to share code
@@ -320,6 +320,7 @@ is_basic (char c)
{
switch (c)
{
+ case '\0':
case '\t': case '\v': case '\f':
case ' ': case '!': case '"': case '#': case '$': case '%':
case '&': case '\'': case '(': case ')': case '*':
--
2.39.2