So far, the is_basic_table in mbchar.c marked all bytes in the range
0x20..0x7E as "basic", but not all of the control characters 0x00..0x1F.
But nowadays, all locale encodings map this range 0x00..0x1F to
U+0000..U+001F. The last encodings which did not have this property
were VISCII and TCVN5712-1, but I could eliminate them from localcharset.h
through the previous patch.

This allows some small optimization:


2023-07-13  Bruno Haible  <br...@clisp.org>

        mbchar: Optimize is_basic.
        * lib/mbchar.h (is_basic_table): Remove declaration.
        (is_basic) [IS_BASIC_ASCII]: Define through a simple range test.
        * lib/mbchar.c (is_basic_table): Remove array.

diff --git a/lib/mbchar.c b/lib/mbchar.c
index 63ff9c7a72..af3c7934dc 100644
--- a/lib/mbchar.c
+++ b/lib/mbchar.c
@@ -21,19 +21,3 @@
 #include <limits.h>
 
 #include "mbchar.h"
-
-#if IS_BASIC_ASCII
-
-/* Bit table of characters in the POSIX "portable character set", which
-   POSIX guarantees to be single-byte and in practice are safe to treat
-   like the ISO C "basic character set".  */
-const unsigned int is_basic_table [UCHAR_MAX / 32 + 1] =
-{
-  0x00003f81,           /* '\0' '\007' '\010' '\t' '\n' '\v' '\f' '\r' */
-  0xffffffff,           /* ' '......'?' */
-  0xffffffff,           /* '@' 'A'...'Z' '[' '\\' ']' '^' '_' */
-  0x7fffffff            /* '`' 'a'...'z' '{' '|' '}' '~' */
-  /* The remaining bits are 0.  */
-};
-
-#endif /* IS_BASIC_ASCII */
diff --git a/lib/mbchar.h b/lib/mbchar.h
index 36bae18276..82c373f47e 100644
--- a/lib/mbchar.h
+++ b/lib/mbchar.h
@@ -309,14 +309,17 @@ mb_copy (mbchar_t *new_mbc, const mbchar_t *old_mbc)
 /* The character set is ISO-646, not EBCDIC. */
 # define IS_BASIC_ASCII 1
 
-extern const unsigned int is_basic_table[];
-
-MBCHAR_INLINE bool
-is_basic (char c)
-{
-  return (is_basic_table [(unsigned char) c >> 5] >> ((unsigned char) c & 31))
-         & 1;
-}
+/* All locale encodings (see localcharset.h) map the characters 0x00..0x7F
+   to U+0000..U+007F, like ASCII, except for
+     CP864      different mapping of '%'
+     SHIFT_JIS  different mappings of 0x5C, 0x7E
+     JOHAB      different mapping of 0x5C
+   However, these characters in the range 0x20..0x7E are in the ISO C
+   "basic character set" and in the POSIX "portable character set", which
+   ISO C and POSIX guarantee to be single-byte.  Thus, locales with these
+   encodings are not POSIX compliant.  And they are most likely not in use
+   any more (as of 2023).  */
+# define is_basic(c) ((unsigned char) (c) < 0x80)
 
 #else
 




Reply via email to