In most application areas, it is not a problem if strings cannot contain NUL bytes, and thus the C type 'char *' with its NUL terminator is well usable.
In areas where strings with embedded NUL bytes need to be handled, the common approach is to use a 'char * data' pointer together with a 'size_t nbytes' size. This works fine in code that constructs or manipulates strings with embedded NUL bytes. But when it comes to *storing* them, for example in an array or as key or value of a hash table, one needs a type that combines these two fields: struct { size_t nbytes; char * data; } I propose to add a module that adds such a type, together with elementary functions that work on them. Such a type was long known as a "string descriptor" in VMS. It's also known as basic_string_view<char> in C++, or as String in Java. The type that I'm proposing does not have NUL byte appended to the data always and automatically, because I think it is more important to have a string_desc_substring function that does not cause memory allocation, than to have string_desc_c function (conversion to 'char *') that does not cause memory allocation. The type that I'm proposing does not have two distinct fields nbytes_used and nbytes_allocated. Such a type, e.g. [1] attempts to cover the use-case of accumulating a string as well. But - The Java experience with String vs. StringBuffer/StringBuilder shows that it is cleaner to separate the two use cases. - For the use-case of accumulating a string, C programmers have been using ad-hoc code with n_used and n_allocated for a long time; there is no need for anything else (except for lazy people who want C to be a scripting language). The type that I'm proposing also does not have fields for heap management, such as a 'bool heap' [2] or a reference count. That's because I think that - managing the allocated memory of a data structure is a different problem than that of representing a string, and it can be achieved with data outside the string descriptor, - Such a field would make it wrong to simply assign a string descriptor to a variable. Please let me know what you think: Does this have a place in Gnulib? (Or should it stay in GNU gettext, where I need it for the Perl parser?) Bruno [1] https://github.com/websnarf/bstrlib/blob/master/bstrlib.txt [2] https://github.com/maxim2266/str
/* GNU gettext - internationalization aids Copyright (C) 2023 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <https://www.gnu.org/licenses/>. */ /* Written by Bruno Haible <br...@clisp.org>, 2023. */ #ifndef _STRING_DESC_H #define _STRING_DESC_H 1 /* Get size_t, ptrdiff_t. */ #include <stddef.h> /* Get bool. */ #include <stdbool.h> #ifdef __cplusplus extern "C" { #endif /* Type describing a string that may contain NUL bytes. It's merely a descriptor of an array of bytes. */ typedef struct string_desc_t string_desc_t; struct string_desc_t { size_t nbytes; char *data; }; /* String descriptors can be passed and returned by value. */ /* ==== Side-effect-free operations on string descriptors ==== */ /* Return the length of the string S. */ extern size_t string_desc_length (string_desc_t s); /* Return the byte at index I of string S. I must be < length(S). */ extern char string_desc_char_at (string_desc_t s, size_t i); /* Return a read-only view of the bytes of S. */ extern const char * string_desc_data (string_desc_t s); /* Return true if S is the empty string. */ extern bool string_desc_is_empty (string_desc_t s); /* Return true if S starts with PREFIX. */ extern bool string_desc_startswith (string_desc_t s, string_desc_t prefix); /* Return true if S ends with SUFFIX. */ extern bool string_desc_endswith (string_desc_t s, string_desc_t suffix); /* Return > 0, == 0, or < 0 if A > B, A == B, A < B. This uses a lexicographic ordering, where the bytes are compared as 'unsigned char'. */ extern int string_desc_cmp (string_desc_t a, string_desc_t b); /* Return the index of the first occurrence of C in S, or -1 if there is none. */ extern ptrdiff_t string_desc_index (string_desc_t s, char c); /* Return the index of the last occurrence of C in S, or -1 if there is none. */ extern ptrdiff_t string_desc_last_index (string_desc_t s, char c); /* Return the index of the first occurrence of NEEDLE in HAYSTACK, or -1 if there is none. */ extern ptrdiff_t string_desc_contains (string_desc_t haystack, string_desc_t needle); /* Return a string that represents the C string S, of length strlen (S). */ extern string_desc_t string_desc_from_c (const char *s); /* Return the substring of S, starting at offset START and ending at offset END. START must be <= END. The result is of length END - START. The result must not be freed (since its storage is part of the storage of S). */ extern string_desc_t string_desc_substring (string_desc_t s, size_t start, size_t end); /* ==== Memory-allocating operations on string descriptors ==== */ /* Return a string of length N, with uninitialized contents. */ extern string_desc_t string_desc_new (size_t n); /* Return a string of length N, at the given memory address. */ extern string_desc_t string_desc_new_addr (size_t n, char *addr); /* Return a string of length N, filled with C. */ extern string_desc_t string_desc_new_filled (size_t n, char c); /* Return a copy of string S. */ extern string_desc_t string_desc_copy (string_desc_t s); /* Return the concatenation of N strings. N must be > 0. */ extern string_desc_t string_desc_concat (size_t n, string_desc_t string1, ...); /* Return a copy of string S, as a NUL-terminated C string. */ extern char * string_desc_c (string_desc_t s); /* ==== Operations with side effects on string descriptors ==== */ /* Overwrite the byte at index I of string S with C. I must be < length(S). */ extern void string_desc_set_char_at (string_desc_t s, size_t i, char c); /* Fill part of S, starting at offset START and ending at offset END, with copies of C. START must be <= END. */ extern void string_desc_fill (string_desc_t s, size_t start, size_t end, char c); /* Overwrite part of S with T, starting at offset START. START + length(T) must be <= length (S). */ extern void string_desc_overwrite (string_desc_t s, size_t start, string_desc_t t); /* Free S. */ extern void string_desc_free (string_desc_t s); #ifdef __cplusplus } #endif #endif /* _STRING_DESC_H */
/* GNU gettext - internationalization aids Copyright (C) 2023 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <https://www.gnu.org/licenses/>. */ /* Written by Bruno Haible <br...@clisp.org>, 2023. */ #ifdef HAVE_CONFIG_H # include "config.h" #endif /* Specification. */ #include "str-desc.h" #include <stdarg.h> #include <stdlib.h> #include <string.h> #include "xalloc.h" /* ==== Side-effect-free operations on string descriptors ==== */ size_t string_desc_length (string_desc_t s) { return s.nbytes; } char string_desc_char_at (string_desc_t s, size_t i) { if (!(i < s.nbytes)) /* Invalid argument. */ abort (); return s.data[i]; } const char * string_desc_data (string_desc_t s) { return s.data; } bool string_desc_is_empty (string_desc_t s) { return s.nbytes == 0; } bool string_desc_startswith (string_desc_t s, string_desc_t prefix) { return (s.nbytes >= prefix.nbytes && (prefix.nbytes == 0 || memcmp (s.data, prefix.data, prefix.nbytes) == 0)); } bool string_desc_endswith (string_desc_t s, string_desc_t suffix) { return (s.nbytes >= suffix.nbytes && (suffix.nbytes == 0 || memcmp (s.data + (s.nbytes - suffix.nbytes), suffix.data, suffix.nbytes) == 0)); } int string_desc_cmp (string_desc_t a, string_desc_t b) { if (a.nbytes > b.nbytes) { if (b.nbytes == 0) return 1; return (memcmp (a.data, b.data, b.nbytes) < 0 ? -1 : 1); } else if (a.nbytes < b.nbytes) { if (a.nbytes == 0) return -1; return (memcmp (a.data, b.data, a.nbytes) > 0 ? 1 : -1); } else /* a.nbytes == b.nbytes */ { if (a.nbytes == 0) return 0; return memcmp (a.data, b.data, a.nbytes); } } ptrdiff_t string_desc_index (string_desc_t s, char c) { if (s.nbytes > 0) { void *found = memchr (s.data, (unsigned char) c, s.nbytes); if (found != NULL) return (char *) found - s.data; } return -1; } ptrdiff_t string_desc_last_index (string_desc_t s, char c) { if (s.nbytes > 0) { void *found = memrchr (s.data, (unsigned char) c, s.nbytes); if (found != NULL) return (char *) found - s.data; } return -1; } ptrdiff_t string_desc_contains (string_desc_t haystack, string_desc_t needle) { if (needle.nbytes == 0) return 0; void *found = memmem (haystack.data, haystack.nbytes, needle.data, needle.nbytes); if (found != NULL) return (char *) found - haystack.data; else return -1; } string_desc_t string_desc_from_c (const char *s) { string_desc_t result; result.nbytes = strlen (s); result.data = (char *) s; return result; } string_desc_t string_desc_substring (string_desc_t s, size_t start, size_t end) { string_desc_t result; if (!(start <= end)) /* Invalid arguments. */ abort (); result.nbytes = end - start; result.data = s.data + start; return result; } /* ==== Memory-allocating operations on string descriptors ==== */ string_desc_t string_desc_new (size_t n) { string_desc_t result; result.nbytes = n; if (n == 0) result.data = NULL; else result.data = (char *) xmalloc (n); return result; } string_desc_t string_desc_new_addr (size_t n, char *addr) { string_desc_t result; result.nbytes = n; if (n == 0) result.data = NULL; else result.data = addr; return result; } string_desc_t string_desc_new_filled (size_t n, char c) { string_desc_t result; result.nbytes = n; if (n == 0) result.data = NULL; else { result.data = (char *) xmalloc (n); memset (result.data, (unsigned char) c, n); } return result; } string_desc_t string_desc_copy (string_desc_t s) { string_desc_t result; size_t n = s.nbytes; result.nbytes = n; if (n == 0) result.data = NULL; else { result.data = (char *) xmalloc (n); memcpy (result.data, s.data, n); } return result; } string_desc_t string_desc_concat (size_t n, string_desc_t string1, ...) { if (n == 0) /* Invalid argument. */ abort (); size_t total = 0; total += string1.nbytes; if (n > 1) { va_list other_strings; size_t i; va_start (other_strings, string1); for (i = --n; i > 0; i--) { string_desc_t arg = va_arg (other_strings, string_desc_t); total += arg.nbytes; } va_end (other_strings); } char *combined = (char *) xmalloc (total); size_t pos = 0; memcpy (combined, string1.data, string1.nbytes); pos += string1.nbytes; if (n > 1) { va_list other_strings; size_t i; va_start (other_strings, string1); for (i = --n; i > 0; i--) { string_desc_t arg = va_arg (other_strings, string_desc_t); if (arg.nbytes > 0) memcpy (combined + pos, arg.data, arg.nbytes); pos += arg.nbytes; } va_end (other_strings); } string_desc_t result; result.nbytes = total; result.data = combined; return result; } char * string_desc_c (string_desc_t s) { size_t n = s.nbytes; char *result = (char *) xmalloc (n + 1); if (n > 0) memcpy (result, s.data, n); result[n] = '\0'; return result; } /* ==== Operations with side effects on string descriptors ==== */ void string_desc_set_char_at (string_desc_t s, size_t i, char c) { if (!(i < s.nbytes)) /* Invalid argument. */ abort (); s.data[i] = c; } void string_desc_fill (string_desc_t s, size_t start, size_t end, char c) { if (!(start <= end)) /* Invalid arguments. */ abort (); if (start < end) memset (s.data + start, (unsigned char) c, end - start); } void string_desc_overwrite (string_desc_t s, size_t start, string_desc_t t) { if (!(start + t.nbytes <= s.nbytes)) /* Invalid arguments. */ abort (); if (t.nbytes > 0) memcpy (s.data + start, t.data, t.nbytes); } void string_desc_free (string_desc_t s) { free (s.data); }