In most application areas, it is not a problem if strings cannot contain NUL
bytes, and thus the C type 'char *' with its NUL terminator is well usable.

In areas where strings with embedded NUL bytes need to be handled, the common
approach is to use a 'char * data' pointer together with a 'size_t nbytes'
size. This works fine in code that constructs or manipulates strings with
embedded NUL bytes. But when it comes to *storing* them, for example in an
array or as key or value of a hash table, one needs a type that combines these
two fields:

  struct
  {
    size_t nbytes;
    char * data;
  }

I propose to add a module that adds such a type, together with elementary
functions that work on them.

Such a type was long known as a "string descriptor" in VMS. It's also known
as basic_string_view<char> in C++, or as String in Java.

The type that I'm proposing does not have NUL byte appended to the data
always and automatically, because I think it is more important to have a
string_desc_substring function that does not cause memory allocation,
than to have string_desc_c function (conversion to 'char *') that does
not cause memory allocation.

The type that I'm proposing does not have two distinct fields
nbytes_used and nbytes_allocated. Such a type, e.g. [1] attempts to
cover the use-case of accumulating a string as well. But
  - The Java experience with String vs. StringBuffer/StringBuilder
    shows that it is cleaner to separate the two use cases.
  - For the use-case of accumulating a string, C programmers have been using
    ad-hoc code with n_used and n_allocated for a long time; there is
    no need for anything else (except for lazy people who want C to be
    a scripting language).

The type that I'm proposing also does not have fields for heap management,
such as a 'bool heap' [2] or a reference count. That's because I think that
  - managing the allocated memory of a data structure is a different
    problem than that of representing a string, and it can be achieved
    with data outside the string descriptor,
  - Such a field would make it wrong to simply assign a string descriptor
    to a variable.

Please let me know what you think: Does this have a place in Gnulib? (Or
should it stay in GNU gettext, where I need it for the Perl parser?)

Bruno

[1] https://github.com/websnarf/bstrlib/blob/master/bstrlib.txt
[2] https://github.com/maxim2266/str
/* GNU gettext - internationalization aids
   Copyright (C) 2023 Free Software Foundation, Inc.

   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */

/* Written by Bruno Haible <br...@clisp.org>, 2023.  */

#ifndef _STRING_DESC_H
#define _STRING_DESC_H 1

/* Get size_t, ptrdiff_t.  */
#include <stddef.h>

/* Get bool.  */
#include <stdbool.h>


#ifdef __cplusplus
extern "C" {
#endif


/* Type describing a string that may contain NUL bytes.
   It's merely a descriptor of an array of bytes.  */
typedef struct string_desc_t string_desc_t;
struct string_desc_t
{
  size_t nbytes;
  char *data;
};

/* String descriptors can be passed and returned by value.  */


/* ==== Side-effect-free operations on string descriptors ==== */

/* Return the length of the string S.  */
extern size_t string_desc_length (string_desc_t s);

/* Return the byte at index I of string S.
   I must be < length(S).  */
extern char string_desc_char_at (string_desc_t s, size_t i);

/* Return a read-only view of the bytes of S.  */
extern const char * string_desc_data (string_desc_t s);

/* Return true if S is the empty string.  */
extern bool string_desc_is_empty (string_desc_t s);

/* Return true if S starts with PREFIX.  */
extern bool string_desc_startswith (string_desc_t s, string_desc_t prefix);

/* Return true if S ends with SUFFIX.  */
extern bool string_desc_endswith (string_desc_t s, string_desc_t suffix);

/* Return > 0, == 0, or < 0 if A > B, A == B, A < B.
   This uses a lexicographic ordering, where the bytes are compared as
   'unsigned char'.  */
extern int string_desc_cmp (string_desc_t a, string_desc_t b);

/* Return the index of the first occurrence of C in S,
   or -1 if there is none.  */
extern ptrdiff_t string_desc_index (string_desc_t s, char c);

/* Return the index of the last occurrence of C in S,
   or -1 if there is none.  */
extern ptrdiff_t string_desc_last_index (string_desc_t s, char c);

/* Return the index of the first occurrence of NEEDLE in HAYSTACK,
   or -1 if there is none.  */
extern ptrdiff_t string_desc_contains (string_desc_t haystack, string_desc_t needle);

/* Return a string that represents the C string S, of length strlen (S).  */
extern string_desc_t string_desc_from_c (const char *s);

/* Return the substring of S, starting at offset START and ending at offset END.
   START must be <= END.
   The result is of length END - START.
   The result must not be freed (since its storage is part of the storage
   of S).  */
extern string_desc_t string_desc_substring (string_desc_t s, size_t start, size_t end);


/* ==== Memory-allocating operations on string descriptors ==== */

/* Return a string of length N, with uninitialized contents.  */
extern string_desc_t string_desc_new (size_t n);

/* Return a string of length N, at the given memory address.  */
extern string_desc_t string_desc_new_addr (size_t n, char *addr);

/* Return a string of length N, filled with C.  */
extern string_desc_t string_desc_new_filled (size_t n, char c);

/* Return a copy of string S.  */
extern string_desc_t string_desc_copy (string_desc_t s);

/* Return the concatenation of N strings.  N must be > 0.  */
extern string_desc_t string_desc_concat (size_t n, string_desc_t string1, ...);

/* Return a copy of string S, as a NUL-terminated C string.  */
extern char * string_desc_c (string_desc_t s);


/* ==== Operations with side effects on string descriptors ==== */

/* Overwrite the byte at index I of string S with C.
   I must be < length(S).  */
extern void string_desc_set_char_at (string_desc_t s, size_t i, char c);

/* Fill part of S, starting at offset START and ending at offset END,
   with copies of C.
   START must be <= END.  */
extern void string_desc_fill (string_desc_t s, size_t start, size_t end, char c);

/* Overwrite part of S with T, starting at offset START.
   START + length(T) must be <= length (S).  */
extern void string_desc_overwrite (string_desc_t s, size_t start, string_desc_t t);

/* Free S.  */
extern void string_desc_free (string_desc_t s);


#ifdef __cplusplus
}
#endif


#endif /* _STRING_DESC_H */
/* GNU gettext - internationalization aids
   Copyright (C) 2023 Free Software Foundation, Inc.

   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */

/* Written by Bruno Haible <br...@clisp.org>, 2023.  */

#ifdef HAVE_CONFIG_H
# include "config.h"
#endif

/* Specification.  */
#include "str-desc.h"

#include <stdarg.h>
#include <stdlib.h>
#include <string.h>

#include "xalloc.h"


/* ==== Side-effect-free operations on string descriptors ==== */

size_t
string_desc_length (string_desc_t s)
{
  return s.nbytes;
}

char
string_desc_char_at (string_desc_t s, size_t i)
{
  if (!(i < s.nbytes))
    /* Invalid argument.  */
    abort ();
  return s.data[i];
}

const char *
string_desc_data (string_desc_t s)
{
  return s.data;
}

bool
string_desc_is_empty (string_desc_t s)
{
  return s.nbytes == 0;
}

bool
string_desc_startswith (string_desc_t s, string_desc_t prefix)
{
  return (s.nbytes >= prefix.nbytes
          && (prefix.nbytes == 0
              || memcmp (s.data, prefix.data, prefix.nbytes) == 0));
}

bool
string_desc_endswith (string_desc_t s, string_desc_t suffix)
{
  return (s.nbytes >= suffix.nbytes
          && (suffix.nbytes == 0
              || memcmp (s.data + (s.nbytes - suffix.nbytes), suffix.data,
                         suffix.nbytes) == 0));
}

int
string_desc_cmp (string_desc_t a, string_desc_t b)
{
  if (a.nbytes > b.nbytes)
    {
      if (b.nbytes == 0)
        return 1;
      return (memcmp (a.data, b.data, b.nbytes) < 0 ? -1 : 1);
    }
  else if (a.nbytes < b.nbytes)
    {
      if (a.nbytes == 0)
        return -1;
      return (memcmp (a.data, b.data, a.nbytes) > 0 ? 1 : -1);
    }
  else /* a.nbytes == b.nbytes */
    {
      if (a.nbytes == 0)
        return 0;
      return memcmp (a.data, b.data, a.nbytes);
    }
}

ptrdiff_t
string_desc_index (string_desc_t s, char c)
{
  if (s.nbytes > 0)
    {
      void *found = memchr (s.data, (unsigned char) c, s.nbytes);
      if (found != NULL)
        return (char *) found - s.data;
    }
  return -1;
}

ptrdiff_t
string_desc_last_index (string_desc_t s, char c)
{
  if (s.nbytes > 0)
    {
      void *found = memrchr (s.data, (unsigned char) c, s.nbytes);
      if (found != NULL)
        return (char *) found - s.data;
    }
  return -1;
}

ptrdiff_t
string_desc_contains (string_desc_t haystack, string_desc_t needle)
{
  if (needle.nbytes == 0)
    return 0;
  void *found =
    memmem (haystack.data, haystack.nbytes, needle.data, needle.nbytes);
  if (found != NULL)
    return (char *) found - haystack.data;
  else
    return -1;
}

string_desc_t
string_desc_from_c (const char *s)
{
  string_desc_t result;

  result.nbytes = strlen (s);
  result.data = (char *) s;

  return result;
}

string_desc_t
string_desc_substring (string_desc_t s, size_t start, size_t end)
{
  string_desc_t result;

  if (!(start <= end))
    /* Invalid arguments.  */
    abort ();

  result.nbytes = end - start;
  result.data = s.data + start;

  return result;
}


/* ==== Memory-allocating operations on string descriptors ==== */

string_desc_t
string_desc_new (size_t n)
{
  string_desc_t result;

  result.nbytes = n;
  if (n == 0)
    result.data = NULL;
  else
    result.data = (char *) xmalloc (n);

  return result;
}

string_desc_t
string_desc_new_addr (size_t n, char *addr)
{
  string_desc_t result;

  result.nbytes = n;
  if (n == 0)
    result.data = NULL;
  else
    result.data = addr;

  return result;
}

string_desc_t
string_desc_new_filled (size_t n, char c)
{
  string_desc_t result;

  result.nbytes = n;
  if (n == 0)
    result.data = NULL;
  else
    {
      result.data = (char *) xmalloc (n);
      memset (result.data, (unsigned char) c, n);
    }

  return result;
}

string_desc_t
string_desc_copy (string_desc_t s)
{
  string_desc_t result;
  size_t n = s.nbytes;

  result.nbytes = n;
  if (n == 0)
    result.data = NULL;
  else
    {
      result.data = (char *) xmalloc (n);
      memcpy (result.data, s.data, n);
    }

  return result;
}

string_desc_t
string_desc_concat (size_t n, string_desc_t string1, ...)
{
  if (n == 0)
    /* Invalid argument.  */
    abort ();

  size_t total = 0;
  total += string1.nbytes;
  if (n > 1)
    {
      va_list other_strings;
      size_t i;

      va_start (other_strings, string1);
      for (i = --n; i > 0; i--)
        {
          string_desc_t arg = va_arg (other_strings, string_desc_t);
          total += arg.nbytes;
        }
      va_end (other_strings);
    }

  char *combined = (char *) xmalloc (total);
  size_t pos = 0;
  memcpy (combined, string1.data, string1.nbytes);
  pos += string1.nbytes;
  if (n > 1)
    {
      va_list other_strings;
      size_t i;

      va_start (other_strings, string1);
      for (i = --n; i > 0; i--)
        {
          string_desc_t arg = va_arg (other_strings, string_desc_t);
          if (arg.nbytes > 0)
            memcpy (combined + pos, arg.data, arg.nbytes);
          pos += arg.nbytes;
        }
      va_end (other_strings);
    }

  string_desc_t result;
  result.nbytes = total;
  result.data = combined;

  return result;
}

char *
string_desc_c (string_desc_t s)
{
  size_t n = s.nbytes;
  char *result = (char *) xmalloc (n + 1);
  if (n > 0)
    memcpy (result, s.data, n);
  result[n] = '\0';

  return result;
}


/* ==== Operations with side effects on string descriptors ==== */

void
string_desc_set_char_at (string_desc_t s, size_t i, char c)
{
  if (!(i < s.nbytes))
    /* Invalid argument.  */
    abort ();
  s.data[i] = c;
}

void
string_desc_fill (string_desc_t s, size_t start, size_t end, char c)
{
  if (!(start <= end))
    /* Invalid arguments.  */
    abort ();

  if (start < end)
    memset (s.data + start, (unsigned char) c, end - start);
}

void
string_desc_overwrite (string_desc_t s, size_t start, string_desc_t t)
{
  if (!(start + t.nbytes <= s.nbytes))
    /* Invalid arguments.  */
    abort ();

  if (t.nbytes > 0)
    memcpy (s.data + start, t.data, t.nbytes);
}

void
string_desc_free (string_desc_t s)
{
  free (s.data);
}

Reply via email to