[dev] [sbase] [PATCH] Rewrite tr(1) in a sane way

FRIGN Fri, 09 Jan 2015 11:40:29 -0800

Hello fellow hackers,

the current tr(1)-implementation has really been giving me nightmares,
so I rewrote it.
Given POSIX really sucks in some areas, I went off the path at some
areas, but not in a way that it would break scripts.
Here's a comparison and you let me know what you prefer:


1) GNU coreutils:
-----------------

$ echo "Motörhead" | tr öo oö
M�to�rhead

What happens? coreutils-tr(1) doesn't support multibyte characters
and actually interprets ö as multiple single characters, which is
the reason why it obviously messes it up.

2) old sbase-tr:
----------------

$ echo "Motörhead" | ./tr öo oö
Mötorhead
$ echo "x    x" | ./tr -s " "
usage: ./tr [-d] [-c] set1 [set2]
$ wc -l tr.c
356 tr.c

Oh geez! You can't squeeze! Well, seems like I have to use coreutils now.

3) new tr:
----------

$ echo "Motörhead" | ./tr öo oö
Mötorhead
$ echo "x    x" | ./tr -s " "
x x
$ wc -l tr.c
243 tr.c

Works just fine!

Please test it and let me know what you think!

Cheers

FRIGN

-- 
FRIGN <d...@frign.de>

>From 2ff2c365fac5a0c0c0b6ee88cbbb4502a2dcf0a6 Mon Sep 17 00:00:00 2001
From: FRIGN <d...@frign.de>
Date: Fri, 9 Jan 2015 20:36:27 +0100
Subject: [PATCH] Rewrite tr(1) in a sane way

tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:

 - UTF-8: not allowed in POSIX, but in my opinion a must. This
          finally allows you to work with UTF-8 streams without
          problems or unexpected behaviour.
 - Equivalence classes: Left out, even GNU coreutils ignore them
                        and depending on LC_COLLATE, which sucks.
 - Character classes: No experiments or environment-variable-trickery.
                      Just plain definitions derived from the POSIX-
                      standard, working as expected.

I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
---
 Makefile |   1 +
 tr.c     | 487 ++++++++++++++++++++++++---------------------------------------
 utf.h    |   1 +
 3 files changed, 189 insertions(+), 300 deletions(-)

diff --git a/Makefile b/Makefile
index 14dc982..b4565bf 100644
--- a/Makefile
+++ b/Makefile
@@ -20,6 +20,7 @@ HDR =\
 
 LIBUTF = libutf.a
 LIBUTFSRC =\
+	libutf/chartorunearr.c\
 	libutf/readrune.c\
 	libutf/rune.c\
 	libutf/runetype.c\
diff --git a/tr.c b/tr.c
index b661048..cfd1c94 100644
--- a/tr.c
+++ b/tr.c
@@ -1,356 +1,243 @@
-/* See LICENSE file for copyright and license details. */
-#include <locale.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <string.h>
-#include <wchar.h>
 
-#include "text.h"
+#include "utf.h"
 #include "util.h"
 
-static void
-usage(void)
-{
-	eprintf("usage: %s [-d] [-c] set1 [set2]\n", argv0);
-}
-
-static int dflag, cflag;
-static wchar_t mappings[0x110000];
+static int cflag = 0;
+static int dflag = 0;
+static int sflag = 0;
 
-struct wset_state {
-	char *s;               /* current character */
-	wchar_t rfirst, rlast; /* first and last in range */
-	wchar_t prev;          /* previous returned character */
-	int prev_was_range;    /* was the previous character part of a c-c range? */
+struct range {
+	Rune   start;
+	Rune   end;
+	size_t quant;
 };
 
-struct set_state {
-	char *s, rfirst, rlast, prev;
-	int prev_was_octal; /* was the previous returned character written in octal? */
+#define DIGIT "0-9"
+#define UPPER "A-Z"
+#define LOWER "a-z"
+#define PUNCT "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
+#define ALNUM DIGIT UPPER LOWER
+
+struct class {
+	char  *name;
+	char  *str;
+} classes[] = {
+	{ "alnum",  ALNUM           },
+	{ "alpha",  UPPER LOWER     },
+	{ "blank",  " \t"           },
+	{ "cntrl",  "\000-\037\177" },
+	{ "digit",  DIGIT           },
+	{ "graph",  ALNUM PUNCT     },
+	{ "lower",  LOWER           },
+	{ "print",  ALNUM PUNCT " " },
+	{ "punct",  PUNCT           },
+	{ "space",  "\t\n\v\f\r"    },
+	{ "upper",  UPPER           },
+	{ "xdigit", DIGIT "A-Fa-f"  },
 };
 
-static void
-set_state_defaults(struct set_state *s)
-{
-	s->rfirst = 1;
-	s->rlast = 0;
-	s->prev_was_octal = 1;
-}
+struct range *set1 = NULL;
+size_t set1ranges  = 0;
+struct range *set2 = NULL;
+size_t set2ranges  = 0;
 
-static void
-wset_state_defaults(struct wset_state *s)
+static size_t
+rangelen(struct range r)
 {
-	s->rfirst = 1;
-	s->rlast = 0;
-	s->prev_was_range = 1;
+	return (r.end - r.start + 1) * r.quant;
 }
 
-/* sets *s to the char that was intended to be written.
- * returns how many bytes the s pointer has to advance to skip the
- * escape sequence if it was an octal, always zero otherwise. */
-static int
-resolve_escape(char *s)
+static size_t
+setlen(struct range *set, size_t setranges)
 {
 	int i;
-	unsigned char c;
-
-	switch (*s) {
-	case 'n':
-		*s = '\n';
-		return 0;
-	case 't':
-		*s = '\t';
-		return 0;
-	case 'r':
-		*s = '\r';
-		return 0;
-	case 'f':
-		*s = '\f';
-		return 0;
-	case 'a':
-		*s = '\a';
-		return 0;
-	case 'b':
-		*s = '\b';
-		return 0;
-	case 'v':
-		*s = '\v';
-		return 0;
-	case '\\':
-		*s = '\\';
-		return 0;
-	case '\0':
-		eprintf("stray '\\' at end of input:");
-	default: ;
-	}
+	size_t len = 0;
 
-	if(*s < '0' || *s > '7')
-		eprintf("invalid character after '\\':");
-	for(i = 0, c = 0; s[i] >= '0' && s[i] <= '7' && i < 3; i++) {
-		c <<= 3;
-		c += s[i]-'0';
-	}
-	if(*s > '3' && i == 3)
-		eprintf("octal byte cannot be bigger than 377:");
-	*s = c;
-	return i;
-}
-
-#define embtowc(a, b) mbtowc(a, b, 4)
-
-static int
-xmbtowc(wchar_t *unicodep, const char *s)
-{
-	int rv;
+	for (i = 0; i < setranges; i++)
+		len += rangelen(set[i]);
 
-	rv = embtowc(unicodep, s);
-	if (rv < 0)
-			eprintf("mbtowc: invalid input sequence:");
-	return rv;
+	return len;
 }
 
 static int
-has_octal_escapes(const char *s)
-{
-	while (*s)
-		if (*s++ == '\\' && *s >= '0' && *s <= '7')
-			return 1;
-	return 0;
-}
-
-static char
-get_next_char(struct set_state *s)
-{
-	char c;
-	int nchars;
-
-start:
-	if (s->rfirst <= s->rlast) {
-		c = s->rfirst;
-		s->rfirst++;
-		return c;
-	}
-
-	if (*s->s == '-' && !s->prev_was_octal) {
-		s->s++;
-		if (!*s->s)
-			return '-';
-		if (*s->s == '\\' && (nchars = resolve_escape(++(s->s))))
-			goto char_is_octal;
-		s->rlast = *(s->s)++;
-		if (!s->rlast)
-			return '\0';
-		s->prev_was_octal = 1;
-		s->rfirst = ++(s->prev);
-		goto start;
-	}
-	if (*s->s == '\\' && (nchars = resolve_escape(++(s->s))))
-		goto char_is_octal;
-
-	s->prev_was_octal = 0;
-	c = *(s->s)++;
-	s->prev = c;
-	return c;
-
-char_is_octal:
-	s->prev_was_octal = 1;
-	c = *s->s;
-	s->s += nchars;
-	return c;
-}
-
-static wchar_t
-get_next_wchar(struct wset_state *s)
+rstrmatch(Rune *r, char *s, size_t n)
 {
-start:
-	if (s->rfirst <= s->rlast) {
-		s->prev = s->rfirst;
-		s->rfirst++;
-		return s->prev;
-	}
-
-	if (*s->s == '-' && !s->prev_was_range) {
-		s->s++;
-		if (!*s->s)
-			return '-';
-		if (*s->s == '\\')
-			resolve_escape(++(s->s));
-		s->s += xmbtowc(&s->rlast, s->s);
-		if (!s->rlast)
-			return '\0';
-		s->rfirst = ++(s->prev);
-		s->prev_was_range = 1;
-		goto start;
-	}
+	size_t i;
 
-	if (*s->s == '\\')
-		resolve_escape(++(s->s));
-	s->s += xmbtowc(&s->prev, s->s);
-	s->prev_was_range = 0;
-	return s->prev;
+	for (i = 0; i < n; i++)
+		if (r[i] != s[i])
+			return 0;
+	return 1;
 }
 
-static int
-is_mapping_wide(const char *set1, const char *set2)
+static size_t
+makeset(char *str, struct range **set)
 {
-	struct set_state ss1, ss2;
-	struct wset_state wss1, wss2;
-	wchar_t wc1, wc2, last_wc2;
-
-	if (has_octal_escapes(set1)) {
-		set_state_defaults(&ss1);
-		ss1.s = (char *) set1;
-		if (set2) {
-			set_state_defaults(&ss2);
-			ss2.s = (char *) set2;
-			/* if the character returned is from an octal triplet, it might be null
-			 * and still need to continue */
-			while ((wc1 = (unsigned char) get_next_char(&ss1)) || ss1.prev_was_octal ) {
-				if (!(wc2 = (unsigned char) get_next_char(&ss2)))
-					wc2 = last_wc2;
-				mappings[wc1] = wc2;
-				last_wc2 = wc2;
-			}
-		} else {
-			while ((wc1 = (unsigned char) get_next_char(&ss1)) || ss1.prev_was_octal)
-				mappings[wc1] = 1;
-		}
-		return 0;
-	} else {
-		wset_state_defaults(&wss1);
-		wss1.s = (char *) set1;
-		if (set2) {
-			wset_state_defaults(&wss2);
-			wss2.s = (char *) set2;
-			while ((wc1 = get_next_wchar(&wss1))) {
-				if (!(wc2 = get_next_wchar(&wss2)))
-					wc2 = last_wc2;
-				mappings[wc1] = wc2;
-				last_wc2 = wc2;
+	Rune  *rstr;
+	size_t len, i, j, m, n;
+	size_t q, setranges;
+	int    factor, base;
+
+reset:
+	setranges = 0;
+
+	/* rstr defines at most len ranges */
+	len = chartorunearr(str, &rstr);
+	*set = emalloc(len * sizeof(**set));
+
+	/* todo: allow expressions */
+	for (i = 0; i < len; i++) {
+		if (rstr[i] == '[') {
+			j = i;
+nextbrack:
+			if (j == len)
+				goto literal;
+			for (m = j; m < len; m++)
+				if (rstr[m] == ']') {
+					j = m;
+					break;
+				}
+			if (j == i)
+				goto literal;
+
+			/* CLASSES [=EQUIV=] (skip) */
+			if (j - i > 3 && rstr[i + 1] == '=' && rstr[m - 1] == '=') {
+				i = j;
+				continue;
 			}
-		} else {
-			while ((wc1 = get_next_wchar(&wss1)))
-				mappings[wc1] = 1;
-		}
-		return 1;
-	}
-	return 0; /* unreachable */
-}
-
-static void
-wmap_null(char *in, ssize_t nbytes)
-{
-	char *s;
-	wchar_t rune;
-	int parsed_bytes = 0;
 
-	s = in;
-	while (nbytes) {
-		parsed_bytes = embtowc(&rune, s);
-		if (parsed_bytes < 0) {
-			rune = *s;
-			parsed_bytes = 1;
-		}
-		if (((!mappings[rune])&1) ^ cflag)
-			putwchar(rune);
-		s += parsed_bytes;
-		nbytes -= parsed_bytes;
-	}
-}
+			/* CLASSES [:CLASS:] */
+			if (j - i > 3 && rstr[i + 1] == ':' && rstr[m - 1] == ':') {
+				for (n = 0; n < LEN(classes); n++) {
+					if (rstrmatch(rstr + i + 2, classes[n].name, j - i - 3)) {
+						str = classes[n].str;
+						goto reset;
+					}
+				}
+				eprintf("Invalid character class\n");
+			}
 
-static void
-wmap_set(char *in, ssize_t nbytes)
-{
-	char *s;
-	wchar_t rune;
-	int parsed_bytes = 0;
+			/* REPEAT  [_*n] (only allowed in set2) */
+			if (j - i > 2 && rstr[i + 2] == '*' && set1ranges > 0) {
+				/* check if right side of '*' is a number */
+				q = 0;
+				factor = 1;
+				base = (rstr[i + 3] == '0') ? 8 : 10;
+				for (n = j - 1; n > i + 2; n--) {
+					if (rstr[n] < '0' && rstr[n] > '9') {
+						n = 0;
+						break;
+					}
+					q += (rstr[n] - '0') * factor;
+					factor *= base;
+				}
+
+				if (n == 0) {
+					j = m + 1;
+					goto nextbrack;
+				}
+				(*set)[setranges].start = rstr[i + 1];
+				(*set)[setranges].end   = rstr[i + 1];
+				(*set)[setranges].quant = q ? q : setlen(set1, set1ranges);
+				setranges++;
+				i = j;
+				continue;
+			}
 
-	s = in;
-	while (nbytes) {
-		parsed_bytes = embtowc(&rune, s);
-		if (parsed_bytes < 0) {
-			rune = *s;
-			parsed_bytes = 1;
+			j = m + 1;
+			goto nextbrack;
 		}
-		if (!mappings[rune])
-			putwchar(rune);
-		else
-			putwchar(mappings[rune]);
-		nbytes -= parsed_bytes;
-		s += parsed_bytes;
+literal:
+		/* RANGES [_-__-_], _-__-_ */
+		/* LITERALS _______ */
+		(*set)[setranges].start = rstr[i];
+
+		if (i < len - 2 && rstr[i + 1] == '-' && rstr[i + 2] >= rstr[i])
+			i += 2;
+		(*set)[setranges].end = rstr[i];
+		(*set)[setranges].quant = 1;
+		setranges++;
 	}
-}
 
-static void
-map_null(char *in, ssize_t nbytes)
-{
-	char *s;
-
-	for (s = in; nbytes; s++, nbytes--)
-		if (((!mappings[(unsigned char)*s])&1) ^ cflag)
-			putchar(*s);
+	free(rstr);
+	return setranges;
 }
 
 static void
-map_set(char *in, ssize_t nbytes)
+usage(void)
 {
-	char *s;
-
-	for (s = in; nbytes; s++, nbytes--)
-		if (!mappings[(unsigned char)*s])
-			putchar(*s);
-		else
-			putchar(mappings[(unsigned char)*s]);
+	eprintf("usage: %s [-cCds] set1 [set2]\n", argv0);
 }
 
 int
 main(int argc, char *argv[])
 {
-	char *buf = NULL;
-	size_t size = 0;
-	ssize_t nbytes;
-	void (*mapfunc)(char*, ssize_t);
-
-	setlocale(LC_ALL, "");
-	dflag = cflag = 0;
+	Rune r = 0, lastrune = 0;
+	int i, m;
+	size_t off1, off2;
 
 	ARGBEGIN {
+	case 'c':
+	case 'C':
+		cflag = 1;
+		break;
 	case 'd':
 		dflag = 1;
 		break;
-	case 'c':
-		cflag = 1;
+	case 's':
+		sflag = 1;
 		break;
 	default:
 		usage();
 	} ARGEND;
 
-	if (argc == 0)
+	if (argc < 1 || argc > 2 || (argc == 1 && dflag == sflag))
 		usage();
+	set1ranges = makeset(argv[0], &set1);
+	if (argc == 2)
+		set2ranges = makeset(argv[1], &set2);
+read:
+	if (!readrune("<stdin>", stdin, &r))
+		return 0;
+	off1 = off2 = 0;
+	for (i = 0; i < set1ranges; i++) {
+		if (set1[i].start <= r && r <= set1[i].end) {
+			if (dflag && !cflag)
+				goto read;
+			if (sflag) {
+				if (r == lastrune)
+					goto read;
+				else
+					goto write;
+			}
+			for (m = 0; m < i; m++)
+				off1 += rangelen(set1[m]);
+			off1 += r - set1[m].start;
+			if (off1 > setlen(set2, set2ranges) - 1) {
+				r = set2[set2ranges - 1].end;
+				goto write;
+			}
+			for (m = 0; m < set2ranges; m++) {
+				if (off2 + rangelen(set2[m]) > off1) {
+					m++;
+					break;
+				}
+				off2 += rangelen(set2[m]);
+			}
+			m--;
+			r = set2[m].start + (off1 - off2) / set2[m].quant;
 
-	if (dflag) {
-		if (argc != 1)
-			usage();
-		if (is_mapping_wide(argv[0], NULL))
-			mapfunc = wmap_null;
-		else
-			mapfunc = map_null;
-	} else if (cflag) {
-		usage();
-	} else if (argc == 2) {
-		if (is_mapping_wide(argv[0], argv[1]))
-			mapfunc = wmap_set;
-		else
-			mapfunc = map_set;
-	} else {
-		usage();
+			goto write;
+		}
 	}
-
-	while ((nbytes = getline(&buf, &size, stdin)) != -1)
-		mapfunc(buf, nbytes);
-	free(buf);
-	if (ferror(stdin))
-		eprintf("<stdin>: read error:");
-
-	return 0;
+	if (dflag && cflag)
+		goto read;
+	if (dflag && sflag && r == lastrune)
+		goto read;
+write:
+	lastrune = r;
+	writerune("<stdout>", stdout, &r);
+	goto read;
 }
diff --git a/utf.h b/utf.h
index 26b9a6f..96cc361 100644
--- a/utf.h
+++ b/utf.h
@@ -51,3 +51,4 @@ int isdigitrune(Rune);
 
 int readrune(const char *, FILE *, Rune *);
 void writerune(const char *, FILE *, Rune *);
+int chartorunearr(const char*, Rune **);
-- 
1.8.5.5

[dev] [sbase] [PATCH] Rewrite tr(1) in a sane way

Reply via email to