Module Name:    src
Committed By:   christos
Date:           Thu Aug  1 06:22:52 UTC 2019

Modified Files:
        src/external/historical/nawk/dist: b.c

Log Message:
PR/54424: Martijn Dekker: awk: broken character classes in UTF-8 locale:
only the first matches
Pick up some of the fixes from upstream:
        - posix paren matching
        - print \v \a
        - some more fatal handling
        - init all the character range.


To generate a diff of this commit:
cvs rdiff -u -r1.7 -r1.8 src/external/historical/nawk/dist/b.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/external/historical/nawk/dist/b.c
diff -u src/external/historical/nawk/dist/b.c:1.7 src/external/historical/nawk/dist/b.c:1.8
--- src/external/historical/nawk/dist/b.c:1.7	Thu Aug  1 02:14:45 2019
+++ src/external/historical/nawk/dist/b.c	Thu Aug  1 02:22:52 2019
@@ -31,6 +31,7 @@ THIS SOFTWARE.
 #define	DEBUG
 
 #include <ctype.h>
+#include <limits.h>
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>
@@ -333,6 +334,10 @@ int quoted(const uschar **pp)	/* pick up
 		c = '\r';
 	else if (c == 'b')
 		c = '\b';
+	else if (c == 'v')
+		c = '\v';
+	else if (c == 'a')
+		c = '\a';
 	else if (c == '\\')
 		c = '\\';
 	else if (c == 'x') {	/* hexadecimal goo follows */
@@ -978,6 +983,7 @@ static int repeat(const uschar *reptok, 
 	if (secondnum < 0) {	/* means {n,} -> repeat n-1 times followed by PLUS */
 		if (firstnum < 2) {
 			/* 0 or 1: should be handled before you get here */
+			FATAL("internal error");
 		} else {
 			return replace_repeat(reptok, reptoklen, atom, atomlen,
 				firstnum, secondnum, REPEAT_PLUS_APPENDED);
@@ -998,6 +1004,7 @@ static int repeat(const uschar *reptok, 
 		return replace_repeat(reptok, reptoklen, atom, atomlen,
 					firstnum, secondnum, REPEAT_WITH_Q);
 	} else {	/* Error - shouldn't be here (n>m) */
+		FATAL("internal error");
 	}
 	return 0;
 }
@@ -1013,6 +1020,7 @@ int relex(void)		/* lexical analyzer for
 	int i;
 	int num, m, commafound, digitfound;
 	const uschar *startreptok;
+	static int parens = 0;
 
 rescan:
 	starttok = prestr;
@@ -1026,9 +1034,18 @@ rescan:
 	case '\0': prestr--; return '\0';
 	case '^':
 	case '$':
+		return c;
 	case '(':
+		parens++;
+ 		return c;
 	case ')':
-		return c;
+		if (parens) {
+			parens--;
+			return c;
+		}
+		/* unmatched close parenthesis; per POSIX, treat as literal */
+		rlxval = c;
+		return CHAR;
 	case '\\':
 		rlxval = quoted(&prestr);
 		return CHAR;
@@ -1064,7 +1081,15 @@ rescan:
 				if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
 				    prestr[2 + cc->cc_namelen] == ']') {
 					prestr += cc->cc_namelen + 3;
-					for (i = 1; i < NCHARS; i++) {
+					/*
+					 * BUG: We begin at 1, instead of 0, since we
+					 * would otherwise prematurely terminate the
+					 * string for classes like [[:cntrl:]]. This
+					 * means that we can't match the NUL character,
+					 * not without first adapting the entire
+					 * program to track each string's length.
+					 */
+					for (i = 1; i <= UCHAR_MAX; i++) {
 						if (!adjbuf(&buf, &bufsz, bp-buf+1, 100, &bp, "relex2"))
 						    FATAL("out of space for reg expr %.10s...", lastre);
 						if (cc->cc_func(i)) {

Reply via email to