curdeius created this revision.

This patch is motivated by bug #34739 
(https://bugs.llvm.org/show_bug.cgi?id=34739).
Clang-format misbehaves because escape sequence '\t' in regex is treated as 
literal 't'.
This patch handles escaped characters inside square brackets as well (so that 
somebody can write `R("^[\[\]\t]")` as a regex pattern).


https://reviews.llvm.org/D38357

Files:
  lib/Support/regcomp.c
  unittests/Support/RegexTest.cpp

Index: unittests/Support/RegexTest.cpp
===================================================================
--- unittests/Support/RegexTest.cpp
+++ unittests/Support/RegexTest.cpp
@@ -61,6 +61,76 @@
   EXPECT_TRUE(r5.match(String));
 }
 
+TEST_F(RegexTest, Tabulators) {
+  Regex r1("^\\t+$");
+  EXPECT_TRUE(r1.match("\t"));
+  EXPECT_TRUE(r1.match("\t\t\t"));
+  EXPECT_FALSE(r1.match(""));
+  EXPECT_FALSE(r1.match(" "));
+  EXPECT_FALSE(r1.match(" \t "));
+
+  Regex r2("^(\\t| )+$");
+  EXPECT_TRUE(r2.match("\t"));
+  EXPECT_TRUE(r2.match("\t\t\t"));
+  EXPECT_FALSE(r2.match(""));
+  EXPECT_TRUE(r2.match(" "));
+  EXPECT_TRUE(r2.match(" \t "));
+
+  Regex r3("^[\\t ]+$");
+  EXPECT_TRUE(r3.match("\t"));
+  EXPECT_TRUE(r3.match("\t\t\t"));
+  EXPECT_FALSE(r3.match(""));
+  EXPECT_TRUE(r3.match(" "));
+  EXPECT_TRUE(r3.match(" \t "));
+}
+
+TEST_F(RegexTest, EscapedBackslash) {
+  Regex r1("^\\\\+$");
+  EXPECT_TRUE(r1.match("\\"));
+  EXPECT_TRUE(r1.match("\\\\\\"));
+  EXPECT_FALSE(r1.match(""));
+  EXPECT_FALSE(r1.match(" "));
+  EXPECT_FALSE(r1.match(" \\ "));
+
+  Regex r2("^(\\\\| )+$");
+  EXPECT_TRUE(r2.match("\\"));
+  EXPECT_TRUE(r2.match("\\\\\\"));
+  EXPECT_FALSE(r2.match(""));
+  EXPECT_TRUE(r2.match(" "));
+  EXPECT_TRUE(r2.match(" \\ "));
+
+  Regex r3("^[\\\\ ]+$");
+  EXPECT_TRUE(r3.match("\\"));
+  EXPECT_TRUE(r3.match("\\\\\\"));
+  EXPECT_FALSE(r3.match(""));
+  EXPECT_TRUE(r3.match(" "));
+  EXPECT_TRUE(r3.match(" \\ "));
+}
+
+TEST_F(RegexTest, EscapedOrdinaryCharacters) {
+  Regex r1("^\\X+$");
+  EXPECT_TRUE(r1.match("X"));
+  EXPECT_TRUE(r1.match("XXX"));
+  EXPECT_FALSE(r1.match(""));
+  EXPECT_FALSE(r1.match(" "));
+  EXPECT_FALSE(r1.match(" X "));
+
+  Regex r2("^(\\X| )+$");
+  EXPECT_TRUE(r2.match("X"));
+  EXPECT_TRUE(r2.match("XXX"));
+  EXPECT_FALSE(r2.match(""));
+  EXPECT_TRUE(r2.match(" "));
+  EXPECT_TRUE(r2.match(" X "));
+
+  Regex r3("^[\\X ]+$");
+  EXPECT_TRUE(r3.match("X"));
+  EXPECT_TRUE(r3.match("XXX"));
+  EXPECT_FALSE(r3.match(""));
+  EXPECT_TRUE(r3.match(" "));
+  EXPECT_TRUE(r3.match(" X "));
+}
+
+
 TEST_F(RegexTest, Backreferences) {
   Regex r1("([a-z]+)_\\1");
   SmallVector<StringRef, 4> Matches;
@@ -100,7 +170,7 @@
 
   EXPECT_EQ("aber", Regex("[0-9]+").sub("\\", "a1234ber", &Error));
   EXPECT_EQ(Error, "replacement string contained trailing backslash");
-  
+
   // Backreferences
   EXPECT_EQ("aa1234bber", Regex("a[0-9]+b").sub("a\\0b", "a1234ber", &Error));
   EXPECT_EQ("", Error);
Index: lib/Support/regcomp.c
===================================================================
--- lib/Support/regcomp.c
+++ lib/Support/regcomp.c
@@ -403,10 +403,17 @@
 			EMIT(O_BACK, backrefnum);
 			p->g->backrefs = 1;
 		} else {
-			/* Other chars are simply themselves when escaped with a backslash.
-			 */
-			ordinary(p, c);
-		}
+            switch (c) {
+            case 't':
+              ordinary(p, '\t');
+              break;
+            default:
+              /* Other chars are simply themselves when escaped with a
+               * backslash. */
+              ordinary(p, c);
+              break;
+            }
+        }
 		break;
 	case '{':		/* okay as ordinary except if digit follows */
 		REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
@@ -787,7 +794,19 @@
 	default:		/* symbol, ordinary character, or range */
 /* xxx revision needed for multichar stuff */
 		start = p_b_symbol(p);
-		if (SEE('-') && MORE2() && PEEK2() != ']') {
+        if ((start == '\\')) {
+          /* escape */
+          REQUIRE(MORE(), REG_EESCAPE);
+          c = GETNEXT();
+          switch (c) {
+          case 't':
+            start = finish = '\t';
+            break;
+          default:
+            start = finish = c;
+            break;
+          }
+		} else if (SEE('-') && MORE2() && PEEK2() != ']') {
 			/* range */
 			NEXT();
 			if (EAT('-'))
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to