[PATCH] D129706: [NFCI][clang-tidy] Reimplement GlobList without relying on Regex.

Nathan James via Phabricator via cfe-commits Wed, 13 Jul 2022 16:44:54 -0700

njames93 created this revision.
njames93 added reviewers: alexfh, aaron.ballman.
Herald added subscribers: carlosgalvezp, xazax.hun.
Herald added a project: All.
njames93 requested review of this revision.
Herald added a project: clang-tools-extra.
Herald added a subscriber: cfe-commits.


Regex is a little cumbersome for the limited syntax GlobList supports and its 
implementation is a bit of a hack.
This implementation supports exactly what we require and nothing more.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D129706

Files:
  clang-tools-extra/clang-tidy/GlobList.cpp
  clang-tools-extra/clang-tidy/GlobList.h
  clang-tools-extra/unittests/clang-tidy/GlobListTest.cpp

Index: clang-tools-extra/unittests/clang-tidy/GlobListTest.cpp
===================================================================
--- clang-tools-extra/unittests/clang-tidy/GlobListTest.cpp
+++ clang-tools-extra/unittests/clang-tidy/GlobListTest.cpp
@@ -16,6 +16,13 @@
   EXPECT_FALSE(Filter.contains("aaa"));
 }
 
+TYPED_TEST(GlobListTest, NotEmpty) {
+  TypeParam Filter("*,-");
+
+  EXPECT_FALSE(Filter.contains(""));
+  EXPECT_TRUE(Filter.contains("aaa"));
+}
+
 TYPED_TEST(GlobListTest, Nothing) {
   TypeParam Filter("-*");
 
@@ -57,6 +64,42 @@
   EXPECT_FALSE(Filter.contains("bbbb"));
 }
 
+TYPED_TEST(GlobListTest, PreAndPostFixPattern) {
+  TypeParam Filter("a*b");
+
+  EXPECT_TRUE(Filter.contains("ab"));
+  EXPECT_TRUE(Filter.contains("aMiddleb"));
+  EXPECT_FALSE(Filter.contains("aba"));
+  EXPECT_FALSE(Filter.contains("bab"));
+}
+
+TYPED_TEST(GlobListTest, DoubleWildcard) {
+  {
+    TypeParam Filter("a**b");
+
+    EXPECT_TRUE(Filter.contains("ab"));
+    EXPECT_TRUE(Filter.contains("aMiddleb"));
+    EXPECT_FALSE(Filter.contains("aba"));
+    EXPECT_FALSE(Filter.contains("bab"));
+  }
+  {
+    TypeParam Filter("a**");
+
+    EXPECT_TRUE(Filter.contains("ab"));
+    EXPECT_TRUE(Filter.contains("aMiddleb"));
+    EXPECT_TRUE(Filter.contains("aba"));
+    EXPECT_FALSE(Filter.contains("bab"));
+  }
+  {
+    TypeParam Filter("**b");
+
+    EXPECT_TRUE(Filter.contains("ab"));
+    EXPECT_TRUE(Filter.contains("aMiddleb"));
+    EXPECT_FALSE(Filter.contains("aba"));
+    EXPECT_TRUE(Filter.contains("bab"));
+  }
+}
+
 TYPED_TEST(GlobListTest, PatternPriority) {
   // The last glob that matches the string decides whether that string is
   // included or excluded.
@@ -107,6 +150,9 @@
 TYPED_TEST(GlobListTest, NewlineCharactersAsSeparators) {
   TypeParam Filter("a*  \n b,\n-c*,dd");
 
+  // FIXME: Define this behaviour, There are 2 seperator characters next to each
+  // other (,\n), Technically that should be represented as containing an empty
+  // item.
   EXPECT_FALSE(Filter.contains(""));
   EXPECT_TRUE(Filter.contains("aaa"));
   EXPECT_TRUE(Filter.contains("b"));
@@ -117,5 +163,14 @@
   EXPECT_FALSE(Filter.contains("ddd"));
 }
 
+TYPED_TEST(GlobListTest, IgnoreNegativeMatches) {
+  TypeParam Filter("a,-a,b*,-b*,-*", false);
+
+  EXPECT_FALSE(Filter.contains(""));
+  EXPECT_TRUE(Filter.contains("a"));
+  EXPECT_TRUE(Filter.contains("b"));
+  EXPECT_TRUE(Filter.contains("bar"));
+}
+
 } // namespace tidy
 } // namespace clang
Index: clang-tools-extra/clang-tidy/GlobList.h
===================================================================
--- clang-tools-extra/clang-tidy/GlobList.h
+++ clang-tools-extra/clang-tidy/GlobList.h
@@ -10,10 +10,10 @@
 #define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GLOBLIST_H
 
 #include "clang/Basic/LLVM.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Regex.h"
+#include <climits>
+#include <vector>
 
 namespace clang {
 namespace tidy {
@@ -25,7 +25,9 @@
 /// them in the order of appearance in the list.
 class GlobList {
 public:
-  virtual ~GlobList() = default;
+  virtual ~GlobList();
+
+  GlobList(const GlobList &) = delete;
 
   /// \p Globs is a comma-separated list of globs (only the '*' metacharacter is
   /// supported) with an optional '-' prefix to denote exclusion.
@@ -41,12 +43,32 @@
   /// matching glob's Positive flag.
   virtual bool contains(StringRef S) const;
 
+  bool isEquivalentTo(const GlobList &Other);
+
 private:
-  struct GlobListItem {
-    bool IsPositive;
-    llvm::Regex Regex;
+  static constexpr size_t ItemBits = 15;
+  // Pretty arbitrary choice here, but should
+  static constexpr size_t NumInlineElements = 24;
+  struct GlobStartInfo {
+    uint16_t Items : ItemBits;
+    uint16_t IsPositive : 1;
+  };
+  union DataItem {
+    GlobStartInfo StartInfo;
+    uint16_t Size;
+  };
+  /// We traverse this vector backwards, and the first item we encounter is a \c
+  /// GlobStart object. The next \c GlobStart.Items items in here will be a raw
+  /// uint16_t. After that, if there are more items, the next will be another \c
+  /// GlobStart object.
+  std::vector<DataItem> Data;
+  union {
+    char Inline[NumInlineElements];
+    char *Ptr;
   };
-  SmallVector<GlobListItem, 0> Items;
+  size_t RawStringSize : sizeof(size_t) * CHAR_BIT - 1;
+  // The fallback value if we don't match anything
+  size_t UnmatchedResult : 1;
 };
 
 /// A \p GlobList that caches search results, so that search is performed only
Index: clang-tools-extra/clang-tidy/GlobList.cpp
===================================================================
--- clang-tools-extra/clang-tidy/GlobList.cpp
+++ clang-tools-extra/clang-tidy/GlobList.cpp
@@ -7,61 +7,174 @@
 //===----------------------------------------------------------------------===//
 
 #include "GlobList.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include <numeric>
 
 namespace clang {
 namespace tidy {
 
-// Returns true if GlobList starts with the negative indicator ('-'), removes it
-// from the GlobList.
-static bool consumeNegativeIndicator(StringRef &GlobList) {
-  GlobList = GlobList.trim();
-  if (GlobList.startswith("-")) {
-    GlobList = GlobList.substr(1);
-    return true;
+static size_t getMaxAllocSize(StringRef Str) {
+  size_t Count = 0;
+  for (char C : Str) {
+    switch (C) {
+    case '\n': // Seperator character.
+    case ',':  // Seperator character.
+    case '*':  // Wildcards aren't stored.
+      continue;
+    default:
+      ++Count;
+    }
   }
-  return false;
+  return Count;
 }
 
-// Converts first glob from the comma-separated list of globs to Regex and
-// removes it and the trailing comma from the GlobList.
-static llvm::Regex consumeGlob(StringRef &GlobList) {
-  StringRef UntrimmedGlob = GlobList.substr(0, GlobList.find_first_of(",\n"));
-  StringRef Glob = UntrimmedGlob.trim();
-  GlobList = GlobList.substr(UntrimmedGlob.size() + 1);
-  SmallString<128> RegexText("^");
-  StringRef MetaChars("()^$|*+?.[]\\{}");
-  for (char C : Glob) {
-    if (C == '*')
-      RegexText.push_back('.');
-    else if (MetaChars.contains(C))
-      RegexText.push_back('\\');
-    RegexText.push_back(C);
+GlobList::GlobList(StringRef Globs, bool KeepNegativeGlobs /* =true */)
+    : RawStringSize(0), UnmatchedResult(false) {
+  // FIXME: trimming the newline character here is suspicious as its recognised
+  // as a seperator.
+  Globs = Globs.trim(" \t\r\n\f\v");
+
+  //  Upperbound for the allocated size
+  size_t MaxAllocSize = getMaxAllocSize(Globs);
+  char *Storage = Inline;
+
+  while (true) {
+    GlobStartInfo S;
+    size_t GlobStart = Globs.find_last_of(",\n") + 1;
+    StringRef Trimmed = Globs.substr(GlobStart).ltrim(" \t\v\f\r");
+    S.IsPositive = !Trimmed.consume_front("-");
+    if (S.IsPositive || KeepNegativeGlobs) {
+      // The match-all glob means anything before it won't ever be read.
+
+      if (Trimmed == "*") {
+        // Don't actually stored this glob, We'll stop checking once we reach
+        // this item, so instead, just track whether it was a match or not.
+        UnmatchedResult = S.IsPositive;
+        return;
+      }
+      auto StartIndex = Data.size();
+      Data.emplace_back();
+      if (!Trimmed.empty()) {
+        // FIXME: This wouldn't support escaping the wildcard (\*), but that's
+        // not a known usecase for GlobList in clang-tidy.
+        for (auto Item : llvm::split(Trimmed, "*")) {
+          if (Item.empty()) {
+            // Eat ** in a glob.
+            if (Data.size() - StartIndex > 1 && Data.back().Size == 0)
+              continue;
+          } else {
+            assert(Item.size() < (1 << ItemBits) && "Index out of bounds");
+            // Lazily allocate out of line storage if needed.
+            if (RawStringSize <= NumInlineElements &&
+                RawStringSize + Item.size() > NumInlineElements) {
+              char *NewPtr = new char[MaxAllocSize];
+              ::memcpy(NewPtr, Inline, RawStringSize);
+              Ptr = NewPtr;
+              Storage = Ptr;
+            }
+            assert(RawStringSize + Item.size() <= MaxAllocSize);
+            llvm::copy(Item, &Storage[RawStringSize]);
+            RawStringSize += Item.size();
+          }
+          Data.emplace_back();
+          Data.back().Size = Item.size();
+        }
+      }
+      auto ItemCount = Data.size() - StartIndex - 1;
+      assert(ItemCount < (1 << ItemBits) && "Too many items");
+      S.Items = ItemCount;
+      // The starting node goes at the end of the items, because we traverse the
+      // glob list in reverse order.
+      Data[StartIndex].StartInfo = S;
+    }
+    if (GlobStart == 0)
+      return;
+    // Bit of a hack this, Basically let ,\n not be parsed as an empty string.
+    // This behaviour needs defining.
+    auto NextOffset = Globs.find_last_not_of(
+        Globs[GlobStart - 1] == ',' ? " \t\r\n\f\v" : " \t\r\n\f\v,",
+        GlobStart - 1);
+    if (NextOffset == StringRef::npos)
+      return;
+    Globs = Globs.substr(0, NextOffset + 1);
   }
-  RegexText.push_back('$');
-  return llvm::Regex(RegexText);
 }
 
-GlobList::GlobList(StringRef Globs, bool KeepNegativeGlobs /* =true */) {
-  Items.reserve(Globs.count(',') + Globs.count('\n') + 1);
-  do {
-    GlobListItem Item;
-    Item.IsPositive = !consumeNegativeIndicator(Globs);
-    Item.Regex = consumeGlob(Globs);
-    if (Item.IsPositive || KeepNegativeGlobs)
-      Items.push_back(std::move(Item));
-  } while (!Globs.empty());
+GlobList::~GlobList() {
+  if (RawStringSize > NumInlineElements)
+    delete[] Ptr;
+}
+
+static bool matchGlobList(StringRef Source, llvm::ArrayRef<uint16_t> Offsets,
+                          StringRef Match) {
+  assert(!Offsets.empty());
+  // If we only had one item originally, then we are working with an exact match
+  // with no wild cards.
+  if (Offsets.size() == 1) {
+    assert(Source.size() == Offsets.front());
+    return Match == Source;
+  }
+
+  if (!Match.consume_front(Source.take_front(Offsets.front())))
+    return false;
+  Source = Source.drop_front(Offsets.front());
+
+  for (auto Offset : Offsets.drop_front().drop_back()) {
+    auto SearchItem = Source.take_front(Offset);
+    Source = Source.drop_front(Offset);
+    auto Idx = Match.find(SearchItem);
+    if (Idx == StringRef::npos)
+      return false;
+    Match = Match.drop_front(Idx + Offset);
+  }
+
+  assert(Source.size() == Offsets.back());
+  // For the last item, we know previously there was a wildcard, so we only need
+  // to check for a suffix match.
+  return Match.endswith(Source);
 }
 
 bool GlobList::contains(StringRef S) const {
-  // Iterating the container backwards as the last match determins if S is in
-  // the list.
-  for (const GlobListItem &Item : llvm::reverse(Items)) {
-    if (Item.Regex.match(S))
-      return Item.IsPositive;
+  StringRef Source(((RawStringSize > NumInlineElements)
+                        ? static_cast<const char *>(Ptr)
+                        : Inline),
+                   RawStringSize);
+  auto Items = llvm::makeArrayRef(Data);
+  while (!Items.empty()) {
+    GlobStartInfo StartNode = Items.front().StartInfo;
+    Items = Items.drop_front();
+    assert(Items.size() >= StartNode.Items);
+    if (StartNode.Items == 0) {
+      if (S.empty())
+        return StartNode.IsPositive;
+      continue;
+    }
+    auto GlobItems = Items.take_front(StartNode.Items);
+    Items = Items.drop_front(StartNode.Items);
+    auto Offsets = llvm::makeArrayRef(&GlobItems.front().Size, StartNode.Items);
+    auto Size = std::accumulate(Offsets.begin(), Offsets.end(), 0U);
+    if (matchGlobList(Source.take_front(Size), Offsets, S))
+      return StartNode.IsPositive;
+    Source = Source.drop_front(Size);
   }
-  return false;
+  return UnmatchedResult;
+}
+
+bool GlobList::isEquivalentTo(const GlobList &Other) {
+  // Check the cheap to determine lengths first.
+  if (RawStringSize != Other.RawStringSize)
+    return false;
+  if (Data.size() != Other.Data.size())
+    return false;
+  // Then check the elements.
+  if (memcmp(Data.data(), Other.Data.data(), Data.size() * sizeof(Data[0])) !=
+      0)
+    return false;
+  if (RawStringSize > NumInlineElements)
+    return strncmp(Ptr, Other.Ptr, RawStringSize) == 0;
+  return strncmp(Inline, Other.Inline, RawStringSize) == 0;
 }
 
 bool CachedGlobList::contains(StringRef S) const {

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D129706: [NFCI][clang-tidy] Reimplement GlobList without relying on Regex.

Reply via email to