[PATCH] D129706: [NFCI][clang-tidy] Reimplement GlobList without relying on Regex.

Nathan James via Phabricator via cfe-commits Mon, 18 Jul 2022 05:23:35 -0700

njames93 updated this revision to Diff 445464.
njames93 added a comment.

Enable escaping the leading `-` at the start of a glob to match strings 
starting with a `-`.



Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D129706/new/

https://reviews.llvm.org/D129706

Files:
  clang-tools-extra/clang-tidy/GlobList.cpp
  clang-tools-extra/clang-tidy/GlobList.h
  clang-tools-extra/unittests/clang-tidy/GlobListTest.cpp

Index: clang-tools-extra/unittests/clang-tidy/GlobListTest.cpp
===================================================================
--- clang-tools-extra/unittests/clang-tidy/GlobListTest.cpp
+++ clang-tools-extra/unittests/clang-tidy/GlobListTest.cpp
@@ -16,6 +16,13 @@
   EXPECT_FALSE(Filter.contains("aaa"));
 }
 
+TYPED_TEST(GlobListTest, NotEmpty) {
+  TypeParam Filter("*,-");
+
+  EXPECT_FALSE(Filter.contains(""));
+  EXPECT_TRUE(Filter.contains("aaa"));
+}
+
 TYPED_TEST(GlobListTest, Nothing) {
   TypeParam Filter("-*");
 
@@ -57,6 +64,51 @@
   EXPECT_FALSE(Filter.contains("bbbb"));
 }
 
+TYPED_TEST(GlobListTest, PreAndPostFixPattern) {
+  TypeParam Filter("a*b");
+
+  EXPECT_TRUE(Filter.contains("ab"));
+  EXPECT_TRUE(Filter.contains("aMiddleb"));
+  EXPECT_FALSE(Filter.contains("aba"));
+  EXPECT_FALSE(Filter.contains("bab"));
+}
+
+TYPED_TEST(GlobListTest, DoubleWildcard) {
+  {
+    TypeParam Filter("a**b");
+
+    EXPECT_TRUE(Filter.isEquivalentTo("a*b"));
+    EXPECT_TRUE(Filter.isEquivalentTo("a***b"));
+
+    EXPECT_TRUE(Filter.contains("ab"));
+    EXPECT_TRUE(Filter.contains("aMiddleb"));
+    EXPECT_FALSE(Filter.contains("aba"));
+    EXPECT_FALSE(Filter.contains("bab"));
+  }
+  {
+    TypeParam Filter("a**");
+
+    EXPECT_TRUE(Filter.isEquivalentTo("a*"));
+    EXPECT_TRUE(Filter.isEquivalentTo("a***"));
+
+    EXPECT_TRUE(Filter.contains("ab"));
+    EXPECT_TRUE(Filter.contains("aMiddleb"));
+    EXPECT_TRUE(Filter.contains("aba"));
+    EXPECT_FALSE(Filter.contains("bab"));
+  }
+  {
+    TypeParam Filter("**b");
+
+    EXPECT_TRUE(Filter.isEquivalentTo("*b"));
+    EXPECT_TRUE(Filter.isEquivalentTo("***b"));
+
+    EXPECT_TRUE(Filter.contains("ab"));
+    EXPECT_TRUE(Filter.contains("aMiddleb"));
+    EXPECT_FALSE(Filter.contains("aba"));
+    EXPECT_TRUE(Filter.contains("bab"));
+  }
+}
+
 TYPED_TEST(GlobListTest, PatternPriority) {
   // The last glob that matches the string decides whether that string is
   // included or excluded.
@@ -107,6 +159,9 @@
 TYPED_TEST(GlobListTest, NewlineCharactersAsSeparators) {
   TypeParam Filter("a*  \n b,\n-c*,dd");
 
+  // FIXME: Define this behaviour, There are 2 seperator characters next to each
+  // other (,\n), Technically that should be represented as containing an empty
+  // item.
   EXPECT_FALSE(Filter.contains(""));
   EXPECT_TRUE(Filter.contains("aaa"));
   EXPECT_TRUE(Filter.contains("b"));
@@ -117,5 +172,51 @@
   EXPECT_FALSE(Filter.contains("ddd"));
 }
 
+TYPED_TEST(GlobListTest, IgnoreNegativeMatches) {
+  TypeParam Filter("a,-a,b*,-b*,-*", false);
+
+  EXPECT_FALSE(Filter.contains(""));
+  EXPECT_TRUE(Filter.contains("a"));
+  EXPECT_TRUE(Filter.contains("b"));
+  EXPECT_TRUE(Filter.contains("bar"));
+}
+
+TYPED_TEST(GlobListTest, EscapeNegatives) {
+  TypeParam Filter("\\-a,-a");
+
+  EXPECT_TRUE(Filter.contains("-a"));
+  EXPECT_FALSE(Filter.contains("a"));
+}
+
+void checkCanonicalize(StringRef Glob, StringRef Expected) {
+  GlobList Orig(Glob);
+  std::string Canonicalized;
+  llvm::raw_string_ostream Str(Canonicalized);
+  Orig.print(Str);
+  Str.flush();
+  EXPECT_EQ(Canonicalized, Expected)
+      << "While canonicalizing glob: \"" << Glob << "\"";
+  EXPECT_TRUE(Orig.isEquivalentTo(Expected))
+      << "While canonicalizing glob: \"" << Glob << "\"";
+  GlobList Canonical(Canonicalized);
+  EXPECT_TRUE(Orig.isEquivalentTo(Canonical))
+      << "While canonicalizing glob: \"" << Glob << "\"";
+  EXPECT_TRUE(Canonical.isEquivalentTo(Glob))
+      << "While canonicalizing glob: \"" << Glob << "\"";
+}
+
+TEST(GlobListTest, Printing) {
+  // A positive wildcard is always printed with any previous items removed
+  // checkCanonicalize(" *", "*");
+  // checkCanonicalize(" * ,   -AAA ", "*,-AAA");
+  checkCanonicalize("Unused , *, -A*", "*,-A*");
+  // A negative wildcard is removed if there is nothing else after it.
+  checkCanonicalize("-* ", "-*");
+  checkCanonicalize("-*, AAA  ", "AAA");
+  // The canonical representation uses commas as the seperators instead of
+  // newlines.
+  checkCanonicalize("Unused\n-*\nRest\n*Others", "Rest,*Others");
+}
+
 } // namespace tidy
 } // namespace clang
Index: clang-tools-extra/clang-tidy/GlobList.h
===================================================================
--- clang-tools-extra/clang-tidy/GlobList.h
+++ clang-tools-extra/clang-tidy/GlobList.h
@@ -10,10 +10,10 @@
 #define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_GLOBLIST_H
 
 #include "clang/Basic/LLVM.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Regex.h"
+#include <climits>
+#include <vector>
 
 namespace clang {
 namespace tidy {
@@ -25,7 +25,9 @@
 /// them in the order of appearance in the list.
 class GlobList {
 public:
-  virtual ~GlobList() = default;
+  virtual ~GlobList();
+
+  GlobList(const GlobList &) = delete;
 
   /// \p Globs is a comma-separated list of globs (only the '*' metacharacter is
   /// supported) with an optional '-' prefix to denote exclusion.
@@ -41,12 +43,43 @@
   /// matching glob's Positive flag.
   virtual bool contains(StringRef S) const;
 
+  bool isEquivalentTo(const GlobList &Other) const;
+
+  bool isEquivalentTo(StringRef S, bool KeepNegativeGlobs = true) const;
+
+  void print(raw_ostream &OS) const;
+
 private:
-  struct GlobListItem {
-    bool IsPositive;
-    llvm::Regex Regex;
+  static constexpr size_t ItemBits = 15;
+  // Pretty arbitrary choice here.
+  static constexpr size_t NumInlineElements = 24;
+  struct GlobStartInfo {
+    uint16_t Items : ItemBits;
+    uint16_t IsPositive : 1;
+  };
+
+  bool isInlineStorage() const;
+
+  char *getStorage();
+
+  const char *getStorage() const;
+
+  union DataItem {
+    GlobStartInfo StartInfo;
+    uint16_t Size;
+  };
+
+  /// First item we encounter is a \c GlobStartInfo object. The next \c
+  /// StartInfo.Items items in here will be a raw uint16_t. After that, if there
+  /// are more items, the next will be another \c GlobStartInfo object.
+  std::vector<DataItem> Data;
+  union {
+    char Inline[NumInlineElements];
+    char *Ptr;
   };
-  SmallVector<GlobListItem, 0> Items;
+  size_t RawStringSize : sizeof(size_t) * CHAR_BIT - 1;
+  // The fallback value if we don't match anything.
+  size_t UnmatchedResult : 1;
 };
 
 /// A \p GlobList that caches search results, so that search is performed only
Index: clang-tools-extra/clang-tidy/GlobList.cpp
===================================================================
--- clang-tools-extra/clang-tidy/GlobList.cpp
+++ clang-tools-extra/clang-tidy/GlobList.cpp
@@ -7,63 +7,298 @@
 //===----------------------------------------------------------------------===//
 
 #include "GlobList.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstring>
+#include <limits>
+#include <numeric>
 
 namespace clang {
 namespace tidy {
 
-// Returns true if GlobList starts with the negative indicator ('-'), removes it
-// from the GlobList.
-static bool consumeNegativeIndicator(StringRef &GlobList) {
-  GlobList = GlobList.trim();
-  if (GlobList.startswith("-")) {
-    GlobList = GlobList.substr(1);
-    return true;
+static size_t getMaxAllocSize(StringRef Str) {
+  size_t Count = 0;
+  for (char C : Str) {
+    switch (C) {
+    case '\n': // Seperator character.
+    case ',':  // Seperator character.
+    case '*':  // Wildcards aren't stored.
+      continue;
+    default:
+      ++Count;
+    }
   }
-  return false;
+  return Count;
 }
 
-// Converts first glob from the comma-separated list of globs to Regex and
-// removes it and the trailing comma from the GlobList.
-static llvm::Regex consumeGlob(StringRef &GlobList) {
-  StringRef UntrimmedGlob = GlobList.substr(0, GlobList.find_first_of(",\n"));
-  StringRef Glob = UntrimmedGlob.trim();
-  GlobList = GlobList.substr(UntrimmedGlob.size() + 1);
-  SmallString<128> RegexText("^");
-  StringRef MetaChars("()^$|*+?.[]\\{}");
-  for (char C : Glob) {
-    if (C == '*')
-      RegexText.push_back('.');
-    else if (MetaChars.contains(C))
-      RegexText.push_back('\\');
-    RegexText.push_back(C);
+static std::pair<StringRef, bool> splitGlob(StringRef &Glob) {
+  size_t GlobStart = Glob.find_last_of(",\n") + 1;
+  StringRef Trimmed = Glob.substr(GlobStart).ltrim(" \t\v\f\r");
+  if (GlobStart != 0) {
+    // Bit of a hack this, Basically let ',\n' not be parsed as an empty string.
+    // This behaviour needs defining.
+    size_t NextOffset = Glob.find_last_not_of(
+        Glob[GlobStart - 1] == ',' ? " \t\r\n\f\v" : " \t\r\n\f\v,",
+        GlobStart - 1);
+
+    if (NextOffset != StringRef::npos) {
+      Glob = Glob.substr(0, NextOffset + 1);
+      return {Trimmed, true};
+    }
   }
-  RegexText.push_back('$');
-  return llvm::Regex(RegexText);
+  return {Trimmed, false};
+}
+
+static void initialGlobTrim(StringRef &Globs) {
+  // FIXME: trimming the newline character here is suspicious as its recognised
+  // as a seperator.
+  Globs = Globs.trim(" \t\r\n\f\v");
 }
 
-GlobList::GlobList(StringRef Globs, bool KeepNegativeGlobs /* =true */) {
-  Items.reserve(Globs.count(',') + Globs.count('\n') + 1);
+GlobList::GlobList(StringRef Globs, bool KeepNegativeGlobs /* =true */)
+    : RawStringSize(0), UnmatchedResult(false) {
+  initialGlobTrim(Globs);
+  //  Upperbound for the allocated size.
+  size_t MaxAllocSize = getMaxAllocSize(Globs);
+  char *Storage = Inline;
+  auto AddSizedItem = [this](size_t Size) {
+    assert(Size < std::numeric_limits<uint16_t>::max() && "Chunk too large");
+    Data.emplace_back();
+    Data.back().Size = Size;
+  };
+  bool HasMoreItems;
+  StringRef Trimmed;
   do {
-    GlobListItem Item;
-    Item.IsPositive = !consumeNegativeIndicator(Globs);
-    Item.Regex = consumeGlob(Globs);
-    if (Item.IsPositive || KeepNegativeGlobs)
-      Items.push_back(std::move(Item));
-  } while (!Globs.empty());
+    GlobStartInfo S;
+    std::tie(Trimmed, HasMoreItems) = splitGlob(Globs);
+    S.IsPositive = !Trimmed.consume_front("-");
+    // Let '\\' be used to escape the '-' used to denote a negative match.
+    if (Trimmed.size() > 2 && Trimmed.take_front(2) == "\\-")
+      Trimmed = Trimmed.drop_front();
+    if (S.IsPositive || KeepNegativeGlobs) {
+      // The match-all glob means anything before it won't ever be read.
+      // Therefore we can stop building the glob if we reach this item.
+      // We also don't need to actually store this glob as we can just update
+      // the default result if no items match.
+      if (Trimmed == "*") {
+        UnmatchedResult = S.IsPositive;
+        return;
+      }
+      size_t StartIndex = Data.size();
+      Data.emplace_back();
+      if (Trimmed.consume_front("*"))
+        AddSizedItem(0);
+      const bool LastEmpty = Trimmed.consume_back("*");
+      // FIXME: This wouldn't support escaping the wildcard (\*), but that's
+      // not a known usecase for GlobList in clang-tidy.
+      for (StringRef Item : llvm::split(Trimmed, "*")) {
+        // Don't store empty strings here as they can only exist with sucessive
+        // wildcards(**)
+        if (Item.empty())
+          continue;
+        assert(RawStringSize + Item.size() <= MaxAllocSize);
+        // Lazily allocate out of line storage if needed.
+        if (isInlineStorage() &&
+            RawStringSize + Item.size() > NumInlineElements) {
+          // We cant store the Pointer directly in Ptr as it would clobber the
+          // Inline storage.
+          char *NewPtr = new char[MaxAllocSize];
+          ::memcpy(NewPtr, Inline, RawStringSize);
+          Ptr = NewPtr;
+          Storage = Ptr;
+        }
+        llvm::copy(Item, &Storage[RawStringSize]);
+        RawStringSize += Item.size();
+        AddSizedItem(Item.size());
+      }
+      if (LastEmpty)
+        AddSizedItem(0);
+      size_t ItemCount = Data.size() - StartIndex - 1;
+      assert(ItemCount < (1 << ItemBits) && "Too many items");
+      S.Items = ItemCount;
+      // The starting node goes at the end of the items, because we traverse
+      // the glob list in reverse order.
+      Data[StartIndex].StartInfo = S;
+    }
+  } while (HasMoreItems);
 }
 
-bool GlobList::contains(StringRef S) const {
-  // Iterating the container backwards as the last match determins if S is in
-  // the list.
-  for (const GlobListItem &Item : llvm::reverse(Items)) {
-    if (Item.Regex.match(S))
-      return Item.IsPositive;
+bool GlobList::isEquivalentTo(StringRef Globs, bool KeepNegativeGlobs) const {
+  initialGlobTrim(Globs);
+  bool HasMoreItems;
+  StringRef Trimmed;
+  auto Begin = Data.begin();
+  auto End = Data.end();
+  size_t CurSize = 0;
+  bool Error = false;
+  const char *Storage = getStorage();
+  do {
+    GlobStartInfo S;
+    std::tie(Trimmed, HasMoreItems) = splitGlob(Globs);
+    S.IsPositive = !Trimmed.consume_front("-");
+    if (S.IsPositive || KeepNegativeGlobs) {
+      // The match-all glob means anything before it won't ever be read.
+      if (Trimmed == "*") {
+        Error = UnmatchedResult != S.IsPositive;
+        break;
+      }
+      if (Begin == End || S.IsPositive != Begin->StartInfo.IsPositive)
+        return false;
+      size_t Items = 0;
+      size_t ExpectedItems = Begin->StartInfo.Items;
+      ++Begin;
+      auto EatItem = [&Items, ExpectedItems, &Begin](size_t Size = 0) {
+        return ++Items <= ExpectedItems && (Begin++)->Size == Size;
+      };
+
+      if (Trimmed.consume_front("*") && !EatItem())
+        return false;
+
+      const bool LastEmpty = Trimmed.consume_back("*");
+      for (llvm::StringRef Item : llvm::split(Trimmed, "*")) {
+        if (Item.empty())
+          continue;
+        assert(Item.size() + CurSize <= RawStringSize);
+        if (!EatItem(Item.size()) ||
+            memcmp(Item.data(), &Storage[CurSize], Item.size()) != 0)
+          return false;
+        CurSize += Item.size();
+      }
+      if ((LastEmpty && !EatItem()) || Items != ExpectedItems)
+        return false;
+    }
+  } while (HasMoreItems);
+  if (!Error && Begin == End) {
+    assert(CurSize == RawStringSize);
+    return true;
   }
   return false;
 }
 
+GlobList::~GlobList() {
+  if (!isInlineStorage())
+    delete[] Ptr;
+}
+
+static bool matchGlobList(StringRef Source, llvm::ArrayRef<uint16_t> Offsets,
+                          StringRef Match) {
+  assert(!Offsets.empty());
+  // If we only had one item originally, then we are working with an exact match
+  // with no wild cards.
+  if (Offsets.size() == 1) {
+    assert(Source.size() == Offsets.front());
+    return Match == Source;
+  }
+
+  if (!Match.consume_front(Source.take_front(Offsets.front())))
+    return false;
+  Source = Source.drop_front(Offsets.front());
+
+  for (uint16_t Offset : Offsets.drop_front().drop_back()) {
+    StringRef SearchItem = Source.take_front(Offset);
+    Source = Source.drop_front(Offset);
+    size_t Idx = Match.find(SearchItem);
+    if (Idx == StringRef::npos)
+      return false;
+    Match = Match.drop_front(Idx + Offset);
+  }
+
+  assert(Source.size() == Offsets.back());
+  // For the last item, we know previously there was a wildcard, so we only need
+  // to check for a suffix match.
+  return Match.endswith(Source);
+}
+
+bool GlobList::contains(StringRef S) const {
+  StringRef Source(getStorage(), RawStringSize);
+  ArrayRef<DataItem> Items(Data);
+  while (!Items.empty()) {
+    const GlobStartInfo &StartNode = Items.front().StartInfo;
+    Items = Items.drop_front();
+    assert(Items.size() >= StartNode.Items);
+    if (StartNode.Items == 0) {
+      if (S.empty())
+        return StartNode.IsPositive;
+      continue;
+    }
+    ArrayRef<DataItem> GlobItems = Items.take_front(StartNode.Items);
+    Items = Items.drop_front(StartNode.Items);
+    ArrayRef<uint16_t> Offsets =
+        llvm::makeArrayRef(&GlobItems.front().Size, StartNode.Items);
+    size_t Size = std::accumulate(Offsets.begin(), Offsets.end(), size_t(0U));
+    if (matchGlobList(Source.take_front(Size), Offsets, S))
+      return StartNode.IsPositive;
+    Source = Source.drop_front(Size);
+  }
+  return UnmatchedResult;
+}
+
+bool GlobList::isEquivalentTo(const GlobList &Other) const {
+  // Check the cheap to determine lengths first.
+  if (std::make_tuple(RawStringSize, UnmatchedResult, Data.size()) !=
+      std::make_tuple(Other.RawStringSize, Other.UnmatchedResult,
+                      Other.Data.size()))
+    return false;
+  // Then check the elements, These are POD types so memcmp is safe for this.
+  if (memcmp(Data.data(), Other.Data.data(), Data.size() * sizeof(Data[0])) !=
+      0)
+    return false;
+  return strncmp(getStorage(), Other.getStorage(), RawStringSize) == 0;
+}
+
+void GlobList::print(raw_ostream &OS) const {
+  StringRef Source(getStorage(), RawStringSize);
+  // For ease of matching, we store the globs in reverse order to what they are
+  // written in. Therefore when we print the glob, we need to reverse the order
+  // they are stored in.
+  llvm::SmallVector<std::pair<const DataItem *, size_t>> RevPrint;
+  ArrayRef<DataItem> Items(Data);
+  size_t Cur = 0;
+  while (!Items.empty()) {
+    const DataItem *Start = Items.begin();
+    RevPrint.emplace_back(Start, Cur);
+    ArrayRef<uint16_t> Offsets(&Start[1].Size, Start->StartInfo.Items);
+    Items = Items.drop_front(1 + Start->StartInfo.Items);
+    Cur += std::accumulate(Offsets.begin(), Offsets.end(), 0U);
+  }
+  if (UnmatchedResult)
+    OS << "*";
+  else if (RevPrint.empty())
+    OS << "-*";
+  bool Sep = UnmatchedResult;
+  for (const auto &Item : llvm::reverse(RevPrint)) {
+    if (Sep)
+      OS << ',';
+    Sep = true;
+    if (!Item.first->StartInfo.IsPositive)
+      OS << '-';
+    ArrayRef<uint16_t> Offsets(&Item.first[1].Size,
+                               Item.first->StartInfo.Items);
+    if (Offsets.empty())
+      continue;
+    OS << Source.substr(Item.second, Offsets.front());
+    size_t NextOff = Item.second + Offsets.front();
+    for (uint16_t Item : Offsets.drop_front()) {
+      OS << '*' << Source.substr(NextOff, Item);
+      NextOff += Item;
+    }
+  }
+}
+
+bool GlobList::isInlineStorage() const {
+  return RawStringSize <= NumInlineElements;
+}
+
+char *GlobList::getStorage() { return isInlineStorage() ? Inline : Ptr; }
+
+const char *GlobList::getStorage() const {
+  return const_cast<GlobList *>(this)->getStorage();
+}
+
 bool CachedGlobList::contains(StringRef S) const {
   auto Entry = Cache.try_emplace(S);
   bool &Value = Entry.first->getValue();

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D129706: [NFCI][clang-tidy] Reimplement GlobList without relying on Regex.

Reply via email to