sammccall created this revision.
sammccall added a reviewer: ilya-biryukov.
Herald added subscribers: cfe-commits, kadircet, arphaman, jkorous, MaskRay, 
ioeric.

The FALSE iterator will be used in a followup patch to fix a logic bug in Dex
(currently, tokens that don't have posting lists in the index are simply dropped
from the query, changing semantics).

It can usually be optimized away, so added the following opmitizations:

- simplify booleans inside AND/OR
- replace effectively-empty AND/OR with booleans
- flatten nested AND/ORs

While working on this, found a bug in the AND iterator: its constructor sync()
assumes that ReachedEnd is set if applicable, but the constructor never sets it.
This crashes if a non-first iterator is nonempty.


Repository:
  rCTE Clang Tools Extra

https://reviews.llvm.org/D52789

Files:
  clangd/index/dex/Iterator.cpp
  clangd/index/dex/Iterator.h
  unittests/clangd/DexTests.cpp

Index: unittests/clangd/DexTests.cpp
===================================================================
--- unittests/clangd/DexTests.cpp
+++ unittests/clangd/DexTests.cpp
@@ -109,6 +109,18 @@
   EXPECT_TRUE(And->reachedEnd());
 }
 
+TEST(DexIterators, AndEmpty) {
+  Corpus C{10000};
+  const PostingList L1({1});
+  const PostingList L2({2});
+  // These iterators are empty, but the optimizer can't tell.
+  auto Empty1 = C.intersect(L1.iterator(), L2.iterator());
+  auto Empty2 = C.intersect(L1.iterator(), L2.iterator());
+  // And syncs iterators on construction, and used to fail on empty children.
+  auto And = C.intersect(std::move(Empty1), std::move(Empty2));
+  EXPECT_TRUE(And->reachedEnd());
+}
+
 TEST(DexIterators, OrTwoLists) {
   Corpus C{10000};
   const PostingList L0({0, 5, 7, 10, 42, 320, 9000});
@@ -270,18 +282,8 @@
 }
 
 TEST(DexIterators, True) {
-  Corpus C{0};
-  auto TrueIterator = C.all();
-  EXPECT_TRUE(TrueIterator->reachedEnd());
-  EXPECT_THAT(consumeIDs(*TrueIterator), ElementsAre());
-
-  C = Corpus{7};
-  const PostingList L0({1, 2, 5, 7});
-  TrueIterator = C.all();
-  EXPECT_THAT(TrueIterator->peek(), 0);
-  auto AndIterator = C.intersect(L0.iterator(), move(TrueIterator));
-  EXPECT_FALSE(AndIterator->reachedEnd());
-  EXPECT_THAT(consumeIDs(*AndIterator), ElementsAre(1, 2, 5));
+  EXPECT_TRUE(Corpus{0}.all()->reachedEnd());
+  EXPECT_THAT(consumeIDs(*Corpus{4}.all()), ElementsAre(0, 1, 2, 3));
 }
 
 TEST(DexIterators, Boost) {
@@ -313,6 +315,38 @@
   EXPECT_THAT(ElementBoost, 3);
 }
 
+TEST(DexIterators, Optimizations) {
+  Corpus C{5};
+  const PostingList L1({1});
+  const PostingList L2({2});
+  const PostingList L3({3});
+
+  // empty and/or yield true/false
+  EXPECT_EQ(llvm::to_string(*C.intersect()), "true");
+  EXPECT_EQ(llvm::to_string(*C.unionOf()), "false");
+
+  // true/false inside and/or short-circuit
+  EXPECT_EQ(llvm::to_string(*C.intersect(L1.iterator(), C.all())), "[1]");
+  EXPECT_EQ(llvm::to_string(*C.intersect(L1.iterator(), C.none())), "false");
+  // Not optimized to avoid breaking boosts.
+  EXPECT_EQ(llvm::to_string(*C.unionOf(L1.iterator(), C.all())),
+            "(| [1] true)");
+  EXPECT_EQ(llvm::to_string(*C.unionOf(L1.iterator(), C.none())), "[1]");
+
+  // and/or nested inside and/or are flattened
+  EXPECT_EQ(llvm::to_string(*C.intersect(
+                L1.iterator(), C.intersect(L1.iterator(), L1.iterator()))),
+            "(& [1] [1] [1])");
+  EXPECT_EQ(llvm::to_string(*C.unionOf(
+                L1.iterator(), C.unionOf(L2.iterator(), L3.iterator()))),
+            "(| [1] [2] [3])");
+
+  // optimizations combine over multiple levels
+  EXPECT_EQ(llvm::to_string(*C.intersect(
+                C.intersect(L1.iterator(), C.intersect()), C.unionOf(C.all()))),
+            "[1]");
+}
+
 //===----------------------------------------------------------------------===//
 // Search token tests.
 //===----------------------------------------------------------------------===//
Index: clangd/index/dex/Iterator.h
===================================================================
--- clangd/index/dex/Iterator.h
+++ clangd/index/dex/Iterator.h
@@ -98,7 +98,15 @@
     return Iterator.dump(OS);
   }
 
+  /// Inspect iterator type, used internally for optimizing query trees.
+  enum class Kind { And, Or, True, False, Other };
+  Kind kind() const { return MyKind; }
+
+protected:
+  Iterator(Kind MyKind = Kind::Other) : MyKind(MyKind) {}
+
 private:
+  Kind MyKind;
   virtual llvm::raw_ostream &dump(llvm::raw_ostream &OS) const = 0;
 };
 
@@ -150,6 +158,9 @@
   /// containing all items in range [0, Size) in an efficient manner.
   std::unique_ptr<Iterator> all() const;
 
+  /// Returns FALSE Iterator which iterates over no documents.
+  std::unique_ptr<Iterator> none() const;
+
   /// Returns BOOST iterator which multiplies the score of each item by given
   /// factor. Boosting can be used as a computationally inexpensive filtering.
   /// Users can return significantly more items using consumeAndBoost() and
Index: clangd/index/dex/Iterator.cpp
===================================================================
--- clangd/index/dex/Iterator.cpp
+++ clangd/index/dex/Iterator.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "Iterator.h"
+#include "llvm/Support/Casting.h"
 #include <algorithm>
 #include <cassert>
 #include <numeric>
@@ -26,9 +27,11 @@
 class AndIterator : public Iterator {
 public:
   explicit AndIterator(std::vector<std::unique_ptr<Iterator>> AllChildren)
-      : Children(std::move(AllChildren)) {
+      : Iterator(Kind::And), Children(std::move(AllChildren)) {
     assert(!Children.empty() && "AND iterator should have at least one child.");
     // Establish invariants.
+    for (const auto &Child : Children)
+      ReachedEnd |= Child->reachedEnd();
     sync();
     // When children are sorted by the estimateSize(), sync() calls are more
     // effective. Each sync() starts with the first child and makes sure all
@@ -122,6 +125,7 @@
   /// update the field, rather than traversing the whole subtree in each
   /// reachedEnd() call.
   bool ReachedEnd = false;
+  friend Corpus; // For optimizations.
 };
 
 /// Implements Iterator over the union of other iterators.
@@ -133,7 +137,7 @@
 class OrIterator : public Iterator {
 public:
   explicit OrIterator(std::vector<std::unique_ptr<Iterator>> AllChildren)
-      : Children(std::move(AllChildren)) {
+      : Iterator(Kind::Or), Children(std::move(AllChildren)) {
     assert(!Children.empty() && "OR iterator should have at least one child.");
   }
 
@@ -208,14 +212,15 @@
 
   // FIXME(kbobyrev): Would storing Children in min-heap be faster?
   std::vector<std::unique_ptr<Iterator>> Children;
+  friend Corpus; // For optimizations.
 };
 
 /// TrueIterator handles PostingLists which contain all items of the index. It
 /// stores size of the virtual posting list, and all operations are performed
 /// in O(1).
 class TrueIterator : public Iterator {
 public:
-  explicit TrueIterator(DocID Size) : Size(Size) {}
+  explicit TrueIterator(DocID Size) : Iterator(Kind::True), Size(Size) {}
 
   bool reachedEnd() const override { return Index >= Size; }
 
@@ -251,6 +256,29 @@
   DocID Size;
 };
 
+/// FalseIterator yields no results.
+class FalseIterator : public Iterator {
+public:
+  FalseIterator() : Iterator(Kind::False) {}
+  bool reachedEnd() const override { return true; }
+  void advance() override { assert(false); }
+  void advanceTo(DocID ID) override { assert(false); }
+  DocID peek() const override {
+    assert(false);
+    return 0;
+  }
+  float consume() override {
+    assert(false);
+    return 1;
+  }
+  size_t estimateSize() const override { return 0; }
+
+private:
+  llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override {
+    return OS << "false";
+  }
+};
+
 /// Boost iterator is a wrapper around its child which multiplies scores of
 /// each retrieved item by a given factor.
 class BoostIterator : public Iterator {
@@ -331,29 +359,83 @@
 
 std::unique_ptr<Iterator>
 Corpus::intersect(std::vector<std::unique_ptr<Iterator>> Children) const {
-  // If there is exactly one child, pull it one level up: AND(Child) -> Child.
-  return Children.size() == 1 ? std::move(Children.front())
-                              : llvm::make_unique<AndIterator>(move(Children));
+  std::vector<std::unique_ptr<Iterator>> RealChildren;
+  for (auto &Child : Children)
+    switch (Child->kind()) {
+    case Iterator::Kind::True:
+      break; // No effect, drop the iterator.
+    case Iterator::Kind::False:
+      return std::move(Child); // Intersection is empty.
+    case Iterator::Kind::And: {
+      // Inline nested AND into parent AND.
+      auto &NewChildren = static_cast<AndIterator *>(Child.get())->Children;
+      std::move(NewChildren.begin(), NewChildren.end(),
+                std::back_inserter(RealChildren));
+      break;
+    }
+    default:
+      RealChildren.push_back(std::move(Child));
+    }
+  switch (RealChildren.size()) {
+  case 0:
+    return all();
+  case 1:
+    return std::move(RealChildren.front());
+  default:
+    return llvm::make_unique<AndIterator>(move(RealChildren));
+  }
 }
 
 std::unique_ptr<Iterator>
 Corpus::unionOf(std::vector<std::unique_ptr<Iterator>> Children) const {
-  // If there is exactly one child, pull it one level up: OR(Child) -> Child.
-  return Children.size() == 1 ? std::move(Children.front())
-                              : llvm::make_unique<OrIterator>(move(Children));
+  std::vector<std::unique_ptr<Iterator>> RealChildren;
+  for (auto &Child : Children)
+    switch (Child->kind()) {
+    case Iterator::Kind::False:
+      break; // No effect, drop the iterator.
+    case Iterator::Kind::Or: {
+      // Inline nested OR into parent OR.
+      auto &NewChildren = static_cast<OrIterator *>(Child.get())->Children;
+      std::move(NewChildren.begin(), NewChildren.end(),
+                std::back_inserter(RealChildren));
+      break;
+    }
+    case Iterator::Kind::True:
+      // Don't return all(), which would discard sibling boosts.
+    default:
+      RealChildren.push_back(std::move(Child));
+    }
+  switch (RealChildren.size()) {
+  case 0:
+    return none();
+  case 1:
+    return std::move(RealChildren.front());
+  default:
+    return llvm::make_unique<OrIterator>(move(RealChildren));
+  }
 }
 
 std::unique_ptr<Iterator> Corpus::all() const {
   return llvm::make_unique<TrueIterator>(Size);
 }
 
+std::unique_ptr<Iterator> Corpus::none() const {
+  return llvm::make_unique<FalseIterator>();
+}
+
 std::unique_ptr<Iterator> Corpus::boost(std::unique_ptr<Iterator> Child,
                                         float Factor) const {
+  if (Factor == 1)
+    return Child;
+  if (Child->kind() == Iterator::Kind::False)
+    return Child;
   return llvm::make_unique<BoostIterator>(move(Child), Factor);
 }
 
 std::unique_ptr<Iterator> Corpus::limit(std::unique_ptr<Iterator> Child,
                                         size_t Limit) const {
+  if (Child->kind() == Iterator::Kind::False)
+    return Child;
   return llvm::make_unique<LimitIterator>(move(Child), Limit);
 }
 
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to