sammccall created this revision. sammccall added a reviewer: ilya-biryukov. Herald added subscribers: cfe-commits, kadircet, arphaman, jkorous, MaskRay, ioeric.
The bug being fixed: when a posting list doesn't exist in the index, it was previously just dropped from the query rather than being treated as empty. Now that we have the FALSE iterator, we can use it instead. The query tree logic previously had a bunch of special cases to detect whether subtrees are empty. Now we just naively build the whole tree, and rely on the query optimizations to drop the trivial parts. Finally, there was a bug in trigram generation: the empty query would generate a single trigram "$$$" instead of no trigrams. This had no effect (there was no posting list, so the other bug cancelled it out). But we now have to fix this bug too. Repository: rCTE Clang Tools Extra https://reviews.llvm.org/D52796 Files: clangd/index/dex/Dex.cpp clangd/index/dex/Dex.h clangd/index/dex/Trigram.cpp unittests/clangd/DexTests.cpp
Index: unittests/clangd/DexTests.cpp =================================================================== --- unittests/clangd/DexTests.cpp +++ unittests/clangd/DexTests.cpp @@ -411,6 +411,7 @@ EXPECT_THAT(generateQueryTrigrams("cl"), trigramsAre({"cl$"})); EXPECT_THAT(generateQueryTrigrams("cla"), trigramsAre({"cla"})); + EXPECT_THAT(generateQueryTrigrams(""), trigramsAre({})); EXPECT_THAT(generateQueryTrigrams("_"), trigramsAre({"_$$"})); EXPECT_THAT(generateQueryTrigrams("__"), trigramsAre({"__$"})); EXPECT_THAT(generateQueryTrigrams("___"), trigramsAre({"___"})); @@ -588,6 +589,14 @@ EXPECT_THAT(match(*I, Req), UnorderedElementsAre("ns::ABC", "ns::abc")); } +TEST(DexTest, UnknownPostingList) { + // Regression test: we used to ignore unknown scopes and accept any symbol. + auto I = Dex::build(generateSymbols({"ns::ABC", "ns::abc"}), URISchemes); + FuzzyFindRequest Req; + Req.Scopes = {"ns2::"}; + EXPECT_THAT(match(*I, Req), UnorderedElementsAre()); +} + TEST(DexTest, Lookup) { auto I = Dex::build(generateSymbols({"ns::abc", "ns::xyz"}), URISchemes); EXPECT_THAT(lookup(*I, SymbolID("ns::abc")), UnorderedElementsAre("ns::abc")); Index: clangd/index/dex/Trigram.cpp =================================================================== --- clangd/index/dex/Trigram.cpp +++ clangd/index/dex/Trigram.cpp @@ -124,6 +124,9 @@ // If the number of symbols which can form fuzzy matching trigram is not // sufficient, generate a single incomplete trigram for query. if (ValidSymbolsCount < 3) { + if (LowercaseQuery.empty()) + return {}; + // TODO(sammccall): is the incomplete trigram for *invalid* chars useful? std::string Chars = LowercaseQuery.substr(0, std::min<size_t>(3UL, Query.size())); Chars.append(3 - Chars.size(), END_MARKER); Index: clangd/index/dex/Dex.h =================================================================== --- clangd/index/dex/Dex.h +++ clangd/index/dex/Dex.h @@ -88,6 +88,7 @@ private: void buildIndex(); + std::unique_ptr<Iterator> iterator(const Token &Tok) const; /// Stores symbols sorted in the descending order of symbol quality.. std::vector<const Symbol *> Symbols; Index: clangd/index/dex/Dex.cpp =================================================================== --- clangd/index/dex/Dex.cpp +++ clangd/index/dex/Dex.cpp @@ -53,7 +53,7 @@ } // Constructs BOOST iterators for Path Proximities. -std::vector<std::unique_ptr<Iterator>> createFileProximityIterators( +std::unique_ptr<Iterator> createFileProximityIterator( llvm::ArrayRef<std::string> ProximityPaths, llvm::ArrayRef<std::string> URISchemes, const llvm::DenseMap<Token, PostingList> &InvertedIndex, @@ -96,7 +96,8 @@ It->second.iterator(&It->first), PathProximitySignals.evaluate())); } } - return BoostingIterators; + BoostingIterators.push_back(Corpus.all()); + return Corpus.unionOf(std::move(BoostingIterators)); } } // namespace @@ -138,6 +139,12 @@ {TokenToPostingList.first, PostingList(TokenToPostingList.second)}); } +std::unique_ptr<Iterator> Dex::iterator(const Token &Tok) const { + auto It = InvertedIndex.find(Tok); + return It == InvertedIndex.end() ? Corpus.none() + : It->second.iterator(&It->first); +} + /// Constructs iterators over tokens extracted from the query and exhausts it /// while applying Callback to each symbol in the order of decreasing quality /// of the matched symbols. @@ -149,64 +156,40 @@ FuzzyMatcher Filter(Req.Query); bool More = false; - std::vector<std::unique_ptr<Iterator>> TopLevelChildren; + std::vector<std::unique_ptr<Iterator>> Criteria; const auto TrigramTokens = generateQueryTrigrams(Req.Query); // Generate query trigrams and construct AND iterator over all query // trigrams. std::vector<std::unique_ptr<Iterator>> TrigramIterators; - for (const auto &Trigram : TrigramTokens) { - const auto It = InvertedIndex.find(Trigram); - if (It != InvertedIndex.end()) - TrigramIterators.push_back(It->second.iterator(&It->first)); - } - if (!TrigramIterators.empty()) - TopLevelChildren.push_back(Corpus.intersect(move(TrigramIterators))); + for (const auto &Trigram : TrigramTokens) + TrigramIterators.push_back(iterator(Trigram)); + Criteria.push_back(Corpus.intersect(move(TrigramIterators))); // Generate scope tokens for search query. std::vector<std::unique_ptr<Iterator>> ScopeIterators; - for (const auto &Scope : Req.Scopes) { - Token Tok(Token::Kind::Scope, Scope); - const auto It = InvertedIndex.find(Tok); - if (It != InvertedIndex.end()) - ScopeIterators.push_back(It->second.iterator(&It->first)); - } - if (Req.AnyScope) + for (const auto &Scope : Req.Scopes) + ScopeIterators.push_back(iterator(Token(Token::Kind::Scope, Scope))); + if (Req.AnyScope || /*legacy*/ Req.Scopes.empty()) ScopeIterators.push_back( Corpus.boost(Corpus.all(), ScopeIterators.empty() ? 1.0 : 0.2)); + Criteria.push_back(Corpus.unionOf(move(ScopeIterators))); - // Add OR iterator for scopes if there are any Scope Iterators. - if (!ScopeIterators.empty()) - TopLevelChildren.push_back(Corpus.unionOf(move(ScopeIterators))); - - // Add proximity paths boosting. - auto BoostingIterators = createFileProximityIterators( - Req.ProximityPaths, URISchemes, InvertedIndex, Corpus); - // Boosting iterators do not actually filter symbols. In order to preserve - // the validity of resulting query, TRUE iterator should be added along - // BOOSTs. - if (!BoostingIterators.empty()) { - BoostingIterators.push_back(Corpus.all()); - TopLevelChildren.push_back(Corpus.unionOf(move(BoostingIterators))); - } + // Add proximity paths boosting (all symbols, some boosted). + Criteria.push_back(createFileProximityIterator(Req.ProximityPaths, URISchemes, + InvertedIndex, Corpus)); if (Req.RestrictForCodeCompletion) - TopLevelChildren.push_back( - InvertedIndex.find(RestrictedForCodeCompletion) - ->second.iterator(&RestrictedForCodeCompletion)); + Criteria.push_back(iterator(RestrictedForCodeCompletion)); // Use TRUE iterator if both trigrams and scopes from the query are not // present in the symbol index. - auto QueryIterator = TopLevelChildren.empty() - ? Corpus.all() - : Corpus.intersect(move(TopLevelChildren)); + auto Root = Corpus.intersect(move(Criteria)); // Retrieve more items than it was requested: some of the items with high // final score might not be retrieved otherwise. - // FIXME(kbobyrev): Pre-scoring retrieval threshold should be adjusted as - // using 100x of the requested number might not be good in practice, e.g. - // when the requested number of items is small. - auto Root = Req.Limit ? Corpus.limit(move(QueryIterator), *Req.Limit * 100) - : move(QueryIterator); + // FIXME(kbobyrev): Tune this ratio. + if (Req.Limit) + Root = Corpus.limit(move(Root), *Req.Limit * 100); SPAN_ATTACH(Tracer, "query", llvm::to_string(*Root)); vlog("Dex query tree: {0}", *Root);
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits