Author: omtcyfz Date: Wed Jul 25 03:34:57 2018 New Revision: 337901 URL: http://llvm.org/viewvc/llvm-project?rev=337901&view=rev Log: [clangd] Introduce Dex symbol index search tokens
This patch introduces the core building block of the next-generation Clangd symbol index - Dex. Search tokens are the keys in the inverted index and represent a characteristic of a specific symbol: examples of search token types (Token Namespaces) are * Trigrams - these are essential for unqualified symbol name fuzzy search * Scopes for filtering the symbols by the namespace * Paths, e.g. these can be used to uprank symbols defined close to the edited file This patch outlines the generic for such token namespaces, but only implements trigram generation. The intuition behind trigram generation algorithm is that each extracted trigram is a valid sequence for Fuzzy Matcher jumps, proposed implementation utilize existing FuzzyMatcher API for segmentation and trigram extraction. However, trigrams generation algorithm for the query string is different from the previous one: it simply yields sequences of 3 consecutive lowercased valid characters (letters, digits). Dex RFC in the mailing list: http://lists.llvm.org/pipermail/clangd-dev/2018-July/000022.html The trigram generation techniques are described in detail in the proposal: https://docs.google.com/document/d/1C-A6PGT6TynyaX4PXyExNMiGmJ2jL1UwV91Kyx11gOI/edit#heading=h.903u1zon9nkj Reviewers: sammccall, ioeric, ilya-biryukovA Subscribers: cfe-commits, klimek, mgorny, MaskRay, jkorous, arphaman Differential Revision: https://reviews.llvm.org/D49591 Added: clang-tools-extra/trunk/clangd/index/dex/ clang-tools-extra/trunk/clangd/index/dex/Token.h clang-tools-extra/trunk/clangd/index/dex/Trigram.cpp clang-tools-extra/trunk/clangd/index/dex/Trigram.h clang-tools-extra/trunk/unittests/clangd/DexIndexTests.cpp Modified: clang-tools-extra/trunk/clangd/CMakeLists.txt clang-tools-extra/trunk/unittests/clangd/CMakeLists.txt Modified: clang-tools-extra/trunk/clangd/CMakeLists.txt URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/clangd/CMakeLists.txt?rev=337901&r1=337900&r2=337901&view=diff ============================================================================== --- clang-tools-extra/trunk/clangd/CMakeLists.txt (original) +++ clang-tools-extra/trunk/clangd/CMakeLists.txt Wed Jul 25 03:34:57 2018 @@ -34,6 +34,7 @@ add_clang_library(clangDaemon TUScheduler.cpp URI.cpp XRefs.cpp + index/CanonicalIncludes.cpp index/FileIndex.cpp index/Index.cpp @@ -42,6 +43,8 @@ add_clang_library(clangDaemon index/SymbolCollector.cpp index/SymbolYAML.cpp + index/dex/Trigram.cpp + LINK_LIBS clangAST clangASTMatchers Added: clang-tools-extra/trunk/clangd/index/dex/Token.h URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/clangd/index/dex/Token.h?rev=337901&view=auto ============================================================================== --- clang-tools-extra/trunk/clangd/index/dex/Token.h (added) +++ clang-tools-extra/trunk/clangd/index/dex/Token.h Wed Jul 25 03:34:57 2018 @@ -0,0 +1,112 @@ +//===--- Token.h - Symbol Search primitive ----------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Token objects represent a characteristic of a symbol, which can be used to +// perform efficient search. Tokens are keys for inverted index which are mapped +// to the corresponding posting lists. +// +// The symbol std::cout might have the tokens: +// * Scope "std::" +// * Trigram "cou" +// * Trigram "out" +// * Type "std::ostream" +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_DEX_TOKEN_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_DEX_TOKEN_H + +#include "llvm/ADT/DenseMap.h" +#include "llvm/Support/raw_ostream.h" + +#include <string> +#include <vector> + +namespace clang { +namespace clangd { +namespace dex { + +/// A Token represents an attribute of a symbol, such as a particular trigram +/// present in the name (used for fuzzy search). +/// +/// Tokens can be used to perform more sophisticated search queries by +/// constructing complex iterator trees. +struct Token { + /// Kind specifies Token type which defines semantics for the internal + /// representation. Each Kind has different representation stored in Data + /// field. + enum class Kind { + /// Represents trigram used for fuzzy search of unqualified symbol names. + /// + /// Data contains 3 bytes with trigram contents. + Trigram, + /// Scope primitives, e.g. "symbol belongs to namespace foo::bar". + /// + /// Data stroes full scope name , e.g. "foo::bar::baz::" or "" (for global + /// scope). + Scope, + /// Internal Token type for invalid/special tokens, e.g. empty tokens for + /// llvm::DenseMap. + Sentinel, + /// FIXME(kbobyrev): Add other Token Kinds + /// * Path with full or relative path to the directory in which symbol is + /// defined + /// * Type with qualified type name or its USR + }; + + Token(Kind TokenKind, llvm::StringRef Data) + : Data(Data), TokenKind(TokenKind) {} + + bool operator==(const Token &Other) const { + return TokenKind == Other.TokenKind && Data == Other.Data; + } + + /// Representation which is unique among Token with the same Kind. + std::string Data; + Kind TokenKind; + + friend llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token &T) { + return OS << T.Data; + } + +private: + friend llvm::hash_code hash_value(const Token &Token) { + return llvm::hash_combine(static_cast<int>(Token.TokenKind), Token.Data); + } +}; + +} // namespace dex +} // namespace clangd +} // namespace clang + +namespace llvm { + +// Support Tokens as DenseMap keys. +template <> struct DenseMapInfo<clang::clangd::dex::Token> { + static inline clang::clangd::dex::Token getEmptyKey() { + return {clang::clangd::dex::Token::Kind::Sentinel, "EmptyKey"}; + } + + static inline clang::clangd::dex::Token getTombstoneKey() { + return {clang::clangd::dex::Token::Kind::Sentinel, "TombstoneKey"}; + } + + static unsigned getHashValue(const clang::clangd::dex::Token &Tag) { + return hash_value(Tag); + } + + static bool isEqual(const clang::clangd::dex::Token &LHS, + const clang::clangd::dex::Token &RHS) { + return LHS == RHS; + } +}; + +} // namespace llvm + +#endif Added: clang-tools-extra/trunk/clangd/index/dex/Trigram.cpp URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/clangd/index/dex/Trigram.cpp?rev=337901&view=auto ============================================================================== --- clang-tools-extra/trunk/clangd/index/dex/Trigram.cpp (added) +++ clang-tools-extra/trunk/clangd/index/dex/Trigram.cpp Wed Jul 25 03:34:57 2018 @@ -0,0 +1,132 @@ +//===--- Trigram.cpp - Trigram generation for Fuzzy Matching ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "Trigram.h" +#include "../../FuzzyMatch.h" +#include "Token.h" + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/StringExtras.h" + +#include <cctype> +#include <queue> +#include <string> + +using namespace llvm; + +namespace clang { +namespace clangd { +namespace dex { + +// FIXME(kbobyrev): Deal with short symbol symbol names. A viable approach would +// be generating unigrams and bigrams here, too. This would prevent symbol index +// from applying fuzzy matching on a tremendous number of symbols and allow +// supplementary retrieval for short queries. +// +// Short names (total segment length <3 characters) are currently ignored. +std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier) { + // Apply fuzzy matching text segmentation. + std::vector<CharRole> Roles(Identifier.size()); + calculateRoles(Identifier, + llvm::makeMutableArrayRef(Roles.data(), Identifier.size())); + + std::string LowercaseIdentifier = Identifier.lower(); + + // For each character, store indices of the characters to which fuzzy matching + // algorithm can jump. There are 3 possible variants: + // + // * Next Tail - next character from the same segment + // * Next Head - front character of the next segment + // * Skip-1-Next Head - front character of the skip-1-next segment + // + // Next stores tuples of three indices in the presented order, if a variant is + // not available then 0 is stored. + std::vector<std::array<unsigned, 3>> Next(LowercaseIdentifier.size()); + unsigned NextTail = 0, NextHead = 0, NextNextHead = 0; + for (int I = LowercaseIdentifier.size() - 1; I >= 0; --I) { + Next[I] = {{NextTail, NextHead, NextNextHead}}; + NextTail = Roles[I] == Tail ? I : 0; + if (Roles[I] == Head) { + NextNextHead = NextHead; + NextHead = I; + } + } + + DenseSet<Token> UniqueTrigrams; + std::array<char, 4> Chars; + for (size_t I = 0; I < LowercaseIdentifier.size(); ++I) { + // Skip delimiters. + if (Roles[I] != Head && Roles[I] != Tail) + continue; + for (const unsigned J : Next[I]) { + if (!J) + continue; + for (const unsigned K : Next[J]) { + if (!K) + continue; + Chars = {{LowercaseIdentifier[I], LowercaseIdentifier[J], + LowercaseIdentifier[K], 0}}; + auto Trigram = Token(Token::Kind::Trigram, Chars.data()); + // Push unique trigrams to the result. + if (!UniqueTrigrams.count(Trigram)) { + UniqueTrigrams.insert(Trigram); + } + } + } + } + + std::vector<Token> Result; + for (const auto &Trigram : UniqueTrigrams) + Result.push_back(Trigram); + + return Result; +} + +// FIXME(kbobyrev): Similarly, to generateIdentifierTrigrams, this ignores short +// inputs (total segment length <3 characters). +std::vector<Token> generateQueryTrigrams(llvm::StringRef Query) { + // Apply fuzzy matching text segmentation. + std::vector<CharRole> Roles(Query.size()); + calculateRoles(Query, llvm::makeMutableArrayRef(Roles.data(), Query.size())); + + std::string LowercaseQuery = Query.lower(); + + DenseSet<Token> UniqueTrigrams; + std::deque<char> Chars; + + for (size_t I = 0; I < LowercaseQuery.size(); ++I) { + // If current symbol is delimiter, just skip it. + if (Roles[I] != Head && Roles[I] != Tail) + continue; + + Chars.push_back(LowercaseQuery[I]); + + if (Chars.size() > 3) + Chars.pop_front(); + if (Chars.size() == 3) { + auto Trigram = + Token(Token::Kind::Trigram, std::string(begin(Chars), end(Chars))); + // Push unique trigrams to the result. + if (!UniqueTrigrams.count(Trigram)) { + UniqueTrigrams.insert(Trigram); + } + } + } + + std::vector<Token> Result; + for (const auto &Trigram : UniqueTrigrams) + Result.push_back(Trigram); + + return Result; +} + +} // namespace dex +} // namespace clangd +} // namespace clang Added: clang-tools-extra/trunk/clangd/index/dex/Trigram.h URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/clangd/index/dex/Trigram.h?rev=337901&view=auto ============================================================================== --- clang-tools-extra/trunk/clangd/index/dex/Trigram.h (added) +++ clang-tools-extra/trunk/clangd/index/dex/Trigram.h Wed Jul 25 03:34:57 2018 @@ -0,0 +1,62 @@ +//===--- Trigram.h - Trigram generation for Fuzzy Matching ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Trigrams are attributes of the symbol unqualified name used to effectively +// extract symbols which can be fuzzy-matched given user query from the inverted +// index. To match query with the extracted set of trigrams Q, the set of +// generated trigrams T for identifier (unqualified symbol name) should contain +// all items of Q, i.e. Q â T. +// +// Trigram sets extracted from unqualified name and from query are different: +// the set of query trigrams only contains consecutive sequences of three +// characters (which is only a subset of all trigrams generated for an +// identifier). +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_DEX_TRIGRAM_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_DEX_TRIGRAM_H + +#include "Token.h" + +#include <string> + +namespace clang { +namespace clangd { +namespace dex { + +/// Returns list of unique fuzzy-search trigrams from unqualified symbol. +/// +/// First, given Identifier (unqualified symbol name) is segmented using +/// FuzzyMatch API and lowercased. After segmentation, the following technique +/// is applied for generating trigrams: for each letter or digit in the input +/// string the algorithms looks for the possible next and skip-1-next symbols +/// which can be jumped to during fuzzy matching. Each combination of such three +/// symbols is inserted into the result. +/// +/// Trigrams can start at any character in the input. Then we can choose to move +/// to the next character, move to the start of the next segment, or skip over a +/// segment. +/// +/// Note: the returned list of trigrams does not have duplicates, if any trigram +/// belongs to more than one class it is only inserted once. +std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier); + +/// Returns list of unique fuzzy-search trigrams given a query. +/// +/// Query is segmented using FuzzyMatch API and downcasted to lowercase. Then, +/// the simplest trigrams - sequences of three consecutive letters and digits +/// are extracted and returned after deduplication. +std::vector<Token> generateQueryTrigrams(llvm::StringRef Query); + +} // namespace dex +} // namespace clangd +} // namespace clang + +#endif Modified: clang-tools-extra/trunk/unittests/clangd/CMakeLists.txt URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/unittests/clangd/CMakeLists.txt?rev=337901&r1=337900&r2=337901&view=diff ============================================================================== --- clang-tools-extra/trunk/unittests/clangd/CMakeLists.txt (original) +++ clang-tools-extra/trunk/unittests/clangd/CMakeLists.txt Wed Jul 25 03:34:57 2018 @@ -15,9 +15,10 @@ add_extra_unittest(ClangdTests CodeCompleteTests.cpp CodeCompletionStringsTests.cpp ContextTests.cpp + DexIndexTests.cpp DraftStoreTests.cpp - FileIndexTests.cpp FileDistanceTests.cpp + FileIndexTests.cpp FindSymbolsTests.cpp FuzzyMatchTests.cpp GlobalCompilationDatabaseTests.cpp @@ -27,11 +28,11 @@ add_extra_unittest(ClangdTests SourceCodeTests.cpp SymbolCollectorTests.cpp SyncAPI.cpp + TUSchedulerTests.cpp TestFS.cpp TestTU.cpp ThreadingTests.cpp TraceTests.cpp - TUSchedulerTests.cpp URITests.cpp XRefsTests.cpp ) Added: clang-tools-extra/trunk/unittests/clangd/DexIndexTests.cpp URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/unittests/clangd/DexIndexTests.cpp?rev=337901&view=auto ============================================================================== --- clang-tools-extra/trunk/unittests/clangd/DexIndexTests.cpp (added) +++ clang-tools-extra/trunk/unittests/clangd/DexIndexTests.cpp Wed Jul 25 03:34:57 2018 @@ -0,0 +1,96 @@ +//===-- DexIndexTests.cpp ----------------------------*- C++ -*-----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "index/dex/Token.h" +#include "index/dex/Trigram.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +#include <string> +#include <vector> + +namespace clang { +namespace clangd { +namespace dex { + +testing::Matcher<std::vector<Token>> +trigramsAre(std::initializer_list<std::string> Trigrams) { + std::vector<Token> Tokens; + for (const auto &Symbols : Trigrams) { + Tokens.push_back(Token(Token::Kind::Trigram, Symbols)); + } + return testing::UnorderedElementsAreArray(Tokens); +} + +TEST(DexIndexTrigrams, IdentifierTrigrams) { + EXPECT_THAT(generateIdentifierTrigrams("X86"), trigramsAre({"x86"})); + + EXPECT_THAT(generateIdentifierTrigrams("nl"), trigramsAre({})); + + EXPECT_THAT(generateIdentifierTrigrams("clangd"), + trigramsAre({"cla", "lan", "ang", "ngd"})); + + EXPECT_THAT(generateIdentifierTrigrams("abc_def"), + trigramsAre({"abc", "abd", "ade", "bcd", "bde", "cde", "def"})); + + EXPECT_THAT( + generateIdentifierTrigrams("a_b_c_d_e_"), + trigramsAre({"abc", "abd", "acd", "ace", "bcd", "bce", "bde", "cde"})); + + EXPECT_THAT( + generateIdentifierTrigrams("unique_ptr"), + trigramsAre({"uni", "unp", "upt", "niq", "nip", "npt", "iqu", "iqp", + "ipt", "que", "qup", "qpt", "uep", "ept", "ptr"})); + + EXPECT_THAT(generateIdentifierTrigrams("TUDecl"), + trigramsAre({"tud", "tde", "ude", "dec", "ecl"})); + + EXPECT_THAT(generateIdentifierTrigrams("IsOK"), + trigramsAre({"iso", "iok", "sok"})); + + EXPECT_THAT(generateIdentifierTrigrams("abc_defGhij__klm"), + trigramsAre({ + "abc", "abd", "abg", "ade", "adg", "adk", "agh", "agk", "bcd", + "bcg", "bde", "bdg", "bdk", "bgh", "bgk", "cde", "cdg", "cdk", + "cgh", "cgk", "def", "deg", "dek", "dgh", "dgk", "dkl", "efg", + "efk", "egh", "egk", "ekl", "fgh", "fgk", "fkl", "ghi", "ghk", + "gkl", "hij", "hik", "hkl", "ijk", "ikl", "jkl", "klm", + })); +} + +TEST(DexIndexTrigrams, QueryTrigrams) { + EXPECT_THAT(generateQueryTrigrams("X86"), trigramsAre({"x86"})); + + EXPECT_THAT(generateQueryTrigrams("nl"), trigramsAre({})); + + EXPECT_THAT(generateQueryTrigrams("clangd"), + trigramsAre({"cla", "lan", "ang", "ngd"})); + + EXPECT_THAT(generateQueryTrigrams("abc_def"), + trigramsAre({"abc", "bcd", "cde", "def"})); + + EXPECT_THAT(generateQueryTrigrams("a_b_c_d_e_"), + trigramsAre({"abc", "bcd", "cde"})); + + EXPECT_THAT(generateQueryTrigrams("unique_ptr"), + trigramsAre({"uni", "niq", "iqu", "que", "uep", "ept", "ptr"})); + + EXPECT_THAT(generateQueryTrigrams("TUDecl"), + trigramsAre({"tud", "ude", "dec", "ecl"})); + + EXPECT_THAT(generateQueryTrigrams("IsOK"), trigramsAre({"iso", "sok"})); + + EXPECT_THAT(generateQueryTrigrams("abc_defGhij__klm"), + trigramsAre({"abc", "bcd", "cde", "def", "efg", "fgh", "ghi", + "hij", "ijk", "jkl", "klm"})); +} + +} // namespace dex +} // namespace clangd +} // namespace clang _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits