kbobyrev updated this revision to Diff 157196.
kbobyrev marked 7 inline comments as done.
kbobyrev added a comment.

Address last round of comments.


https://reviews.llvm.org/D49591

Files:
  clang-tools-extra/clangd/CMakeLists.txt
  clang-tools-extra/clangd/index/dex/Token.h
  clang-tools-extra/clangd/index/dex/Trigram.cpp
  clang-tools-extra/clangd/index/dex/Trigram.h
  clang-tools-extra/unittests/clangd/CMakeLists.txt
  clang-tools-extra/unittests/clangd/DexIndexTests.cpp

Index: clang-tools-extra/unittests/clangd/DexIndexTests.cpp
===================================================================
--- /dev/null
+++ clang-tools-extra/unittests/clangd/DexIndexTests.cpp
@@ -0,0 +1,96 @@
+//===-- DexIndexTests.cpp  ----------------------------*- C++ -*-----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "index/dex/Token.h"
+#include "index/dex/Trigram.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+#include <string>
+#include <vector>
+
+namespace clang {
+namespace clangd {
+namespace dex {
+
+testing::Matcher<std::vector<Token>>
+trigramsAre(std::initializer_list<std::string> Trigrams) {
+  std::vector<Token> Tokens;
+  for (const auto &Symbols : Trigrams) {
+    Tokens.push_back(Token(Token::Kind::Trigram, Symbols));
+  }
+  return testing::UnorderedElementsAreArray(Tokens);
+}
+
+TEST(DexIndexTrigrams, IdentifierTrigrams) {
+  EXPECT_THAT(generateIdentifierTrigrams("X86"), trigramsAre({"x86"}));
+
+  EXPECT_THAT(generateIdentifierTrigrams("nl"), trigramsAre({}));
+
+  EXPECT_THAT(generateIdentifierTrigrams("clangd"),
+              trigramsAre({"cla", "lan", "ang", "ngd"}));
+
+  EXPECT_THAT(generateIdentifierTrigrams("abc_def"),
+              trigramsAre({"abc", "abd", "ade", "bcd", "bde", "cde", "def"}));
+
+  EXPECT_THAT(
+      generateIdentifierTrigrams("a_b_c_d_e_"),
+      trigramsAre({"abc", "abd", "acd", "ace", "bcd", "bce", "bde", "cde"}));
+
+  EXPECT_THAT(
+      generateIdentifierTrigrams("unique_ptr"),
+      trigramsAre({"uni", "unp", "upt", "niq", "nip", "npt", "iqu", "iqp",
+                   "ipt", "que", "qup", "qpt", "uep", "ept", "ptr"}));
+
+  EXPECT_THAT(generateIdentifierTrigrams("TUDecl"),
+              trigramsAre({"tud", "tde", "ude", "dec", "ecl"}));
+
+  EXPECT_THAT(generateIdentifierTrigrams("IsOK"),
+              trigramsAre({"iso", "iok", "sok"}));
+
+  EXPECT_THAT(generateIdentifierTrigrams("abc_defGhij__klm"),
+              trigramsAre({
+                  "abc", "abd", "abg", "ade", "adg", "adk", "agh", "agk", "bcd",
+                  "bcg", "bde", "bdg", "bdk", "bgh", "bgk", "cde", "cdg", "cdk",
+                  "cgh", "cgk", "def", "deg", "dek", "dgh", "dgk", "dkl", "efg",
+                  "efk", "egh", "egk", "ekl", "fgh", "fgk", "fkl", "ghi", "ghk",
+                  "gkl", "hij", "hik", "hkl", "ijk", "ikl", "jkl", "klm",
+              }));
+}
+
+TEST(DexIndexTrigrams, QueryTrigrams) {
+  EXPECT_THAT(generateQueryTrigrams("X86"), trigramsAre({"x86"}));
+
+  EXPECT_THAT(generateQueryTrigrams("nl"), trigramsAre({}));
+
+  EXPECT_THAT(generateQueryTrigrams("clangd"),
+              trigramsAre({"cla", "lan", "ang", "ngd"}));
+
+  EXPECT_THAT(generateQueryTrigrams("abc_def"),
+              trigramsAre({"abc", "bcd", "cde", "def"}));
+
+  EXPECT_THAT(generateQueryTrigrams("a_b_c_d_e_"),
+              trigramsAre({"abc", "bcd", "cde"}));
+
+  EXPECT_THAT(generateQueryTrigrams("unique_ptr"),
+              trigramsAre({"uni", "niq", "iqu", "que", "uep", "ept", "ptr"}));
+
+  EXPECT_THAT(generateQueryTrigrams("TUDecl"),
+              trigramsAre({"tud", "ude", "dec", "ecl"}));
+
+  EXPECT_THAT(generateQueryTrigrams("IsOK"), trigramsAre({"iso", "sok"}));
+
+  EXPECT_THAT(generateQueryTrigrams("abc_defGhij__klm"),
+              trigramsAre({"abc", "bcd", "cde", "def", "efg", "fgh", "ghi",
+                           "hij", "ijk", "jkl", "klm"}));
+}
+
+} // namespace dex
+} // namespace clangd
+} // namespace clang
Index: clang-tools-extra/unittests/clangd/CMakeLists.txt
===================================================================
--- clang-tools-extra/unittests/clangd/CMakeLists.txt
+++ clang-tools-extra/unittests/clangd/CMakeLists.txt
@@ -15,9 +15,10 @@
   CodeCompleteTests.cpp
   CodeCompletionStringsTests.cpp
   ContextTests.cpp
+  DexIndexTests.cpp
   DraftStoreTests.cpp
-  FileIndexTests.cpp
   FileDistanceTests.cpp
+  FileIndexTests.cpp
   FindSymbolsTests.cpp
   FuzzyMatchTests.cpp
   GlobalCompilationDatabaseTests.cpp
@@ -27,11 +28,11 @@
   SourceCodeTests.cpp
   SymbolCollectorTests.cpp
   SyncAPI.cpp
+  TUSchedulerTests.cpp
   TestFS.cpp
   TestTU.cpp
   ThreadingTests.cpp
   TraceTests.cpp
-  TUSchedulerTests.cpp
   URITests.cpp
   XRefsTests.cpp
   )
Index: clang-tools-extra/clangd/index/dex/Trigram.h
===================================================================
--- /dev/null
+++ clang-tools-extra/clangd/index/dex/Trigram.h
@@ -0,0 +1,62 @@
+//===--- Trigram.h - Trigram generation for Fuzzy Matching ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Trigrams are attributes of the symbol unqualified name used to effectively
+// extract symbols which can be fuzzy-matched given user query from the inverted
+// index. To match query with the extracted set of trigrams Q, the set of
+// generated trigrams T for identifier (unqualified symbol name) should contain
+// all items of Q, i.e. Q ⊆ T.
+//
+// Trigram sets extracted from unqualified name and from query are different:
+// the set of query trigrams only contains consecutive sequences of three
+// characters (which is only a subset of all trigrams generated for an
+// identifier).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_DEX_TRIGRAM_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_DEX_TRIGRAM_H
+
+#include "Token.h"
+
+#include <string>
+
+namespace clang {
+namespace clangd {
+namespace dex {
+
+/// Returns list of unique fuzzy-search trigrams from unqualified symbol.
+///
+/// First, given Identifier (unqualified symbol name) is segmented using
+/// FuzzyMatch API and lowercased. After segmentation, the following technique
+/// is applied for generating trigrams: for each letter or digit in the input
+/// string the algorithms looks for the possible next and skip-1-next symbols
+/// which can be jumped to during fuzzy matching. Each combination of such three
+/// symbols is inserted into the result.
+///
+/// Trigrams can start at any character in the input. Then we can choose to move
+/// to the next character, move to the start of the next segment, or skip over a
+/// segment.
+///
+/// Note: the returned list of trigrams does not have duplicates, if any trigram
+/// belongs to more than one class it is only inserted once.
+std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier);
+
+/// Returns list of unique fuzzy-search trigrams given a query.
+///
+/// Query is segmented using FuzzyMatch API and downcasted to lowercase. Then,
+/// the simplest trigrams - sequences of three consecutive letters and digits
+/// are extracted and returned after deduplication.
+std::vector<Token> generateQueryTrigrams(llvm::StringRef Query);
+
+} // namespace dex
+} // namespace clangd
+} // namespace clang
+
+#endif
Index: clang-tools-extra/clangd/index/dex/Trigram.cpp
===================================================================
--- /dev/null
+++ clang-tools-extra/clangd/index/dex/Trigram.cpp
@@ -0,0 +1,132 @@
+//===--- Trigram.cpp - Trigram generation for Fuzzy Matching ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Trigram.h"
+#include "../../FuzzyMatch.h"
+#include "Token.h"
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/StringExtras.h"
+
+#include <cctype>
+#include <queue>
+#include <string>
+
+using namespace llvm;
+
+namespace clang {
+namespace clangd {
+namespace dex {
+
+// FIXME(kbobyrev): Deal with short symbol symbol names. A viable approach would
+// be generating unigrams and bigrams here, too. This would prevent symbol index
+// from applying fuzzy matching on a tremendous number of symbols and allow
+// supplementary retrieval for short queries.
+//
+// Short names (total segment length <3 characters) are currently ignored.
+std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier) {
+  // Apply fuzzy matching text segmentation.
+  std::vector<CharRole> Roles(Identifier.size());
+  calculateRoles(Identifier,
+                 llvm::makeMutableArrayRef(Roles.data(), Identifier.size()));
+
+  std::string LowercaseIdentifier = Identifier.lower();
+
+  // For each character, store indices of the characters to which fuzzy matching
+  // algorithm can jump. There are 3 possible variants:
+  //
+  // * Next Tail - next character from the same segment
+  // * Next Head - front character of the next segment
+  // * Skip-1-Next Head - front character of the skip-1-next segment
+  //
+  // Next stores tuples of three indices in the presented order, if a variant is
+  // not available then 0 is stored.
+  std::vector<std::array<unsigned, 3>> Next(LowercaseIdentifier.size());
+  unsigned NextTail = 0, NextHead = 0, NextNextHead = 0;
+  for (int I = LowercaseIdentifier.size() - 1; I >= 0; --I) {
+    Next[I] = {{NextTail, NextHead, NextNextHead}};
+    NextTail = Roles[I] == Tail ? I : 0;
+    if (Roles[I] == Head) {
+      NextNextHead = NextHead;
+      NextHead = I;
+    }
+  }
+
+  DenseSet<Token> UniqueTrigrams;
+  std::array<char, 4> Chars;
+  for (size_t I = 0; I < LowercaseIdentifier.size(); ++I) {
+    // Skip delimiters.
+    if (Roles[I] != Head && Roles[I] != Tail)
+      continue;
+    for (const unsigned J : Next[I]) {
+      if (!J)
+        continue;
+      for (const unsigned K : Next[J]) {
+        if (!K)
+          continue;
+        Chars = {{LowercaseIdentifier[I], LowercaseIdentifier[J],
+                  LowercaseIdentifier[K], 0}};
+        auto Trigram = Token(Token::Kind::Trigram, Chars.data());
+        // Push unique trigrams to the result.
+        if (!UniqueTrigrams.count(Trigram)) {
+          UniqueTrigrams.insert(Trigram);
+        }
+      }
+    }
+  }
+
+  std::vector<Token> Result;
+  for (const auto &Trigram : UniqueTrigrams)
+    Result.push_back(Trigram);
+
+  return Result;
+}
+
+// FIXME(kbobyrev): Similarly, to generateIdentifierTrigrams, this ignores short
+// inputs (total segment length <3 characters).
+std::vector<Token> generateQueryTrigrams(llvm::StringRef Query) {
+  // Apply fuzzy matching text segmentation.
+  std::vector<CharRole> Roles(Query.size());
+  calculateRoles(Query, llvm::makeMutableArrayRef(Roles.data(), Query.size()));
+
+  std::string LowercaseQuery = Query.lower();
+
+  DenseSet<Token> UniqueTrigrams;
+  std::deque<char> Chars;
+
+  for (size_t I = 0; I < LowercaseQuery.size(); ++I) {
+    // If current symbol is delimiter, just skip it.
+    if (Roles[I] != Head && Roles[I] != Tail)
+      continue;
+
+    Chars.push_back(LowercaseQuery[I]);
+
+    if (Chars.size() > 3)
+      Chars.pop_front();
+    if (Chars.size() == 3) {
+      auto Trigram =
+          Token(Token::Kind::Trigram, std::string(begin(Chars), end(Chars)));
+      // Push unique trigrams to the result.
+      if (!UniqueTrigrams.count(Trigram)) {
+        UniqueTrigrams.insert(Trigram);
+      }
+    }
+  }
+
+  std::vector<Token> Result;
+  for (const auto &Trigram : UniqueTrigrams)
+    Result.push_back(Trigram);
+
+  return Result;
+}
+
+} // namespace dex
+} // namespace clangd
+} // namespace clang
Index: clang-tools-extra/clangd/index/dex/Token.h
===================================================================
--- /dev/null
+++ clang-tools-extra/clangd/index/dex/Token.h
@@ -0,0 +1,112 @@
+//===--- Token.h - Symbol Search primitive ----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Token objects represent a characteristic of a symbol, which can be used to
+// perform efficient search. Tokens are keys for inverted index which are mapped
+// to the corresponding posting lists.
+//
+// The symbol std::cout might have the tokens:
+// * Scope "std::"
+// * Trigram "cou"
+// * Trigram "out"
+// * Type "std::ostream"
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_DEX_TOKEN_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_DEX_TOKEN_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <string>
+#include <vector>
+
+namespace clang {
+namespace clangd {
+namespace dex {
+
+/// A Token represents an attribute of a symbol, such as a particular trigram
+/// present in the name (used for fuzzy search).
+///
+/// Tokens can be used to perform more sophisticated search queries by
+/// constructing complex iterator trees.
+struct Token {
+  /// Kind specifies Token type which defines semantics for the internal
+  /// representation. Each Kind has different representation stored in Data
+  /// field.
+  enum class Kind {
+    /// Represents trigram used for fuzzy search of unqualified symbol names.
+    ///
+    /// Data contains 3 bytes with trigram contents.
+    Trigram,
+    /// Scope primitives, e.g. "symbol belongs to namespace foo::bar".
+    ///
+    /// Data stroes full scope name , e.g. "foo::bar::baz::" or "" (for global
+    /// scope).
+    Scope,
+    /// Internal Token type for invalid/special tokens, e.g. empty tokens for
+    /// llvm::DenseMap.
+    Sentinel,
+    /// FIXME(kbobyrev): Add other Token Kinds
+    /// * Path with full or relative path to the directory in which symbol is
+    ///   defined
+    /// * Type with qualified type name or its USR
+  };
+
+  Token(Kind TokenKind, llvm::StringRef Data)
+      : Data(Data), TokenKind(TokenKind) {}
+
+  bool operator==(const Token &Other) const {
+    return TokenKind == Other.TokenKind && Data == Other.Data;
+  }
+
+  /// Representation which is unique among Token with the same Kind.
+  std::string Data;
+  Kind TokenKind;
+
+  friend llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token &T) {
+    return OS << T.Data;
+  }
+
+private:
+  friend llvm::hash_code hash_value(const Token &Token) {
+    return llvm::hash_combine(static_cast<int>(Token.TokenKind), Token.Data);
+  }
+};
+
+} // namespace dex
+} // namespace clangd
+} // namespace clang
+
+namespace llvm {
+
+// Support Tokens as DenseMap keys.
+template <> struct DenseMapInfo<clang::clangd::dex::Token> {
+  static inline clang::clangd::dex::Token getEmptyKey() {
+    return {clang::clangd::dex::Token::Kind::Sentinel, "EmptyKey"};
+  }
+
+  static inline clang::clangd::dex::Token getTombstoneKey() {
+    return {clang::clangd::dex::Token::Kind::Sentinel, "TombstoneKey"};
+  }
+
+  static unsigned getHashValue(const clang::clangd::dex::Token &Tag) {
+    return hash_value(Tag);
+  }
+
+  static bool isEqual(const clang::clangd::dex::Token &LHS,
+                      const clang::clangd::dex::Token &RHS) {
+    return LHS == RHS;
+  }
+};
+
+} // namespace llvm
+
+#endif
Index: clang-tools-extra/clangd/CMakeLists.txt
===================================================================
--- clang-tools-extra/clangd/CMakeLists.txt
+++ clang-tools-extra/clangd/CMakeLists.txt
@@ -34,14 +34,17 @@
   TUScheduler.cpp
   URI.cpp
   XRefs.cpp
+
   index/CanonicalIncludes.cpp
   index/FileIndex.cpp
   index/Index.cpp
   index/MemIndex.cpp
   index/Merge.cpp
   index/SymbolCollector.cpp
   index/SymbolYAML.cpp
 
+  index/dex/Trigram.cpp
+
   LINK_LIBS
   clangAST
   clangASTMatchers
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to