kbobyrev updated this revision to Diff 156774.
kbobyrev added a comment.

Wrong diff, should be correct now.


https://reviews.llvm.org/D49591

Files:
  clang-tools-extra/clangd/CMakeLists.txt
  clang-tools-extra/clangd/index/dex/Token.cpp
  clang-tools-extra/clangd/index/dex/Token.h
  clang-tools-extra/clangd/index/dex/Trigram.cpp
  clang-tools-extra/clangd/index/dex/Trigram.h
  clang-tools-extra/unittests/clangd/CMakeLists.txt
  clang-tools-extra/unittests/clangd/DexIndexTests.cpp

Index: clang-tools-extra/unittests/clangd/DexIndexTests.cpp
===================================================================
--- /dev/null
+++ clang-tools-extra/unittests/clangd/DexIndexTests.cpp
@@ -0,0 +1,92 @@
+//===-- DexIndexTests.cpp  ----------------------------*- C++ -*-----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "index/dex/Token.h"
+#include "index/dex/Trigram.h"
+#include "gtest/gtest.h"
+
+#include <string>
+#include <vector>
+
+using std::string;
+using std::vector;
+
+namespace clang {
+namespace clangd {
+namespace dex {
+
+vector<Token> getTrigrams(std::initializer_list<string> Trigrams) {
+  vector<Token> Result;
+  for (const auto &Symbols : Trigrams) {
+    Result.push_back(Token(Token::Kind::Trigram, Symbols));
+  }
+  return Result;
+}
+
+TEST(DexIndexTrigrams, IdentifierTrigrams) {
+  EXPECT_EQ(generateIdentifierTrigrams("X86"), getTrigrams({"x86"}));
+
+  EXPECT_EQ(generateIdentifierTrigrams("nl"), getTrigrams({}));
+
+  EXPECT_EQ(generateIdentifierTrigrams("clangd"),
+            getTrigrams({"cla", "lan", "ang", "ngd"}));
+
+  EXPECT_EQ(generateIdentifierTrigrams("abc_def"),
+            getTrigrams({"abc", "abd", "ade", "bcd", "bde", "cde", "def"}));
+
+  EXPECT_EQ(
+      generateIdentifierTrigrams("a_b_c_d_e_"),
+      getTrigrams({"abc", "abd", "acd", "ace", "bcd", "bce", "bde", "cde"}));
+
+  EXPECT_EQ(generateIdentifierTrigrams("unique_ptr"),
+            getTrigrams({"uni", "unp", "upt", "niq", "nip", "npt", "iqu", "iqp",
+                         "ipt", "que", "qup", "qpt", "uep", "ept", "ptr"}));
+
+  EXPECT_EQ(generateIdentifierTrigrams("TUDecl"),
+            getTrigrams({"tud", "tde", "ude", "dec", "ecl"}));
+
+  EXPECT_EQ(generateIdentifierTrigrams("IsOK"),
+            getTrigrams({"iso", "iok", "sok"}));
+
+  EXPECT_EQ(generateIdentifierTrigrams("abc_defGhij__klm"),
+            getTrigrams({
+                "abc", "abd", "abg", "ade", "adg", "adk", "agh", "agk", "bcd",
+                "bcg", "bde", "bdg", "bdk", "bgh", "bgk", "cde", "cdg", "cdk",
+                "cgh", "cgk", "def", "deg", "dek", "dgh", "dgk", "dkl", "efg",
+                "efk", "egh", "egk", "ekl", "fgh", "fgk", "fkl", "ghi", "ghk",
+                "gkl", "hij", "hik", "hkl", "ijk", "ikl", "jkl", "klm",
+            }));
+}
+
+TEST(DexIndexTrigrams, QueryTrigrams) {
+  EXPECT_EQ(generateQueryTrigrams("X86"), getTrigrams({"x86"}));
+
+  EXPECT_EQ(generateQueryTrigrams("nl"), getTrigrams({}));
+
+  EXPECT_EQ(generateQueryTrigrams("clangd"),
+            getTrigrams({"cla", "lan", "ang", "ngd"}));
+
+  EXPECT_EQ(generateQueryTrigrams("abc_def"), getTrigrams({"abc", "def"}));
+
+  EXPECT_EQ(generateQueryTrigrams("a_b_c_d_e_"), getTrigrams({}));
+
+  EXPECT_EQ(generateQueryTrigrams("unique_ptr"),
+            getTrigrams({"uni", "niq", "iqu", "que", "ptr"}));
+
+  EXPECT_EQ(generateQueryTrigrams("TUDecl"), getTrigrams({"dec", "ecl"}));
+
+  EXPECT_EQ(generateQueryTrigrams("IsOK"), getTrigrams({}));
+
+  EXPECT_EQ(generateQueryTrigrams("abc_defGhij__klm"),
+            getTrigrams({"abc", "def", "ghi", "hij", "klm"}));
+}
+
+} // namespace dex
+} // namespace clangd
+} // namespace clang
Index: clang-tools-extra/unittests/clangd/CMakeLists.txt
===================================================================
--- clang-tools-extra/unittests/clangd/CMakeLists.txt
+++ clang-tools-extra/unittests/clangd/CMakeLists.txt
@@ -15,9 +15,10 @@
   CodeCompleteTests.cpp
   CodeCompletionStringsTests.cpp
   ContextTests.cpp
+  DexIndexTests.cpp
   DraftStoreTests.cpp
-  FileIndexTests.cpp
   FileDistanceTests.cpp
+  FileIndexTests.cpp
   FindSymbolsTests.cpp
   FuzzyMatchTests.cpp
   GlobalCompilationDatabaseTests.cpp
@@ -27,11 +28,11 @@
   SourceCodeTests.cpp
   SymbolCollectorTests.cpp
   SyncAPI.cpp
+  TUSchedulerTests.cpp
   TestFS.cpp
   TestTU.cpp
   ThreadingTests.cpp
   TraceTests.cpp
-  TUSchedulerTests.cpp
   URITests.cpp
   XRefsTests.cpp
   )
Index: clang-tools-extra/clangd/index/dex/Trigram.h
===================================================================
--- /dev/null
+++ clang-tools-extra/clangd/index/dex/Trigram.h
@@ -0,0 +1,62 @@
+//===--- Trigram.h - Trigram generation for Fuzzy Matching ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Trigrams are attributes of the symbol unqualified name used to effectively
+// extract symbols which can be fuzzy-matched given user query from the inverted
+// index. To match query with the extracted set of trigrams Q, the set of
+// generated trigrams T for identifier (unqualified symbol name) should contain
+// all items of Q, i.e. Q ⊆ T.
+//
+// Trigram sets extracted from unqualified name and from query are different:
+// the set of query trigrams only contains consecutive sequences of three
+// characters (which is only a subset of all trigrams generated for an
+// identifier).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_DEX_TRIGRAM_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_DEX_TRIGRAM_H
+
+#include "Token.h"
+
+#include <string>
+
+namespace clang {
+namespace clangd {
+namespace dex {
+
+/// Returns list of unique fuzzy-search trigrams from unqualified symbol.
+///
+/// First, given Identifier (unqualified symbol name) is segmented using
+/// calculateRoles and downcasted to lowercase. After segmentation, the
+/// following technique is applied for generating trigrams: for each letter or
+/// digit in the input string the algorithms looks for the possible next and
+/// skip-1-next symbols which can be jumped to during fuzzy matching. Each
+/// combination of such three symbols is inserted into the result.
+///
+/// Trigrams can start at any character in the input. Then we can choose to move
+/// to the next character, move to the start of the next segment, or skip over a
+/// segment.
+///
+/// Note: the returned list of trigrams does not have duplicates, if any trigram
+/// belongs to more than one class it is only inserted once.
+std::vector<Token> generateIdentifierTrigrams(std::string Identifier);
+
+/// Returns list of unique fuzzy-search trigrams given a query.
+///
+/// Query is segmented using clangd::calculateRoles and downcasted to lowercase.
+/// Then, the simplest trigrams - sequences of three consecutive from the same
+/// segment are extracted and returned after deduplication.
+std::vector<Token> generateQueryTrigrams(std::string Query);
+
+} // namespace dex
+} // namespace clangd
+} // namespace clang
+
+#endif
Index: clang-tools-extra/clangd/index/dex/Trigram.cpp
===================================================================
--- /dev/null
+++ clang-tools-extra/clangd/index/dex/Trigram.cpp
@@ -0,0 +1,130 @@
+//===--- Trigram.cpp - Trigram generation for Fuzzy Matching ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Trigram.h"
+#include "../../FuzzyMatch.h"
+#include "Token.h"
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseSet.h"
+
+#include <cctype>
+#include <queue>
+#include <string>
+
+using namespace llvm;
+
+namespace clang {
+namespace clangd {
+namespace dex {
+
+// FIXME(kbobyrev): Deal with short symbol symbol names. A viable approach would
+// be generating unigrams and bigrams here, too. This would prevent symbol index
+// from applying fuzzy matching on a tremendous number of symbols and allow
+// supplementary retrieval for short queries.
+//
+// Short names (1 segment with <3 characters) are currently ignored.
+std::vector<Token> generateIdentifierTrigrams(std::string Identifier) {
+  // Apply fuzzy matching text segmentation.
+  std::vector<CharRole> Roles(Identifier.size());
+  calculateRoles(Identifier,
+                 llvm::makeMutableArrayRef(Roles.data(), Identifier.size()));
+  std::transform(begin(Identifier), end(Identifier), begin(Identifier),
+                 ::tolower);
+
+  // For each character, store indices of the characters to which fuzzy matching
+  // algorithm can jump. There are 3 possible variants:
+  //
+  // * Next Tail - next character from the same segment
+  // * Next Head - front character of the next segment
+  // * Skip-1-Next Head - front character of the skip-1-next segment
+  //
+  // Next stores tuples of three indices in the presented order, if a variant is
+  // not available then 0 is stored.
+  std::vector<std::array<unsigned, 3>> Next(Identifier.size());
+  unsigned NextTail = 0, NextHead = 0, NextNextHead = 0;
+  for (int I = Identifier.size() - 1; I >= 0; --I) {
+    Next[I] = {{NextTail, NextHead, NextNextHead}};
+    NextTail = Roles[I] == Tail ? I : 0;
+    if (Roles[I] == Head) {
+      NextNextHead = NextHead;
+      NextHead = I;
+    }
+  }
+
+  DenseSet<Token> UniqueTrigrams;
+  std::vector<Token> Result;
+  std::array<char, 4> Chars;
+  for (size_t I = 0; I < Identifier.size(); ++I) {
+    // Skip delimiters.
+    if (Roles[I] != Head && Roles[I] != Tail)
+      continue;
+    for (const unsigned J : Next[I]) {
+      if (!J)
+        continue;
+      for (const unsigned K : Next[J]) {
+        if (!K)
+          continue;
+        Chars = {{Identifier[I], Identifier[J], Identifier[K], 0}};
+        auto Trigram = Token(Token::Kind::Trigram, Chars.data());
+        // Push unique trigrams to the result.
+        if (!UniqueTrigrams.count(Trigram)) {
+          UniqueTrigrams.insert(Trigram);
+          Result.push_back(Trigram);
+        }
+      }
+    }
+  }
+
+  return Result;
+}
+
+// FIXME(kbobyrev): Similarly, to generateIdentifierTrigrams, this ignores short
+// inputs (1 segment with <3 characters).
+std::vector<Token> generateQueryTrigrams(std::string Query) {
+  // Apply fuzzy matching text segmentation.
+  std::vector<CharRole> Roles(Query.size());
+  calculateRoles(Query, llvm::makeMutableArrayRef(Roles.data(), Query.size()));
+  std::transform(begin(Query), end(Query), begin(Query), ::tolower);
+
+  DenseSet<Token> UniqueTrigrams;
+  std::vector<Token> Result;
+  std::deque<char> Chars;
+
+  for (size_t I = 0; I < Query.size(); ++I) {
+    // If current symbol is delimiter, just skip it.
+    if (Roles[I] != Head && Roles[I] != Tail)
+      continue;
+
+    // New segment starts with Head: flush Chars.
+    if (Roles[I] == Head)
+      while (!Chars.empty())
+        Chars.pop_front();
+
+    Chars.push_back(Query[I]);
+
+    if (Chars.size() > 3)
+      Chars.pop_front();
+    if (Chars.size() == 3) {
+      auto Trigram =
+          Token(Token::Kind::Trigram, std::string(begin(Chars), end(Chars)));
+      // Push unique trigrams to the result.
+      if (!UniqueTrigrams.count(Trigram)) {
+        UniqueTrigrams.insert(Trigram);
+        Result.push_back(Trigram);
+      }
+    }
+  }
+
+  return Result;
+}
+
+} // namespace dex
+} // namespace clangd
+} // namespace clang
Index: clang-tools-extra/clangd/index/dex/Token.h
===================================================================
--- /dev/null
+++ clang-tools-extra/clangd/index/dex/Token.h
@@ -0,0 +1,108 @@
+//===--- Token.h - Symbol Search primitive ----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Token objects represent a characteristic of a symbol, which can be used to
+// perform efficient search. Tokens are keys for inverted index which are mapped
+// to the corresponding posting lists.
+//
+// The symbol std::cout might have the tokens:
+// * Scope "std::"
+// * Trigram "cou"
+// * Trigram "out"
+// * Type "std::ostream"
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_DEX_TOKEN_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_DEX_TOKEN_H
+
+#include "llvm/ADT/DenseMap.h"
+
+#include <string>
+#include <vector>
+
+namespace clang {
+namespace clangd {
+namespace dex {
+
+/// A Token represents an attribute of a symbol, such as a particular trigram
+/// present in the name (used for fuzzy search).
+///
+/// Tokens can be used to perform more sophisticated search queries by
+/// constructing complex iterator trees.
+class Token {
+public:
+  /// Kind specifies Token type which defines semantics for the internal
+  /// representation. Each Kind has different representation stored in Data
+  /// field.
+  enum class Kind {
+    /// Represents trigram used for fuzzy search of unqualified symbol names.
+    ///
+    /// Data contains 3 bytes with trigram contents.
+    Trigram,
+    /// Scope primitives, e.g. "symbol belongs to namespace foo::bar".
+    ///
+    /// Data stroes full scope name is stored, e.g. "foo::bar::baz::" or ""
+    /// (global scope).
+    Scope,
+    /// FIXME(kbobyrev): Add other Token Kinds
+    /// * Path with full or relative path to the directory in which symbol is
+    ///   defined
+    /// * Type with qualified type name or its USR
+  };
+
+  Token(Kind TokenKind, llvm::StringRef Data);
+
+  bool operator==(const Token &Other) const;
+
+  llvm::StringRef data() const;
+
+  const Kind &kind() const;
+
+private:
+  friend llvm::hash_code hash_value(const Token &Token);
+
+  /// Representation which is unique among Token with the same Kind.
+  std::string Data;
+  Kind TokenKind;
+};
+
+} // namespace dex
+} // namespace clangd
+} // namespace clang
+
+namespace llvm {
+
+// Support Tokens as DenseMap keys.
+template <> struct DenseMapInfo<clang::clangd::dex::Token> {
+  static inline clang::clangd::dex::Token getEmptyKey() {
+    static clang::clangd::dex::Token EmptyKey(
+        clang::clangd::dex::Token::Kind::Scope, "EmptyKey");
+    return EmptyKey;
+  }
+
+  static inline clang::clangd::dex::Token getTombstoneKey() {
+    static clang::clangd::dex::Token TombstoneKey(
+        clang::clangd::dex::Token::Kind::Scope, "TombstoneKey");
+    return TombstoneKey;
+  }
+
+  static unsigned getHashValue(const clang::clangd::dex::Token &Tag) {
+    return hash_value(Tag);
+  }
+
+  static bool isEqual(const clang::clangd::dex::Token &LHS,
+                      const clang::clangd::dex::Token &RHS) {
+    return LHS == RHS;
+  }
+};
+
+} // namespace llvm
+
+#endif
Index: clang-tools-extra/clangd/index/dex/Token.cpp
===================================================================
--- /dev/null
+++ clang-tools-extra/clangd/index/dex/Token.cpp
@@ -0,0 +1,38 @@
+//===--- Token.cpp - Symbol Search primitive --------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Token.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Twine.h"
+
+#include <cctype>
+#include <string>
+
+namespace clang {
+namespace clangd {
+namespace dex {
+
+Token::Token(Kind TokenKind, llvm::StringRef Data)
+    : Data(Data), TokenKind(TokenKind) {}
+
+bool Token::operator==(const Token &Other) const {
+  return TokenKind == Other.TokenKind && Data == Other.Data;
+}
+
+llvm::StringRef Token::data() const { return Data; }
+
+const Token::Kind &Token::kind() const { return TokenKind; }
+
+llvm::hash_code hash_value(const Token &Token) {
+  return llvm::hash_combine(static_cast<int>(Token.TokenKind), Token.Data);
+}
+
+} // namespace dex
+} // namespace clangd
+} // namespace clang
Index: clang-tools-extra/clangd/CMakeLists.txt
===================================================================
--- clang-tools-extra/clangd/CMakeLists.txt
+++ clang-tools-extra/clangd/CMakeLists.txt
@@ -34,14 +34,18 @@
   TUScheduler.cpp
   URI.cpp
   XRefs.cpp
+
   index/CanonicalIncludes.cpp
   index/FileIndex.cpp
   index/Index.cpp
   index/MemIndex.cpp
   index/Merge.cpp
   index/SymbolCollector.cpp
   index/SymbolYAML.cpp
 
+  index/dex/Token.cpp
+  index/dex/Trigram.cpp
+
   LINK_LIBS
   clangAST
   clangASTMatchers
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to