sammccall created this revision.
sammccall added a reviewer: ioeric.
Herald added subscribers: cfe-commits, kadircet, arphaman, mgrang, jkorous, 
ilya-biryukov.

1. Instead of a$$ for a short-query trigram, just use a
2. Generate more short-query trigrams, e.g. "AbcDefGhi" now yields "d" and 
"ag". This is effectively required by LSP, having "ag" not match but "agh" 
match will lead to glitches due to client-side filtering.
3. Drop leading-punctuation short-query trigrams. Nice idea, but current 
implementation is broken (competes with non-punctuation short query trigrams)


Repository:
  rCTE Clang Tools Extra

https://reviews.llvm.org/D52808

Files:
  clangd/index/dex/Trigram.cpp
  clangd/index/dex/Trigram.h
  clangd/indexer/IndexerMain.cpp
  unittests/clangd/DexTests.cpp

Index: unittests/clangd/DexTests.cpp
===================================================================
--- unittests/clangd/DexTests.cpp
+++ unittests/clangd/DexTests.cpp
@@ -333,53 +333,61 @@
 
 TEST(DexTrigrams, IdentifierTrigrams) {
   EXPECT_THAT(generateIdentifierTrigrams("X86"),
-              trigramsAre({"x86", "x$$", "x8$"}));
+              trigramsAre({"x86", "x", "x8"}));
 
-  EXPECT_THAT(generateIdentifierTrigrams("nl"), trigramsAre({"nl$", "n$$"}));
+  EXPECT_THAT(generateIdentifierTrigrams("nl"), trigramsAre({"nl", "n"}));
 
-  EXPECT_THAT(generateIdentifierTrigrams("n"), trigramsAre({"n$$"}));
+  EXPECT_THAT(generateIdentifierTrigrams("n"), trigramsAre({"n"}));
 
   EXPECT_THAT(generateIdentifierTrigrams("clangd"),
-              trigramsAre({"c$$", "cl$", "cla", "lan", "ang", "ngd"}));
+              trigramsAre({"c", "cl", "cla", "lan", "ang", "ngd"}));
 
   EXPECT_THAT(generateIdentifierTrigrams("abc_def"),
-              trigramsAre({"a$$", "abc", "abd", "ade", "bcd", "bde", "cde",
-                           "def", "ab$", "ad$"}));
+              trigramsAre({"a", "d", "abc", "abd", "ade", "bcd", "bde", "cde",
+                           "def", "ab", "ad", "de"}));
 
   EXPECT_THAT(generateIdentifierTrigrams("a_b_c_d_e_"),
-              trigramsAre({"a$$", "a_$", "a_b", "abc", "abd", "acd", "ace",
-                           "bcd", "bce", "bde", "cde", "ab$"}));
+              trigramsAre({"a",   "b",   "c",   "d",   "e",   "ab",  "ac",
+                           "bc",  "bd",  "cd",  "ce",  "de",  "abc", "abd",
+                           "acd", "ace", "bcd", "bce", "bde", "cde"}));
 
   EXPECT_THAT(generateIdentifierTrigrams("unique_ptr"),
-              trigramsAre({"u$$", "uni", "unp", "upt", "niq", "nip", "npt",
+              trigramsAre({"u", "p", "uni", "unp", "upt", "niq", "nip", "npt",
                            "iqu", "iqp", "ipt", "que", "qup", "qpt", "uep",
-                           "ept", "ptr", "un$", "up$"}));
+                           "ept", "ptr", "un", "up", "pt"}));
 
-  EXPECT_THAT(
-      generateIdentifierTrigrams("TUDecl"),
-      trigramsAre({"t$$", "tud", "tde", "ude", "dec", "ecl", "tu$", "td$"}));
+  EXPECT_THAT(generateIdentifierTrigrams("TUDecl"),
+              trigramsAre({"t", "d", "tud", "tde", "ude", "dec", "ecl", "tu",
+                           "td", "de"}));
 
   EXPECT_THAT(generateIdentifierTrigrams("IsOK"),
-              trigramsAre({"i$$", "iso", "iok", "sok", "is$", "io$"}));
-
+              trigramsAre({"i", "o", "iso", "iok", "sok", "is", "io", "ok"}));
+
+  auto X = generateIdentifierTrigrams("abc_defGhij__klm");
+  llvm::sort(X, [&](const Token &X, const Token &Y) {
+    return X.Data < Y.Data;
+  });
+  for (const auto & E : X)
+    llvm::errs() << E << "\n";
   EXPECT_THAT(
       generateIdentifierTrigrams("abc_defGhij__klm"),
-      trigramsAre({"a$$", "abc", "abd", "abg", "ade", "adg", "adk", "agh",
-                   "agk", "bcd", "bcg", "bde", "bdg", "bdk", "bgh", "bgk",
-                   "cde", "cdg", "cdk", "cgh", "cgk", "def", "deg", "dek",
-                   "dgh", "dgk", "dkl", "efg", "efk", "egh", "egk", "ekl",
-                   "fgh", "fgk", "fkl", "ghi", "ghk", "gkl", "hij", "hik",
-                   "hkl", "ijk", "ikl", "jkl", "klm", "ab$", "ad$"}));
+      trigramsAre(
+          {"a",   "d",   "g",   "k",   "abc", "abd", "abg", "ade", "adg", "adk",
+           "agh", "agk", "bcd", "bcg", "bde", "bdg", "bdk", "bgh", "bgk", "cde",
+           "cdg", "cdk", "cgh", "cgk", "def", "deg", "dek", "dgh", "dgk", "dkl",
+           "efg", "efk", "egh", "egk", "ekl", "fgh", "fgk", "fkl", "ghi", "ghk",
+           "gkl", "hij", "hik", "hkl", "ijk", "ikl", "jkl", "klm", "ab",  "ad",
+           "ag",  "de",  "dg",  "dk",  "gh",  "gk",  "kl"}));
 }
 
 TEST(DexTrigrams, QueryTrigrams) {
-  EXPECT_THAT(generateQueryTrigrams("c"), trigramsAre({"c$$"}));
-  EXPECT_THAT(generateQueryTrigrams("cl"), trigramsAre({"cl$"}));
+  EXPECT_THAT(generateQueryTrigrams("c"), trigramsAre({"c"}));
+  EXPECT_THAT(generateQueryTrigrams("cl"), trigramsAre({"cl"}));
   EXPECT_THAT(generateQueryTrigrams("cla"), trigramsAre({"cla"}));
 
-  EXPECT_THAT(generateQueryTrigrams("_"), trigramsAre({"_$$"}));
-  EXPECT_THAT(generateQueryTrigrams("__"), trigramsAre({"__$"}));
-  EXPECT_THAT(generateQueryTrigrams("___"), trigramsAre({"___"}));
+  EXPECT_THAT(generateQueryTrigrams("_"), trigramsAre({}));
+  EXPECT_THAT(generateQueryTrigrams("__"), trigramsAre({}));
+  EXPECT_THAT(generateQueryTrigrams("___"), trigramsAre({}));
 
   EXPECT_THAT(generateQueryTrigrams("X86"), trigramsAre({"x86"}));
 
Index: clangd/indexer/IndexerMain.cpp
===================================================================
--- clangd/indexer/IndexerMain.cpp
+++ clangd/indexer/IndexerMain.cpp
@@ -66,7 +66,7 @@
                                        "human-readable YAML format"),
                             clEnumValN(IndexFileFormat::RIFF, "binary",
                                        "binary RIFF format")),
-           llvm::cl::init(IndexFileFormat::YAML));
+           llvm::cl::init(IndexFileFormat::RIFF));
 
 /// Responsible for aggregating symbols from each processed file and producing
 /// the final results. All methods in this class must be thread-safe,
@@ -83,15 +83,15 @@
 };
 
 class SymbolIndexActionFactory : public tooling::FrontendActionFactory {
-public:
+ public:
   SymbolIndexActionFactory(SymbolsConsumer &Consumer) : Consumer(Consumer) {}
 
   clang::FrontendAction *create() override {
     auto CollectorOpts = SymbolCollector::Options();
     CollectorOpts.FallbackDir = AssumedHeaderDir;
     return createStaticIndexingAction(
-               CollectorOpts,
-               [&](SymbolSlab S) { Consumer.consumeSymbols(std::move(S)); })
+        CollectorOpts,
+        [&](SymbolSlab S) { Consumer.consumeSymbols(std::move(S)); })
         .release();
   }
 
@@ -102,7 +102,7 @@
 /// end. Useful for running on MapReduce infrastructure to avoid keeping symbols
 /// from multiple files in memory.
 class ToolExecutorConsumer : public SymbolsConsumer {
-public:
+ public:
   ToolExecutorConsumer(ToolExecutor &Executor) : Executor(Executor) {}
 
   void consumeSymbols(SymbolSlab Symbols) override {
Index: clangd/index/dex/Trigram.h
===================================================================
--- clangd/index/dex/Trigram.h
+++ clangd/index/dex/Trigram.h
@@ -33,26 +33,18 @@
 namespace dex {
 
 /// Returns list of unique fuzzy-search trigrams from unqualified symbol.
+/// The trigrams give the 3-character query substrings this symbol can match.
 ///
-/// First, given Identifier (unqualified symbol name) is segmented using
-/// FuzzyMatch API and lowercased. After segmentation, the following technique
-/// is applied for generating trigrams: for each letter or digit in the input
-/// string the algorithms looks for the possible next and skip-1-next characters
-/// which can be jumped to during fuzzy matching. Each combination of such three
-/// characters is inserted into the result.
-///
+/// The symbol's name is broken into segments, e.g. "FooBar" has two segments.
 /// Trigrams can start at any character in the input. Then we can choose to move
-/// to the next character, move to the start of the next segment, or skip over a
-/// segment.
+/// to the next character, move to the start of the next segment, or stop.
+/// Short trigrams (length 1-2) are used for short queries.
 ///
-/// This also generates incomplete trigrams for short query scenarios:
-///  * Empty trigram: "$$$".
-///  * Unigram: the first character of the identifier.
-///  * Bigrams: a 2-char prefix of the identifier and a bigram of the first two
-///    HEAD characters (if they exist).
-//
-/// Note: the returned list of trigrams does not have duplicates, if any trigram
-/// belongs to more than one class it is only inserted once.
+/// For "FooBar" we get the following trigrams:
+///  {f, b, fo, fb, foo, fob, fba, oob, oba, bar}.
+///
+/// Trigrams are lowercase, as trigram matching is case-insensitive.
+/// Trigrams in the returned list are deduplicated.
 std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier);
 
 /// Returns list of unique fuzzy-search trigrams given a query.
Index: clangd/index/dex/Trigram.cpp
===================================================================
--- clangd/index/dex/Trigram.cpp
+++ clangd/index/dex/Trigram.cpp
@@ -23,11 +23,6 @@
 namespace clangd {
 namespace dex {
 
-/// This is used to mark unigrams and bigrams and distinct them from complete
-/// trigrams. Since '$' is not present in valid identifier names, it is safe to
-/// use it as the special symbol.
-static const char END_MARKER = '$';
-
 std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier) {
   // Apply fuzzy matching text segmentation.
   std::vector<CharRole> Roles(Identifier.size());
@@ -47,111 +42,68 @@
   // not available then 0 is stored.
   std::vector<std::array<unsigned, 3>> Next(LowercaseIdentifier.size());
   unsigned NextTail = 0, NextHead = 0, NextNextHead = 0;
-  // Store two first HEAD characters in the identifier (if present).
-  std::deque<char> TwoHeads;
   for (int I = LowercaseIdentifier.size() - 1; I >= 0; --I) {
     Next[I] = {{NextTail, NextHead, NextNextHead}};
     NextTail = Roles[I] == Tail ? I : 0;
     if (Roles[I] == Head) {
       NextNextHead = NextHead;
       NextHead = I;
-      TwoHeads.push_front(LowercaseIdentifier[I]);
-      if (TwoHeads.size() > 2)
-        TwoHeads.pop_back();
     }
   }
 
   DenseSet<Token> UniqueTrigrams;
 
-  auto add = [&](std::string Chars) {
+  auto Add = [&](std::string Chars) {
     UniqueTrigrams.insert(Token(Token::Kind::Trigram, Chars));
   };
 
-  if (TwoHeads.size() == 2)
-    add({{TwoHeads.front(), TwoHeads.back(), END_MARKER}});
-
-  if (!LowercaseIdentifier.empty())
-    add({{LowercaseIdentifier.front(), END_MARKER, END_MARKER}});
-
-  if (LowercaseIdentifier.size() >= 2)
-    add({{LowercaseIdentifier[0], LowercaseIdentifier[1], END_MARKER}});
-
-  if (LowercaseIdentifier.size() >= 3)
-    add({{LowercaseIdentifier[0], LowercaseIdentifier[1],
-          LowercaseIdentifier[2]}});
-
   // Iterate through valid seqneces of three characters Fuzzy Matcher can
   // process.
   for (size_t I = 0; I < LowercaseIdentifier.size(); ++I) {
     // Skip delimiters.
     if (Roles[I] != Head && Roles[I] != Tail)
       continue;
+    if (Roles[I] == Head)
+      Add({LowercaseIdentifier[I]});
     for (const unsigned J : Next[I]) {
       if (J == 0)
         continue;
+      if (Roles[I] == Head)
+        Add({LowercaseIdentifier[I], LowercaseIdentifier[J]});
       for (const unsigned K : Next[J]) {
         if (K == 0)
           continue;
-        add({{LowercaseIdentifier[I], LowercaseIdentifier[J],
+        Add({{LowercaseIdentifier[I], LowercaseIdentifier[J],
               LowercaseIdentifier[K]}});
       }
     }
   }
 
-  std::vector<Token> Result;
-  for (const auto &Trigram : UniqueTrigrams)
-    Result.push_back(Trigram);
-
-  return Result;
+  return {UniqueTrigrams.begin(), UniqueTrigrams.end()};
 }
 
 std::vector<Token> generateQueryTrigrams(llvm::StringRef Query) {
   // Apply fuzzy matching text segmentation.
   std::vector<CharRole> Roles(Query.size());
   calculateRoles(Query, llvm::makeMutableArrayRef(Roles.data(), Query.size()));
 
-  // Additional pass is necessary to count valid identifier characters.
-  // Depending on that, this function might return incomplete trigram.
-  unsigned ValidSymbolsCount = 0;
-  for (const auto Role : Roles)
-    if (Role == Head || Role == Tail)
-      ++ValidSymbolsCount;
-
   std::string LowercaseQuery = Query.lower();
-
   DenseSet<Token> UniqueTrigrams;
-
-  // If the number of symbols which can form fuzzy matching trigram is not
-  // sufficient, generate a single incomplete trigram for query.
-  if (ValidSymbolsCount < 3) {
-    std::string Chars =
-        LowercaseQuery.substr(0, std::min<size_t>(3UL, Query.size()));
-    Chars.append(3 - Chars.size(), END_MARKER);
-    UniqueTrigrams.insert(Token(Token::Kind::Trigram, Chars));
-  } else {
-    std::deque<char> Chars;
-    for (size_t I = 0; I < LowercaseQuery.size(); ++I) {
-      // If current symbol is delimiter, just skip it.
-      if (Roles[I] != Head && Roles[I] != Tail)
-        continue;
-
-      Chars.push_back(LowercaseQuery[I]);
-
-      if (Chars.size() > 3)
-        Chars.pop_front();
-
-      if (Chars.size() == 3) {
-        UniqueTrigrams.insert(
-            Token(Token::Kind::Trigram, std::string(begin(Chars), end(Chars))));
-      }
-    }
+  std::string Chars;
+  for (unsigned I = 0; I < Query.size(); ++I) {
+    if (Roles[I] != Head && Roles[I] != Tail)
+      continue; // Skip delimiters.
+    Chars.push_back(LowercaseQuery[I]);
+    if (Chars.size() > 3)
+      Chars.erase(Chars.begin());
+    if (Chars.size() == 3)
+      UniqueTrigrams.insert(Token(Token::Kind::Trigram, Chars));
   }
 
-  std::vector<Token> Result;
-  for (const auto &Trigram : UniqueTrigrams)
-    Result.push_back(Trigram);
-
-  return Result;
+  // If it's a short query, emit a single short trigram.
+  if (Chars.size() < 3 && !Chars.empty())
+    return {Token(Token::Kind::Trigram, Chars)};
+  return {UniqueTrigrams.begin(), UniqueTrigrams.end()};
 }
 
 } // namespace dex
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to