[PATCH] D70594: [clangd] Implement range patching heuristics for cross-file rename.

Haojian Wu via Phabricator via cfe-commits Wed, 04 Dec 2019 05:58:22 -0800

hokein updated this revision to Diff 232101.
hokein marked 3 inline comments as done.
hokein added a comment.


address comments

- re-define the concept of a near miss
- add metric for evaluate how good a near miss is


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D70594/new/

https://reviews.llvm.org/D70594

Files:
  clang-tools-extra/clangd/refactor/Rename.cpp
  clang-tools-extra/clangd/refactor/Rename.h
  clang-tools-extra/clangd/unittests/RenameTests.cpp

Index: clang-tools-extra/clangd/unittests/RenameTests.cpp
===================================================================
--- clang-tools-extra/clangd/unittests/RenameTests.cpp
+++ clang-tools-extra/clangd/unittests/RenameTests.cpp
@@ -702,6 +702,222 @@
             expectedResult(Code, expectedResult(T, "abc")));
 }
 
+TEST(RangePatchingHeuristic, MappedRange) {
+  // Ranges in IndexedCode indicate the indexed occurrences;
+  // ranges in DraftCode indicate the expected mapped result, empty indicates
+  // we expect no matched result found.
+  struct {
+    llvm::StringRef IndexedCode;
+    llvm::StringRef DraftCode;
+  } Tests[] = {
+    {
+      // no lexed occurrences.
+      R"cpp(int [[x]] = 0)cpp",
+      R"cpp()cpp",
+    },
+    {
+      // both line and column are changed, not a near miss.
+      R"cpp(
+        int [[x]] = 0;
+      )cpp",
+      R"cpp(
+
+         int x = 0;
+      )cpp",
+    },
+    {
+      // subset.
+      R"cpp(
+        int [[x]] = 0;
+      )cpp",
+      R"cpp(
+        int [[x]] = 0;
+        {int x = 0; }
+      )cpp",
+    },
+    {
+      // column shift
+      R"cpp(int [[x]] = 0; void foo(int x);)cpp",
+      R"cpp(double [[x]] = 0; void foo(double x);)cpp",
+    },
+    {
+      // insert a line
+      R"cpp(
+        int [[x]] = 0;
+        void foo(int x);
+      )cpp",
+      R"cpp(
+        // new line insert;
+        int [[x]] = 0;
+        void foo(int x);
+      )cpp",
+    },
+    {
+      // insert a line, x refers to different symbols
+      R"cpp(
+        void foo(int [[x]]) {
+                [[x]] = 2;
+          { int x = 0; }
+                  [[x]] = 3;
+          int y = [[x]];
+          { int x = 0; }
+        }
+      )cpp",
+      R"cpp(
+        // insert a new line
+        void foo(int [[x]]) {
+                [[x]] = 2;
+          { int x = 0; }
+                  [[x]] = 3;
+          int y = [[x]];
+          { int x = 0; }
+        }
+      )cpp",
+    },
+    {
+      // non-distinct result (have two best results), not a near miss.
+      R"cpp(
+        int [[x]](int a);
+        int a = [[x]](1);
+        int b = [[x]](2);
+      )cpp",
+      R"cpp(
+        int x();
+        int x(int a);
+        int a = x(1);
+        int b = x(2);
+      )cpp",
+    },
+  };
+  LangOptions LangOpts;
+  LangOpts.CPlusPlus = true;
+  for (const auto &T : Tests) {
+    auto IndexedOccurrences = Annotations(T.IndexedCode).ranges();
+    Annotations Draft(T.DraftCode);
+    auto ActualRanges =
+        patchRanges(Draft.code(), "x", IndexedOccurrences, LangOpts);
+    if (!Draft.ranges().empty()) {
+      EXPECT_TRUE((bool)ActualRanges) << "patchRange returned an error: "
+                                 << llvm::toString(ActualRanges.takeError());
+      EXPECT_THAT(Draft.ranges(),
+                  testing::UnorderedElementsAreArray(*ActualRanges))
+          << T.DraftCode;
+    } else {
+      EXPECT_FALSE(ActualRanges)
+          << "expected an error: " << T.DraftCode;
+      llvm::consumeError(ActualRanges.takeError());
+    }
+  }
+}
+
+TEST(RangePatchingHeuristic, EditCost) {
+  struct {
+    llvm::StringRef RangeCode;
+    size_t ExpectedCost;
+  } Tests[] = {
+    {
+      R"(
+        $idx[[]]$lex[[]] // diff: 0
+      )",
+      0,
+    },
+    {
+      // line offset shift.
+      R"(
+        $idx[[]]
+        $lex[[]] // line diff: +1
+                       $idx[[]]
+                       $lex[[]] // line diff: +1
+        $idx[[]]
+        $lex[[]] // line diff: +1
+
+          $idx[[]]
+
+          $lex[[]] // line diff: +2
+      )",
+      1 + 1
+    },
+    {
+       R"(
+        $idx[[]]
+        $lex[[]] // line diff: +1
+                       $idx[[]]
+
+                       $lex[[]] // line diff: +2
+        $idx[[]]
+
+
+        $lex[[]] // line diff: +3
+      )",
+      1 + 1 + 1
+    },
+    {
+       R"(
+        $idx[[]]
+
+
+        $lex[[]] // line diff: +3
+                       $idx[[]]
+
+                       $lex[[]] // line diff: +2
+        $idx[[]]
+        $lex[[]] // line diff: +1
+      )",
+      3 + 1 + 1
+    },
+    {
+      R"(
+        $idx[[]]
+        $lex[[]] // line diff: +1
+                       $lex[[]] // line diff: -2
+
+                       $idx[[]]
+        $idx[[]]
+
+
+        $lex[[]] // line diff: +3
+      )",
+      1 + 3 + 5
+    },
+    // line & column diff.
+    {
+      R"(
+                       $idx[[]] $lex[[]] // column diff: +1
+        $idx[[]]$lex[[]] // diff: 0
+      )",
+      1
+    },
+    {
+      R"(
+        $idx[[]]
+        $lex[[]] // diff: +1
+                       $idx[[]] $lex[[]] // column diff: +1
+        $idx[[]]$lex[[]] // diff: 0
+      )",
+      1 + 1 + 1
+    },
+    // column offset shift.
+    {
+      R"(
+        $idx[[]] $lex[[]] // column diff: +1
+      )",
+      1
+    },
+    {
+      R"(
+        // column diffs: +1, +2, +3
+        $idx[[]] $lex[[]] $idx[[]]  $lex[[]] $idx[[]]   $lex[[]]
+      )",
+      1 + 1 + 1,
+    },
+  };
+  for (const auto &T : Tests) {
+    Annotations C(T.RangeCode);
+    EXPECT_EQ(editCost(C.ranges("idx"), C.ranges("lex")), T.ExpectedCost)
+        << T.RangeCode;
+  }
+}
+
 } // namespace
 } // namespace clangd
 } // namespace clang
Index: clang-tools-extra/clangd/refactor/Rename.h
===================================================================
--- clang-tools-extra/clangd/refactor/Rename.h
+++ clang-tools-extra/clangd/refactor/Rename.h
@@ -12,6 +12,7 @@
 #include "Path.h"
 #include "Protocol.h"
 #include "SourceCode.h"
+#include "clang/Basic/LangOptions.h"
 #include "clang/Tooling/Core/Replacement.h"
 #include "llvm/Support/Error.h"
 
@@ -55,6 +56,52 @@
                                      std::vector<Range> Occurrences,
                                      llvm::StringRef NewName);
 
+/// Gets "authoritative" occurrences to perform the rename.
+///
+/// Index is not always up-to-update. Applying stale indexed occurrences leads
+/// to bad rename result, so this API helps to mitigate the stale-index issue.
+/// It determines the indexed occurrences are good enough, and heuristically
+/// repairs the indexed occurrences if necessary.
+///
+/// Details:
+///  - lex the draft code to get all rename candidates, this yields a superset
+//     of candidates.
+///  - apply range patching heuristics to generate "authoritative" occurrences,
+///    cases we consider:
+///      (a) index returns a subset of candidates, we use the indexed results.
+///        - fully equal, we are sure the index is up-to-date
+///        - proper subset, index is correct in most cases? there may be false
+///          positives (e.g. candidates got appended), but rename is still safe
+///      (b) index returns non-candidate results, we use the best near miss.
+///
+/// The API assumes that IndexedOccurrences contains only named occcurrences (
+/// each occurrence has the same length).
+llvm::Expected<std::vector<Range>> patchRanges(llvm::StringRef DraftCode,
+                                               llvm::StringRef Identifier,
+                                               std::vector<Range> Indexed,
+                                               const LangOptions &LangOpts);
+
+/// Evaluates how good the mapping result is. 0 indicates a perfect match.
+///
+/// It uses the sum of the implied edit sizes between successive diffs, only
+/// simple edits are considered:
+///   - insert/remove a line (change line offset)
+///   - insert/remove a character on an existing line (change the column offset)
+///
+/// Example I, total result is 1 + 1 = 2.
+///   diff[0]: line + 1 <- insert a line before edit 0.
+///   diff[1]: line + 1
+///   diff[2]: line + 1
+///   diff[3]: line + 2 <- insert a line before edits 2 and 3.
+///
+/// Example II, total result is 1 + 1 + 1 = 3.
+///   diff[0]: line + 1  <- insert a line before edit 0.
+///   diff[1]: column + 1 <- remove a line between edits 0 and 1, and insert a
+///   character on edit 1.
+///
+/// REQUIRED: Indexed and Mapped are sorted, and have the same size.
+size_t editCost(ArrayRef<Range> Indexed, ArrayRef<Range> Mapped);
+
 } // namespace clangd
 } // namespace clang
 
Index: clang-tools-extra/clangd/refactor/Rename.cpp
===================================================================
--- clang-tools-extra/clangd/refactor/Rename.cpp
+++ clang-tools-extra/clangd/refactor/Rename.cpp
@@ -21,6 +21,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FormatVariadic.h"
+#include <algorithm>
 
 namespace clang {
 namespace clangd {
@@ -355,9 +356,19 @@
       elog("Fail to read file content: {0}", AffectedFileCode.takeError());
       continue;
     }
-    auto RenameEdit =
-        buildRenameEdit(FilePath, *AffectedFileCode,
-                        std::move(FileAndOccurrences.second), NewName);
+    auto RenameCandidates =
+        patchRanges(*AffectedFileCode, RenameDecl.getNameAsString(),
+                    std::move(FileAndOccurrences.second),
+                    RenameDecl.getASTContext().getLangOpts());
+    if (!RenameCandidates) {
+      return llvm::make_error<llvm::StringError>(
+          llvm::formatv("Index results don't match the content of file {0} "
+                        "(the index may be stale), details: {0}",
+                        FilePath, llvm::toString(RenameCandidates.takeError())),
+          llvm::inconvertibleErrorCode());
+    }
+    auto RenameEdit = buildRenameEdit(FilePath, *AffectedFileCode,
+                                      *RenameCandidates, NewName);
     if (!RenameEdit) {
       return llvm::make_error<llvm::StringError>(
           llvm::formatv("fail to build rename edit for file {0}: {1}", FilePath,
@@ -370,6 +381,142 @@
   return Results;
 }
 
+bool isLineOrColumnEqual(const Position &LHS, const Position &RHS) {
+  return LHS.line == RHS.line || LHS.character == RHS.character;
+}
+
+// Describe a single line-based/column edit diff.
+struct Diff {
+  enum Type {
+    Line,
+    Column,
+  };
+  Type DType;
+  int Offset = 0;
+  const Range *IndexedRange = nullptr;
+  const Range *LexedRange = nullptr;
+};
+Diff computeDiff(const Range &LHS, const Range &RHS) {
+  assert(isLineOrColumnEqual(LHS.start, RHS.start));
+  if (LHS.start.line != RHS.start.line)
+    return Diff{Diff::Line, LHS.start.line - RHS.start.line, &LHS, &RHS};
+  return Diff{Diff::Column, LHS.start.character - RHS.start.character, &LHS,
+              &RHS};
+}
+
+// Calculate the quality of the give Batch.
+size_t batchEditSize(ArrayRef<Diff> Batch) {
+  assert(!Batch.empty());
+  size_t TotalCost = abs(Batch.front().Offset);
+  for (size_t I = 1; I < Batch.size(); ++I)
+    TotalCost += abs(Batch[I].Offset - Batch[I - 1].Offset);
+  return TotalCost;
+}
+
+// Performs a DFS to enumerate all possible near-miss matches.
+// It finds the locations where the indexed occurrences are now spelled in
+// Lexed occurrences, a near miss is defined as:
+//   - a near miss maps all of the **name** occurrences from the index onto a
+//     *subset* of lexed occurrences (we allow a single name refers to more
+//     than one symbol)
+//   - all indexed occurrences must be mapped, and Result must be distinct and
+//     preseve order (only support detecting simple edits to ensure a
+//     robust mapping)
+//   - each indexed -> lexed occurrences mapping correspondence may change the
+//     *line* or *column*, but not both (increases chance of a robust mapping)
+class NearMissFinder {
+public:
+  NearMissFinder(ArrayRef<Range> Indexed, ArrayRef<Range> Lexed)
+      : Indexed(Indexed), Lexed(Lexed) {
+    assert(Indexed.size() <= Lexed.size());
+    MatchedIndex.resize(Indexed.size());
+  }
+
+  using MatchedCallback = llvm::function_ref<void(std::vector<Range> Matched)>;
+  void find(MatchedCallback CB) {
+    MatchedCB = CB;
+    enumerate(0, 0, 0);
+  }
+
+private:
+  void enumerate(size_t NextMatched, size_t NextLexed, size_t IterCount) {
+    if (IterCount > KMaxIterations) {
+      vlog("Reach out the maximum iteration limit, stopped.");
+      return;
+    }
+
+    if (NextMatched >= Indexed.size()) {
+      // find a match, emit it.
+      std::vector<Range> Mapped;
+      for (int Pos : MatchedIndex)
+        Mapped.push_back(Lexed[Pos]);
+      return MatchedCB(std::move(Mapped));
+    }
+
+    for (size_t I = NextLexed; I < Lexed.size(); ++I) {
+      if (Visited.count(I))
+        continue;
+      if (isLineOrColumnEqual(Indexed[NextMatched].start, Lexed[I].start)) {
+        Visited.insert(I);
+        MatchedIndex[NextMatched] = I;
+        enumerate(NextMatched + 1, I + 1, IterCount + 1);
+        Visited.erase(I); // reset the state.
+      }
+    }
+  }
+  static constexpr size_t KMaxIterations = 10000;
+  ArrayRef<Range> Indexed; // indexed occurrences.
+  ArrayRef<Range> Lexed;   // lexed occurrences
+  MatchedCallback MatchedCB;
+
+  // Corresponding to the Indexed, values are the indices into Lexed array.
+  std::vector<int> MatchedIndex;
+  // Whether the elements of Lexed array visited.
+  llvm::DenseSet<int> Visited;
+};
+
+/// Returns the best near miss for the Indexed occurrences.
+/// REQUIRED: Indexed and Lexed are sorted.
+llvm::Expected<std::vector<Range>> getMappedRanges(ArrayRef<Range> Indexed,
+                                                   ArrayRef<Range> Lexed) {
+  assert(!Indexed.empty());
+  assert(std::is_sorted(Indexed.begin(), Indexed.end()));
+  assert(std::is_sorted(Lexed.begin(), Lexed.end()));
+
+  if (Indexed.size() > Lexed.size())
+    return llvm::make_error<llvm::StringError>(
+        llvm::inconvertibleErrorCode(),
+        llvm::formatv("The number of lexed occurrences is less than indexed "
+                      "occurrences"));
+  // Fast check for the special subset case.
+  if (std::includes(Indexed.begin(), Indexed.end(), Lexed.begin(), Lexed.end()))
+    return std::vector<Range>{Indexed.begin(), Indexed.end()};
+
+  std::vector<Range> Best;
+  size_t BestCost = std::numeric_limits<size_t>::max();
+  bool HasMultiple = 0;
+  NearMissFinder FindNearMiss(Indexed, Lexed);
+  FindNearMiss.find([&](std::vector<Range> Mapped) {
+    size_t MCost = editCost(Indexed, Mapped);
+    if (MCost < BestCost) {
+      BestCost = MCost;
+      Best = std::move(Mapped);
+      HasMultiple = false; // reset
+      return;
+    }
+    if (MCost == BestCost)
+      HasMultiple = true;
+  });
+  if (HasMultiple)
+    return llvm::make_error<llvm::StringError>(
+        llvm::inconvertibleErrorCode(),
+        llvm::formatv("The best near miss is not distinct"));
+  if (Best.empty())
+    return llvm::make_error<llvm::StringError>(
+        llvm::inconvertibleErrorCode(),
+        llvm::formatv("Didn't found a near miss"));
+  return Best;
+}
 } // namespace
 
 llvm::Expected<FileEdits> rename(const RenameInputs &RInputs) {
@@ -505,5 +652,49 @@
   return Edit(InitialCode, std::move(RenameEdit));
 }
 
+llvm::Expected<std::vector<Range>> patchRanges(llvm::StringRef DraftCode,
+                                               llvm::StringRef Identifier,
+                                               std::vector<Range> Indexed,
+                                               const LangOptions &LangOpts) {
+  assert(!Indexed.empty());
+  std::vector<Range> Lexed =
+      collectIdentifierRanges(Identifier, DraftCode, LangOpts);
+  llvm::sort(Indexed);
+  llvm::sort(Lexed);
+  return getMappedRanges(Indexed, Lexed);
+}
+
+size_t editCost(ArrayRef<Range> Indexed, ArrayRef<Range> Mapped) {
+  assert(Indexed.size() == Mapped.size());
+  assert(std::is_sorted(Indexed.begin(), Indexed.end()));
+  assert(std::is_sorted(Mapped.begin(), Mapped.end()));
+
+  std::vector<Diff> Diffs;
+  for (size_t I = 0; I < Indexed.size(); ++I)
+    Diffs.push_back(computeDiff(Indexed[I], Mapped[I]));
+  auto DiffsRef = ArrayRef<Diff>(Diffs);
+  size_t SumCost = 0;
+  while (!DiffsRef.empty()) {
+    auto DType = DiffsRef.front().DType;
+
+    // Group all successive diffs: all line diffs, or all column diffs on a
+    // same line.
+    auto Batch = DiffsRef.take_while([&](const Diff &D) {
+      if (DType == Diff::Line)
+        return D.DType == Diff::Line;
+      return DiffsRef.front().IndexedRange->start.line ==
+             D.IndexedRange->start.line;
+    });
+    assert(Batch.size());
+    SumCost += batchEditSize(Batch);
+    DiffsRef = DiffsRef.drop_front(Batch.size());
+
+    if (!DiffsRef.empty() && DType == Diff::Line)
+      // pay back the cost for the next column-based batch.
+      SumCost += abs(Batch.back().Offset);
+  }
+  return SumCost;
+}
+
 } // namespace clangd
 } // namespace clang

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D70594: [clangd] Implement range patching heuristics for cross-file rename.

Reply via email to