johannes updated this revision to Diff 103320.
johannes added a comment.

- Fix a bug in getSimilarity()
- Change terminology: `label` -> `value`
- Define SNodeId: Now it cannot be implicitly constructed from an int, however 
it can be converted to int. Still feels a bit weird
- Fix some issues with SNodeId
- Rewrite the computation of leftmost descendants in subtrees.


Index: tools/clang-diff/ClangDiff.cpp
--- /dev/null
+++ tools/clang-diff/ClangDiff.cpp
@@ -0,0 +1,110 @@
+//===- ClangDiff.cpp - compare source files by AST nodes ------*- C++ -*- -===//
+//                     The LLVM Compiler Infrastructure
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// This file implements a tool for syntax tree based comparison using
+// Tooling/ASTDiff.
+#include "clang/Tooling/ASTDiff/ASTDiff.h"
+#include "clang/Tooling/CommonOptionsParser.h"
+#include "clang/Tooling/Tooling.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+using namespace clang;
+using namespace tooling;
+static cl::OptionCategory ClangDiffCategory("clang-diff options");
+static cl::opt<bool>
+    DumpAST("ast-dump",
+            cl::desc("Print the internal representation of the AST as JSON."),
+            cl::init(false), cl::cat(ClangDiffCategory));
+static cl::opt<bool> NoCompilationDatabase(
+    "no-compilation-database",
+    cl::desc(
+        "Do not attempt to load build settigns from a compilation database"),
+    cl::init(false), cl::cat(ClangDiffCategory));
+static cl::opt<std::string> SourcePath(cl::Positional, cl::desc("<source>"),
+                                       cl::Required,
+                                       cl::cat(ClangDiffCategory));
+static cl::opt<std::string> DestinationPath(cl::Positional,
+                                            cl::desc("<destination>"),
+                                            cl::Optional,
+                                            cl::cat(ClangDiffCategory));
+static std::unique_ptr<ASTUnit> getAST(const StringRef Filename) {
+  std::string ErrorMessage;
+  std::unique_ptr<CompilationDatabase> Compilations;
+  if (!NoCompilationDatabase)
+    Compilations =
+        CompilationDatabase::autoDetectFromSource(Filename, ErrorMessage);
+  if (!Compilations) {
+    if (!NoCompilationDatabase)
+      llvm::errs()
+          << "Error while trying to load a compilation database, running "
+             "without flags.\n"
+          << ErrorMessage;
+    Compilations.reset(
+        new FixedCompilationDatabase(".", std::vector<std::string>()));
+  }
+  std::array<std::string, 1> Files = {{Filename}};
+  ClangTool Tool(*Compilations, Files);
+  std::vector<std::unique_ptr<ASTUnit>> ASTs;
+  Tool.buildASTs(ASTs);
+  if (ASTs.size() != Files.size())
+    return nullptr;
+  return std::move(ASTs[0]);
+int main(int argc, const char **argv) {
+  cl::HideUnrelatedOptions(ClangDiffCategory);
+  if (!cl::ParseCommandLineOptions(argc, argv)) {
+    cl::PrintOptionValues();
+    return 1;
+  }
+  if (DumpAST) {
+    if (!DestinationPath.empty()) {
+      llvm::errs() << "Error: Please specify exactly one filename.\n";
+      return 1;
+    }
+    std::unique_ptr<ASTUnit> AST = getAST(SourcePath);
+    if (!AST)
+      return 1;
+    clang::diff::TreeRoot Tree(AST->getASTContext());
+    Tree.printAsJson();
+    return 0;
+  }
+  if (DestinationPath.empty()) {
+    llvm::errs() << "Error: Exactly two paths are required.\n";
+    return 1;
+  }
+  std::unique_ptr<ASTUnit> Src = getAST(SourcePath);
+  std::unique_ptr<ASTUnit> Dst = getAST(DestinationPath);
+  if (!Src || !Dst)
+    return 1;
+  diff::TreeRoot T1(Src->getASTContext());
+  diff::TreeRoot T2(Dst->getASTContext());
+  diff::ASTDiff DiffTool(T1, T2);
+  diff::Mapping M = DiffTool.computeMapping();
+  M.printMapping();
+  auto Changes = DiffTool.computeChanges(M);
+  for (const auto &C : Changes)
+    DiffTool.printChange(C);
+  return 0;
Index: tools/clang-diff/CMakeLists.txt
--- /dev/null
+++ tools/clang-diff/CMakeLists.txt
@@ -0,0 +1,13 @@
+  Support
+  )
+  ClangDiff.cpp
+  )
+  clangFrontend
+  clangTooling
+  clangToolingASTDiff
+  )
Index: tools/CMakeLists.txt
--- tools/CMakeLists.txt
+++ tools/CMakeLists.txt
@@ -2,6 +2,7 @@
Index: test/Tooling/clang-diff-basic.cpp
--- /dev/null
+++ test/Tooling/clang-diff-basic.cpp
@@ -0,0 +1,61 @@
+// RUN: %clang_cc1 -E %s > %T/src.cpp
+// RUN: %clang_cc1 -E %s > %T/dst.cpp -DDEST
+// RUN: clang-diff -no-compilation-database %T/src.cpp %T/dst.cpp | FileCheck %s
+#ifndef DEST
+namespace src {
+  const char str[] = "the string";
+  ;
+  ;
+  ;
+  ;
+int on = 1 * 2 * 3 * 4;
+int b = on * 2;
+class X {
+  const char *foo(int i) {
+    if (i == 0)
+      return "Foo!";
+    return 0;
+  }
+  X(){};
+  int id(int i) { return i; }
+// CHECK: Match NamespaceDecl: src{{.*}} to NamespaceDecl: src
+namespace src {
+  ;
+  ;
+  ;
+  ;
+// CHECK-NOT: Match NamespaceDecl: src{{.*}} to NamespaceDecl: dst
+namespace dst {
+// CHECK: Match VarDecl: on(int){{.*}} to VarDecl: one(double)
+// CHECK: Update VarDecl: on(int){{.*}} to one(double)
+double one = 1 * 2 * 55;
+// CHECK: Update DeclRefExpr
+int b = one * 2;
+class X {
+  const char *foo(int i) {
+    if (i == 0)
+      return "Bar";
+    // CHECK: Insert IfStmt{{.*}} into IfStmt
+    // CHECK: Insert BinaryOperator: =={{.*}} into IfStmt
+    else if (i == -1)
+      return "Foo!";
+    return 0;
+  }
+  // CHECK: Delete AccessSpecDecl: public
+  X(){};
+  // CHECK: Delete CXXMethodDecl
Index: lib/Tooling/CMakeLists.txt
--- lib/Tooling/CMakeLists.txt
+++ lib/Tooling/CMakeLists.txt
@@ -5,6 +5,7 @@
Index: lib/Tooling/ASTDiff/CMakeLists.txt
--- /dev/null
+++ lib/Tooling/ASTDiff/CMakeLists.txt
@@ -0,0 +1,11 @@
+  Support
+  )
+  ASTDiff.cpp
+  clangBasic
+  clangAST
+  clangLex
+  )
Index: lib/Tooling/ASTDiff/ASTDiff.cpp
--- /dev/null
+++ lib/Tooling/ASTDiff/ASTDiff.cpp
@@ -0,0 +1,776 @@
+//===- ASTDiff.cpp - AST differencing implementation-----------*- C++ -*- -===//
+//                     The LLVM Compiler Infrastructure
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// This file contains definitons for the AST differencing interface.
+#include "clang/Tooling/ASTDiff/ASTDiff.h"
+#include "clang/AST/RecursiveASTVisitor.h"
+#include "clang/Lex/Lexer.h"
+#include "llvm/ADT/PriorityQueue.h"
+#include "llvm/Support/FormatVariadic.h"
+#include <limits>
+#include <memory>
+#include <unordered_set>
+using namespace llvm;
+using namespace clang;
+namespace clang {
+namespace diff {
+namespace {
+/// Counts the number of nodes that will be compared.
+struct NodeCountVisitor : public RecursiveASTVisitor<NodeCountVisitor> {
+  int Count = 0;
+  const TreeRoot &Root;
+  NodeCountVisitor(const TreeRoot &Root) : Root(Root) {}
+  bool TraverseDecl(Decl *D) {
+    if (Root.discardNode(D))
+      return true;
+    ++Count;
+    RecursiveASTVisitor<NodeCountVisitor>::TraverseDecl(D);
+    return true;
+  }
+  bool TraverseStmt(Stmt *S) {
+    if (Root.discardNode(S))
+      return true;
+    ++Count;
+    RecursiveASTVisitor<NodeCountVisitor>::TraverseStmt(S);
+    return true;
+  }
+  bool TraverseType(QualType T) { return true; }
+} // end anonymous namespace
+namespace {
+// Sets Height, Parent and Children for each node.
+struct PreorderVisitor : public RecursiveASTVisitor<PreorderVisitor> {
+  int Id = 0, Depth = 0;
+  NodeId Parent = InvalidNodeId;
+  TreeRoot &Root;
+  PreorderVisitor(TreeRoot &Root) : Root(Root) {}
+  template <class T> std::tuple<NodeId, NodeId> PreTraverse(T *ASTNode) {
+    NodeId MyId = Id;
+    Node &N = Root.getMutableNode(MyId);
+    N.Parent = Parent;
+    N.Depth = Depth;
+    N.ASTNode = ast_type_traits::DynTypedNode::create(*ASTNode);
+    assert(!N.ASTNode.getNodeKind().isNone() &&
+           "Expected nodes to have a valid kind.");
+    if (Parent != InvalidNodeId) {
+      Node &P = Root.getMutableNode(Parent);
+      P.Children.push_back(MyId);
+    }
+    Parent = MyId;
+    ++Id;
+    ++Depth;
+    return {MyId, Root.getNode(MyId).Parent};
+  }
+  void PostTraverse(std::tuple<NodeId, NodeId> State) {
+    NodeId MyId, PreviousParent;
+    std::tie(MyId, PreviousParent) = State;
+    assert(MyId != InvalidNodeId && "Expecting to only traverse valid nodes.");
+    Parent = PreviousParent;
+    --Depth;
+    Node &N = Root.getMutableNode(MyId);
+    N.RightMostDescendant = Id;
+    if (N.isLeaf())
+      Root.Leaves.push_back(MyId);
+    N.Height = 1;
+    for (NodeId Child : N.Children)
+      N.Height = std::max(N.Height, 1 + Root.getNode(Child).Height);
+  }
+  bool TraverseDecl(Decl *D) {
+    if (Root.discardNode(D))
+      return true;
+    auto SavedState = PreTraverse(D);
+    RecursiveASTVisitor<PreorderVisitor>::TraverseDecl(D);
+    PostTraverse(SavedState);
+    return true;
+  }
+  bool TraverseStmt(Stmt *S) {
+    if (Root.discardNode(S))
+      return true;
+    auto SavedState = PreTraverse(S);
+    RecursiveASTVisitor<PreorderVisitor>::TraverseStmt(S);
+    PostTraverse(SavedState);
+    return true;
+  }
+  bool TraverseType(QualType T) { return true; }
+} // end anonymous namespace
+TreeRoot::TreeRoot(ASTContext &AST) : AST(AST) {
+  auto *TUD = AST.getTranslationUnitDecl();
+  // Run the above visitors to initialize the tree.
+  NodeCountVisitor NodeCounter(*this);
+  NodeCounter.TraverseDecl(TUD);
+  setSize(NodeCounter.Count);
+  PreorderVisitor PreorderWalker(*this);
+  PreorderWalker.TraverseDecl(TUD);
+  setLeftMostDescendant();
+  int PostorderId = 0;
+  PostorderIds.resize(Nodes.size());
+  std::function<void(NodeId)> PostorderTraverse = [&](NodeId Id) {
+    for (NodeId Child : getNode(Id).Children)
+      PostorderTraverse(Child);
+    PostorderIds[Id] = PostorderId;
+    ++PostorderId;
+  };
+  PostorderTraverse(root());
+template <class T> bool TreeRoot::discardNode(T *N) const {
+  if (!N)
+    return true;
+  SourceLocation SLoc = N->getLocStart();
+  const SourceManager &SrcMgr = AST.getSourceManager();
+  return SLoc.isValid() && SrcMgr.isInSystemHeader(SLoc);
+void TreeRoot::setLeftMostDescendant() {
+  for (NodeId Leaf : Leaves) {
+    getMutableNode(Leaf).LeftMostDescendant = Leaf;
+    NodeId Parent, Cur = Leaf;
+    while ((Parent = getNode(Cur).Parent) != InvalidNodeId &&
+           getNode(Parent).Children[0] == Cur) {
+      Cur = Parent;
+      getMutableNode(Cur).LeftMostDescendant = Leaf;
+    }
+  }
+int TreeRoot::getSubtreePostorder(std::vector<NodeId> &Ids, NodeId Root) const {
+  int Leaves = 0;
+  std::function<void(NodeId)> Traverse = [&](NodeId Id) {
+    const Node &N = getNode(Id);
+    for (NodeId Child : N.Children)
+      Traverse(Child);
+    if (N.isLeaf())
+      ++Leaves;
+    Ids.push_back(Id);
+  };
+  Traverse(Root);
+  return Leaves;
+std::vector<NodeId> TreeRoot::getSubtreeBfs(NodeId Id) const {
+  std::vector<NodeId> Ids;
+  size_t Expanded = 0;
+  Ids.push_back(Id);
+  while (Expanded < Ids.size())
+    for (NodeId Child : getNode(Ids[Expanded++]).Children)
+      Ids.push_back(Child);
+  return Ids;
+int TreeRoot::getNumberOfDescendants(NodeId Id) const {
+  return getNode(Id).RightMostDescendant - Id + 1;
+std::string TreeRoot::getValue(NodeId Id) const {
+  const Node &N = getNode(Id);
+  const ast_type_traits::DynTypedNode &DTN = N.ASTNode;
+  if (auto *X = DTN.get<BinaryOperator>())
+    return X->getOpcodeStr();
+  if (auto *X = DTN.get<AccessSpecDecl>()) {
+    CharSourceRange Range(X->getSourceRange(), false);
+    return Lexer::getSourceText(Range, AST.getSourceManager(),
+                                AST.getLangOpts());
+  }
+  if (auto *X = DTN.get<IntegerLiteral>()) {
+    SmallString<256> Str;
+    X->getValue().toString(Str, /*Radix=*/10, /*Signed=*/false);
+    return Str.str();
+  }
+  if (auto *X = DTN.get<StringLiteral>())
+    return X->getString();
+  if (auto *X = DTN.get<ValueDecl>())
+    return X->getNameAsString() + "(" + X->getType().getAsString() + ")";
+  if (auto *X = DTN.get<DeclStmt>())
+    return "";
+  if (auto *X = DTN.get<TranslationUnitDecl>())
+    return "";
+  std::string Value;
+  if (auto *X = DTN.get<DeclRefExpr>()) {
+    if (X->hasQualifier() && X->getQualifier()->getAsIdentifier())
+      Value += std::string(X->getQualifier()->getAsIdentifier()->getName());
+    Value += X->getDecl()->getNameAsString();
+    return Value;
+  }
+  if (auto *X = DTN.get<NamedDecl>())
+    Value += X->getNameAsString() + ";";
+  if (auto *X = DTN.get<TypedefNameDecl>())
+    return Value + X->getUnderlyingType().getAsString() + ";";
+  if (auto *X = DTN.get<NamespaceDecl>())
+    return Value;
+  if (auto *X = DTN.get<TypeDecl>())
+    if (X->getTypeForDecl())
+      Value +=
+          X->getTypeForDecl()->getCanonicalTypeInternal().getAsString() + ";";
+  if (auto *X = DTN.get<Decl>())
+    return Value;
+  if (auto *X = DTN.get<Stmt>())
+    return Value;
+  if (auto *X = DTN.get<QualType>())
+    llvm_unreachable("Types are not included.\n");
+  llvm_unreachable("Fatal: unhandled AST node.\n");
+void TreeRoot::printTree(raw_ostream &OS, NodeId Id) const {
+  if (Id == InvalidNodeId)
+    Id = root();
+  const Node &N = getNode(Id);
+  for (int I = 0; I < N.Depth; ++I)
+    OS << " ";
+  OS << showNode(Id) << "\n";
+  for (NodeId Child : N.Children)
+    printTree(OS, Child);
+std::string TreeRoot::showNode(NodeId Id) const {
+  if (Id == InvalidNodeId)
+    return "None";
+  std::string ValueString;
+  if (getValue(Id) != "")
+    ValueString = formatv(": {0}", getValue(Id));
+  return formatv("{0}{1}({2})", getNode(Id).getTypeLabel(), ValueString,
+                 PostorderIds[Id]);
+void TreeRoot::printNodeAsJson(raw_ostream &OS, NodeId Id) const {
+  auto N = getNode(Id);
+  std::string ValueProperty;
+  if (getValue(Id) != "")
+    ValueProperty = formatv(R"(,"value":"{0}")", getValue(Id));
+  OS << formatv(R"({"type":"{0}"{1},"children":[)", N.getTypeLabel(),
+                ValueProperty);
+  if (N.Children.size() > 0) {
+    printNodeAsJson(OS, N.Children[0]);
+    for (size_t I = 1, E = N.Children.size(); I < E; ++I) {
+      OS << ",";
+      printNodeAsJson(OS, N.Children[I]);
+    }
+  }
+  OS << "]}";
+void TreeRoot::printAsJson(raw_ostream &OS) const {
+  OS << R"({"root":)";
+  printNodeAsJson(OS, root());
+  OS << "}\n";
+Mapping::Mapping(const TreeRoot &T1, const TreeRoot &T2) : T1(T1), T2(T2) {
+  // Maximum possible size after patching one tree.
+  int Size = T1.getSize() + T2.getSize();
+  SrcToDst = llvm::make_unique<NodeId[]>(Size);
+  DstToSrc = llvm::make_unique<NodeId[]>(Size);
+  // set everything to InvalidNodeId == -1
+  memset(SrcToDst.get(), InvalidNodeId, Size * sizeof(NodeId));
+  memset(DstToSrc.get(), InvalidNodeId, Size * sizeof(NodeId));
+void Mapping::link(NodeId Src, NodeId Dst) {
+  SrcToDst[Src] = Dst;
+  DstToSrc[Dst] = Src;
+void Mapping::printMapping(raw_ostream &OS) const {
+  for (NodeId Id1 = 0, Id2, E = T1.getSize(); Id1 < E; ++Id1)
+    if ((Id2 = getDst(Id1)) != InvalidNodeId)
+      OS << formatv("Match {0} to {1}\n", T1.showNode(Id1), T2.showNode(Id2));
+void Mapping::printMapping() const { printMapping(llvm::outs()); }
+/// Identifies a node in a subtree by its postorder offset, starting at 1.
+struct SNodeId {
+  int Id;
+  explicit SNodeId(int Id) : Id(Id){};
+  explicit SNodeId() : Id(InvalidNodeId){};
+  SNodeId &operator++() { return ++this->Id, *this; }
+  SNodeId &operator--() { return --this->Id, *this; }
+  SNodeId operator+(int Other) const { return SNodeId(this->Id + Other); }
+  operator int() const { return Id; }
+class Subtree {
+  /// The parent tree.
+  const TreeRoot &Tree;
+  /// Maps SNodeIds to original ids.
+  std::vector<NodeId> RootIds;
+  /// Maps subtree nodes to their leftmost descendants wtihin the subtree.
+  std::vector<SNodeId> LeftMostDescendants;
+  std::vector<SNodeId> KeyRoots;
+  Subtree(const TreeRoot &Tree, NodeId SubtreeRoot) : Tree(Tree) {
+    int Leaves = Tree.getSubtreePostorder(RootIds, SubtreeRoot);
+    setLeftMostDescendants();
+    computeKeyRoots(Leaves);
+  }
+  int getSizeS() const { return RootIds.size(); }
+  NodeId getIdInRoot(SNodeId Id) const {
+    assert(Id > 0 && Id <= getSizeS() && "Invalid subtree node index.");
+    return RootIds[Id - 1];
+  }
+  const Node &getNodeS(SNodeId Id) const {
+    return Tree.getNode(getIdInRoot(Id));
+  }
+  const std::string getValueS(SNodeId Id) const {
+    return Tree.getValue(getIdInRoot(Id));
+  }
+  SNodeId getLeftMostDescendant(SNodeId Id) const {
+    assert(Id > 0 && Id <= getSizeS() && "Invalid subtree node index.");
+    return LeftMostDescendants[Id - 1];
+  }
+  // Returns the postorder index of the leftmost descendant in the subtree.
+  NodeId getPostorderOffset() const {
+    return Tree.PostorderIds[getIdInRoot(SNodeId(1))];
+  }
+  void setLeftMostDescendants() {
+    LeftMostDescendants.resize(getSizeS());
+    for (int I = 0; I < getSizeS(); ++I) {
+      SNodeId SI(I + 1);
+      NodeId IdRoot = getIdInRoot(SI);
+      const Node &N = getNodeS(SI);
+      assert(I == Tree.PostorderIds[IdRoot] - getPostorderOffset() &&
+             "Postorder traversal in subtree should correspond to traversal in "
+             "the root tree by a constant offset.");
+      LeftMostDescendants[I] = SNodeId(Tree.PostorderIds[N.LeftMostDescendant] -
+                                       getPostorderOffset());
+    }
+  }
+  void computeKeyRoots(int Leaves) {
+    KeyRoots.resize(Leaves);
+    std::unordered_set<int> Visited;
+    int K = Leaves - 1;
+    for (SNodeId I(getSizeS()); I > 0; --I) {
+      SNodeId LeftDesc = getLeftMostDescendant(I);
+      if (Visited.count(LeftDesc))
+        continue;
+      assert(K >= 0 && "K should be non-negative");
+      KeyRoots[K] = I;
+      Visited.insert(LeftDesc);
+      --K;
+    }
+  }
+// Computes an optimal mapping between two trees.
+class ZsMatcher {
+  Subtree S1;
+  Subtree S2;
+  std::unique_ptr<std::unique_ptr<double[]>[]> TreeDist, ForestDist;
+  ZsMatcher(const TreeRoot &T1, const TreeRoot &T2, NodeId Id1, NodeId Id2)
+      : S1(T1, Id1), S2(T2, Id2) {
+    TreeDist =
+        llvm::make_unique<std::unique_ptr<double[]>[]>(S1.getSizeS() + 1);
+    ForestDist =
+        llvm::make_unique<std::unique_ptr<double[]>[]>(S1.getSizeS() + 1);
+    for (int I = 0, E = S1.getSizeS() + 1; I < E; ++I) {
+      TreeDist[I] = llvm::make_unique<double[]>(S2.getSizeS() + 1);
+      ForestDist[I] = llvm::make_unique<double[]>(S2.getSizeS() + 1);
+    }
+  }
+  std::vector<std::pair<NodeId, NodeId>> getMatchingNodes() {
+    std::vector<std::pair<NodeId, NodeId>> Matches;
+    std::vector<std::pair<SNodeId, SNodeId>> TreePairs;
+    computeTreeDist();
+    bool RootNodePair = true;
+    TreePairs.emplace_back(S1.getSizeS(), S2.getSizeS());
+    while (!TreePairs.empty()) {
+      SNodeId LastRow, LastCol, FirstRow, FirstCol, Row, Col;
+      std::tie(LastRow, LastCol) = TreePairs.back();
+      TreePairs.pop_back();
+      if (!RootNodePair) {
+        computeForestDist(LastRow, LastCol);
+      }
+      RootNodePair = false;
+      FirstRow = S1.getLeftMostDescendant(LastRow);
+      FirstCol = S2.getLeftMostDescendant(LastCol);
+      Row = LastRow;
+      Col = LastCol;
+      while (Row > FirstRow || Col > FirstCol) {
+        if (Row > FirstRow &&
+            ForestDist[Row - 1][Col] + 1 == ForestDist[Row][Col]) {
+          --Row;
+        } else if (Col > FirstCol &&
+                   ForestDist[Row][Col - 1] + 1 == ForestDist[Row][Col]) {
+          --Col;
+        } else {
+          SNodeId LMD1 = S1.getLeftMostDescendant(Row);
+          SNodeId LMD2 = S2.getLeftMostDescendant(Col);
+          if (LMD1 == S1.getLeftMostDescendant(LastRow) &&
+              LMD2 == S2.getLeftMostDescendant(LastCol)) {
+            assert(S1.getNodeS(Row).hasSameType(S2.getNodeS(Col)) &&
+                   "Must not match nodes of different kind.");
+            Matches.emplace_back(S1.getIdInRoot(Row), S2.getIdInRoot(Col));
+            --Row;
+            --Col;
+          } else {
+            TreePairs.emplace_back(Row, Col);
+            Row = LMD1;
+            Col = LMD2;
+          }
+        }
+      }
+    }
+    return Matches;
+  }
+  /// Simple cost model for edit actions.
+  /// The values range between 0 and 1, or infinity if this edit action should
+  /// always be avoided.
+  /// These costs could be modified to better model the estimated cost of /
+  /// inserting / deleting the current node.
+  static constexpr double DeletionCost = 1;
+  static constexpr double InsertionCost = 1;
+  double getUpdateCost(SNodeId Id1, SNodeId Id2) {
+    if (!S1.getNodeS(Id1).hasSameType(S2.getNodeS(Id2)))
+      return std::numeric_limits<double>::max();
+    // TODO Use string editing distance instead
+    if (S1.getValueS(Id1) == S2.getValueS(Id2))
+      return 0;
+    return 1;
+  }
+  void computeTreeDist() {
+    for (SNodeId Id1 : S1.KeyRoots)
+      for (SNodeId Id2 : S2.KeyRoots)
+        computeForestDist(Id1, Id2);
+  }
+  void computeForestDist(SNodeId Id1, SNodeId Id2) {
+    assert(Id1 > 0 && Id2 > 0 && "Expecting offsets greater than 0.");
+    SNodeId LMD1 = S1.getLeftMostDescendant(Id1);
+    SNodeId LMD2 = S2.getLeftMostDescendant(Id2);
+    ForestDist[LMD1][LMD2] = 0;
+    for (SNodeId D1 = LMD1 + 1; D1 <= Id1; ++D1) {
+      ForestDist[D1][LMD2] = ForestDist[D1 - 1][LMD2] + DeletionCost;
+      for (SNodeId D2 = LMD2 + 1; D2 <= Id2; ++D2) {
+        ForestDist[LMD1][D2] = ForestDist[LMD1][D2 - 1] + InsertionCost;
+        SNodeId DLMD1 = S1.getLeftMostDescendant(D1);
+        SNodeId DLMD2 = S2.getLeftMostDescendant(D2);
+        if (DLMD1 == LMD1 && DLMD2 == LMD2) {
+          double UpdateCost = getUpdateCost(D1, D2);
+          ForestDist[D1][D2] =
+              std::min(std::min(ForestDist[D1 - 1][D2] + DeletionCost,
+                                ForestDist[D1][D2 - 1] + InsertionCost),
+                       ForestDist[D1 - 1][D2 - 1] + UpdateCost);
+          TreeDist[D1][D2] = ForestDist[D1][D2];
+        } else {
+          ForestDist[D1][D2] =
+              std::min(std::min(ForestDist[D1 - 1][D2] + DeletionCost,
+                                ForestDist[D1][D2 - 1] + InsertionCost),
+                       ForestDist[DLMD1][DLMD2] + TreeDist[D1][D2]);
+        }
+      }
+    }
+  }
+namespace {
+// Compares nodes by their depth.
+struct HeightLess {
+  const TreeRoot &Tree;
+  HeightLess(const TreeRoot &Tree) : Tree(Tree) {}
+  bool operator()(NodeId Id1, NodeId Id2) const {
+    return Tree.getNode(Id1).Height < Tree.getNode(Id2).Height;
+  }
+} // end anonymous namespace
+// Priority queue for nodes, sorted descendingly by their height.
+class PriorityList {
+  const TreeRoot &Tree;
+  HeightLess Comparator;
+  std::vector<NodeId> Container;
+  PriorityQueue<NodeId, std::vector<NodeId>, HeightLess> List;
+  PriorityList(const TreeRoot &Tree)
+      : Tree(Tree), Comparator(Tree), List(Comparator, Container) {}
+  void push(NodeId id) { List.push(id); }
+  std::vector<NodeId> pop() {
+    int Max = peekMax();
+    std::vector<NodeId> Result;
+    if (Max == 0)
+      return Result;
+    while (peekMax() == Max) {
+      Result.push_back(;
+      List.pop();
+    }
+    // TODO this is here to get a stable output, not a good heuristic
+    std::sort(Result.begin(), Result.end());
+    return Result;
+  }
+  int peekMax() const {
+    if (List.empty())
+      return 0;
+    return Tree.getNode(;
+  }
+  void open(NodeId Id) {
+    for (NodeId Child : Tree.getNode(Id).Children)
+      push(Child);
+  }
+bool ASTDiff::isomorphic(NodeId Id1, NodeId Id2) const {
+  const Node &N1 = T1.getNode(Id1);
+  const Node &N2 = T2.getNode(Id2);
+  if (!N1.hasSameType(N2) || N1.Children.size() != N2.Children.size() ||
+      T1.getValue(Id1) != T2.getValue(Id2))
+    return false;
+  for (size_t Id = 0, E = N1.Children.size(); Id < E; ++Id)
+    if (!isomorphic(N1.Children[Id], N2.Children[Id]))
+      return false;
+  return true;
+bool ASTDiff::isMappingAllowed(const Mapping &M, NodeId Id1, NodeId Id2) const {
+  const Node &N1 = T1.getNode(Id1);
+  const Node &N2 = T2.getNode(Id2);
+  bool AnyMapped = M.hasSrc(Id1) || M.hasDst(Id2);
+  bool SameType = N1.hasSameType(N2);
+  NodeId P1 = N1.Parent;
+  NodeId P2 = N2.Parent;
+  bool ParentsSameType = (P1 == InvalidNodeId && P2 == InvalidNodeId) ||
+                         (P1 != InvalidNodeId && P2 != InvalidNodeId &&
+                          T1.getNode(P1).hasSameType(T2.getNode(P2)));
+  return !AnyMapped && SameType && ParentsSameType;
+void ASTDiff::addIsomorphicSubTrees(Mapping &M, NodeId Id1, NodeId Id2) const {
+  assert(isomorphic(Id1, Id2) && "Can only be called on isomorphic subtrees.");
+, Id2);
+  const Node &N1 = T1.getNode(Id1);
+  const Node &N2 = T2.getNode(Id2);
+  for (size_t Id = 0, E = N1.Children.size(); Id < E; ++Id)
+    addIsomorphicSubTrees(M, N1.Children[Id], N2.Children[Id]);
+void ASTDiff::addOptimalMapping(Mapping &M, NodeId Id1, NodeId Id2) const {
+  if (std::max(T1.getNumberOfDescendants(Id1),
+               T2.getNumberOfDescendants(Id2)) >= MaxSize)
+    return;
+  ZsMatcher Matcher(T1, T2, Id1, Id2);
+  std::vector<std::pair<NodeId, NodeId>> R = Matcher.getMatchingNodes();
+  for (const auto Tuple : R) {
+    NodeId Src = Tuple.first;
+    NodeId Dst = Tuple.second;
+    if (isMappingAllowed(M, Src, Dst))
+, Dst);
+  }
+double ASTDiff::getSimilarity(const Mapping &M, NodeId Id1, NodeId Id2) const {
+  if (Id1 == InvalidNodeId || Id2 == InvalidNodeId)
+    return 0.0;
+  int CommonDescendants = 0;
+  const Node &N1 = T1.getNode(Id1);
+  for (NodeId Id = Id1 + 1; Id <= N1.RightMostDescendant; ++Id)
+    CommonDescendants += int(M.hasSrc(Id));
+  return 2.0 * CommonDescendants /
+         (T1.getNumberOfDescendants(Id1) + T2.getNumberOfDescendants(Id2));
+NodeId ASTDiff::findCandidate(const Mapping &M, NodeId Id1) const {
+  NodeId Candidate = InvalidNodeId;
+  double MaxSimilarity = 0.0;
+  const Node &N1 = T1.getNode(Id1);
+  for (NodeId Id2 = 0, E = T2.getSize(); Id2 < E; ++Id2) {
+    const Node &N2 = T2.getNode(Id2);
+    if (!N1.hasSameType(N2))
+      continue;
+    if (M.hasDst(Id2))
+      continue;
+    double Similarity = getSimilarity(M, Id1, Id2);
+    if (Similarity > MaxSimilarity) {
+      MaxSimilarity = Similarity;
+      Candidate = Id2;
+    }
+  }
+  return Candidate;
+void ASTDiff::matchBottomUp(Mapping &M) const {
+  std::vector<NodeId> Postorder;
+  T1.getSubtreePostorder(Postorder, T1.root());
+  for (NodeId Id1 : Postorder) {
+    if (Id1 == T1.root()) {
+, T2.root());
+      addOptimalMapping(M, T1.root(), T2.root());
+      break;
+    }
+    const Node &N1 = T1.getNode(Id1);
+    bool Matched = M.hasSrc(Id1);
+    bool MatchedChildren =
+        std::any_of(N1.Children.begin(), N1.Children.end(),
+                    [&](NodeId Child) { return M.hasSrc(Child); });
+    if (Matched || !MatchedChildren)
+      continue;
+    NodeId Id2 = findCandidate(M, Id1);
+    if (Id2 == InvalidNodeId || !isMappingAllowed(M, Id1, Id2) ||
+        getSimilarity(M, Id1, Id2) < MinDice)
+      continue;
+, Id2);
+    addOptimalMapping(M, Id1, Id2);
+  }
+Mapping ASTDiff::matchTopDown() const {
+  PriorityList L1(T1);
+  PriorityList L2(T2);
+  Mapping M(T1, T2);
+  L1.push(T1.root());
+  L2.push(T2.root());
+  int Max1, Max2;
+  while (std::min(Max1 = L1.peekMax(), Max2 = L2.peekMax()) > MinHeight) {
+    if (Max1 > Max2) {
+      for (NodeId Id : L1.pop())
+      continue;
+    }
+    if (Max2 > Max1) {
+      for (NodeId Id : L2.pop())
+      continue;
+    }
+    std::vector<NodeId> H1, H2;
+    H1 = L1.pop();
+    H2 = L2.pop();
+    for (NodeId Id1 : H1) {
+      for (NodeId Id2 : H2)
+        if (isomorphic(Id1, Id2) && isMappingAllowed(M, Id1, Id2))
+          addIsomorphicSubTrees(M, Id1, Id2);
+    }
+    for (NodeId Id1 : H1) {
+      if (!M.hasSrc(Id1))
+    }
+    for (NodeId Id2 : H2) {
+      if (!M.hasDst(Id2))
+    }
+  }
+  return M;
+Mapping ASTDiff::computeMapping() const {
+  Mapping M = matchTopDown();
+  matchBottomUp(M);
+  return M;
+std::vector<Change> ASTDiff::computeChanges(Mapping &M) {
+  std::vector<Change> Changes;
+  for (NodeId Id2 : T2.getSubtreeBfs(T2.root())) {
+    const Node &N2 = T2.getNode(Id2);
+    NodeId Id1 = M.getSrc(Id2);
+    if (Id1 != InvalidNodeId) {
+      assert(T1.getNode(Id1).hasSameType(N2) &&
+             "Matched nodes with different kinds.");
+      if (T1.getValue(Id1) != T2.getValue(Id2)) {
+        Changes.emplace_back(Update, Id1, Id2, /*UNUSED Position=*/0);
+      }
+      continue;
+    }
+    NodeId P2 = N2.Parent;
+    NodeId P1 = M.getSrc(P2);
+    assert(P1 != InvalidNodeId);
+    Node &Parent1 = T1.getMutableNode(P1);
+    const Node &Parent2 = T2.getNode(P2);
+    auto &Siblings1 = Parent1.Children;
+    const auto &Siblings2 = Parent2.Children;
+    size_t Position;
+    for (Position = 0; Position < Siblings2.size(); ++Position)
+      if (Siblings2[Position] == Id2 || Position >= Siblings1.size())
+        break;
+    Changes.emplace_back(Insert, Id2, P2, Position);
+    Node PatchNode;
+    PatchNode.Parent = P1;
+    PatchNode.LeftMostDescendant = N2.LeftMostDescendant;
+    PatchNode.RightMostDescendant = N2.RightMostDescendant;
+    PatchNode.Depth = N2.Depth;
+    PatchNode.ASTNode = N2.ASTNode;
+    // TODO update Depth if needed
+    NodeId PatchNodeId = T1.getSize();
+    // TODO maybe choose a different data structure for Children.
+    Siblings1.insert(Siblings1.begin() + Position, PatchNodeId);
+    T1.addNode(PatchNode);
+, Id2);
+  }
+  for (NodeId Id1 = 0; Id1 < T1.getSize(); ++Id1) {
+    NodeId Id2 = M.getDst(Id1);
+    if (Id2 == InvalidNodeId)
+      Changes.emplace_back(Delete, Id1, Id2, /*UNUSED Position=*/0);
+  }
+  return Changes;
+void ASTDiff::printChange(raw_ostream &OS, const Change &Chg) const {
+  ChangeKind Kind;
+  NodeId Id1, Id2;
+  size_t Position;
+  std::tie(Kind, Id1, Id2, Position) = Chg;
+  std::string S;
+  switch (Kind) {
+  case Delete:
+    S = formatv("Delete {0}", T1.showNode(Id1));
+    break;
+  case Update:
+    S = formatv("Update {0} to {1}", T1.showNode(Id1), T2.getValue(Id2));
+    break;
+  case Insert:
+    S = formatv("Insert {0} into {1} at {2}", T2.showNode(Id1),
+                T2.showNode(Id2), Position);
+    break;
+  case Move:
+    llvm_unreachable("TODO");
+    break;
+  };
+  OS << S << "\n";
+} // end namespace diff
+} // end namespace clang
Index: include/clang/Tooling/ASTDiff/ASTDiff.h
--- /dev/null
+++ include/clang/Tooling/ASTDiff/ASTDiff.h
@@ -0,0 +1,183 @@
+//===- ASTDiff.h - AST differencing API -----------------------*- C++ -*- -===//
+//                     The LLVM Compiler Infrastructure
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// This file specifies an interface that can be used to compare C++ syntax
+// trees.
+// We use the gumtree algorithm which combines a heuristic top-down search that
+// is able to match large subtrees that are equivalent, with an optimal
+// algorithm to match small subtrees.
+#include "clang/AST/ASTTypeTraits.h"
+namespace clang {
+namespace diff {
+/// Within a tree, this identifies a node by its preorder offset.
+using NodeId = int;
+/// Sentinel value for invalid nodes.
+const NodeId InvalidNodeId = -1;
+/// Represents a Clang AST node, alongside some additional information.
+struct Node {
+  NodeId Parent, LeftMostDescendant, RightMostDescendant;
+  int Depth, Height;
+  ast_type_traits::DynTypedNode ASTNode;
+  // Maybe there is a better way to store children than this.
+  SmallVector<NodeId, 4> Children;
+  ast_type_traits::ASTNodeKind getType() const { return ASTNode.getNodeKind(); }
+  bool hasSameType(const Node &Other) const {
+    return getType().isSame(Other.getType()) ||
+           (getType().isNone() && Other.getType().isNone());
+  }
+  const StringRef getTypeLabel() const { return getType().asStringRef(); }
+  bool isLeaf() const { return Children.empty(); }
+/// Represents the AST of a TranslationUnit.
+class TreeRoot {
+  /// Nodes in preorder.
+  std::vector<Node> Nodes;
+  void setSize(size_t Size) { Nodes.resize(Size); }
+  void setLeftMostDescendant();
+  ASTContext &AST;
+  std::vector<NodeId> Leaves;
+  // Maps preorder indices to postorder ones.
+  std::vector<int> PostorderIds;
+  TreeRoot(ASTContext &AST);
+  int getSize() const { return Nodes.size(); }
+  NodeId root() const { return 0; }
+  const Node &getNode(NodeId Id) const { return Nodes[Id]; }
+  Node &getMutableNode(NodeId Id) { return Nodes[Id]; }
+  bool isValidNodeId(NodeId Id) const { return Id >= 0 && Id < getSize(); }
+  void addNode(Node &N) { Nodes.push_back(N); }
+  template <class T> bool discardNode(T *N) const;
+  int getSubtreePostorder(std::vector<NodeId> &Ids, NodeId Root) const;
+  std::vector<NodeId> getSubtreeBfs(NodeId Id) const;
+  int getNumberOfDescendants(NodeId Id) const;
+  /// Returns the value of the node.
+  std::string getValue(NodeId Id) const;
+  /// Return the node as "<type>[: <value>](<postorder-id)"
+  std::string showNode(NodeId Id) const;
+  void printTree(NodeId Id = InvalidNodeId) const {
+    printTree(llvm::outs(), Id);
+  }
+  void printTree(raw_ostream &OS, NodeId Id = InvalidNodeId) const;
+  void printAsJson() const { printAsJson(llvm::outs()); }
+  void printAsJson(raw_ostream &OS) const;
+  void printNodeAsJson(raw_ostream &OS, NodeId Id) const;
+/// Maps nodes of the left tree to ones on the right, and vice versa.
+// Supports fast insertion and lookup.
+class Mapping {
+  std::unique_ptr<NodeId[]> SrcToDst, DstToSrc;
+  const TreeRoot &T1, &T2;
+  Mapping(const TreeRoot &T1, const TreeRoot &T2);
+  void link(NodeId Src, NodeId Dst);
+  NodeId getDst(NodeId Src) const { return SrcToDst[Src]; }
+  NodeId getSrc(NodeId Dst) const { return DstToSrc[Dst]; }
+  bool hasSrc(NodeId Src) const { return SrcToDst[Src] != InvalidNodeId; }
+  bool hasDst(NodeId Dst) const { return DstToSrc[Dst] != InvalidNodeId; }
+  void printMapping() const;
+  void printMapping(raw_ostream &OS) const;
+/// (Update, src, dst, _  ): update the value of node src to match dst.
+/// (Insert, src, dst, pos): insert src as child of dst at offset pos.
+/// (Delete, src, _,   _  ): delete node src.
+/// (Move,   src, dst, pos): move src to a child of dst at offset pos.
+enum ChangeKind { Delete, Update, Insert, Move };
+using Change = std::tuple<ChangeKind, NodeId, NodeId, size_t>;
+void runDiff(ASTContext &AST1, ASTContext &AST2);
+class ASTDiff {
+  TreeRoot &T1, &T2;
+  // Returns true if the two subtrees are identical.
+  bool isomorphic(NodeId Id1, NodeId Id2) const;
+  // TODO This is too restrictive, we want to allow multiple mapping candidates
+  // for nodes and resolve the ambiguity later.
+  bool isMappingAllowed(const Mapping &M, NodeId Id1, NodeId Id2) const;
+  // Adds all corresponding subtrees of the two nodes to the mapping.
+  // The two nodes must be isomorphic.
+  void addIsomorphicSubTrees(Mapping &M, NodeId Id1, NodeId Id2) const;
+  // Uses an optimal albeit slow algorithm to compute a mapping between two
+  // subtrees, but only if both have fewer nodes than MaxSize.
+  void addOptimalMapping(Mapping &M, NodeId Id1, NodeId Id2) const;
+  // Computes the ratio of common descendants between the two nodes.
+  // Descendants are only considered to be equal when they are mapped in M.
+  double getSimilarity(const Mapping &M, NodeId Id1, NodeId Id2) const;
+  // Returns the node that has the highest degree of similarity.
+  NodeId findCandidate(const Mapping &M, NodeId Id1) const;
+  // Tries to match any yet unmapped nodes, in a bottom-up fashion.
+  void matchBottomUp(Mapping &M) const;
+  // Returns a mapping of isomorphic subtrees.
+  Mapping matchTopDown() const;
+  // During top-down matching, only consider nodes of at least this height.
+  int MinHeight = 2;
+  // During bottom-up matching, match only nodes with at least this value as
+  // the ratio of their common descendants.
+  double MinDice = 0.2;
+  // Whenever two subtrees are matched in the bottom-up phase, the optimal
+  // mapping is computed, unless the size of either subtrees exceeds this.
+  int MaxSize = 100;
+  ASTDiff(TreeRoot &T1, TreeRoot &T2) : T1(T1), T2(T2) {}
+  // Matches nodes one-by-one based on their similarity.
+  Mapping computeMapping() const;
+  // Finds an edit script that converts T1 to T2.
+  std::vector<Change> computeChanges(Mapping &M);
+  // Prints an edit action.
+  void printChange(const Change &Chg) const { printChange(llvm::outs(), Chg); }
+  void printChange(raw_ostream &OS, const Change &Chg) const;
+} // end namespace diff
+} // end namespace clang
cfe-commits mailing list

Reply via email to