Re: [PATCH] D20795: Added basic capabilities to detect source code clones.

Raphael Isemann via cfe-commits Wed, 13 Jul 2016 13:39:53 -0700

teemperor updated this revision to Diff 63851.
teemperor marked 18 inline comments as done.
teemperor added a comment.


- Checker is now in the alpha package and hidden.
- MinGroupComplexity is now a parameter for the checker.
- StmtData is now called CloneSignature.
- Replaced the std::map inside CloneDetector with a vector and a small cache in 
the HashVisitor.
- Moved code from AST to Analysis.
- Fixed multiple other smaller problems pointed out in the review. (Thanks, 
Vassil, Anna and Artem!)


http://reviews.llvm.org/D20795

Files:
  include/clang/Analysis/CloneDetection.h
  include/clang/StaticAnalyzer/Checkers/Checkers.td
  lib/Analysis/CMakeLists.txt
  lib/Analysis/CloneDetection.cpp
  lib/StaticAnalyzer/Checkers/CMakeLists.txt
  lib/StaticAnalyzer/Checkers/CloneChecker.cpp
  test/Analysis/copypaste/test-min-max.cpp
  test/Analysis/copypaste/test-sub-sequences.cpp

Index: test/Analysis/copypaste/test-sub-sequences.cpp
===================================================================
--- /dev/null
+++ test/Analysis/copypaste/test-sub-sequences.cpp
@@ -0,0 +1,28 @@
+// RUN: %clang_cc1 -analyze -std=c++11 -analyzer-checker=alpha.clone.CloneChecker -verify %s
+
+// We test if sub-sequences can match with normal sequences containing only
+// a single statement.
+
+void log2(int a);
+void log();
+
+int max(int a, int b) {
+  log2(a);
+  log(); // expected-warning{{Detected code clone.}}
+  if (a > b)
+    return a;
+  return b;
+}
+
+int maxClone(int a, int b) { // expected-note{{Related code clone is here.}}
+  log();
+  if (a > b)
+    return a;
+  return b;
+}
+
+// Functions below are not clones and should not be reported.
+
+int foo(int a, int b) { // no-warning
+  return a + b;
+}
Index: test/Analysis/copypaste/test-min-max.cpp
===================================================================
--- /dev/null
+++ test/Analysis/copypaste/test-min-max.cpp
@@ -0,0 +1,41 @@
+// RUN: %clang_cc1 -analyze -std=c++11 -analyzer-checker=alpha.clone.CloneChecker -verify %s
+
+void log();
+
+int max(int a, int b) { // expected-warning{{Detected code clone.}}
+  log();
+  if (a > b)
+    return a;
+  return b;
+}
+
+int maxClone(int x, int y) { // expected-note{{Related code clone is here.}}
+  log();
+  if (x > y)
+    return x;
+  return y;
+}
+
+// False positives below. These clones should not be reported.
+
+// FIXME: Detect different binary operator kinds.
+int min1(int a, int b) { // expected-note{{Related code clone is here.}}
+  log();
+  if (a < b)
+    return a;
+  return b;
+}
+
+// FIXME: Detect different variable patterns.
+int min2(int a, int b) { // expected-note{{Related code clone is here.}}
+  log();
+  if (b > a)
+    return a;
+  return b;
+}
+
+// Functions below are not clones and should not be reported.
+
+int foo(int a, int b) { // no-warning
+  return a + b;
+}
Index: lib/StaticAnalyzer/Checkers/CloneChecker.cpp
===================================================================
--- /dev/null
+++ lib/StaticAnalyzer/Checkers/CloneChecker.cpp
@@ -0,0 +1,80 @@
+//===--- CloneChecker.cpp - Clone detection checker -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// CloneChecker is a checker that reports clones in the current translation
+/// unit.
+///
+//===----------------------------------------------------------------------===//
+
+#include "ClangSACheckers.h"
+#include "clang/Analysis/CloneDetection.h"
+#include "clang/Basic/Diagnostic.h"
+#include "clang/StaticAnalyzer/Core/Checker.h"
+#include "clang/StaticAnalyzer/Core/CheckerManager.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
+
+using namespace clang;
+using namespace ento;
+
+namespace {
+class CloneChecker : public Checker<check::EndOfTranslationUnit> {
+
+public:
+  void checkEndOfTranslationUnit(const TranslationUnitDecl *TU,
+                                 AnalysisManager &Mgr, BugReporter &BR) const;
+};
+} // end anonymous namespace
+
+void CloneChecker::checkEndOfTranslationUnit(const TranslationUnitDecl *TU,
+                                             AnalysisManager &Mgr,
+                                             BugReporter &BR) const {
+
+  int MinComplexity = Mgr.getAnalyzerOptions().getOptionAsInteger(
+      "MinimumCloneComplexity", 10, this);
+
+  assert(MinComplexity >= 0);
+
+  SourceManager &SM = BR.getSourceManager();
+
+  CloneDetector CloneDetector;
+  CloneDetector.analyzeTranslationUnit(const_cast<TranslationUnitDecl *>(TU));
+
+  std::vector<CloneDetector::CloneGroup> CloneGroups;
+  CloneDetector.findClones(CloneGroups, MinComplexity);
+
+  DiagnosticsEngine &DiagEngine = Mgr.getDiagnostic();
+
+  unsigned WarnID = DiagEngine.getCustomDiagID(DiagnosticsEngine::Warning,
+                                               "Detected code clone.");
+
+  unsigned NoteID = DiagEngine.getCustomDiagID(DiagnosticsEngine::Note,
+                                               "Related code clone is here.");
+
+  for (CloneDetector::CloneGroup &Group : CloneGroups) {
+    std::sort(Group.begin(), Group.end(), [&SM](const StmtSequence &LHS,
+                                                const StmtSequence &RHS) {
+      return SM.isBeforeInTranslationUnit(LHS.getStartLoc(),
+                                          RHS.getStartLoc()) &&
+             SM.isBeforeInTranslationUnit(LHS.getEndLoc(), RHS.getEndLoc());
+    });
+    DiagEngine.Report(Group.front().getStartLoc(), WarnID);
+    for (unsigned J = 1; J < Group.size(); ++J) {
+      DiagEngine.Report(Group[J].getStartLoc(), NoteID);
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Register CloneChecker
+//===----------------------------------------------------------------------===//
+
+void ento::registerCloneChecker(CheckerManager &Mgr) {
+  Mgr.registerChecker<CloneChecker>();
+}
Index: lib/StaticAnalyzer/Checkers/CMakeLists.txt
===================================================================
--- lib/StaticAnalyzer/Checkers/CMakeLists.txt
+++ lib/StaticAnalyzer/Checkers/CMakeLists.txt
@@ -22,6 +22,7 @@
   CheckerDocumentation.cpp
   ChrootChecker.cpp
   ClangCheckers.cpp
+  CloneChecker.cpp
   DeadStoresChecker.cpp
   DebugCheckers.cpp
   DereferenceChecker.cpp
Index: lib/Analysis/CloneDetection.cpp
===================================================================
--- /dev/null
+++ lib/Analysis/CloneDetection.cpp
@@ -0,0 +1,433 @@
+//===--- CloneDetection.cpp - Finds code clones in an AST -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+///  This file implements classes for searching and anlyzing source code clones.
+///
+//===----------------------------------------------------------------------===//
+
+#include "clang/Analysis/CloneDetection.h"
+
+#include "clang/AST/ASTContext.h"
+#include "clang/AST/RecursiveASTVisitor.h"
+#include "clang/AST/Stmt.h"
+
+using namespace clang;
+
+StmtSequence::StmtSequence(CompoundStmt const *Stmt, ASTContext &Context,
+                           unsigned StartIndex, unsigned EndIndex)
+    : S(Stmt), Context(&Context), StartIndex(StartIndex), EndIndex(EndIndex) {
+  assert(Stmt && "Stmt must not be a nullptr");
+  assert(StartIndex < EndIndex && "Given array should not be empty");
+  assert(EndIndex <= Stmt->size() && "Given array too big for this Stmt");
+}
+
+StmtSequence::StmtSequence(Stmt const *Stmt, ASTContext &Context)
+    : S(Stmt), Context(&Context), StartIndex(0), EndIndex(0) {}
+
+StmtSequence::StmtSequence()
+    : S(nullptr), Context(nullptr), StartIndex(0), EndIndex(0) {}
+
+bool StmtSequence::contains(const StmtSequence &Other) const {
+  // If both sequences reside in different translation units, they can never
+  // contain each other.
+  if (Context != Other.Context)
+    return false;
+
+  const SourceManager &SM = Context->getSourceManager();
+
+  // Otherwise check if the start and end locations of the current sequence
+  // surround the other sequence.
+  bool StartIsInBounds =
+      SM.isBeforeInTranslationUnit(getStartLoc(), Other.getStartLoc()) ||
+      getStartLoc() == Other.getStartLoc();
+  if (!StartIsInBounds)
+    return false;
+
+  bool EndIsInBounds =
+      SM.isBeforeInTranslationUnit(Other.getEndLoc(), getEndLoc()) ||
+      Other.getEndLoc() == getEndLoc();
+  return EndIsInBounds;
+}
+
+StmtSequence::iterator StmtSequence::begin() const {
+  if (!holdsSequence()) {
+    return &S;
+  }
+  auto CS = cast<CompoundStmt>(S);
+  return CS->body_begin() + StartIndex;
+}
+
+StmtSequence::iterator StmtSequence::end() const {
+  if (!holdsSequence()) {
+    return &S + 1;
+  }
+  auto CS = cast<CompoundStmt>(S);
+  return CS->body_begin() + EndIndex;
+}
+
+SourceLocation StmtSequence::getStartLoc() const {
+  return front()->getLocStart();
+}
+
+SourceLocation StmtSequence::getEndLoc() const { return back()->getLocEnd(); }
+
+namespace {
+/// \brief A cache for temporarily storing a small amount of CloneSignatures.
+///
+/// This cache should be manually cleaned from CloneSignatures that are
+/// no longer needed by calling \p remove or \p removeChildren . Otherwise the
+/// cache has to allocate memory to store the increased amount of signatures.
+/// However, the lookup time for the most recently added CloneSignatures won't
+/// deteriorate even if the cache is not properly cleaned.
+class CloneSignatureCache {
+  llvm::SmallVector<CloneDetector::CloneSignature, 16> Cache;
+
+public:
+  /// \brief Adds the Signature into the cache.
+  void add(CloneDetector::CloneSignature const &Signature) {
+    Cache.push_back(Signature);
+  }
+
+  /// \brief Removes the CloneSignature that describes the given Sequence.
+  void remove(StmtSequence const &Sequence) {
+    Cache.erase(
+        std::remove_if(
+            Cache.begin(), Cache.end(),
+            [&Sequence](CloneDetector::CloneSignature const &Signature) {
+              return Signature.Sequence == Sequence;
+            }),
+        Cache.end());
+  }
+
+  /// \brief Removes the associated signature for each child of the given Stmt.
+  /// \param Parent The given Stmt.
+  /// \param Context The ASTContext that contains Parent.
+  void removeChildren(Stmt const *Parent, ASTContext &Context) {
+    for (Stmt const *Child : Parent->children()) {
+      StmtSequence ChildSequence(Child, Context);
+      remove(ChildSequence);
+    }
+  }
+
+  /// \brief Retrieves the CloneSignature that describes the given Sequence.
+  CloneDetector::CloneSignature get(StmtSequence const &S) const {
+    // This cache promises good lookup time for recently added CloneSignatures
+    // even if not properly cleaned. As the most recently added CloneSignature
+    // is at the end, we start searching from back of the cache to the front to
+    // keep that promise.
+    for (auto I = Cache.rbegin(); I != Cache.rend(); ++I) {
+      if (I->Sequence == S) {
+        return *I;
+      }
+    }
+    llvm_unreachable("Couldn't find CloneSignature for StmtSequence");
+  }
+};
+} // end anonymous namespace
+
+namespace {
+/// Traverses the AST and calculates hash values and complexity values for all
+/// statements. The results are stored in a CloneDetector object.
+///
+/// This calculation happens in linear time and each statement is only visited a
+/// fixed amount of times during this process. The hash value for a statement is
+/// first calculated from the data stored directly in the statement object.
+/// Afterwards, the hash values of the children are calculated into the
+/// computed hash value.
+class HashVisitor : public RecursiveASTVisitor<HashVisitor> {
+
+  /// \brief Defines a placeholder hash value for missing child statements.
+  ///
+  /// The value of this constant should be a integer that is unlikely to be
+  /// generated by normally hashing to prevent unintended hash value collisions.
+  static const unsigned NullChildHashValue = 313;
+
+  CloneDetector &CD;
+  ASTContext &Context;
+
+  /// \brief Stores the signatures of visited nodes with an unvisited parent.
+  ///
+  /// This cache is necessary as the signature of a node is partly computed
+  /// from the data of its children. It's the reponsibility of the child node to
+  /// insert its own signature into the cache and the parents responsibility to
+  /// remove the signatures of its children.
+  CloneSignatureCache SignatureCache;
+
+public:
+  explicit HashVisitor(CloneDetector &CD, ASTContext &Context)
+      : CD(CD), Context(Context) {}
+
+  /// \brief Overwritten function that this visitor will traverse in postorder.
+  ///
+  /// We need to traverse postorder over the AST for our algorithm
+  /// to work as each parent expects that its children were already processed.
+  bool shouldTraversePostOrder() const { return true; }
+
+  bool VisitStmt(Stmt *S);
+
+  // Some statements have no parent that could remove their CloneSignatures from
+  // the cache. We manually clean such orphaned clones in the Visit* methods
+  // below. It's not a critical error if a certain declaration isn't handled
+  // here and it only slighly affects the memory consumption during traversal.
+
+  bool VisitFunctionDecl(FunctionDecl *D) {
+    if (D->hasBody())
+      SignatureCache.removeChildren(D->getBody(), Context);
+    return true;
+  }
+  bool VisitVarDecl(VarDecl *D) {
+    if (D->hasInit())
+      SignatureCache.removeChildren(D->getInit(), Context);
+    return true;
+  }
+  bool VisitParmVarDecl(ParmVarDecl *D) {
+    if (D->hasDefaultArg())
+      SignatureCache.removeChildren(D->getDefaultArg(), Context);
+    return true;
+  }
+  bool VisitCXXCtorInitializer(CXXCtorInitializer *D) {
+    SignatureCache.removeChildren(D->getInit(), Context);
+    return true;
+  }
+  bool VisitFieldDecl(FieldDecl *D) {
+    if (D->hasInClassInitializer())
+      SignatureCache.removeChildren(D->getInClassInitializer(), Context);
+    return true;
+  }
+
+private:
+  void HandleChildHashes(llvm::FoldingSetNodeID &Hash, unsigned &Complexity,
+                         Stmt::const_child_iterator Iterator, unsigned StartPos,
+                         unsigned Length) {
+    auto StartIterator = Iterator;
+    for (unsigned i = 0; i < StartPos; ++i)
+      ++StartIterator;
+    auto EndIterator = StartIterator;
+    for (unsigned i = 0; i < Length; ++i)
+      ++EndIterator;
+    Stmt::const_child_range Children(StartIterator, EndIterator);
+    HandleChildHashes(Hash, Complexity, Children);
+  }
+
+  /// Iterates over the specified Stmt array and computes the hash values of all
+  /// statements into the given hash value. Also, the complexity of all
+  /// statements is added to the given complexity value.
+  void HandleChildHashes(llvm::FoldingSetNodeID &Hash, unsigned &Complexity,
+                         Stmt::const_child_range Children) {
+    // Iterate over each child in the sub-sequence.
+    for (Stmt const *Child : Children) {
+      if (Child == nullptr) {
+        ++Complexity;
+        // We use an placeholder hash value for null children.
+        Hash.AddInteger(NullChildHashValue);
+      } else {
+        auto Signature = SignatureCache.get(StmtSequence(Child, Context));
+        Complexity += Signature.Complexity;
+        Hash.AddInteger(Signature.Hash);
+      }
+    }
+  }
+
+  /// \brief Creates and saves the CloneSignatures for all sub-sequences in the
+  /// given
+  /// CompoundStmt.
+  ///
+  /// A sub-sequence is a continuous sequence of statements inside the child
+  /// array of the CompoundStmt.
+  void SaveAllSubSequences(CompoundStmt const *CS);
+};
+} // end anonymous namespace
+
+bool HashVisitor::VisitStmt(Stmt *S) {
+  StmtSequence CurrentStmt(S, Context);
+  llvm::FoldingSetNodeID Hash;
+  unsigned Complexity = 1;
+
+  // The only relevant data for now is the class of the statement, so we
+  // calculate the hash class into the current hash value.
+  // TODO: Handle statement class specific data.
+  Hash.AddInteger(S->getStmtClass());
+
+  // Incorporate the hash values of all child Stmts into the current hash value.
+  HandleChildHashes(Hash, Complexity, S->children());
+
+  CloneDetector::CloneSignature Signature(CurrentStmt, Hash.ComputeHash(),
+                                          Complexity);
+  // Add the signature to the cache that the parent can use it.
+  SignatureCache.add(Signature);
+
+  // Save the signature for the current sequence in the CloneDetector object.
+  CD.add(Signature);
+
+  // If the currently hashed statement is a CompoundStmt, we also need to handle
+  // the sub-sequences inside the CompoundStmt.
+  if (auto CS = dyn_cast<CompoundStmt>(S))
+    SaveAllSubSequences(CS);
+
+  // We no longer need the signatures of the children, so we remove them
+  // from the cache.
+  SignatureCache.removeChildren(S, Context);
+  return true;
+}
+
+void HashVisitor::SaveAllSubSequences(CompoundStmt const *CS) {
+  // The length of the sub-sequence. We don't need to hash sequences with the
+  // length 1 as they are already handled when we hashed the children. We also
+  // don't need to hash the sub-sequence that contains the whole body as this
+  // would be equal to the hash of the CompoundStmt itself.
+  for (unsigned Length = 2; Length < CS->size(); ++Length) {
+    // The start index in the body of the CompoundStmt.
+    // We increase the position and rehash until the end of the sub-sequence
+    // reaches the end of the CompoundStmt body.
+    for (unsigned Pos = 0; Pos <= CS->size() - Length; ++Pos) {
+      llvm::FoldingSetNodeID SubHash;
+      unsigned SubComplexity = 1;
+      StmtSequence Sequence(CS, Context, Pos, Pos + Length);
+
+      // We consider a sub-sequence to be CompoundStmt that only contains
+      // the children in the sub-sequence. This is necessary for matching
+      // sub-sequences with full CompoundStmts.
+      SubHash.AddInteger(CS->getStmtClass());
+
+      HandleChildHashes(SubHash, SubComplexity, CS->child_begin(), Pos, Length);
+
+      CloneDetector::CloneSignature SubData(Sequence, SubHash.ComputeHash(),
+                                            SubComplexity);
+      CD.add(SubData);
+    }
+  }
+}
+
+void CloneDetector::analyzeTranslationUnit(TranslationUnitDecl *TU) {
+  HashVisitor visitor(*this, TU->getASTContext());
+  visitor.TraverseDecl(TU);
+}
+
+void CloneDetector::add(const CloneSignature &Data) {
+  Signatures.push_back(Data);
+}
+
+namespace {
+/// \brief Returns true if and only if \p Stmt contains at least one other
+/// sequence in the \p Group.
+bool StmtContainsAnyInGroup(StmtSequence &Stmt,
+                            CloneDetector::CloneGroup &Group) {
+  for (StmtSequence &GroupStmt : Group) {
+    if (Stmt.contains(GroupStmt))
+      return true;
+  }
+  return false;
+}
+
+/// \brief Returns true if and only if all sequences in \p OtherGroup are
+/// contained by a sequence in \p Group.
+bool GroupContains(CloneDetector::CloneGroup &Group,
+                   CloneDetector::CloneGroup &OtherGroup) {
+  // We have less sequences in the current group than we have in the other,
+  // so we will never fulfill the requirement for returning true. This is only
+  // possible because we know that a sequence in Group can contain at most
+  // one sequence in OtherGroup.
+  if (Group.size() < OtherGroup.size())
+    return false;
+
+  for (StmtSequence &Stmt : Group) {
+    if (!StmtContainsAnyInGroup(Stmt, OtherGroup))
+      return false;
+  }
+  return true;
+}
+} // end anonymous namespace
+
+void CloneDetector::findClones(std::vector<CloneGroup> &Result,
+                               unsigned MinGroupComplexity) {
+  // For creating hash groups, we need to find CloneSignatures with the same
+  // hash value, so we first sort all CloneSignatures by their hash value. Then
+  // we search for sequences of CloneSignatures with the same hash value.
+  std::sort(Signatures.begin(), Signatures.end(),
+            [](const CloneSignature &A, const CloneSignature &B) {
+              return A.Hash < B.Hash;
+            });
+
+  // Check for each CloneSignature if its successor has the same hash value.
+  // We don't check the last CloneSignature as it has no successor.
+  for (unsigned i = 0; i < Signatures.size() - 1; ++i) {
+    CloneSignature const &Current = Signatures[i];
+    CloneSignature const &Next = Signatures[i + 1];
+
+    if (Current.Hash != Next.Hash)
+      continue;
+
+    // It's likely that we just found an sequence of CloneSignatures that
+    // represent a CloneGroup, so we create a new group and start checking and
+    // adding the CloneSignatures in this sequence.
+    CloneGroup Group;
+
+    for (; i < Signatures.size(); ++i) {
+      CloneSignature const &Signature = Signatures[i];
+
+      // A different hash value means we have reached the end of the sequence.
+      if (Current.Hash != Signature.Hash) {
+        // The current Signature could be the start of a new CloneGroup. So we
+        // decrement i so that we visit it again in the outer loop.
+        // Note: i can never be 0 at this point because we are just comparing
+        // the hash of the Current CloneSignature with itself in the 'if' above.
+        assert(i != 0);
+        --i;
+        break;
+      }
+
+      // Skip CloneSignatures that won't pass the complexity requirement.
+      if (Signature.Complexity < MinGroupComplexity)
+        continue;
+
+      Group.push_back(Signature.Sequence);
+    }
+
+    // There is a chance that we haven't found more than two fitting
+    // CloneSignature because not enough CloneSignatures passed the complexity
+    // requirement. As a CloneGroup with less than two members makes no sense,
+    // we ignore this CloneGroup and won't add it to the result.
+    if (Group.size() <= 1)
+      continue;
+
+    Result.push_back(Group);
+  }
+
+  std::vector<unsigned> IndexesToRemove;
+
+  // Compare every group in the result with the rest. If one groups contains
+  // another group, we only need to return the bigger group.
+  // Note: This doesn't scale well, so if possible avoid calling any heavy
+  // function from this loop to reduce the negative performance impact.
+  for (unsigned i = 0; i < Result.size(); ++i) {
+    for (unsigned j = 0; j < Result.size(); ++j) {
+      // Don't compare a group with itself.
+      if (i == j)
+        continue;
+
+      if (GroupContains(Result[j], Result[i])) {
+        IndexesToRemove.push_back(i);
+        break;
+      }
+    }
+  }
+
+  // We need to erase invalid results from the result vector with decreasing
+  // unique indexes. For this we sort all indexes in descending order and then
+  // remove any duplicates.
+  std::sort(IndexesToRemove.begin(), IndexesToRemove.end(),
+            std::greater<unsigned>());
+
+  auto UniqueEnd = std::unique(IndexesToRemove.begin(), IndexesToRemove.end());
+  IndexesToRemove.erase(UniqueEnd, IndexesToRemove.end());
+
+  for (unsigned i : IndexesToRemove) {
+    Result.erase(Result.begin() + i);
+  }
+}
Index: lib/Analysis/CMakeLists.txt
===================================================================
--- lib/Analysis/CMakeLists.txt
+++ lib/Analysis/CMakeLists.txt
@@ -9,6 +9,7 @@
   CFGReachabilityAnalysis.cpp
   CFGStmtMap.cpp
   CallGraph.cpp
+  CloneDetection.cpp
   CocoaConventions.cpp
   Consumed.cpp
   CodeInjector.cpp
Index: include/clang/StaticAnalyzer/Checkers/Checkers.td
===================================================================
--- include/clang/StaticAnalyzer/Checkers/Checkers.td
+++ include/clang/StaticAnalyzer/Checkers/Checkers.td
@@ -77,6 +77,8 @@
 def LLVM : Package<"llvm">;
 def Debug : Package<"debug">;
 
+def CloneDetectionAlpha : Package<"clone">, InPackage<Alpha>, Hidden;
+
 //===----------------------------------------------------------------------===//
 // Core Checkers.
 //===----------------------------------------------------------------------===//
@@ -657,3 +659,17 @@
   DescFile<"DebugCheckers.cpp">;
 
 } // end "debug"
+
+
+//===----------------------------------------------------------------------===//
+// Clone Detection
+//===----------------------------------------------------------------------===//
+
+let ParentPackage = CloneDetectionAlpha in {
+
+def CloneChecker : Checker<"CloneChecker">,
+  HelpText<"Reports similar pieces of code.">,
+  DescFile<"CloneChecker.cpp">;
+
+} // end "clone"
+
Index: include/clang/Analysis/CloneDetection.h
===================================================================
--- /dev/null
+++ include/clang/Analysis/CloneDetection.h
@@ -0,0 +1,216 @@
+//===--- CloneDetection.h - Finds code clones in an AST ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// /file
+/// This file defines classes for searching and anlyzing source code clones.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_AST_CLONEDETECTION_H
+#define LLVM_CLANG_AST_CLONEDETECTION_H
+
+#include "clang/Basic/SourceLocation.h"
+
+#include <vector>
+
+namespace clang {
+
+class Stmt;
+class ASTContext;
+class CompoundStmt;
+class TranslationUnitDecl;
+
+/// \brief Identifies a list of statements.
+///
+/// Can either identify a single arbitrary Stmt object, a continuous sequence of
+/// child statements inside a CompoundStmt or no statements at all.
+class StmtSequence {
+  /// If this object identifies a sequence of statements inside a CompoundStmt,
+  /// S points to this CompoundStmt. If this object only identifies a single
+  /// Stmt, then S is a pointer to this Stmt.
+  Stmt const *S;
+
+  /// The related ASTContext for S.
+  ASTContext *Context;
+
+  /// If EndIndex is non-zero, then S is a CompoundStmt and this StmtSequence
+  /// instance is representing the CompoundStmt children inside the array
+  /// [StartIndex, EndIndex).
+  unsigned StartIndex;
+  unsigned EndIndex;
+
+public:
+  /// \brief Constructs a StmtSequence holding multiple statements.
+  ///
+  /// The resulting StmtSequence identifies a continuous sequence of statements
+  /// in the body of the given CompoundStmt. Which statements of the body should
+  /// be identified needs to be specified by providing a start and end index
+  /// that describe a non-empty sub-array in the body of the given CompoundStmt.
+  ///
+  /// \param Stmt A CompoundStmt that contains all statements in its body.
+  /// \param Context The ASTContext for the given CompoundStmt.
+  /// \param StartIndex The inclusive start index in the children array of
+  ///                   \p Stmt
+  /// \param EndIndex The exclusive end index in the children array of \p Stmt.
+  StmtSequence(CompoundStmt const *Stmt, ASTContext &Context,
+               unsigned StartIndex, unsigned EndIndex);
+
+  /// \brief Constructs a StmtSequence holding a single statement.
+  ///
+  /// \param Stmt An arbitrary Stmt.
+  /// \param Context The ASTContext for the given Stmt.
+  StmtSequence(Stmt const *Stmt, ASTContext &Context);
+
+  /// \brief Constructs an empty StmtSequence.
+  StmtSequence();
+
+  typedef Stmt const *const *iterator;
+
+  /// Returns an iterator pointing to the first statement in this sequence.
+  iterator begin() const;
+
+  /// Returns an iterator pointing behind the last statement in this sequence.
+  iterator end() const;
+
+  /// Returns the first statement in this sequence.
+  ///
+  /// This method should only be called on a non-empty StmtSequence object.
+  Stmt const *front() const {
+    assert(!empty());
+    return begin()[0];
+  }
+
+  /// Returns the last statement in this sequence.
+  ///
+  /// This method should only be called on a non-empty StmtSequence object.
+  Stmt const *back() const {
+    assert(!empty());
+    return begin()[size() - 1];
+  }
+
+  /// Returns the number of statements this object holds.
+  unsigned size() const {
+    if (holdsSequence())
+      return EndIndex - StartIndex;
+    if (S == nullptr)
+      return 0;
+    return 1;
+  }
+
+  /// Returns true if and only if this StmtSequence contains no statements.
+  bool empty() const { return size() == 0; }
+
+  /// Returns the related ASTContext for the stored Stmts.
+  ASTContext &getASTContext() const {
+    assert(Context);
+    return *Context;
+  }
+
+  /// Returns true if this objects holds a list of statements.
+  bool holdsSequence() const { return EndIndex != 0; }
+
+  /// Returns the start sourcelocation of the first statement in this sequence.
+  ///
+  /// This method should only be called on a non-empty StmtSequence object.
+  SourceLocation getStartLoc() const;
+
+  /// Returns the end sourcelocation of the last statement in this sequence.
+  ///
+  /// This method should only be called on a non-empty StmtSequence object.
+  SourceLocation getEndLoc() const;
+
+  bool operator==(const StmtSequence &Other) const {
+    return std::tie(S, StartIndex, EndIndex) ==
+           std::tie(Other.S, Other.StartIndex, Other.EndIndex);
+  }
+
+  bool operator!=(const StmtSequence &Other) const {
+    return std::tie(S, StartIndex, EndIndex) !=
+           std::tie(Other.S, Other.StartIndex, Other.EndIndex);
+  }
+
+  /// Returns true if and only if this sequence covers a source range that
+  /// contains the source range of the given sequence \p Other.
+  ///
+  /// This method should only be called on a non-empty StmtSequence object
+  /// and passed a non-empty StmtSequence object.
+  bool contains(const StmtSequence &Other) const;
+};
+
+/// \brief Searches for clones in source code.
+///
+/// First, this class needs a translation unit which is passed via
+/// \p analyzeTranslationUnit . It will then generate and store search data
+/// for all statements inside the given translation unit.
+/// Afterwards the generated data can be used to find code clones by calling
+/// \p findClones .
+///
+/// The generates search data is needed for efficiently finding
+/// clones. This is done by hashing all statements using a locality-sensitive
+/// hash function that generates identical hash values for similar statement
+/// trees.
+///
+/// This class only searches for clones in exectuable source code
+/// (e.g. function bodies). Other clones (e.g. cloned comments or declarations)
+/// are not supported.
+class CloneDetector {
+public:
+  /// Holds the data about a StmtSequence that is needed during the search for
+  /// code clones.
+  struct CloneSignature {
+    /// The StmtSequence that is described by this CloneSignature.
+    StmtSequence Sequence;
+
+    /// The generated hash value for the StmtSequence. Two CloneSignature
+    /// objects with the same hash value will probably contain two StmtSequence
+    /// objects that are clones of each other. However, false positives are
+    /// possible here due to unintended hash function collisions.
+    ///
+    /// Two CloneSignature objects with a different hash value will never
+    /// contain two StmtSequence objects that are clones of each other.
+    unsigned Hash;
+
+    /// \brief The complexity of the StmtSequence.
+    ///
+    /// This scalar value serves as a simple way of filtering clones that are
+    /// too small to be reported. A greater value indicates that the related
+    /// StmtSequence is probably more interesting to the user.
+    unsigned Complexity;
+    CloneSignature() : Hash(0), Complexity(0) {}
+
+    CloneSignature(const StmtSequence &S, unsigned Hash, unsigned Complexity)
+        : Sequence(S), Hash(Hash), Complexity(Complexity) {}
+  };
+
+  /// \brief Generates and stores search data for all statements in the given
+  /// translation unit.
+  void analyzeTranslationUnit(TranslationUnitDecl *TU);
+
+  /// \brief Stores the CloneSignature to allow future querying.
+  void add(const CloneSignature &Data);
+
+  typedef llvm::SmallVector<StmtSequence, 4> CloneGroup;
+
+  /// \brief Searches the provided statements for clones.
+  ///
+  /// \param Result Output parameter that is filled with a list of found
+  ///               clone groups. Each group contains multiple StmtSequences
+  ///               that were identified to be clones of each other.
+  /// \param MinGroupComplexity Only return clones which have at least this
+  ///                           complexity value.
+  void findClones(std::vector<CloneGroup> &Result, unsigned MinGroupComplexity);
+
+private:
+  /// Stores all provided statements and their associated search data.
+  std::vector<CloneSignature> Signatures;
+};
+
+} // end namespace clang
+
+#endif // LLVM_CLANG_AST_CLONEDETECTION_H

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Re: [PATCH] D20795: Added basic capabilities to detect source code clones.

Reply via email to