teemperor updated this revision to Diff 67691.
teemperor added a comment.

- Rebased the patch to the newest code base.

This patch is live again as it turned out that the CloneDetector can't handle 
large code bases without it. The simpler approach that we implemented instead 
consumed too much memory (e.g. at least 15 GB for analyzing SQLite) because the 
large data vectors and the quadratic space complexity of the sub-sequence 
search didn't play together very well.


https://reviews.llvm.org/D22515

Files:
  include/clang/Analysis/CloneDetection.h
  lib/Analysis/CloneDetection.cpp
  lib/StaticAnalyzer/Checkers/CloneChecker.cpp

Index: lib/StaticAnalyzer/Checkers/CloneChecker.cpp
===================================================================
--- lib/StaticAnalyzer/Checkers/CloneChecker.cpp
+++ lib/StaticAnalyzer/Checkers/CloneChecker.cpp
@@ -51,15 +51,6 @@
                                                  "Related code clone is here.");
 
     for (CloneDetector::CloneGroup &Group : CloneGroups) {
-      // For readability reasons we sort the clones by line numbers.
-      std::sort(Group.Sequences.begin(), Group.Sequences.end(),
-                [&SM](const StmtSequence &LHS, const StmtSequence &RHS) {
-                  return SM.isBeforeInTranslationUnit(LHS.getStartLoc(),
-                                                      RHS.getStartLoc()) &&
-                         SM.isBeforeInTranslationUnit(LHS.getEndLoc(),
-                                                      RHS.getEndLoc());
-                });
-
       // We group the clones by printing the first as a warning and all others
       // as a note.
       DiagEngine.Report(Group.Sequences.front().getStartLoc(), WarnID);
Index: lib/Analysis/CloneDetection.cpp
===================================================================
--- lib/Analysis/CloneDetection.cpp
+++ lib/Analysis/CloneDetection.cpp
@@ -243,46 +243,35 @@
 /// This class defines what a code clone is: If it collects for two statements
 /// the same data, then those two statements are considered to be clones of each
 /// other.
-class StmtDataCollector : public ConstStmtVisitor<StmtDataCollector> {
+///
+/// All collected data is forwarded to the given data consumer of the type T.
+/// The data consumer class needs to provide a member method with the signature:
+///   write(const char *Data, size_t Size)
+template <typename T>
+class StmtDataCollector : public ConstStmtVisitor<StmtDataCollector<T>> {
 
   ASTContext &Context;
-  std::vector<CloneDetector::DataPiece> &CollectedData;
+  /// \brief The data sink to which all data is forwarded.
+  T &DataConsumer;
 
 public:
   /// \brief Collects data of the given Stmt.
   /// \param S The given statement.
   /// \param Context The ASTContext of S.
-  /// \param D The given data vector to which all collected data is appended.
-  StmtDataCollector(const Stmt *S, ASTContext &Context,
-                    std::vector<CloneDetector::DataPiece> &D)
-      : Context(Context), CollectedData(D) {
-    Visit(S);
+  /// \param D The data sink to which all data is forwarded.
+  StmtDataCollector(const Stmt *S, ASTContext &Context, T &DataConsumer)
+      : Context(Context), DataConsumer(DataConsumer) {
+    this->Visit(S);
   }
 
   // Below are utility methods for appending different data to the vector.
 
   void addData(CloneDetector::DataPiece Integer) {
-    CollectedData.push_back(Integer);
+    DataConsumer.write(reinterpret_cast<char *>(&Integer), sizeof(Integer));
   }
 
-  // FIXME: The functions below add long strings to the data vector which are
-  // probably not good for performance. Replace the strings with pointer values
-  // or a some other unique integer.
-
   void addData(llvm::StringRef Str) {
-    if (Str.empty())
-      return;
-
-    const size_t OldSize = CollectedData.size();
-
-    const size_t PieceSize = sizeof(CloneDetector::DataPiece);
-    // Calculate how many vector units we need to accomodate all string bytes.
-    size_t RoundedUpPieceNumber = (Str.size() + PieceSize - 1) / PieceSize;
-    // Allocate space for the string in the data vector.
-    CollectedData.resize(CollectedData.size() + RoundedUpPieceNumber);
-
-    // Copy the string to the allocated space at the end of the vector.
-    std::memcpy(CollectedData.data() + OldSize, Str.data(), Str.size());
+    DataConsumer.write(Str.data(), Str.size());
   }
 
   void addData(const QualType &QT) { addData(QT.getAsString()); }
@@ -426,8 +415,10 @@
     // Create an empty signature that will be filled in this method.
     CloneDetector::CloneSignature Signature;
 
+    llvm::hash_stream Hash;
+
     // Collect all relevant data from S and put it into the empty signature.
-    StmtDataCollector(S, Context, Signature.Data);
+    StmtDataCollector<llvm::hash_stream>(S, Context, Hash);
 
     // If this code is generated by a macro, it won't increase the complexity
     // level of it's containing clone. This prevents false-positives caused by
@@ -457,7 +448,8 @@
       auto ChildSignature = generateSignatures(Child);
 
       // Add the collected data to the signature of the current statement.
-      Signature.add(ChildSignature);
+      Signature.Complexity += ChildSignature.Complexity;
+      Hash << static_cast<size_t>(ChildSignature.Hash);
 
       // If the current statement is a CompoundStatement, we need to store the
       // signature for the generation of the sub-sequences.
@@ -470,6 +462,8 @@
     if (CS)
       handleSubSequences(CS, ChildSignatures);
 
+    Signature.Hash = Hash.compute_hash();
+
     // Save the signature for the current statement in the CloneDetector object.
     CD.add(StmtSequence(S, Context), Signature);
 
@@ -498,11 +492,15 @@
         // Create an empty signature and add the signatures of all selected
         // child statements to it.
         CloneDetector::CloneSignature SubSignature;
+        llvm::hash_stream SubHash;
 
         for (unsigned i = Pos; i < Pos + Length; ++i) {
-          SubSignature.add(ChildSignatures[i]);
+          SubSignature.Complexity += ChildSignatures[i].Complexity;
+          SubHash << static_cast<size_t>(ChildSignatures[i].Hash);
         }
 
+        SubSignature.Hash = SubHash.compute_hash();
+
         // Save the signature together with the information about what children
         // sequence we selected.
         CD.add(StmtSequence(CS, Context, Pos, Pos + Length), SubSignature);
@@ -528,25 +526,7 @@
 
 void CloneDetector::add(const StmtSequence &S,
                         const CloneSignature &Signature) {
-  // StringMap only works with StringRefs, so we create one for our data vector.
-  auto &Data = Signature.Data;
-  StringRef DataRef = StringRef(reinterpret_cast<const char *>(Data.data()),
-                                Data.size() * sizeof(unsigned));
-
-  // Search with the help of the signature if we already have encountered a
-  // clone of the given StmtSequence.
-  auto I = CloneGroupIndexes.find(DataRef);
-  if (I == CloneGroupIndexes.end()) {
-    // We haven't found an existing clone group, so we create a new clone group
-    // for this StmtSequence and store the index of it in our search map.
-    CloneGroupIndexes[DataRef] = CloneGroups.size();
-    CloneGroups.emplace_back(S, Signature.Complexity);
-    return;
-  }
-
-  // We have found an existing clone group and can expand it with the given
-  // StmtSequence.
-  CloneGroups[I->getValue()].Sequences.push_back(S);
+  Sequences.push_back(std::make_pair(Signature, S));
 }
 
 namespace {
@@ -579,14 +559,67 @@
 }
 } // end anonymous namespace
 
+
+namespace {
+/// \brief Wrapper around FoldingSetNodeID that it can be used as the template
+///        argument of the StmtDataCollector.
+class FoldingSetWrapper {
+
+  llvm::FoldingSetNodeID &FS;
+
+public:
+  FoldingSetWrapper(llvm::FoldingSetNodeID &FS) : FS(FS) {}
+
+  void write(const char *Data, size_t Size) {
+    FS.AddString(StringRef(Data, Size));
+  }
+};
+}
+
+/// \brief Writes the relevant data from all statements and child statements
+///        in the given StmtSequence into the given FoldingSetNodeID.
+static void CollectStmtSequenceData(const StmtSequence &Sequence,
+                                    FoldingSetWrapper &OutputData) {
+  for (const Stmt *S : Sequence) {
+    StmtDataCollector<FoldingSetWrapper>(S, Sequence.getASTContext(), OutputData);
+
+    for (const Stmt *Child : S->children()) {
+      if (!Child)
+        continue;
+
+      CollectStmtSequenceData(StmtSequence(Child, Sequence.getASTContext()), OutputData);
+    }
+  }
+}
+
+/// \brief Returns true if both sequences are clones of each other.
+static bool areSequencesClones(const StmtSequence &LHS,
+                              const StmtSequence &RHS) {
+  // We collect the data from all statements in the sequence as we did before
+  // when generating a hash value for each sequence. But this time we don't
+  // hash the collected data and compare the whole data set instead. This
+  // prevents any false-positives due to hash code collisions.
+  llvm::FoldingSetNodeID DataLHS, DataRHS;
+  FoldingSetWrapper LHSWrapper(DataLHS);
+  FoldingSetWrapper RHSWrapper(DataRHS);
+
+  CollectStmtSequenceData(LHS, LHSWrapper);
+  CollectStmtSequenceData(RHS, RHSWrapper);
+
+  return DataLHS == DataRHS;
+}
+
 /// \brief Finds all actual clone groups in a single group of presumed clones.
-/// \param Result Output parameter to which all found groups are added. Every
-///               clone in a group that was added this way follows the same
-///               variable pattern as the other clones in its group.
-/// \param Group A group of clones. The clones are allowed to have a different
-///              variable pattern.
+/// \param Result Output parameter to which all found groups are added.
+/// \param Group A group of presumed clones. The clones are allowed to have a
+///              different variable pattern and may not be actual clones of each
+///              other.
+/// \param CheckVariablePatterns If true, every clone in a group that was added
+///              to the output follows the same variable pattern as the other
+///              clones in its group.
 static void createCloneGroups(std::vector<CloneDetector::CloneGroup> &Result,
-                              const CloneDetector::CloneGroup &Group) {
+                              const CloneDetector::CloneGroup &Group,
+                              bool CheckVariablePattern) {
   // We remove the Sequences one by one, so a list is more appropriate.
   std::list<StmtSequence> UnassignedSequences(Group.Sequences.begin(),
                                               Group.Sequences.end());
@@ -598,7 +631,7 @@
     StmtSequence Prototype = UnassignedSequences.front();
     UnassignedSequences.pop_front();
 
-    CloneDetector::CloneGroup FilteredGroup(Prototype, Group.Complexity);
+    CloneDetector::CloneGroup FilteredGroup(Prototype, Group.Signature);
 
     // Analyze the variable pattern of the prototype. Every other StmtSequence
     // needs to have the same pattern to get into the new clone group.
@@ -608,34 +641,109 @@
     // and assign them to our new clone group.
     auto I = UnassignedSequences.begin(), E = UnassignedSequences.end();
     while (I != E) {
+      // If the sequence doesn't fit to the prototype, we have encountered
+      // an unintended hash code collision and we skip it.
+      if (!areSequencesClones(Prototype, *I)) {
+        ++I;
+        continue;
+      }
 
-      if (VariablePattern(*I).getPatternDifferences(PrototypeFeatures) == 0) {
+      // If we weren't asked to check for a matching variable pattern in clone
+      // groups we can add the sequence now to the new clone group.
+      // If we were asked to check for matching variable pattern, we first have
+      // to check that there are no differences between the two patterns and
+      // only proceed if they match.
+      if (!CheckVariablePattern || VariablePattern(*I).getPatternDifferences(PrototypeFeatures) == 0) {
         FilteredGroup.Sequences.push_back(*I);
         I = UnassignedSequences.erase(I);
         continue;
       }
+
+      // We didn't found a matching variable pattern, so we continue with the
+      // next sequence.
       ++I;
     }
 
     // Add a valid clone group to the list of found clone groups.
     if (!FilteredGroup.isValid())
       continue;
-
     Result.push_back(FilteredGroup);
   }
 }
 
 void CloneDetector::findClones(std::vector<CloneGroup> &Result,
                                unsigned MinGroupComplexity,
                                bool CheckPatterns) {
+  // A shortcut (and necessary for the for-loop later in this function).
+  if (Sequences.empty())
+    return;
+
+  // We need to search for groups of StmtSequences with the same hash code to
+  // create our initial clone groups. By sorting all known StmtSequences by
+  // their hash value we make sure that StmtSequences with the same hash code
+  // are grouped together in the Sequences vector.
+  // Note: We stable sort here because the StmtSequences are added in the order
+  // in which they appear in the source file. We want to preserve that order
+  // because we also want to report them in that order in the CloneChecker.
+  std::stable_sort(Sequences.begin(), Sequences.end(),
+                   [](std::pair<CloneSignature, StmtSequence> LHS,
+                      std::pair<CloneSignature, StmtSequence> RHS) {
+                     return LHS.first.Hash < RHS.first.Hash;
+                   });
+
+  std::vector<CloneGroup> CloneGroups;
+
+  // Check for each CloneSignature if its successor has the same hash value.
+  // We don't check the last CloneSignature as it has no successor.
+  // Note: The 'size - 1' in the condition is safe because we check for an empty
+  // Sequences vector at the beginning of this function.
+  for (unsigned i = 0; i < Sequences.size() - 1; ++i) {
+    const auto Current = Sequences[i];
+    const auto Next = Sequences[i + 1];
+
+    if (Current.first.Hash != Next.first.Hash)
+      continue;
+
+    // It's likely that we just found an sequence of CloneSignatures that
+    // represent a CloneGroup, so we create a new group and start checking and
+    // adding the CloneSignatures in this sequence.
+    CloneGroup Group;
+    Group.Signature = Current.first;
+
+    for (; i < Sequences.size(); ++i) {
+      const auto &Signature = Sequences[i];
+
+      // A different hash value means we have reached the end of the sequence.
+      if (Current.first.Hash != Signature.first.Hash) {
+        // The current Signature could be the start of a new CloneGroup. So we
+        // decrement i so that we visit it again in the outer loop.
+        // Note: i can never be 0 at this point because we are just comparing
+        // the hash of the Current CloneSignature with itself in the 'if' above.
+        assert(i != 0);
+        --i;
+        break;
+      }
+
+      // Skip CloneSignatures that won't pass the complexity requirement.
+      if (Signature.first.Complexity < MinGroupComplexity)
+        continue;
+
+      Group.Sequences.push_back(Signature.second);
+    }
+
+    // There is a chance that we haven't found more than two fitting
+    // CloneSignature because not enough CloneSignatures passed the complexity
+    // requirement. As a CloneGroup with less than two members makes no sense,
+    // we ignore this CloneGroup and won't add it to the result.
+    if (!Group.isValid())
+      continue;
+
+    CloneGroups.push_back(Group);
+  }
+
   // Add every valid clone group that fulfills the complexity requirement.
   for (const CloneGroup &Group : CloneGroups) {
-    if (Group.isValid() && Group.Complexity >= MinGroupComplexity) {
-      if (CheckPatterns)
-        createCloneGroups(Result, Group);
-      else
-        Result.push_back(Group);
-    }
+    createCloneGroups(Result, Group, CheckPatterns);
   }
 
   std::vector<unsigned> IndexesToRemove;
Index: include/clang/Analysis/CloneDetection.h
===================================================================
--- include/clang/Analysis/CloneDetection.h
+++ include/clang/Analysis/CloneDetection.h
@@ -16,6 +16,7 @@
 #define LLVM_CLANG_AST_CLONEDETECTION_H
 
 #include "clang/Basic/SourceLocation.h"
+#include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/StringMap.h"
 
 #include <vector>
@@ -163,11 +164,20 @@
   /// Holds the data about a StmtSequence that is needed during the search for
   /// code clones.
   struct CloneSignature {
-    /// \brief Holds all relevant data of a StmtSequence.
+    /// \brief The hash code of the StmtSequence.
     ///
-    /// If this variable is equal for two different StmtSequences, then they can
-    /// be considered clones of each other.
-    std::vector<DataPiece> Data;
+    /// The initial clone groups that are formed during the search for clones
+    /// consist only of Sequences that share the same hash code. This makes this
+    /// value the central part of this heuristic that is needed to find clones
+    /// in a performant way. For this to work, the type of this variable
+    /// always needs to be small and fast to compare.
+    ///
+    /// Also, StmtSequences that are clones of each others have to share
+    /// the same hash code. StmtSequences that are not clones of each other
+    /// shouldn't share the same hash code, but if they do, it will only
+    /// degrade the performance of the hash search but doesn't influence
+    /// the correctness of the result.
+    llvm::hash_code Hash;
 
     /// \brief The complexity of the StmtSequence.
     ///
@@ -179,25 +189,21 @@
     /// \brief Creates an empty CloneSignature without any data.
     CloneSignature() : Complexity(1) {}
 
-    CloneSignature(const std::vector<unsigned> &Data, unsigned Complexity)
-        : Data(Data), Complexity(Complexity) {}
-
-    /// \brief Adds the data from the given CloneSignature to this one.
-    void add(const CloneSignature &Other) {
-      Data.insert(Data.end(), Other.Data.begin(), Other.Data.end());
-      Complexity += Other.Complexity;
-    }
+    CloneSignature(llvm::hash_code Hash, unsigned Complexity)
+        : Hash(Hash), Complexity(Complexity) {}
   };
 
   /// Holds group of StmtSequences that are clones of each other and the
   /// complexity value (see CloneSignature::Complexity) that all stored
   /// StmtSequences have in common.
   struct CloneGroup {
     std::vector<StmtSequence> Sequences;
-    unsigned Complexity;
+    CloneSignature Signature;
+
+    CloneGroup() {}
 
-    CloneGroup(const StmtSequence &Seq, unsigned Complexity)
-        : Complexity(Complexity) {
+    CloneGroup(const StmtSequence &Seq, CloneSignature Signature)
+        : Signature(Signature) {
       Sequences.push_back(Seq);
     }
 
@@ -262,11 +268,8 @@
                             unsigned MinGroupComplexity);
 
 private:
-  /// Stores all found clone groups including invalid groups with only a single
-  /// statement.
-  std::vector<CloneGroup> CloneGroups;
-  /// Maps search data to its related index in the \p CloneGroups vector.
-  llvm::StringMap<std::size_t> CloneGroupIndexes;
+  /// Stores all encountered StmtSequences alongside their CloneSignature.
+  std::vector<std::pair<CloneSignature, StmtSequence>> Sequences;
 };
 
 } // end namespace clang
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to