[PATCH] D85311: [analyzer][tests] Understand when diagnostics change between builds

Valeriy Savchenko via Phabricator via cfe-commits Thu, 06 Aug 2020 02:54:04 -0700

This revision was automatically updated to reflect the committed changes.
Closed by commit rG6ddef9247458: [analyzer][tests] Understand when diagnostics 
change between builds (authored by vsavchenko).


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D85311/new/

https://reviews.llvm.org/D85311

Files:
  clang/utils/analyzer/CmpRuns.py

Index: clang/utils/analyzer/CmpRuns.py
===================================================================
--- clang/utils/analyzer/CmpRuns.py
+++ clang/utils/analyzer/CmpRuns.py
@@ -35,14 +35,16 @@
 from collections import defaultdict
 from copy import copy
 from enum import Enum
-from typing import (Any, cast, Dict, List, NamedTuple, Optional, Sequence,
-                    TextIO, TypeVar, Tuple, Union)
+from typing import (Any, DefaultDict, Dict, List, NamedTuple, Optional,
+                    Sequence, TextIO, TypeVar, Tuple, Union)
 
 
 Number = Union[int, float]
 Stats = Dict[str, Dict[str, Number]]
 Plist = Dict[str, Any]
 JSON = Dict[str, Any]
+# Diff in a form: field -> (before, after)
+JSONDiff = Dict[str, Tuple[str, str]]
 # Type for generics
 T = TypeVar('T')
 
@@ -136,6 +138,9 @@
     def get_description(self) -> str:
         return self._data['description']
 
+    def get_location(self) -> str:
+        return f"{self.get_file_name()}:{self.get_line()}:{self.get_column()}"
+
     def get_issue_identifier(self) -> str:
         id = self.get_file_name() + "+"
 
@@ -172,11 +177,32 @@
         return f"{file_prefix}{funcname_postfix}:{line}:{col}" \
             f", {self.get_category()}: {self.get_description()}"
 
+    KEY_FIELDS = ["check_name", "category", "description"]
+
+    def is_similar_to(self, other: "AnalysisDiagnostic") -> bool:
+        # We consider two diagnostics similar only if at least one
+        # of the key fields is the same in both diagnostics.
+        return len(self.get_diffs(other)) != len(self.KEY_FIELDS)
+
+    def get_diffs(self, other: "AnalysisDiagnostic") -> JSONDiff:
+        return {field: (self._data[field], other._data[field])
+                for field in self.KEY_FIELDS
+                if self._data[field] != other._data[field]}
+
     # Note, the data format is not an API and may change from one analyzer
     # version to another.
     def get_raw_data(self) -> Plist:
         return self._data
 
+    def __eq__(self, other: object) -> bool:
+        return hash(self) == hash(other)
+
+    def __ne__(self, other: object) -> bool:
+        return hash(self) != hash(other)
+
+    def __hash__(self) -> int:
+        return hash(self.get_issue_identifier())
+
 
 class AnalysisRun:
     def __init__(self, info: SingleRunInfo):
@@ -283,12 +309,39 @@
     return d.get_issue_identifier()
 
 
-PresentInBoth = Tuple[AnalysisDiagnostic, AnalysisDiagnostic]
-PresentOnlyInOld = Tuple[AnalysisDiagnostic, None]
-PresentOnlyInNew = Tuple[None, AnalysisDiagnostic]
-ComparisonResult = List[Union[PresentInBoth,
-                              PresentOnlyInOld,
-                              PresentOnlyInNew]]
+AnalysisDiagnosticPair = Tuple[AnalysisDiagnostic, AnalysisDiagnostic]
+
+
+class ComparisonResult:
+    def __init__(self):
+        self.present_in_both: List[AnalysisDiagnostic] = []
+        self.present_only_in_old: List[AnalysisDiagnostic] = []
+        self.present_only_in_new: List[AnalysisDiagnostic] = []
+        self.changed_between_new_and_old: List[AnalysisDiagnosticPair] = []
+
+    def add_common(self, issue: AnalysisDiagnostic):
+        self.present_in_both.append(issue)
+
+    def add_removed(self, issue: AnalysisDiagnostic):
+        self.present_only_in_old.append(issue)
+
+    def add_added(self, issue: AnalysisDiagnostic):
+        self.present_only_in_new.append(issue)
+
+    def add_changed(self, old_issue: AnalysisDiagnostic,
+                    new_issue: AnalysisDiagnostic):
+        self.changed_between_new_and_old.append((old_issue, new_issue))
+
+
+GroupedDiagnostics = DefaultDict[str, List[AnalysisDiagnostic]]
+
+
+def get_grouped_diagnostics(diagnostics: List[AnalysisDiagnostic]
+                            ) -> GroupedDiagnostics:
+    result: GroupedDiagnostics = defaultdict(list)
+    for diagnostic in diagnostics:
+        result[diagnostic.get_location()].append(diagnostic)
+    return result
 
 
 def compare_results(results_old: AnalysisRun, results_new: AnalysisRun,
@@ -302,52 +355,79 @@
     each element {a,b} is None or a matching element from the respective run
     """
 
-    res: ComparisonResult = []
+    res = ComparisonResult()
 
     # Map size_before -> size_after
     path_difference_data: List[float] = []
 
-    # Quickly eliminate equal elements.
-    neq_old: List[AnalysisDiagnostic] = []
-    neq_new: List[AnalysisDiagnostic] = []
-
-    diags_old = copy(results_old.diagnostics)
-    diags_new = copy(results_new.diagnostics)
-
-    diags_old.sort(key=cmp_analysis_diagnostic)
-    diags_new.sort(key=cmp_analysis_diagnostic)
-
-    while diags_old and diags_new:
-        a = diags_old.pop()
-        b = diags_new.pop()
-
-        if a.get_issue_identifier() == b.get_issue_identifier():
-            if a.get_path_length() != b.get_path_length():
-
-                if histogram == HistogramType.RELATIVE:
-                    path_difference_data.append(
-                        float(a.get_path_length()) / b.get_path_length())
-
-                elif histogram == HistogramType.LOG_RELATIVE:
-                    path_difference_data.append(
-                        log(float(a.get_path_length()) / b.get_path_length()))
-
-                elif histogram == HistogramType.ABSOLUTE:
-                    path_difference_data.append(
-                        a.get_path_length() - b.get_path_length())
-
-            res.append((a, b))
-
-        elif a.get_issue_identifier() > b.get_issue_identifier():
-            diags_new.append(b)
-            neq_old.append(a)
-
-        else:
-            diags_old.append(a)
-            neq_new.append(b)
-
-    neq_old.extend(diags_old)
-    neq_new.extend(diags_new)
+    diags_old = get_grouped_diagnostics(results_old.diagnostics)
+    diags_new = get_grouped_diagnostics(results_new.diagnostics)
+
+    locations_old = set(diags_old.keys())
+    locations_new = set(diags_new.keys())
+
+    common_locations = locations_old & locations_new
+
+    for location in common_locations:
+        old = diags_old[location]
+        new = diags_new[location]
+
+        # Quadratic algorithms in this part are fine because 'old' and 'new'
+        # are most commonly of size 1.
+        for a in copy(old):
+            for b in copy(new):
+                if a.get_issue_identifier() == b.get_issue_identifier():
+                    a_path_len = a.get_path_length()
+                    b_path_len = b.get_path_length()
+
+                    if a_path_len != b_path_len:
+
+                        if histogram == HistogramType.RELATIVE:
+                            path_difference_data.append(
+                                float(a_path_len) / b_path_len)
+
+                        elif histogram == HistogramType.LOG_RELATIVE:
+                            path_difference_data.append(
+                                log(float(a_path_len) / b_path_len))
+
+                        elif histogram == HistogramType.ABSOLUTE:
+                            path_difference_data.append(
+                                a_path_len - b_path_len)
+
+                    res.add_common(a)
+                    old.remove(a)
+                    new.remove(b)
+
+        for a in copy(old):
+            for b in copy(new):
+                if a.is_similar_to(b):
+                    res.add_changed(a, b)
+                    old.remove(a)
+                    new.remove(b)
+
+        # Whatever is left in 'old' doesn't have a corresponding diagnostic
+        # in 'new', so we need to mark it as 'removed'.
+        for a in old:
+            res.add_removed(a)
+
+        # Whatever is left in 'new' doesn't have a corresponding diagnostic
+        # in 'old', so we need to mark it as 'added'.
+        for b in new:
+            res.add_added(b)
+
+    only_old_locations = locations_old - common_locations
+    for location in only_old_locations:
+        for a in diags_old[location]:
+            # These locations have been found only in the old build, so we
+            # need to mark all of therm as 'removed'
+            res.add_removed(a)
+
+    only_new_locations = locations_new - common_locations
+    for location in only_new_locations:
+        for b in diags_new[location]:
+            # These locations have been found only in the new build, so we
+            # need to mark all of therm as 'added'
+            res.add_added(b)
 
     # FIXME: Add fuzzy matching. One simple and possible effective idea would
     # be to bin the diagnostics, print them in a normalized form (based solely
@@ -355,11 +435,6 @@
     # the basis for matching. This has the nice property that we don't depend
     # in any way on the diagnostic format.
 
-    for a in neq_old:
-        res.append((a, None))
-    for b in neq_new:
-        res.append((None, b))
-
     if histogram:
         from matplotlib import pyplot
         pyplot.hist(path_difference_data, bins=100)
@@ -476,47 +551,55 @@
 
     # Open the verbose log, if given.
     if verbose_log:
-        auxLog: Optional[TextIO] = open(verbose_log, "w")
+        aux_log: Optional[TextIO] = open(verbose_log, "w")
     else:
-        auxLog = None
+        aux_log = None
 
     diff = compare_results(results_old, results_new, histogram)
     found_diffs = 0
     total_added = 0
     total_removed = 0
-
-    for res in diff:
-        old, new = res
-        if old is None:
-            # TODO: mypy still doesn't understand that old and new can't be
-            #       both Nones, we should introduce a better type solution
-            new = cast(AnalysisDiagnostic, new)
-            out.write(f"ADDED: {new.get_readable_name()}\n")
-            found_diffs += 1
-            total_added += 1
-            if auxLog:
-                auxLog.write(f"('ADDED', {new.get_readable_name()}, "
-                             f"{new.get_html_report()})\n")
-
-        elif new is None:
-            out.write(f"REMOVED: {old.get_readable_name()}\n")
-            found_diffs += 1
-            total_removed += 1
-            if auxLog:
-                auxLog.write(f"('REMOVED', {old.get_readable_name()}, "
-                             f"{old.get_html_report()})\n")
-        else:
-            pass
+    total_modified = 0
+
+    for new in diff.present_only_in_new:
+        out.write(f"ADDED: {new.get_readable_name()}\n\n")
+        found_diffs += 1
+        total_added += 1
+        if aux_log:
+            aux_log.write(f"('ADDED', {new.get_readable_name()}, "
+                          f"{new.get_html_report()})\n")
+
+    for old in diff.present_only_in_old:
+        out.write(f"REMOVED: {old.get_readable_name()}\n\n")
+        found_diffs += 1
+        total_removed += 1
+        if aux_log:
+            aux_log.write(f"('REMOVED', {old.get_readable_name()}, "
+                          f"{old.get_html_report()})\n")
+
+    for old, new in diff.changed_between_new_and_old:
+        out.write(f"MODIFIED: {old.get_readable_name()}\n")
+        found_diffs += 1
+        total_modified += 1
+        diffs = old.get_diffs(new)
+        str_diffs = [f"          '{key}' changed: "
+                     f"'{old_value}' -> '{new_value}'"
+                     for key, (old_value, new_value) in diffs.items()]
+        out.write(",\n".join(str_diffs) + "\n\n")
+        if aux_log:
+            aux_log.write(f"('MODIFIED', {old.get_readable_name()}, "
+                          f"{old.get_html_report()})\n")
 
     total_reports = len(results_new.diagnostics)
     out.write(f"TOTAL REPORTS: {total_reports}\n")
     out.write(f"TOTAL ADDED: {total_added}\n")
     out.write(f"TOTAL REMOVED: {total_removed}\n")
+    out.write(f"TOTAL MODIFIED: {total_modified}\n")
 
-    if auxLog:
-        auxLog.write(f"('TOTAL NEW REPORTS', {total_reports})\n")
-        auxLog.write(f"('TOTAL DIFFERENCES', {found_diffs})\n")
-        auxLog.close()
+    if aux_log:
+        aux_log.write(f"('TOTAL NEW REPORTS', {total_reports})\n")
+        aux_log.write(f"('TOTAL DIFFERENCES', {found_diffs})\n")
+        aux_log.close()
 
     # TODO: change to NamedTuple
     return found_diffs, len(results_old.diagnostics), \

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D85311: [analyzer][tests] Understand when diagnostics change between builds

Reply via email to