chart2/source/model/template/HistogramCalculator.cxx      |   95 ++++++++------
 chart2/source/model/template/HistogramDataInterpreter.cxx |   24 ++-
 2 files changed, 76 insertions(+), 43 deletions(-)

New commits:
commit aebe61498bd226b2392cdfcfe5ad7db4489bd48e
Author:     varshneydevansh <varshney.devansh...@gmail.com>
AuthorDate: Mon Jul 15 22:27:22 2024 +0530
Commit:     Tomaž Vajngerl <qui...@gmail.com>
CommitDate: Sun Aug 4 11:01:08 2024 +0200

    tdf#82716 Implement Correct Calculation for Histogram Chart
    
    - Add Scott's rule for bin width calculation
    - Correct upper and lower limit for the Bin Range in the X-Axis
    
    This commit introduces Scott's Reference Rule as we are currently
    using the sqrt for histogram calculation, improving bin width
    determination and overall histogram accuracy.
    
    Change-Id: I4a430416e365781ebef8b1e0872dd4ed50492b9c
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/170526
    Tested-by: Jenkins
    Reviewed-by: Tomaž Vajngerl <qui...@gmail.com>

diff --git a/chart2/source/model/template/HistogramCalculator.cxx 
b/chart2/source/model/template/HistogramCalculator.cxx
index d53ad0eb4460..ac0455196a53 100644
--- a/chart2/source/model/template/HistogramCalculator.cxx
+++ b/chart2/source/model/template/HistogramCalculator.cxx
@@ -26,70 +26,93 @@ void 
HistogramCalculator::computeBinFrequencyHistogram(const std::vector<double>
     maBinRanges.clear();
     maBinFrequencies.clear();
 
-    // Set min, max to the first value
+    // Calculate statistics
+    double fSum = 0.0;
+    double fSquareSum = 0.0;
     double fMinValue = rDataPoints[0];
     double fMaxValue = rDataPoints[0];
+    sal_Int32 nValidCount = 0;
 
     // Compute min and max values, ignoring non-finite values
-    for (auto const& rValue : rDataPoints)
+    for (const auto& rValue : rDataPoints)
     {
         if (std::isfinite(rValue))
         {
+            fSum += rValue;
+            fSquareSum += rValue * rValue;
             fMinValue = std::min(fMinValue, rValue);
             fMaxValue = std::max(fMaxValue, rValue);
+            ++nValidCount;
         }
     }
 
-    // Round min and max to 6 decimal places
-    // Not sure this is needed or desired
-    fMinValue = std::round(fMinValue * 1e6) / 1e6;
-    fMaxValue = std::round(fMaxValue * 1e6) / 1e6;
-
-    // Handle the case where all values are the same
-    if (fMinValue == fMaxValue)
+    if (nValidCount < 2 || fMinValue == fMaxValue) // Need at least two points 
for variance
     {
-        maBinRanges = { { fMinValue, fMinValue } };
-        maBinFrequencies = { sal_Int32(rDataPoints.size()) };
+        mnBins = 1;
+        mfBinWidth = 1.0;
+        maBinRanges = { { std::floor(fMinValue), std::ceil(fMinValue + 1.0) } 
};
+        maBinFrequencies = { nValidCount };
         return;
     }
 
-    mnBins = sal_Int32(std::sqrt(rDataPoints.size()));
-
-    // Calculate bin width, ensuring it's not zero and rounding to 6 decimal 
places
-    mfBinWidth = std::round((fMaxValue - fMinValue) / mnBins * 1e6) / 1e6;
+    double fMean = fSum / nValidCount;
+    double fVariance = (fSquareSum - fSum * fMean) / (nValidCount - 1);
+    double fStdDev = std::sqrt(fVariance);
 
-    if (mfBinWidth <= 0.0)
-    {
-        mfBinWidth = 0.000001; //minimum bin width of 0.000001
-        mnBins = sal_Int32(std::ceil((fMaxValue - fMinValue) / mfBinWidth));
-    }
+    // Apply Scott's rule for bin width
+    mfBinWidth = (3.5 * fStdDev) / std::cbrt(nValidCount);
 
-    //recalculate maxValue to ensure it's included in the last bin
-    fMaxValue = fMinValue + mfBinWidth * mnBins;
+    // Calculate number of bins
+    mnBins = static_cast<sal_Int32>(std::ceil((fMaxValue - fMinValue) / 
mfBinWidth));
+    mnBins = std::max<sal_Int32>(mnBins, 1); // Ensure at least one bin
 
-    // Initialize bin ranges and frequencies
-    maBinRanges.resize(mnBins);
-    maBinFrequencies.resize(mnBins, 0);
+    // Set up bin ranges
+    maBinRanges.reserve(mnBins);
+    double fBinStart = fMinValue;
 
-    // Calculate bin ranges
-    for (sal_Int32 nBin = 0; nBin < mnBins; ++nBin)
+    for (sal_Int32 i = 0; i < mnBins; ++i)
     {
-        double fBinStart = fMinValue + nBin * mfBinWidth;
-        double fBinEnd = (nBin == mnBins - 1) ? fMaxValue : (fBinStart + 
mfBinWidth);
-        maBinRanges[nBin] = { std::round(fBinStart * 1e6) / 1e6, 
std::round(fBinEnd * 1e6) / 1e6 };
+        double fBinEnd = fBinStart + mfBinWidth;
+
+        // Correct rounding to avoid discrepancies
+        fBinStart = std::round(fBinStart * 100.0) / 100.0;
+        fBinEnd = std::round(fBinEnd * 100.0) / 100.0;
+
+        if (i == 0)
+        {
+            // First bin includes the minimum value, so use closed interval 
[fMinValue, fBinEnd]
+            maBinRanges.emplace_back(fMinValue, fBinEnd);
+        }
+        else
+        {
+            // Subsequent bins use half-open interval (fBinStart, fBinEnd]
+            maBinRanges.emplace_back(fBinStart, fBinEnd);
+        }
+        fBinStart = fBinEnd;
     }
 
+    // Adjust the last bin end to be inclusive
+    maBinRanges.back().second = std::max(maBinRanges.back().second, fMaxValue);
+
     // Calculate frequencies
+    maBinFrequencies.assign(mnBins, 0);
     for (double fValue : rDataPoints)
     {
         if (std::isfinite(fValue))
         {
-            // Calculate into which bin the value falls into
-            sal_Int32 nBinIndex = sal_Int32((fValue - fMinValue) / mfBinWidth);
-            // Sanitize
-            nBinIndex = std::clamp(nBinIndex, sal_Int32(0), mnBins - 1);
-
-            maBinFrequencies[nBinIndex]++;
+            for (size_t i = 0; i < maBinRanges.size(); ++i)
+            {
+                if (i == 0 && fValue >= maBinRanges[i].first && fValue <= 
maBinRanges[i].second)
+                {
+                    maBinFrequencies[i]++;
+                    break;
+                }
+                else if (i > 0 && fValue > maBinRanges[i].first && fValue <= 
maBinRanges[i].second)
+                {
+                    maBinFrequencies[i]++;
+                    break;
+                }
+            }
         }
     }
 }
diff --git a/chart2/source/model/template/HistogramDataInterpreter.cxx 
b/chart2/source/model/template/HistogramDataInterpreter.cxx
index bfadbbd3c83f..43ccc5ee3d92 100644
--- a/chart2/source/model/template/HistogramDataInterpreter.cxx
+++ b/chart2/source/model/template/HistogramDataInterpreter.cxx
@@ -69,13 +69,23 @@ InterpretedData 
HistogramDataInterpreter::interpretDataSource(
     const auto& binFrequencies = aHistogramCalculator.getBinFrequencies();
 
     // Create labels and values for HistogramDataSequence
-    std::vector<OUString> labels;
-    std::vector<double> values;
+    std::vector<OUString> aLabels;
+    std::vector<double> aValues;
     for (size_t i = 0; i < binRanges.size(); ++i)
     {
-        labels.push_back(u"[" + OUString::number(binRanges[i].first) + u"-"
-                         + OUString::number(binRanges[i].second) + u")");
-        values.push_back(static_cast<double>(binFrequencies[i]));
+        OUString aLabel;
+        if (i == 0)
+        {
+            aLabel = u"["_ustr + OUString::number(binRanges[i].first) + 
u"-"_ustr
+                     + OUString::number(binRanges[i].second) + u"]"_ustr;
+        }
+        else
+        {
+            aLabel = u"("_ustr + OUString::number(binRanges[i].first) + 
u"-"_ustr
+                     + OUString::number(binRanges[i].second) + u"]"_ustr;
+        }
+        aLabels.push_back(aLabel);
+        aValues.push_back(static_cast<double>(binFrequencies[i]));
     }
 
     rtl::Reference<DataSeries> xSeries = new DataSeries;
@@ -83,8 +93,8 @@ InterpretedData HistogramDataInterpreter::interpretDataSource(
     aNewData.push_back(aData[0]);
 
     rtl::Reference<HistogramDataSequence> aValuesDataSequence = new 
HistogramDataSequence();
-    aValuesDataSequence->setValues(comphelper::containerToSequence(values));
-    aValuesDataSequence->setLabels(comphelper::containerToSequence(labels));
+    aValuesDataSequence->setValues(comphelper::containerToSequence(aValues));
+    aValuesDataSequence->setLabels(comphelper::containerToSequence(aLabels));
 
     uno::Reference<chart2::data::XDataSequence> aDataSequence = 
aValuesDataSequence;
     SetRole(aDataSequence, u"values-y"_ustr);

Reply via email to