chart2/source/model/template/HistogramCalculator.cxx | 95 ++++++++------ chart2/source/model/template/HistogramDataInterpreter.cxx | 24 ++- 2 files changed, 76 insertions(+), 43 deletions(-)
New commits: commit aebe61498bd226b2392cdfcfe5ad7db4489bd48e Author: varshneydevansh <varshney.devansh...@gmail.com> AuthorDate: Mon Jul 15 22:27:22 2024 +0530 Commit: Tomaž Vajngerl <qui...@gmail.com> CommitDate: Sun Aug 4 11:01:08 2024 +0200 tdf#82716 Implement Correct Calculation for Histogram Chart - Add Scott's rule for bin width calculation - Correct upper and lower limit for the Bin Range in the X-Axis This commit introduces Scott's Reference Rule as we are currently using the sqrt for histogram calculation, improving bin width determination and overall histogram accuracy. Change-Id: I4a430416e365781ebef8b1e0872dd4ed50492b9c Reviewed-on: https://gerrit.libreoffice.org/c/core/+/170526 Tested-by: Jenkins Reviewed-by: Tomaž Vajngerl <qui...@gmail.com> diff --git a/chart2/source/model/template/HistogramCalculator.cxx b/chart2/source/model/template/HistogramCalculator.cxx index d53ad0eb4460..ac0455196a53 100644 --- a/chart2/source/model/template/HistogramCalculator.cxx +++ b/chart2/source/model/template/HistogramCalculator.cxx @@ -26,70 +26,93 @@ void HistogramCalculator::computeBinFrequencyHistogram(const std::vector<double> maBinRanges.clear(); maBinFrequencies.clear(); - // Set min, max to the first value + // Calculate statistics + double fSum = 0.0; + double fSquareSum = 0.0; double fMinValue = rDataPoints[0]; double fMaxValue = rDataPoints[0]; + sal_Int32 nValidCount = 0; // Compute min and max values, ignoring non-finite values - for (auto const& rValue : rDataPoints) + for (const auto& rValue : rDataPoints) { if (std::isfinite(rValue)) { + fSum += rValue; + fSquareSum += rValue * rValue; fMinValue = std::min(fMinValue, rValue); fMaxValue = std::max(fMaxValue, rValue); + ++nValidCount; } } - // Round min and max to 6 decimal places - // Not sure this is needed or desired - fMinValue = std::round(fMinValue * 1e6) / 1e6; - fMaxValue = std::round(fMaxValue * 1e6) / 1e6; - - // Handle the case where all values are the same - if (fMinValue == fMaxValue) + if (nValidCount < 2 || fMinValue == fMaxValue) // Need at least two points for variance { - maBinRanges = { { fMinValue, fMinValue } }; - maBinFrequencies = { sal_Int32(rDataPoints.size()) }; + mnBins = 1; + mfBinWidth = 1.0; + maBinRanges = { { std::floor(fMinValue), std::ceil(fMinValue + 1.0) } }; + maBinFrequencies = { nValidCount }; return; } - mnBins = sal_Int32(std::sqrt(rDataPoints.size())); - - // Calculate bin width, ensuring it's not zero and rounding to 6 decimal places - mfBinWidth = std::round((fMaxValue - fMinValue) / mnBins * 1e6) / 1e6; + double fMean = fSum / nValidCount; + double fVariance = (fSquareSum - fSum * fMean) / (nValidCount - 1); + double fStdDev = std::sqrt(fVariance); - if (mfBinWidth <= 0.0) - { - mfBinWidth = 0.000001; //minimum bin width of 0.000001 - mnBins = sal_Int32(std::ceil((fMaxValue - fMinValue) / mfBinWidth)); - } + // Apply Scott's rule for bin width + mfBinWidth = (3.5 * fStdDev) / std::cbrt(nValidCount); - //recalculate maxValue to ensure it's included in the last bin - fMaxValue = fMinValue + mfBinWidth * mnBins; + // Calculate number of bins + mnBins = static_cast<sal_Int32>(std::ceil((fMaxValue - fMinValue) / mfBinWidth)); + mnBins = std::max<sal_Int32>(mnBins, 1); // Ensure at least one bin - // Initialize bin ranges and frequencies - maBinRanges.resize(mnBins); - maBinFrequencies.resize(mnBins, 0); + // Set up bin ranges + maBinRanges.reserve(mnBins); + double fBinStart = fMinValue; - // Calculate bin ranges - for (sal_Int32 nBin = 0; nBin < mnBins; ++nBin) + for (sal_Int32 i = 0; i < mnBins; ++i) { - double fBinStart = fMinValue + nBin * mfBinWidth; - double fBinEnd = (nBin == mnBins - 1) ? fMaxValue : (fBinStart + mfBinWidth); - maBinRanges[nBin] = { std::round(fBinStart * 1e6) / 1e6, std::round(fBinEnd * 1e6) / 1e6 }; + double fBinEnd = fBinStart + mfBinWidth; + + // Correct rounding to avoid discrepancies + fBinStart = std::round(fBinStart * 100.0) / 100.0; + fBinEnd = std::round(fBinEnd * 100.0) / 100.0; + + if (i == 0) + { + // First bin includes the minimum value, so use closed interval [fMinValue, fBinEnd] + maBinRanges.emplace_back(fMinValue, fBinEnd); + } + else + { + // Subsequent bins use half-open interval (fBinStart, fBinEnd] + maBinRanges.emplace_back(fBinStart, fBinEnd); + } + fBinStart = fBinEnd; } + // Adjust the last bin end to be inclusive + maBinRanges.back().second = std::max(maBinRanges.back().second, fMaxValue); + // Calculate frequencies + maBinFrequencies.assign(mnBins, 0); for (double fValue : rDataPoints) { if (std::isfinite(fValue)) { - // Calculate into which bin the value falls into - sal_Int32 nBinIndex = sal_Int32((fValue - fMinValue) / mfBinWidth); - // Sanitize - nBinIndex = std::clamp(nBinIndex, sal_Int32(0), mnBins - 1); - - maBinFrequencies[nBinIndex]++; + for (size_t i = 0; i < maBinRanges.size(); ++i) + { + if (i == 0 && fValue >= maBinRanges[i].first && fValue <= maBinRanges[i].second) + { + maBinFrequencies[i]++; + break; + } + else if (i > 0 && fValue > maBinRanges[i].first && fValue <= maBinRanges[i].second) + { + maBinFrequencies[i]++; + break; + } + } } } } diff --git a/chart2/source/model/template/HistogramDataInterpreter.cxx b/chart2/source/model/template/HistogramDataInterpreter.cxx index bfadbbd3c83f..43ccc5ee3d92 100644 --- a/chart2/source/model/template/HistogramDataInterpreter.cxx +++ b/chart2/source/model/template/HistogramDataInterpreter.cxx @@ -69,13 +69,23 @@ InterpretedData HistogramDataInterpreter::interpretDataSource( const auto& binFrequencies = aHistogramCalculator.getBinFrequencies(); // Create labels and values for HistogramDataSequence - std::vector<OUString> labels; - std::vector<double> values; + std::vector<OUString> aLabels; + std::vector<double> aValues; for (size_t i = 0; i < binRanges.size(); ++i) { - labels.push_back(u"[" + OUString::number(binRanges[i].first) + u"-" - + OUString::number(binRanges[i].second) + u")"); - values.push_back(static_cast<double>(binFrequencies[i])); + OUString aLabel; + if (i == 0) + { + aLabel = u"["_ustr + OUString::number(binRanges[i].first) + u"-"_ustr + + OUString::number(binRanges[i].second) + u"]"_ustr; + } + else + { + aLabel = u"("_ustr + OUString::number(binRanges[i].first) + u"-"_ustr + + OUString::number(binRanges[i].second) + u"]"_ustr; + } + aLabels.push_back(aLabel); + aValues.push_back(static_cast<double>(binFrequencies[i])); } rtl::Reference<DataSeries> xSeries = new DataSeries; @@ -83,8 +93,8 @@ InterpretedData HistogramDataInterpreter::interpretDataSource( aNewData.push_back(aData[0]); rtl::Reference<HistogramDataSequence> aValuesDataSequence = new HistogramDataSequence(); - aValuesDataSequence->setValues(comphelper::containerToSequence(values)); - aValuesDataSequence->setLabels(comphelper::containerToSequence(labels)); + aValuesDataSequence->setValues(comphelper::containerToSequence(aValues)); + aValuesDataSequence->setLabels(comphelper::containerToSequence(aLabels)); uno::Reference<chart2::data::XDataSequence> aDataSequence = aValuesDataSequence; SetRole(aDataSequence, u"values-y"_ustr);