buraksenn commented on code in PR #20846:
URL: https://github.com/apache/datafusion/pull/20846#discussion_r2914769502
##########
datafusion/physical-plan/src/union.rs:
##########
@@ -863,6 +863,92 @@ fn col_stats_union(
left
}
+fn union_distinct_count(
+ left: &ColumnStatistics,
+ right: &ColumnStatistics,
+) -> Precision<usize> {
+ let (ndv_left, ndv_right) = match (
+ left.distinct_count.get_value(),
+ right.distinct_count.get_value(),
+ ) {
+ (Some(&l), Some(&r)) => (l, r),
+ _ => return Precision::Absent,
+ };
+
+ // Even with exact inputs, the union NDV depends on how
+ // many distinct values are shared between the left and right.
+ // We can only estimate this via range overlap. Thus both paths
+ // below return `Inexact`.
+ if let Some(ndv) = estimate_ndv_with_overlap(left, right, ndv_left,
ndv_right) {
+ return Precision::Inexact(ndv);
+ }
+
+ Precision::Inexact(ndv_left + ndv_right)
+}
+
+/// Estimates the distinct count for a union using range overlap, following
+/// the approach used by Trino:
+///
+/// overlap_a = fraction of A's range that overlaps with B
+/// overlap_b = fraction of B's range that overlaps with A
+/// NDV = max(overlap_a * NDV_a, overlap_b * NDV_b) \[intersection\]
+/// + (1 - overlap_a) * NDV_a [only in A]
+/// + (1 - overlap_b) * NDV_b [only in B]
+fn estimate_ndv_with_overlap(
+ left: &ColumnStatistics,
+ right: &ColumnStatistics,
+ ndv_left: usize,
+ ndv_right: usize,
+) -> Option<usize> {
+ let min_left = left.min_value.get_value()?;
+ let max_left = left.max_value.get_value()?;
+ let min_right = right.min_value.get_value()?;
+ let max_right = right.max_value.get_value()?;
+
+ let range_left = max_left.distance(min_left)?;
+ let range_right = max_right.distance(min_right)?;
+
+ // Constant columns (range == 0) can't use the proportional overlap
+ // formula below, so check interval overlap directly instead.
+ if range_left == 0 || range_right == 0 {
+ let overlaps = min_left <= max_right && min_right <= max_left;
+ return Some(if overlaps {
+ usize::max(ndv_left, ndv_right)
+ } else {
+ ndv_left + ndv_right
+ });
+ }
+
+ let overlap_min = if min_left >= min_right {
+ min_left
+ } else {
+ min_right
+ };
+ let overlap_max = if max_left <= max_right {
+ max_left
+ } else {
+ max_right
+ };
+
+ if overlap_min > overlap_max {
Review Comment:
That's why I've returned actually but as you've said comment makes sense
here. Thanks for heads up I've added a comment
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]