asolimando commented on code in PR #20846:
URL: https://github.com/apache/datafusion/pull/20846#discussion_r2913829551


##########
datafusion/physical-plan/src/union.rs:
##########
@@ -863,6 +863,92 @@ fn col_stats_union(
     left
 }
 
+fn union_distinct_count(
+    left: &ColumnStatistics,
+    right: &ColumnStatistics,
+) -> Precision<usize> {
+    let (ndv_left, ndv_right) = match (
+        left.distinct_count.get_value(),
+        right.distinct_count.get_value(),
+    ) {
+        (Some(&l), Some(&r)) => (l, r),
+        _ => return Precision::Absent,
+    };
+
+    // Even with exact inputs, the union NDV depends on how
+    // many distinct values are shared between the left and right.
+    // We can only estimate this via range overlap. Thus both paths
+    // below return `Inexact`.
+    if let Some(ndv) = estimate_ndv_with_overlap(left, right, ndv_left, 
ndv_right) {
+        return Precision::Inexact(ndv);
+    }
+
+    Precision::Inexact(ndv_left + ndv_right)
+}
+
+/// Estimates the distinct count for a union using range overlap, following
+/// the approach used by Trino:
+///
+/// overlap_a = fraction of A's range that overlaps with B
+/// overlap_b = fraction of B's range that overlaps with A
+/// NDV = max(overlap_a * NDV_a, overlap_b * NDV_b)   \[intersection\]
+///     + (1 - overlap_a) * NDV_a                      [only in A]
+///     + (1 - overlap_b) * NDV_b                      [only in B]
+fn estimate_ndv_with_overlap(
+    left: &ColumnStatistics,
+    right: &ColumnStatistics,
+    ndv_left: usize,
+    ndv_right: usize,
+) -> Option<usize> {
+    let min_left = left.min_value.get_value()?;
+    let max_left = left.max_value.get_value()?;
+    let min_right = right.min_value.get_value()?;
+    let max_right = right.max_value.get_value()?;
+
+    let range_left = max_left.distance(min_left)?;
+    let range_right = max_right.distance(min_right)?;
+
+    // Constant columns (range == 0) can't use the proportional overlap
+    // formula below, so check interval overlap directly instead.
+    if range_left == 0 || range_right == 0 {
+        let overlaps = min_left <= max_right && min_right <= max_left;
+        return Some(if overlaps {
+            usize::max(ndv_left, ndv_right)
+        } else {
+            ndv_left + ndv_right
+        });
+    }
+
+    let overlap_min = if min_left >= min_right {
+        min_left
+    } else {
+        min_right
+    };
+    let overlap_max = if max_left <= max_right {
+        max_left
+    } else {
+        max_right
+    };
+
+    if overlap_min > overlap_max {

Review Comment:
   Nit: the formula naturally degrades to `sum` when there's no overlap 
(`overlap_range = 0` gives `overlap_left = overlap_right = 0`, so the result is 
`ndv_left + ndv_right`), so this is a short-circuit optimization, we might want 
to add that as a comment.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to