alamb commented on code in PR #15661:
URL: https://github.com/apache/datafusion/pull/15661#discussion_r2035858553


##########
datafusion/common/src/stats.rs:
##########
@@ -296,6 +311,24 @@ impl Statistics {
             .collect()
     }
 
+    /// Set the number of rows

Review Comment:
   I made these methods to reduce the boiler plate in the examples (otherwise I 
had to explicitly provide `Precision::Absent` for every field. This let's the 
examples be simpler)



##########
datafusion/common/src/stats.rs:
##########
@@ -521,6 +649,36 @@ impl ColumnStatistics {
         }
     }
 
+    /// Set the null count
+    pub fn with_null_count(mut self, null_count: Precision<usize>) -> Self {

Review Comment:
   Similarly, adding these setters to make it easier (less verbose) to write 
examples and tests



##########
datafusion/common/src/stats.rs:
##########
@@ -798,4 +958,193 @@ mod tests {
             distinct_count: Precision::Exact(100),
         }
     }
+
+    #[test]
+    fn test_try_merge_basic() {

Review Comment:
   these tests were moved from datasource



##########
datafusion/datasource/src/statistics.rs:
##########
@@ -545,255 +488,10 @@ pub fn compute_all_files_statistics(
     Ok((file_groups_with_stats, statistics))
 }
 
+#[deprecated(since = "47.0.0", note = "Use Statistics::add")]
 pub fn add_row_stats(
     file_num_rows: Precision<usize>,
     num_rows: Precision<usize>,
 ) -> Precision<usize> {
-    match (file_num_rows, &num_rows) {
-        (Precision::Absent, _) => num_rows.to_inexact(),
-        (lhs, Precision::Absent) => lhs.to_inexact(),
-        (lhs, rhs) => lhs.add(rhs),
-    }
-}
-
-/// If the given value is numerically greater than the original maximum value,
-/// return the new maximum value with appropriate exactness information.
-fn set_max_if_greater(
-    max_nominee: &Precision<ScalarValue>,
-    max_value: &mut Precision<ScalarValue>,
-) {
-    match (&max_value, max_nominee) {
-        (Precision::Exact(val1), Precision::Exact(val2)) if val1 < val2 => {
-            *max_value = max_nominee.clone();
-        }
-        (Precision::Exact(val1), Precision::Inexact(val2))
-        | (Precision::Inexact(val1), Precision::Inexact(val2))
-        | (Precision::Inexact(val1), Precision::Exact(val2))
-            if val1 < val2 =>
-        {
-            *max_value = max_nominee.clone().to_inexact();
-        }
-        (Precision::Exact(_), Precision::Absent) => {
-            let exact_max = mem::take(max_value);
-            *max_value = exact_max.to_inexact();
-        }
-        (Precision::Absent, Precision::Exact(_)) => {
-            *max_value = max_nominee.clone().to_inexact();
-        }
-        (Precision::Absent, Precision::Inexact(_)) => {
-            *max_value = max_nominee.clone();
-        }
-        _ => {}
-    }
-}
-
-/// If the given value is numerically lesser than the original minimum value,
-/// return the new minimum value with appropriate exactness information.
-fn set_min_if_lesser(
-    min_nominee: &Precision<ScalarValue>,
-    min_value: &mut Precision<ScalarValue>,
-) {
-    match (&min_value, min_nominee) {
-        (Precision::Exact(val1), Precision::Exact(val2)) if val1 > val2 => {
-            *min_value = min_nominee.clone();
-        }
-        (Precision::Exact(val1), Precision::Inexact(val2))
-        | (Precision::Inexact(val1), Precision::Inexact(val2))
-        | (Precision::Inexact(val1), Precision::Exact(val2))
-            if val1 > val2 =>
-        {
-            *min_value = min_nominee.clone().to_inexact();
-        }
-        (Precision::Exact(_), Precision::Absent) => {
-            let exact_min = mem::take(min_value);
-            *min_value = exact_min.to_inexact();
-        }
-        (Precision::Absent, Precision::Exact(_)) => {
-            *min_value = min_nominee.clone().to_inexact();
-        }
-        (Precision::Absent, Precision::Inexact(_)) => {
-            *min_value = min_nominee.clone();
-        }
-        _ => {}
-    }
-}
-
-#[cfg(test)]
-mod tests {

Review Comment:
   tests are moved



##########
datafusion/common/src/stats.rs:
##########
@@ -414,6 +448,100 @@ impl Statistics {
         self.total_byte_size = Precision::Absent;
         Ok(self)
     }
+
+    /// Summarize zero or more statistics into a single `Statistics` instance.
+    ///
+    /// Returns an error if the statistics do not match the specified schemas.
+    pub fn try_merge_iter<'a, I>(items: I, schema: &Schema) -> 
Result<Statistics>
+    where
+        I: IntoIterator<Item = &'a Statistics>,
+    {
+        let mut items = items.into_iter();
+
+        let Some(init) = items.next() else {
+            return Ok(Statistics::new_unknown(schema));
+        };
+        items.try_fold(init.clone(), |acc: Statistics, item_stats: 
&Statistics| {
+            acc.try_merge(item_stats)
+        })
+    }
+
+    /// Merge this Statistics value with another Statistics value.
+    ///
+    /// Returns an error if the statistics do not match (different schemas).
+    ///
+    /// # Example
+    /// ```
+    /// # use datafusion_common::{ColumnStatistics, ScalarValue, Statistics};

Review Comment:
   here is an example showing how merge works



##########
datafusion/datasource/src/statistics.rs:
##########
@@ -409,62 +406,6 @@ pub async fn get_statistics_with_limit(
     Ok((result_files, statistics))
 }
 
-/// Generic function to compute statistics across multiple items that have 
statistics
-fn compute_summary_statistics<T, I>(

Review Comment:
   The point of this PR is to remove this function and replace it with 
`Statistics::try_merge_iter`
   
   



##########
datafusion/common/src/stats.rs:
##########
@@ -798,4 +958,193 @@ mod tests {
             distinct_count: Precision::Exact(100),
         }
     }
+
+    #[test]
+    fn test_try_merge_basic() {
+        // Create a schema with two columns
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("col1", DataType::Int32, false),
+            Field::new("col2", DataType::Int32, false),
+        ]));
+
+        // Create items with statistics
+        let stats1 = Statistics {
+            num_rows: Precision::Exact(10),
+            total_byte_size: Precision::Exact(100),
+            column_statistics: vec![
+                ColumnStatistics {
+                    null_count: Precision::Exact(1),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(100))),
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(1))),
+                    sum_value: Precision::Exact(ScalarValue::Int32(Some(500))),
+                    distinct_count: Precision::Absent,
+                },
+                ColumnStatistics {
+                    null_count: Precision::Exact(2),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(200))),
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(10))),
+                    sum_value: 
Precision::Exact(ScalarValue::Int32(Some(1000))),
+                    distinct_count: Precision::Absent,
+                },
+            ],
+        };
+
+        let stats2 = Statistics {
+            num_rows: Precision::Exact(15),
+            total_byte_size: Precision::Exact(150),
+            column_statistics: vec![
+                ColumnStatistics {
+                    null_count: Precision::Exact(2),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(120))),
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(-10))),
+                    sum_value: Precision::Exact(ScalarValue::Int32(Some(600))),
+                    distinct_count: Precision::Absent,
+                },
+                ColumnStatistics {
+                    null_count: Precision::Exact(3),
+                    max_value: Precision::Exact(ScalarValue::Int32(Some(180))),
+                    min_value: Precision::Exact(ScalarValue::Int32(Some(5))),
+                    sum_value: 
Precision::Exact(ScalarValue::Int32(Some(1200))),
+                    distinct_count: Precision::Absent,
+                },
+            ],
+        };
+
+        let items = vec![stats1, stats2];
+
+        let summary_stats = Statistics::try_merge_iter(&items, 
&schema).unwrap();
+
+        // Verify the results
+        assert_eq!(summary_stats.num_rows, Precision::Exact(25)); // 10 + 15
+        assert_eq!(summary_stats.total_byte_size, Precision::Exact(250)); // 
100 + 150
+
+        // Verify column statistics
+        let col1_stats = &summary_stats.column_statistics[0];
+        assert_eq!(col1_stats.null_count, Precision::Exact(3)); // 1 + 2
+        assert_eq!(
+            col1_stats.max_value,
+            Precision::Exact(ScalarValue::Int32(Some(120)))
+        );
+        assert_eq!(
+            col1_stats.min_value,
+            Precision::Exact(ScalarValue::Int32(Some(-10)))
+        );
+        assert_eq!(
+            col1_stats.sum_value,
+            Precision::Exact(ScalarValue::Int32(Some(1100)))
+        ); // 500 + 600
+
+        let col2_stats = &summary_stats.column_statistics[1];
+        assert_eq!(col2_stats.null_count, Precision::Exact(5)); // 2 + 3
+        assert_eq!(
+            col2_stats.max_value,
+            Precision::Exact(ScalarValue::Int32(Some(200)))
+        );
+        assert_eq!(
+            col2_stats.min_value,
+            Precision::Exact(ScalarValue::Int32(Some(5)))
+        );
+        assert_eq!(
+            col2_stats.sum_value,
+            Precision::Exact(ScalarValue::Int32(Some(2200)))
+        ); // 1000 + 1200
+    }
+
+    #[test]
+    fn test_try_merge_mixed_precision() {
+        // Create a schema with one column
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "col1",
+            DataType::Int32,
+            false,
+        )]));
+
+        // Create items with different precision levels
+        let stats1 = Statistics {
+            num_rows: Precision::Exact(10),
+            total_byte_size: Precision::Inexact(100),
+            column_statistics: vec![ColumnStatistics {
+                null_count: Precision::Exact(1),
+                max_value: Precision::Exact(ScalarValue::Int32(Some(100))),
+                min_value: Precision::Inexact(ScalarValue::Int32(Some(1))),
+                sum_value: Precision::Exact(ScalarValue::Int32(Some(500))),
+                distinct_count: Precision::Absent,
+            }],
+        };
+
+        let stats2 = Statistics {
+            num_rows: Precision::Inexact(15),
+            total_byte_size: Precision::Exact(150),
+            column_statistics: vec![ColumnStatistics {
+                null_count: Precision::Inexact(2),
+                max_value: Precision::Inexact(ScalarValue::Int32(Some(120))),
+                min_value: Precision::Exact(ScalarValue::Int32(Some(-10))),
+                sum_value: Precision::Absent,
+                distinct_count: Precision::Absent,
+            }],
+        };
+
+        let items = vec![stats1, stats2];
+
+        let summary_stats = Statistics::try_merge_iter(&items, 
&schema).unwrap();
+
+        assert_eq!(summary_stats.num_rows, Precision::Inexact(25));
+        assert_eq!(summary_stats.total_byte_size, Precision::Inexact(250));
+
+        let col_stats = &summary_stats.column_statistics[0];
+        assert_eq!(col_stats.null_count, Precision::Inexact(3));
+        assert_eq!(
+            col_stats.max_value,
+            Precision::Inexact(ScalarValue::Int32(Some(120)))
+        );
+        assert_eq!(
+            col_stats.min_value,
+            Precision::Inexact(ScalarValue::Int32(Some(-10)))
+        );
+        assert!(matches!(col_stats.sum_value, Precision::Absent));
+    }
+
+    #[test]
+    fn test_try_merge_empty() {
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "col1",
+            DataType::Int32,
+            false,
+        )]));
+
+        // Empty collection
+        let items: Vec<Statistics> = vec![];
+
+        let summary_stats = Statistics::try_merge_iter(&items, 
&schema).unwrap();
+
+        // Verify default values for empty collection
+        assert_eq!(summary_stats.num_rows, Precision::Absent);
+        assert_eq!(summary_stats.total_byte_size, Precision::Absent);
+        assert_eq!(summary_stats.column_statistics.len(), 1);
+        assert_eq!(
+            summary_stats.column_statistics[0].null_count,
+            Precision::Absent
+        );
+    }
+
+    #[test]
+    fn test_try_merge_mismatched_size() {

Review Comment:
   mismatched size is a new test



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

Reply via email to