xudong963 commented on code in PR #17381:
URL: https://github.com/apache/datafusion/pull/17381#discussion_r2326464415


##########
datafusion/common/src/stats.rs:
##########
@@ -391,62 +391,85 @@ impl Statistics {
     /// parameter to compute global statistics in a multi-partition setting.
     pub fn with_fetch(
         mut self,
-        schema: SchemaRef,
+        _schema: SchemaRef,
         fetch: Option<usize>,
         skip: usize,
         n_partitions: usize,
     ) -> Result<Self> {
         let fetch_val = fetch.unwrap_or(usize::MAX);
 
-        self.num_rows = match self {
-            Statistics {
-                num_rows: Precision::Exact(nr),
-                ..
+        // Get the ratio of rows after / rows before on a per-partition basis
+        let num_rows_before = self.num_rows;
+
+        // Check for early return case before mutating self
+        if let Precision::Exact(nr) | Precision::Inexact(nr) = self.num_rows {
+            if nr <= fetch_val && skip == 0 {
+                // If the input does not reach the `fetch` globally, and `skip`
+                // is zero (meaning the input and output are identical), return
+                // input stats as is.
+                // TODO: Can input stats still be used, but adjusted, when 
`skip`
+                //       is non-zero?
+                return Ok(self);
             }
-            | Statistics {
-                num_rows: Precision::Inexact(nr),
-                ..
-            } => {
+        }
+
+        self.num_rows = match self.num_rows {
+            Precision::Exact(nr) | Precision::Inexact(nr) => {
                 // Here, the inexact case gives us an upper bound on the 
number of rows.
                 if nr <= skip {
                     // All input data will be skipped:
                     Precision::Exact(0)
-                } else if nr <= fetch_val && skip == 0 {
-                    // If the input does not reach the `fetch` globally, and 
`skip`
-                    // is zero (meaning the input and output are identical), 
return
-                    // input stats as is.
-                    // TODO: Can input stats still be used, but adjusted, when 
`skip`
-                    //       is non-zero?
-                    return Ok(self);
                 } else if nr - skip <= fetch_val {
                     // After `skip` input rows are skipped, the remaining rows 
are
                     // less than or equal to the `fetch` values, so `num_rows` 
must
                     // equal the remaining rows.
                     check_num_rows(
                         (nr - skip).checked_mul(n_partitions),
                         // We know that we have an estimate for the number of 
rows:
-                        self.num_rows.is_exact().unwrap(),
+                        matches!(num_rows_before, Precision::Exact(_)),
                     )
                 } else {
                     // At this point we know that we were given a `fetch` value
                     // as the `None` case would go into the branch above. Since
                     // the input has more rows than `fetch + skip`, the number
-                    // of rows will be the `fetch`, but we won't be able to
-                    // predict the other statistics.
+                    // of rows will be the `fetch`, other statistics will have 
to be downgraded to inexact.
                     check_num_rows(
                         fetch_val.checked_mul(n_partitions),
                         // We know that we have an estimate for the number of 
rows:
-                        self.num_rows.is_exact().unwrap(),
+                        matches!(num_rows_before, Precision::Exact(_)),
                     )
                 }
             }
-            Statistics {
-                num_rows: Precision::Absent,
-                ..
-            } => check_num_rows(fetch.and_then(|v| 
v.checked_mul(n_partitions)), false),
+            Precision::Absent => {
+                check_num_rows(fetch.and_then(|v| 
v.checked_mul(n_partitions)), false)
+            }
+        };
+        let ratio: f64 = match (num_rows_before, self.num_rows) {
+            (
+                Precision::Exact(nr_before) | Precision::Inexact(nr_before),
+                Precision::Exact(nr_after) | Precision::Inexact(nr_after),
+            ) => {
+                if nr_before == 0 {
+                    0.0
+                } else {
+                    nr_after as f64 / nr_before as f64
+                }
+            }
+            _ => 0.0,
+        };
+        self.column_statistics = self
+            .column_statistics
+            .into_iter()
+            .map(ColumnStatistics::to_inexact)
+            .collect();

Review Comment:
   > do you know of any cases where this would lead to a worse result?
   
   No, just out of a conservative mindset. 
   
   But I don't have a strong bias for this, we can adjust anytime.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

Reply via email to