xudong963 commented on code in PR #17381: URL: https://github.com/apache/datafusion/pull/17381#discussion_r2324235835
########## datafusion/common/src/stats.rs: ########## @@ -391,62 +391,85 @@ impl Statistics { /// parameter to compute global statistics in a multi-partition setting. pub fn with_fetch( mut self, - schema: SchemaRef, + _schema: SchemaRef, Review Comment: Are you keeping the unused param to avoid API change? ########## datafusion/common/src/stats.rs: ########## @@ -391,62 +391,85 @@ impl Statistics { /// parameter to compute global statistics in a multi-partition setting. pub fn with_fetch( mut self, - schema: SchemaRef, + _schema: SchemaRef, fetch: Option<usize>, skip: usize, n_partitions: usize, ) -> Result<Self> { let fetch_val = fetch.unwrap_or(usize::MAX); - self.num_rows = match self { - Statistics { - num_rows: Precision::Exact(nr), - .. + // Get the ratio of rows after / rows before on a per-partition basis + let num_rows_before = self.num_rows; + + // Check for early return case before mutating self + if let Precision::Exact(nr) | Precision::Inexact(nr) = self.num_rows { + if nr <= fetch_val && skip == 0 { + // If the input does not reach the `fetch` globally, and `skip` + // is zero (meaning the input and output are identical), return + // input stats as is. + // TODO: Can input stats still be used, but adjusted, when `skip` + // is non-zero? + return Ok(self); } - | Statistics { - num_rows: Precision::Inexact(nr), - .. - } => { + } + + self.num_rows = match self.num_rows { + Precision::Exact(nr) | Precision::Inexact(nr) => { // Here, the inexact case gives us an upper bound on the number of rows. if nr <= skip { // All input data will be skipped: Precision::Exact(0) - } else if nr <= fetch_val && skip == 0 { - // If the input does not reach the `fetch` globally, and `skip` - // is zero (meaning the input and output are identical), return - // input stats as is. - // TODO: Can input stats still be used, but adjusted, when `skip` - // is non-zero? - return Ok(self); } else if nr - skip <= fetch_val { // After `skip` input rows are skipped, the remaining rows are // less than or equal to the `fetch` values, so `num_rows` must // equal the remaining rows. check_num_rows( (nr - skip).checked_mul(n_partitions), // We know that we have an estimate for the number of rows: Review Comment: Do you think the purpose of the comment was to say that the `unwrap()` is safe? If so, I think we can remove the comment. ########## datafusion/common/src/stats.rs: ########## @@ -391,62 +391,85 @@ impl Statistics { /// parameter to compute global statistics in a multi-partition setting. pub fn with_fetch( mut self, - schema: SchemaRef, + _schema: SchemaRef, fetch: Option<usize>, skip: usize, n_partitions: usize, ) -> Result<Self> { let fetch_val = fetch.unwrap_or(usize::MAX); - self.num_rows = match self { - Statistics { - num_rows: Precision::Exact(nr), - .. + // Get the ratio of rows after / rows before on a per-partition basis + let num_rows_before = self.num_rows; + + // Check for early return case before mutating self + if let Precision::Exact(nr) | Precision::Inexact(nr) = self.num_rows { + if nr <= fetch_val && skip == 0 { + // If the input does not reach the `fetch` globally, and `skip` + // is zero (meaning the input and output are identical), return + // input stats as is. + // TODO: Can input stats still be used, but adjusted, when `skip` + // is non-zero? + return Ok(self); } - | Statistics { - num_rows: Precision::Inexact(nr), - .. - } => { + } + + self.num_rows = match self.num_rows { + Precision::Exact(nr) | Precision::Inexact(nr) => { // Here, the inexact case gives us an upper bound on the number of rows. if nr <= skip { // All input data will be skipped: Precision::Exact(0) - } else if nr <= fetch_val && skip == 0 { - // If the input does not reach the `fetch` globally, and `skip` - // is zero (meaning the input and output are identical), return - // input stats as is. - // TODO: Can input stats still be used, but adjusted, when `skip` - // is non-zero? - return Ok(self); } else if nr - skip <= fetch_val { // After `skip` input rows are skipped, the remaining rows are // less than or equal to the `fetch` values, so `num_rows` must // equal the remaining rows. check_num_rows( (nr - skip).checked_mul(n_partitions), // We know that we have an estimate for the number of rows: - self.num_rows.is_exact().unwrap(), + matches!(num_rows_before, Precision::Exact(_)), ) } else { // At this point we know that we were given a `fetch` value // as the `None` case would go into the branch above. Since // the input has more rows than `fetch + skip`, the number - // of rows will be the `fetch`, but we won't be able to - // predict the other statistics. + // of rows will be the `fetch`, other statistics will have to be downgraded to inexact. check_num_rows( fetch_val.checked_mul(n_partitions), // We know that we have an estimate for the number of rows: - self.num_rows.is_exact().unwrap(), + matches!(num_rows_before, Precision::Exact(_)), ) } } - Statistics { - num_rows: Precision::Absent, - .. - } => check_num_rows(fetch.and_then(|v| v.checked_mul(n_partitions)), false), + Precision::Absent => { + check_num_rows(fetch.and_then(|v| v.checked_mul(n_partitions)), false) + } + }; + let ratio: f64 = match (num_rows_before, self.num_rows) { + ( + Precision::Exact(nr_before) | Precision::Inexact(nr_before), + Precision::Exact(nr_after) | Precision::Inexact(nr_after), + ) => { + if nr_before == 0 { + 0.0 + } else { + nr_after as f64 / nr_before as f64 + } + } + _ => 0.0, + }; + self.column_statistics = self + .column_statistics + .into_iter() + .map(ColumnStatistics::to_inexact) + .collect(); Review Comment: The `limit` can make the column_statistics super different from the original, so here I'm wondering if it's better to give it an unknown. I'm worried we'll use inexact column_statistics to do some estimation, but in fact, the column_statistics is very deviant -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org