alamb commented on code in PR #19094:
URL: https://github.com/apache/datafusion/pull/19094#discussion_r2590268232
##########
datafusion/datasource/src/file_scan_config.rs:
##########
@@ -799,27 +799,16 @@ impl FileScanConfig {
}
}
- fn projected_stats(&self) -> Statistics {
+ fn projected_stats(&self) -> Result<Statistics> {
let statistics = self.statistics();
-
- let table_cols_stats = self
- .projection_indices()
- .into_iter()
- .map(|idx| {
- if idx < self.file_schema().fields().len() {
- statistics.column_statistics[idx].clone()
- } else {
- // TODO provide accurate stat for partition column (#1186)
- ColumnStatistics::new_unknown()
- }
- })
- .collect();
-
- Statistics {
- num_rows: statistics.num_rows,
- // TODO correct byte size:
https://github.com/apache/datafusion/issues/14936
- total_byte_size: statistics.total_byte_size,
- column_statistics: table_cols_stats,
+ let projection = self.file_source.projection();
Review Comment:
👌
##########
datafusion/core/tests/physical_optimizer/partition_statistics.rs:
##########
@@ -418,7 +417,7 @@ mod test {
let expected_stats = Statistics {
num_rows: Precision::Inexact(4),
- total_byte_size: Precision::Inexact(220),
Review Comment:
I agree using "bytes of arrow data" sounds better compared to "bytes of
parquet encoded data"
i would have thought the difference is due to your fixing this:
(BTW I think you could mark this PR as fixing that issue as well)
```
// TODO correct byte size: https://github.com/apache/datafusion/issues/14936
total_byte_size: statistics.total_byte_size,
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]