nuno-faria commented on code in PR #19054:
URL: https://github.com/apache/datafusion/pull/19054#discussion_r2582689236
##########
datafusion-cli/src/main.rs:
##########
@@ -631,4 +647,90 @@ mod tests {
Ok(())
}
+
+ #[tokio::test]
+ async fn test_statistics_cache() -> Result<(), DataFusionError> {
+ let file_statistics_cache =
Arc::new(DefaultFileStatisticsCache::default());
+ let cache_config = CacheManagerConfig::default()
+ .with_files_statistics_cache(Some(file_statistics_cache.clone()));
+ let runtime = RuntimeEnvBuilder::new()
+ .with_cache_manager(cache_config)
+ .build()?;
+ let config = SessionConfig::new().with_collect_statistics(true);
+ let ctx = SessionContext::new_with_config_rt(config,
Arc::new(runtime));
+
+ ctx.register_udtf(
+ "statistics_cache",
+ Arc::new(StatisticsCacheFunc::new(
+ ctx.task_ctx().runtime_env().cache_manager.clone(),
+ )),
+ );
+
+ ctx.sql(
+ "
+ create external table alltypes_plain
+ stored as parquet
+ location '../parquet-testing/data/alltypes_plain.parquet'",
+ )
+ .await?
+ .collect()
+ .await?;
+
+ ctx.sql(
+ "
+ create external table alltypes_tiny_pages
+ stored as parquet
+ location '../parquet-testing/data/alltypes_tiny_pages.parquet'",
+ )
+ .await?
+ .collect()
+ .await?;
+
+ ctx.sql(
+ "
+ create external table lz4_raw_compressed_larger
+ stored as parquet
+ location
'../parquet-testing/data/lz4_raw_compressed_larger.parquet'",
+ )
+ .await?
+ .collect()
+ .await?;
+
+ let sql = "SELECT split_part(path, '/', -1) as filename,
file_size_bytes, num_rows, num_columns, table_size_bytes from
statistics_cache() order by filename";
+ let df = ctx.sql(sql).await?;
+ let rbs = df.collect().await?;
+ assert_snapshot!(batches_to_string(&rbs),@r"
+ ++
+ ++
+ ");
Review Comment:
This confirms that the file statistics cache is not populated when the table
is created, only after accessing it once.
##########
datafusion/execution/src/cache/cache_unit.rs:
##########
@@ -42,6 +43,29 @@ pub struct DefaultFileStatisticsCache {
statistics: DashMap<Path, (ObjectMeta, Arc<Statistics>)>,
}
+impl FileStatisticsCache for DefaultFileStatisticsCache {
+ fn list_entries(&self) -> HashMap<Path, FileStatisticsCacheEntry> {
+ let mut entries = HashMap::<Path, FileStatisticsCacheEntry>::new();
+
+ for entry in &self.statistics {
+ let path = entry.key();
+ let (object_meta, stats) = entry.value();
+ entries.insert(
+ path.clone(),
+ FileStatisticsCacheEntry {
+ object_meta: object_meta.clone(),
+ num_rows: stats.num_rows,
+ num_columns: stats.column_statistics.len(),
+ table_size_bytes: stats.total_byte_size,
+ statistics_size_bytes: 0, // TODO: set to the real size in
the future
Review Comment:
In the future we need to set this to the real size.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]