This is an automated email from the ASF dual-hosted git repository.
github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 0808f3a8d2 Improvements to `list_files_cache` table function (#19703)
0808f3a8d2 is described below
commit 0808f3a8d2646c9435557db059759653c3f2c383
Author: Andrew Lamb <[email protected]>
AuthorDate: Wed Jan 14 16:55:10 2026 -0500
Improvements to `list_files_cache` table function (#19703)
## Which issue does this PR close?
- Follow on to https://github.com/apache/datafusion/pull/19616
## Rationale for this change
I had a few minor comments / suggestions while reviewing
https://github.com/apache/datafusion/pull/19616 from @jizezhang but they
weren't needed to do the initial merge, so I would like to propose them
in a follow up PR
## What changes are included in this PR?
1. Improve documentation
2. Improve handling of `table_ref` in ListingTableURL
3. use Null rather than `"NULL"` in `list_files_cache` table function
I can break this into separate PRs if that would help
## Are these changes tested?
Yes by CI
## Are there any user-facing changes?
The `list_files_cache` function now might return null
---
datafusion-cli/src/functions.rs | 13 ++++++++-----
datafusion/core/src/datasource/listing_table_factory.rs | 4 +---
datafusion/datasource/src/url.rs | 12 +++++++-----
datafusion/execution/src/cache/cache_manager.rs | 1 +
datafusion/execution/src/cache/list_files_cache.rs | 5 +++++
5 files changed, 22 insertions(+), 13 deletions(-)
diff --git a/datafusion-cli/src/functions.rs b/datafusion-cli/src/functions.rs
index 6a97c5355f..e50339d296 100644
--- a/datafusion-cli/src/functions.rs
+++ b/datafusion-cli/src/functions.rs
@@ -703,10 +703,13 @@ impl TableFunctionImpl for StatisticsCacheFunc {
}
}
-// Implementation of the `list_files_cache` table function in datafusion-cli.
+/// Implementation of the `list_files_cache` table function in datafusion-cli.
+///
+/// This function returns the cached results of running a LIST command on a
+/// particular object store path for a table. The object metadata is returned
as
+/// a List of Structs, with one Struct for each object. DataFusion uses these
+/// cached results to plan queries against external tables.
///
-/// This function returns the cached results of running a LIST command on a
particular object store path for a table. The object metadata is returned as a
List of Structs, with one Struct for each object.
-/// DataFusion uses these cached results to plan queries against external
tables.
/// # Schema
/// ```sql
/// > describe select * from list_files_cache();
@@ -788,7 +791,7 @@ impl TableFunctionImpl for ListFilesCacheFunc {
Field::new("metadata", DataType::Struct(nested_fields.clone()),
true);
let schema = Arc::new(Schema::new(vec![
- Field::new("table", DataType::Utf8, false),
+ Field::new("table", DataType::Utf8, true),
Field::new("path", DataType::Utf8, false),
Field::new("metadata_size_bytes", DataType::UInt64, false),
// expires field in ListFilesEntry has type Instant when set, from
which we cannot get "the number of seconds", hence using Duration instead of
Timestamp as data type.
@@ -821,7 +824,7 @@ impl TableFunctionImpl for ListFilesCacheFunc {
let mut current_offset: i32 = 0;
for (path, entry) in list_files_cache.list_entries() {
- table_arr.push(path.table.map_or("NULL".to_string(), |t|
t.to_string()));
+ table_arr.push(path.table.map(|t| t.to_string()));
path_arr.push(path.path.to_string());
metadata_size_bytes_arr.push(entry.size_bytes as u64);
// calculates time left before entry expires
diff --git a/datafusion/core/src/datasource/listing_table_factory.rs
b/datafusion/core/src/datasource/listing_table_factory.rs
index 86af691fd7..98f61a8528 100644
--- a/datafusion/core/src/datasource/listing_table_factory.rs
+++ b/datafusion/core/src/datasource/listing_table_factory.rs
@@ -161,9 +161,7 @@ impl TableProviderFactory for ListingTableFactory {
}
None => format!("*.{}", cmd.file_type.to_lowercase()),
};
- table_path = table_path
- .with_glob(glob.as_ref())?
- .with_table_ref(cmd.name.clone());
+ table_path = table_path.with_glob(glob.as_ref())?;
}
let schema = options.infer_schema(session_state,
&table_path).await?;
let df_schema = Arc::clone(&schema).to_dfschema()?;
diff --git a/datafusion/datasource/src/url.rs b/datafusion/datasource/src/url.rs
index 678bd280fc..0c274806c0 100644
--- a/datafusion/datasource/src/url.rs
+++ b/datafusion/datasource/src/url.rs
@@ -43,7 +43,7 @@ pub struct ListingTableUrl {
prefix: Path,
/// An optional glob expression used to filter files
glob: Option<Pattern>,
-
+ /// Optional table reference for the table this url belongs to
table_ref: Option<TableReference>,
}
@@ -341,17 +341,19 @@ impl ListingTableUrl {
}
/// Returns a copy of current [`ListingTableUrl`] with a specified `glob`
- pub fn with_glob(self, glob: &str) -> Result<Self> {
- let glob =
- Pattern::new(glob).map_err(|e|
DataFusionError::External(Box::new(e)))?;
- Self::try_new(self.url, Some(glob))
+ pub fn with_glob(mut self, glob: &str) -> Result<Self> {
+ self.glob =
+ Some(Pattern::new(glob).map_err(|e|
DataFusionError::External(Box::new(e)))?);
+ Ok(self)
}
+ /// Set the table reference for this [`ListingTableUrl`]
pub fn with_table_ref(mut self, table_ref: TableReference) -> Self {
self.table_ref = Some(table_ref);
self
}
+ /// Return the table reference for this [`ListingTableUrl`]
pub fn get_table_ref(&self) -> &Option<TableReference> {
&self.table_ref
}
diff --git a/datafusion/execution/src/cache/cache_manager.rs
b/datafusion/execution/src/cache/cache_manager.rs
index 4cc5586440..bd34c441bd 100644
--- a/datafusion/execution/src/cache/cache_manager.rs
+++ b/datafusion/execution/src/cache/cache_manager.rs
@@ -196,6 +196,7 @@ pub trait ListFilesCache: CacheAccessor<TableScopedPath,
CachedFileList> {
/// Retrieves the information about the entries currently cached.
fn list_entries(&self) -> HashMap<TableScopedPath, ListFilesEntry>;
+ /// Drop all entries for the given table reference.
fn drop_table_entries(&self, table_ref: &Option<TableReference>) ->
Result<()>;
}
diff --git a/datafusion/execution/src/cache/list_files_cache.rs
b/datafusion/execution/src/cache/list_files_cache.rs
index c86a03574e..b1b8e6b500 100644
--- a/datafusion/execution/src/cache/list_files_cache.rs
+++ b/datafusion/execution/src/cache/list_files_cache.rs
@@ -139,6 +139,11 @@ pub const DEFAULT_LIST_FILES_CACHE_MEMORY_LIMIT: usize =
1024 * 1024; // 1MiB
/// The default cache TTL for the [`DefaultListFilesCache`]
pub const DEFAULT_LIST_FILES_CACHE_TTL: Option<Duration> = None; // Infinite
+/// Key for [`DefaultListFilesCache`]
+///
+/// Each entry is scoped to its use within a specific table so that the cache
+/// can differentiate between identical paths in different tables, and
+/// table-level cache invalidation.
#[derive(PartialEq, Eq, Hash, Clone, Debug)]
pub struct TableScopedPath {
pub table: Option<TableReference>,
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]