This is an automated email from the ASF dual-hosted git repository.
sivabalan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/master by this push:
new 39f1f395b172 fix(common): Filter stray files when loading partitions
in AbstractTableFileSystemView (#18047)
39f1f395b172 is described below
commit 39f1f395b172378093cf1bae67d6515c57dfac35
Author: Prashant Wason <[email protected]>
AuthorDate: Wed Mar 11 10:05:48 2026 -0700
fix(common): Filter stray files when loading partitions in
AbstractTableFileSystemView (#18047)
When loading files in AbstractTableFileSystemView.listPartition(), all
files in a partition are loaded without validating that they are valid HUDI
data or log files. This can cause validation exceptions later in the code when
stray files (temporary files, hidden files, or files with corrupted names) are
processed, since HUDI requires file names to have specific formats.
Summary and Changelog
Added filtering to ensure only valid HUDI data files (base files and log
files) are loaded when listing partitions.
Changes:
Added filterValidDataFiles() method that uses FSUtils.isDataFile() to
validate files
Updated getAllFilesInPartition() to filter files after loading from metadata
Updated ensurePartitionsLoadedCorrectly() to filter files returned by
listPartitions() before adding them to the view
Co-Authored-By: Claude Opus 4.6 <[email protected]>
---------
Co-authored-by: Claude Opus 4.5 <[email protected]>
---
.../metadata/FileSystemBackedTableMetadata.java | 11 +----
.../TestFileSystemBackedTableMetadata.java | 48 ++++++++++++++++++++++
2 files changed, 49 insertions(+), 10 deletions(-)
diff --git
a/hudi-common/src/main/java/org/apache/hudi/metadata/FileSystemBackedTableMetadata.java
b/hudi-common/src/main/java/org/apache/hudi/metadata/FileSystemBackedTableMetadata.java
index 4f6b0e439efa..a5c51b594641 100644
---
a/hudi-common/src/main/java/org/apache/hudi/metadata/FileSystemBackedTableMetadata.java
+++
b/hudi-common/src/main/java/org/apache/hudi/metadata/FileSystemBackedTableMetadata.java
@@ -351,16 +351,7 @@ public class FileSystemBackedTableMetadata extends
AbstractHoodieTableMetadata {
for (Pair<String, StoragePath> partitionPair : partitionPathList) {
StoragePath absolutePartitionPath = partitionPair.getRight();
- try {
- pathInfoMap.put(partitionPair,
getStorage().listDirectEntries(absolutePartitionPath));
- } catch (IOException e) {
- if (!getStorage().exists(absolutePartitionPath)) {
- pathInfoMap.put(partitionPair, Collections.emptyList());
- } else {
- // in case the partition path was created by another caller
- pathInfoMap.put(partitionPair,
getStorage().listDirectEntries(absolutePartitionPath));
- }
- }
+ pathInfoMap.put(partitionPair,
getAllFilesInPartition(absolutePartitionPath));
}
return pathInfoMap;
}
diff --git
a/hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestFileSystemBackedTableMetadata.java
b/hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestFileSystemBackedTableMetadata.java
index a9305123291f..48a2851c0924 100644
---
a/hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestFileSystemBackedTableMetadata.java
+++
b/hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestFileSystemBackedTableMetadata.java
@@ -21,6 +21,7 @@ package org.apache.hudi.metadata;
import org.apache.hudi.common.engine.HoodieLocalEngineContext;
import org.apache.hudi.common.testutils.HoodieCommonTestHarness;
import org.apache.hudi.common.testutils.HoodieTestTable;
+import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.storage.StoragePath;
import org.apache.hudi.storage.StoragePathInfo;
@@ -30,6 +31,7 @@ import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import java.io.IOException;
+import java.io.OutputStream;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
@@ -178,6 +180,52 @@ public class TestFileSystemBackedTableMetadata extends
HoodieCommonTestHarness {
}
}
+ /**
+ * Test that non-conformant files (stray files that are not valid HUDI data
or log files)
+ * are filtered out when listing partition files.
+ */
+ @Test
+ public void testStrayFilesAreFilteredOut() throws Exception {
+ String partition = "2024/01/01";
+ hoodieTestTable = hoodieTestTable.addCommit("100")
+ .withPartitionMetaFiles(partition)
+ .withBaseFilesInPartition(partition, IntStream.range(0, 5).toArray());
+
+ // Create stray files that should be filtered out
+ StoragePath partitionPath = new StoragePath(basePath + "/" + partition);
+ String[] strayFileNames = {
+ ".tmp_copy_file", // hidden temp file
+ "_temporary_data", // underscore-prefixed temp file
+ "random_file.txt", // non-hudi file
+ "corrupted_name", // file with no valid extension
+ ".crc" // checksum file
+ };
+ for (String strayFile : strayFileNames) {
+ StoragePath strayPath = new StoragePath(partitionPath, strayFile);
+ try (OutputStream out = metaClient.getStorage().create(strayPath)) {
+ out.write("test".getBytes());
+ }
+ }
+
+ HoodieLocalEngineContext localEngineContext =
+ new HoodieLocalEngineContext(metaClient.getStorageConf());
+ FileSystemBackedTableMetadata fileSystemBackedTableMetadata =
+ new FileSystemBackedTableMetadata(localEngineContext,
metaClient.getTableConfig(),
+ metaClient.getStorage(), basePath);
+
+ // getAllFilesInPartition should only return the 5 valid base files
+ List<StoragePathInfo> files =
fileSystemBackedTableMetadata.getAllFilesInPartition(partitionPath);
+ Assertions.assertEquals(5, files.size(), "Stray files should be filtered
out by getAllFilesInPartition");
+
+ // listPartitions should also only return the 5 valid base files
+ List<Pair<String, StoragePath>> partitionPathList =
Collections.singletonList(
+ Pair.of(partition, partitionPath));
+ Map<Pair<String, StoragePath>, List<StoragePathInfo>> partitionFilesMap =
+ fileSystemBackedTableMetadata.listPartitions(partitionPathList);
+ Assertions.assertEquals(5,
partitionFilesMap.get(partitionPathList.get(0)).size(),
+ "Stray files should be filtered out by listPartitions");
+ }
+
@Test
public void testMultiLevelEmptyPartitionTable() throws Exception {
String instant = "100";