This is an automated email from the ASF dual-hosted git repository.

sivabalan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
     new 39f1f395b172 fix(common): Filter stray files when loading partitions 
in AbstractTableFileSystemView (#18047)
39f1f395b172 is described below

commit 39f1f395b172378093cf1bae67d6515c57dfac35
Author: Prashant Wason <[email protected]>
AuthorDate: Wed Mar 11 10:05:48 2026 -0700

    fix(common): Filter stray files when loading partitions in 
AbstractTableFileSystemView (#18047)
    
    When loading files in AbstractTableFileSystemView.listPartition(), all 
files in a partition are loaded without validating that they are valid HUDI 
data or log files. This can cause validation exceptions later in the code when 
stray files (temporary files, hidden files, or files with corrupted names) are 
processed, since HUDI requires file names to have specific formats.
    
    Summary and Changelog
    Added filtering to ensure only valid HUDI data files (base files and log 
files) are loaded when listing partitions.
    
    Changes:
    
    Added filterValidDataFiles() method that uses FSUtils.isDataFile() to 
validate files
    Updated getAllFilesInPartition() to filter files after loading from metadata
    Updated ensurePartitionsLoadedCorrectly() to filter files returned by 
listPartitions() before adding them to the view
    
    Co-Authored-By: Claude Opus 4.6 <[email protected]>
    
    ---------
    
    Co-authored-by: Claude Opus 4.5 <[email protected]>
---
 .../metadata/FileSystemBackedTableMetadata.java    | 11 +----
 .../TestFileSystemBackedTableMetadata.java         | 48 ++++++++++++++++++++++
 2 files changed, 49 insertions(+), 10 deletions(-)

diff --git 
a/hudi-common/src/main/java/org/apache/hudi/metadata/FileSystemBackedTableMetadata.java
 
b/hudi-common/src/main/java/org/apache/hudi/metadata/FileSystemBackedTableMetadata.java
index 4f6b0e439efa..a5c51b594641 100644
--- 
a/hudi-common/src/main/java/org/apache/hudi/metadata/FileSystemBackedTableMetadata.java
+++ 
b/hudi-common/src/main/java/org/apache/hudi/metadata/FileSystemBackedTableMetadata.java
@@ -351,16 +351,7 @@ public class FileSystemBackedTableMetadata extends 
AbstractHoodieTableMetadata {
 
     for (Pair<String, StoragePath> partitionPair : partitionPathList) {
       StoragePath absolutePartitionPath = partitionPair.getRight();
-      try {
-        pathInfoMap.put(partitionPair, 
getStorage().listDirectEntries(absolutePartitionPath));
-      } catch (IOException e) {
-        if (!getStorage().exists(absolutePartitionPath)) {
-          pathInfoMap.put(partitionPair, Collections.emptyList());
-        } else {
-          // in case the partition path was created by another caller
-          pathInfoMap.put(partitionPair, 
getStorage().listDirectEntries(absolutePartitionPath));
-        }
-      }
+      pathInfoMap.put(partitionPair, 
getAllFilesInPartition(absolutePartitionPath));
     }
     return pathInfoMap;
   }
diff --git 
a/hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestFileSystemBackedTableMetadata.java
 
b/hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestFileSystemBackedTableMetadata.java
index a9305123291f..48a2851c0924 100644
--- 
a/hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestFileSystemBackedTableMetadata.java
+++ 
b/hudi-hadoop-common/src/test/java/org/apache/hudi/metadata/TestFileSystemBackedTableMetadata.java
@@ -21,6 +21,7 @@ package org.apache.hudi.metadata;
 import org.apache.hudi.common.engine.HoodieLocalEngineContext;
 import org.apache.hudi.common.testutils.HoodieCommonTestHarness;
 import org.apache.hudi.common.testutils.HoodieTestTable;
+import org.apache.hudi.common.util.collection.Pair;
 import org.apache.hudi.storage.StoragePath;
 import org.apache.hudi.storage.StoragePathInfo;
 
@@ -30,6 +31,7 @@ import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
 
 import java.io.IOException;
+import java.io.OutputStream;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;
@@ -178,6 +180,52 @@ public class TestFileSystemBackedTableMetadata extends 
HoodieCommonTestHarness {
     }
   }
 
+  /**
+   * Test that non-conformant files (stray files that are not valid HUDI data 
or log files)
+   * are filtered out when listing partition files.
+   */
+  @Test
+  public void testStrayFilesAreFilteredOut() throws Exception {
+    String partition = "2024/01/01";
+    hoodieTestTable = hoodieTestTable.addCommit("100")
+        .withPartitionMetaFiles(partition)
+        .withBaseFilesInPartition(partition, IntStream.range(0, 5).toArray());
+
+    // Create stray files that should be filtered out
+    StoragePath partitionPath = new StoragePath(basePath + "/" + partition);
+    String[] strayFileNames = {
+        ".tmp_copy_file",           // hidden temp file
+        "_temporary_data",          // underscore-prefixed temp file
+        "random_file.txt",          // non-hudi file
+        "corrupted_name",           // file with no valid extension
+        ".crc"                      // checksum file
+    };
+    for (String strayFile : strayFileNames) {
+      StoragePath strayPath = new StoragePath(partitionPath, strayFile);
+      try (OutputStream out = metaClient.getStorage().create(strayPath)) {
+        out.write("test".getBytes());
+      }
+    }
+
+    HoodieLocalEngineContext localEngineContext =
+        new HoodieLocalEngineContext(metaClient.getStorageConf());
+    FileSystemBackedTableMetadata fileSystemBackedTableMetadata =
+        new FileSystemBackedTableMetadata(localEngineContext, 
metaClient.getTableConfig(),
+            metaClient.getStorage(), basePath);
+
+    // getAllFilesInPartition should only return the 5 valid base files
+    List<StoragePathInfo> files = 
fileSystemBackedTableMetadata.getAllFilesInPartition(partitionPath);
+    Assertions.assertEquals(5, files.size(), "Stray files should be filtered 
out by getAllFilesInPartition");
+
+    // listPartitions should also only return the 5 valid base files
+    List<Pair<String, StoragePath>> partitionPathList = 
Collections.singletonList(
+        Pair.of(partition, partitionPath));
+    Map<Pair<String, StoragePath>, List<StoragePathInfo>> partitionFilesMap =
+        fileSystemBackedTableMetadata.listPartitions(partitionPathList);
+    Assertions.assertEquals(5, 
partitionFilesMap.get(partitionPathList.get(0)).size(),
+        "Stray files should be filtered out by listPartitions");
+  }
+
   @Test
   public void testMultiLevelEmptyPartitionTable() throws Exception {
     String instant = "100";

Reply via email to