This is an automated email from the ASF dual-hosted git repository.
xushiyan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/master by this push:
new 0f703a7e15 [HUDI-4507] Improve file name extraction logic in metadata
utils (#6250)
0f703a7e15 is described below
commit 0f703a7e15833493037f7f7a07882cd73044ee65
Author: Y Ethan Guo <[email protected]>
AuthorDate: Fri Jul 29 19:25:57 2022 -0700
[HUDI-4507] Improve file name extraction logic in metadata utils (#6250)
---
.../java/org/apache/hudi/common/fs/FSUtils.java | 18 ++++++++++++++++++
.../hudi/metadata/HoodieTableMetadataUtil.java | 21 ++++-----------------
.../java/org/apache/hudi/common/fs/TestFSUtils.java | 12 ++++++++++--
3 files changed, 32 insertions(+), 19 deletions(-)
diff --git a/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java
b/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java
index cfc143e3d0..d940f3bb45 100644
--- a/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java
+++ b/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java
@@ -615,6 +615,24 @@ public class FSUtils {
return StringUtils.isNullOrEmpty(partitionPath) ? basePath : new
Path(basePath, partitionPath);
}
+ /**
+ * Extracts the file name from the relative path based on the table base
path. For example:
+ * "/2022/07/29/file1.parquet", "/2022/07/29" -> "file1.parquet"
+ * "2022/07/29/file2.parquet", "2022/07/29" -> "file2.parquet"
+ * "/file3.parquet", "" -> "file3.parquet"
+ * "file4.parquet", "" -> "file4.parquet"
+ *
+ * @param filePathWithPartition the relative file path based on the table
base path.
+ * @param partition the relative partition path. For
partitioned table, `partition` contains the relative partition path;
+ * for non-partitioned table, `partition` is
empty
+ * @return Extracted file name in String.
+ */
+ public static String getFileName(String filePathWithPartition, String
partition) {
+ int offset = StringUtils.isNullOrEmpty(partition)
+ ? (filePathWithPartition.startsWith("/") ? 1 : 0) : partition.length()
+ 1;
+ return filePathWithPartition.substring(offset);
+ }
+
/**
* Get DFS full partition path (e.g. hdfs://ip-address:8020:/<absolute path>)
*/
diff --git
a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java
b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java
index d41f09990e..2c5b8db0ed 100644
---
a/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java
+++
b/hudi-common/src/main/java/org/apache/hudi/metadata/HoodieTableMetadataUtil.java
@@ -325,16 +325,13 @@ public class HoodieTableMetadataUtil {
return map;
}
- int offset = partition.equals(NON_PARTITIONED_NAME)
- ? (pathWithPartition.startsWith("/") ? 1 : 0)
- : partition.length() + 1;
- String filename = pathWithPartition.substring(offset);
+ String fileName =
FSUtils.getFileName(pathWithPartition, partitionStatName);
// Since write-stats are coming in no particular
order, if the same
// file have previously been appended to w/in the txn,
we simply pick max
// of the sizes as reported after every write, since
file-sizes are
// monotonically increasing (ie file-size never goes
down, unless deleted)
- map.merge(filename, stat.getFileSizeInBytes(),
Math::max);
+ map.merge(fileName, stat.getFileSizeInBytes(),
Math::max);
return map;
},
@@ -410,12 +407,7 @@ public class HoodieTableMetadataUtil {
return Collections.emptyListIterator();
}
- // For partitioned table, "partition" contains the relative partition
path;
- // for non-partitioned table, "partition" is empty
- int offset = StringUtils.isNullOrEmpty(partition)
- ? (pathWithPartition.startsWith("/") ? 1 : 0) : partition.length() +
1;
-
- final String fileName = pathWithPartition.substring(offset);
+ String fileName = FSUtils.getFileName(pathWithPartition, partition);
if (!FSUtils.isBaseFile(new Path(fileName))) {
return Collections.emptyListIterator();
}
@@ -1162,13 +1154,8 @@ public class HoodieTableMetadataUtil {
HoodieTableMetaClient datasetMetaClient,
List<String>
columnsToIndex,
boolean isDeleted)
{
- String partitionName = getPartitionIdentifier(partitionPath);
- // NOTE: We have to chop leading "/" to make sure Hadoop does not treat it
like
- // absolute path
String filePartitionPath = filePath.startsWith("/") ?
filePath.substring(1) : filePath;
- String fileName = partitionName.equals(NON_PARTITIONED_NAME)
- ? filePartitionPath
- : filePartitionPath.substring(partitionName.length() + 1);
+ String fileName = FSUtils.getFileName(filePath, partitionPath);
if (isDeleted) {
// TODO we should delete records instead of stubbing them
diff --git
a/hudi-common/src/test/java/org/apache/hudi/common/fs/TestFSUtils.java
b/hudi-common/src/test/java/org/apache/hudi/common/fs/TestFSUtils.java
index 7506e659c9..481bb1dd45 100644
--- a/hudi-common/src/test/java/org/apache/hudi/common/fs/TestFSUtils.java
+++ b/hudi-common/src/test/java/org/apache/hudi/common/fs/TestFSUtils.java
@@ -326,9 +326,17 @@ public class TestFSUtils extends HoodieCommonTestHarness {
Files.createFile(partitionPath.resolve(log3));
assertEquals(3, (int) FSUtils.getLatestLogVersion(FSUtils.getFs(basePath,
new Configuration()),
- new Path(partitionPath.toString()), fileId, LOG_EXTENTION,
instantTime).get().getLeft());
+ new Path(partitionPath.toString()), fileId, LOG_EXTENTION,
instantTime).get().getLeft());
assertEquals(4, FSUtils.computeNextLogVersion(FSUtils.getFs(basePath, new
Configuration()),
- new Path(partitionPath.toString()), fileId, LOG_EXTENTION,
instantTime));
+ new Path(partitionPath.toString()), fileId, LOG_EXTENTION,
instantTime));
+ }
+
+ @Test
+ public void testGetFilename() {
+ assertEquals("file1.parquet",
FSUtils.getFileName("/2022/07/29/file1.parquet", "/2022/07/29"));
+ assertEquals("file2.parquet",
FSUtils.getFileName("2022/07/29/file2.parquet", "2022/07/29"));
+ assertEquals("file3.parquet", FSUtils.getFileName("/file3.parquet", ""));
+ assertEquals("file4.parquet", FSUtils.getFileName("file4.parquet", ""));
}
private void prepareTestDirectory(FileSystem fileSystem, String rootDir)
throws IOException {