umehrot2 commented on a change in pull request #1768:
URL: https://github.com/apache/hudi/pull/1768#discussion_r447264716
##########
File path: hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java
##########
@@ -199,16 +201,40 @@ public static String getRelativePartitionPath(Path
basePath, Path partitionPath)
return partitions;
}
- public static List<String> getAllDataFilesForMarkers(FileSystem fs, String
basePath, String instantTs,
- String markerDir, String baseFileExtension) throws IOException {
- List<String> dataFiles = new LinkedList<>();
- processFiles(fs, markerDir, (status) -> {
- String pathStr = status.getPath().toString();
- if (pathStr.endsWith(HoodieTableMetaClient.MARKER_EXTN)) {
- dataFiles.add(FSUtils.translateMarkerToDataPath(basePath, pathStr,
instantTs, baseFileExtension));
+ public static Set<String> getAllDataFilesForMarkers(JavaSparkContext jsc,
FileSystem fs, String basePath,
+ String instantTs, String markerDir, String baseFileExtension, int
parallelism) throws IOException {
+ FileStatus[] topLevelStatuses = fs.listStatus(new Path(markerDir));
+
+ Set<String> dataFiles = new HashSet<>();
+
+ List<String> subDirectories = new ArrayList<>();
+ for (FileStatus topLevelStatus: topLevelStatuses) {
+ if (topLevelStatus.isFile()) {
+ String pathStr = topLevelStatus.getPath().toString();
+ if (pathStr.endsWith(HoodieTableMetaClient.MARKER_EXTN)) {
+ dataFiles.add(FSUtils.translateMarkerToDataPath(basePath, pathStr,
instantTs, baseFileExtension));
+ }
+ } else {
+ subDirectories.add(topLevelStatus.getPath().toString());
}
- return true;
- }, false);
+ }
+
+ parallelism = subDirectories.size() < parallelism ? subDirectories.size()
: parallelism;
Review comment:
Will address in the next revision, one I have more feedback as needed
from @n3nash
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]