yihua commented on code in PR #13229:
URL: https://github.com/apache/hudi/pull/13229#discussion_r2078724155


##########
hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieTableServiceClient.java:
##########
@@ -514,23 +533,65 @@ public boolean purgePendingClustering(String 
clusteringInstant) {
     return false;
   }
 
+  protected abstract HoodieWriteMetadata<O> 
convertToOutputMetadata(HoodieWriteMetadata<T> writeMetadata);
+
+  private Map<String, List<String>> 
getPartitionToReplacedFileIds(HoodieClusteringPlan clusteringPlan, 
HoodieWriteMetadata<?> writeMetadata) {
+    Set<HoodieFileGroupId> newFilesWritten = 
writeMetadata.getWriteStats().get().stream()
+        .map(s -> new HoodieFileGroupId(s.getPartitionPath(), 
s.getFileId())).collect(Collectors.toSet());
+
+    return ClusteringUtils.getFileGroupsFromClusteringPlan(clusteringPlan)
+        .filter(fg -> 
"org.apache.hudi.client.clustering.run.strategy.SparkSingleFileSortExecutionStrategy"
+            .equals(config.getClusteringExecutionStrategyClass())
+            || !newFilesWritten.contains(fg))
+        .collect(Collectors.groupingBy(HoodieFileGroupId::getPartitionPath, 
Collectors.mapping(HoodieFileGroupId::getFileId, Collectors.toList())));
+  }
+
   /**
-   * Delete expired partition by config.
-   *
-   * @param instantTime Instant Time for the action
-   * @return HoodieWriteMetadata
+   * Check if any validators are configured and run those validations. If any 
of the validations fail, throws HoodieValidationException.
    */
-  public HoodieWriteMetadata<T> managePartitionTTL(String instantTime) {
-    HoodieTable<?, I, ?, T> table = createTable(config, 
context.getStorageConf());
-    return table.managePartitionTTL(context, instantTime);
-  }
+  protected void runPrecommitValidationForClustering(HoodieWriteMetadata<O> 
writeMetadata, HoodieTable table, String instantTime) {
+    if (StringUtils.isNullOrEmpty(config.getPreCommitValidators())) {
+      return;
+    }
+    throw new HoodieIOException("Precommit validation not implemented for all 
engines yet");
+  }
+
+  private void commitClustering(HoodieWriteMetadata<O> clusteringWriteMetadata,
+                           HoodieTable table,
+                           String clusteringCommitTime) {
+    // triggering the dag for the first time for clustering
+    List<HoodieWriteStat> writeStats = 
triggerWritesAndFetchWriteStats(clusteringWriteMetadata);
+    clusteringWriteMetadata.setWriteStats(writeStats);
+    // Fetch Replace commit metadata and update HoodieWriteStats annd 
Partition to Replace FileIds
+    HoodieReplaceCommitMetadata replaceCommitMetadata = 
(HoodieReplaceCommitMetadata) clusteringWriteMetadata.getCommitMetadata().get();
+    for (HoodieWriteStat writeStat: writeStats) {
+      replaceCommitMetadata.addWriteStat(writeStat.getPartitionPath(), 
writeStat);
+    }
+    HoodieClusteringPlan clusteringPlan = 
ClusteringUtils.getPendingClusteringPlan(table.getMetaClient(), 
clusteringCommitTime);
+    Map<String, List<String>> partitionToReplaceFileIds = 
getPartitionToReplacedFileIds(clusteringPlan, clusteringWriteMetadata);
+    
clusteringWriteMetadata.setPartitionToReplaceFileIds(partitionToReplaceFileIds);
+    
replaceCommitMetadata.setPartitionToReplaceFileIds(partitionToReplaceFileIds);
 
-  protected abstract HoodieWriteMetadata<O> 
convertToOutputMetadata(HoodieWriteMetadata<T> writeMetadata);
+    
clusteringWriteMetadata.setCommitMetadata(Option.of(replaceCommitMetadata));
+    runPrecommitValidationForClustering(clusteringWriteMetadata, table, 
clusteringCommitTime);

Review Comment:
   Could you help me understand where this logic for clustering sits before?



##########
hudi-client/hudi-client-common/src/main/java/org/apache/hudi/client/BaseHoodieTableServiceClient.java:
##########
@@ -318,25 +320,33 @@ protected HoodieWriteMetadata<O> compact(HoodieTable<?, 
I, ?, T> table, String c
     }
     compactionTimer = metrics.getCompactionCtx();
     HoodieWriteMetadata<T> writeMetadata = table.compact(context, 
compactionInstantTime);
-    HoodieWriteMetadata<O> compactionMetadata = 
convertToOutputMetadata(writeMetadata);
-    if (shouldComplete && compactionMetadata.getCommitMetadata().isPresent()) {
-      completeCompaction(compactionMetadata.getCommitMetadata().get(), table, 
compactionInstantTime);
+    HoodieWriteMetadata<O> compactionWriteMetadata = 
convertToOutputMetadata(writeMetadata);
+    if (shouldComplete) {
+      commitCompaction(compactionInstantTime, compactionWriteMetadata, 
Option.of(table));
     }
-    return compactionMetadata;
+    return compactionWriteMetadata;
   }
 
-  /**
-   * Commit a compaction operation. Allow passing additional meta-data to be 
stored in commit instant file.
-   *
-   * @param compactionInstantTime Compaction Instant Time
-   * @param metadata              All the metadata that gets stored along with 
a commit
-   * @param extraMetadata         Extra Metadata to be stored
-   */
-  public void commitCompaction(String compactionInstantTime, 
HoodieCommitMetadata metadata, Option<Map<String, String>> extraMetadata) {
-    extraMetadata.ifPresent(m -> m.forEach(metadata::addMetadata));
-    completeCompaction(metadata, createTable(config, 
context.getStorageConf()), compactionInstantTime);
+  public void commitCompaction(String compactionInstantTime, 
HoodieWriteMetadata<O> compactionWriteMetadata, Option<HoodieTable> tableOpt) {
+    // dereferencing the write dag for compaction for the first time.
+    List<HoodieWriteStat> writeStats = 
triggerWritesAndFetchWriteStats(compactionWriteMetadata);
+    // Fetch commit metadata from HoodieWriteMetadata and update 
HoodieWriteStat
+    HoodieCommitMetadata commitMetadata = 
compactionWriteMetadata.getCommitMetadata().get();
+    commitMetadata.setCompacted(true);
+    for (HoodieWriteStat stat : writeStats) {
+      commitMetadata.addWriteStat(stat.getPartitionPath(), stat);
+    }

Review Comment:
   To clarify, is this new logic added or existing logic migrated from 
somewhere else?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to