[GitHub] [hudi] xushiyan commented on a diff in pull request #6098: [HUDI-4389] Make HoodieStreamingSink idempotent

GitBox Sat, 23 Jul 2022 21:44:48 -0700


xushiyan commented on code in PR #6098:
URL: https://github.com/apache/hudi/pull/6098#discussion_r928194615



##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieStreamingSink.scala:
##########
@@ -48,12 +50,24 @@ class HoodieStreamingSink(sqlContext: SQLContext,
 
   private val log = LogManager.getLogger(classOf[HoodieStreamingSink])
 
-  private val retryCnt = 
options.getOrDefault(DataSourceWriteOptions.STREAMING_RETRY_CNT.key,
-    DataSourceWriteOptions.STREAMING_RETRY_CNT.defaultValue).toInt
-  private val retryIntervalMs = 
options.getOrDefault(DataSourceWriteOptions.STREAMING_RETRY_INTERVAL_MS.key,
-    DataSourceWriteOptions.STREAMING_RETRY_INTERVAL_MS.defaultValue).toLong
-  private val ignoreFailedBatch = 
options.getOrDefault(DataSourceWriteOptions.STREAMING_IGNORE_FAILED_BATCH.key,
-    
DataSourceWriteOptions.STREAMING_IGNORE_FAILED_BATCH.defaultValue).toBoolean
+  private val tablePath = new Path(options.getOrElse("path", "Missing 'path' 
option"))
+  private var metaClient: Option[HoodieTableMetaClient] = {
+    try {
+      
Some(HoodieTableMetaClient.builder().setConf(sqlContext.sparkContext.hadoopConfiguration).setBasePath(tablePath.toString).build())
+    } catch {
+      case _: TableNotFoundException =>
+        log.warn("Ignore TableNotFoundException as it is first microbatch.")
+        Option.empty
+    }
+  }
+  private val retryCnt = options.getOrDefault(STREAMING_RETRY_CNT.key,
+    STREAMING_RETRY_CNT.defaultValue).toInt
+  private val retryIntervalMs = 
options.getOrDefault(STREAMING_RETRY_INTERVAL_MS.key,
+    STREAMING_RETRY_INTERVAL_MS.defaultValue).toLong
+  private val ignoreFailedBatch = 
options.getOrDefault(STREAMING_IGNORE_FAILED_BATCH.key,
+    STREAMING_IGNORE_FAILED_BATCH.defaultValue).toBoolean
+  // This constant serves as the checkpoint key for streaming sink so that 
each microbatch is processed exactly-once.
+  private val SinkCheckpointKey = "_streaming_sink_checkpoint"

Review Comment:
   also `SinkCheckpointKey` should be ALL_CAPS ?



##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieStreamingSink.scala:
##########
@@ -247,4 +285,18 @@ class HoodieStreamingSink(sqlContext: SQLContext,
       writeClient = Option.empty
     }
   }
+
+  private def canSkipBatch(batchId: Long): Boolean = {
+    // get the latest checkpoint from the commit metadata to check if the 
microbatch has already been prcessed or not
+    val lastCommit = 
metaClient.get.getActiveTimeline.getCommitsTimeline.filterCompletedInstants().lastInstant()
+    if (lastCommit.isPresent) {
+      val commitMetadata = HoodieCommitMetadata.fromBytes(
+        
metaClient.get.getActiveTimeline.getInstantDetails(lastCommit.get()).get(), 
classOf[HoodieCommitMetadata])
+      val lastCheckpoint = commitMetadata.getMetadata(SinkCheckpointKey)
+      if (!StringUtils.isNullOrEmpty(lastCheckpoint)) {
+        latestBatchId = lastCheckpoint.toLong
+      }
+    }
+    latestBatchId >= batchId

Review Comment:
   ok in multi-writer case, not sure the current commit metadata in form of 
single KV pair would suffice future design requirements. we may need a more 
flexible structure like k={a:b}? just want to avoid handling BWC later



##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieStreamingSink.scala:
##########
@@ -48,12 +50,24 @@ class HoodieStreamingSink(sqlContext: SQLContext,
 
   private val log = LogManager.getLogger(classOf[HoodieStreamingSink])
 
-  private val retryCnt = 
options.getOrDefault(DataSourceWriteOptions.STREAMING_RETRY_CNT.key,
-    DataSourceWriteOptions.STREAMING_RETRY_CNT.defaultValue).toInt
-  private val retryIntervalMs = 
options.getOrDefault(DataSourceWriteOptions.STREAMING_RETRY_INTERVAL_MS.key,
-    DataSourceWriteOptions.STREAMING_RETRY_INTERVAL_MS.defaultValue).toLong
-  private val ignoreFailedBatch = 
options.getOrDefault(DataSourceWriteOptions.STREAMING_IGNORE_FAILED_BATCH.key,
-    
DataSourceWriteOptions.STREAMING_IGNORE_FAILED_BATCH.defaultValue).toBoolean
+  private val tablePath = new Path(options.getOrElse("path", "Missing 'path' 
option"))
+  private var metaClient: Option[HoodieTableMetaClient] = {
+    try {
+      
Some(HoodieTableMetaClient.builder().setConf(sqlContext.sparkContext.hadoopConfiguration).setBasePath(tablePath.toString).build())
+    } catch {
+      case _: TableNotFoundException =>
+        log.warn("Ignore TableNotFoundException as it is first microbatch.")
+        Option.empty
+    }
+  }
+  private val retryCnt = options.getOrDefault(STREAMING_RETRY_CNT.key,
+    STREAMING_RETRY_CNT.defaultValue).toInt
+  private val retryIntervalMs = 
options.getOrDefault(STREAMING_RETRY_INTERVAL_MS.key,
+    STREAMING_RETRY_INTERVAL_MS.defaultValue).toLong
+  private val ignoreFailedBatch = 
options.getOrDefault(STREAMING_IGNORE_FAILED_BATCH.key,
+    STREAMING_IGNORE_FAILED_BATCH.defaultValue).toBoolean
+  // This constant serves as the checkpoint key for streaming sink so that 
each microbatch is processed exactly-once.
+  private val SinkCheckpointKey = "_streaming_sink_checkpoint"

Review Comment:
   the name `_streaming_sink_checkpoint` may be too generic since we only meant 
it for spark streaming. it could imply flink or other streaming writer



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [hudi] xushiyan commented on a diff in pull request #6098: [HUDI-4389] Make HoodieStreamingSink idempotent

Reply via email to