HeartSaVioR commented on code in PR #40561: URL: https://github.com/apache/spark/pull/40561#discussion_r2096770151
########## sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/statefulOperators.scala: ########## @@ -980,3 +1023,67 @@ object StreamingDeduplicateExec { private val EMPTY_ROW = UnsafeProjection.create(Array[DataType](NullType)).apply(InternalRow.apply(null)) } + +case class StreamingDeduplicateWithinWatermarkExec( + keyExpressions: Seq[Attribute], + child: SparkPlan, + stateInfo: Option[StatefulOperatorStateInfo] = None, + eventTimeWatermarkForLateEvents: Option[Long] = None, + eventTimeWatermarkForEviction: Option[Long] = None) + extends BaseStreamingDeduplicateExec { + + protected val schemaForValueRow: StructType = StructType( + Array(StructField("expiresAtMicros", LongType, nullable = false))) + + protected val extraOptionOnStateStore: Map[String, String] = Map.empty + + private val eventTimeCol: Attribute = WatermarkSupport.findEventTimeColumn(child.output, + allowMultipleEventTimeColumns = false).get + private val delayThresholdMs = eventTimeCol.metadata.getLong(EventTimeWatermark.delayKey) + private val eventTimeColOrdinal: Int = child.output.indexOf(eventTimeCol) + + protected def initializeReusedDupInfoRow(): Option[UnsafeRow] = { + val timeoutToUnsafeRow = UnsafeProjection.create(schemaForValueRow) + val timeoutRow = timeoutToUnsafeRow(new SpecificInternalRow(schemaForValueRow)) + Some(timeoutRow) + } + + protected def putDupInfoIntoState( + store: StateStore, + data: UnsafeRow, + key: UnsafeRow, + reusedDupInfoRow: Option[UnsafeRow]): Unit = { + assert(reusedDupInfoRow.isDefined, "This should have reused row.") + val timeoutRow = reusedDupInfoRow.get + + // We expect data type of event time column to be TimestampType or TimestampNTZType which both + // are internally represented as Long. + val timestamp = data.getLong(eventTimeColOrdinal) + // The unit of timestamp in Spark is microseconds, convert the delay threshold to micros. + val expiresAt = timestamp + DateTimeUtils.millisToMicros(delayThresholdMs) Review Comment: The major concern about semantic here is, I don't think there is a great way to respect the semantic very strictly. I'd rather get back to the rationale - why the necessity of this operator comes up? We want to deduplicate "as many as possible", but also want to avoid state to stay indefinitely. This way I think TTL is the right way to go - it's just that I think expiredAt being updated per processing duplicated event would be much easier to understand. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org