HeartSaVioR commented on code in PR #49121:
URL: https://github.com/apache/spark/pull/49121#discussion_r1881386268
##########
sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingDeduplicationSuite.scala:
##########
@@ -574,6 +574,24 @@ class StreamingDeduplicationSuite extends
StateStoreMetricsTest {
matchPVals = true
)
}
+
+ test("test that avro encoding is not supported") {
+ val inputData = MemoryStream[String]
+ val result = inputData.toDS().dropDuplicates()
+
+ val ex = intercept[Exception] {
+ withSQLConf(SQLConf.STREAMING_STATE_STORE_ENCODING_FORMAT.key -> "avro")
{
+ testStream(result, Append)(
+ AddData(inputData, "a"),
+ CheckLastBatch("a"),
Review Comment:
ditto, no need to have a full code
##########
sql/core/src/test/scala/org/apache/spark/sql/streaming/TransformWithStateChainingSuite.scala:
##########
@@ -269,14 +279,28 @@ class TransformWithStateChainingSuite extends StreamTest {
OutputMode.Append())
.dropDuplicatesWithinWatermark()
- testStream(result, OutputMode.Append())(
- AddData(inputData, InputEventRow("k1", timestamp("2024-02-01
00:00:00"), "e1"),
- InputEventRow("k1", timestamp("2024-02-01 00:00:00"), "e1")),
- CheckNewAnswer(OutputRow("k1", timestamp("2024-02-01 00:00:00"), 2)),
- Execute("assertWatermarkEquals") { q =>
- assertWatermarkEquals(q, timestamp("2024-01-31 23:59:00"))
+ if (!isAvroEnabled) {
+ testStream(result, OutputMode.Append())(
+ AddData(inputData, InputEventRow("k1", timestamp("2024-02-01
00:00:00"), "e1"),
+ InputEventRow("k1", timestamp("2024-02-01 00:00:00"), "e1")),
+ CheckNewAnswer(OutputRow("k1", timestamp("2024-02-01 00:00:00"), 2)),
+ Execute("assertWatermarkEquals") { q =>
+ assertWatermarkEquals(q, timestamp("2024-01-31 23:59:00"))
+ }
+ )
+ } else {
+ val ex = intercept[Exception] {
+ testStream(result, OutputMode.Append())(
+ AddData(inputData, InputEventRow("k1", timestamp("2024-02-01
00:00:00"), "e1"),
+ InputEventRow("k1", timestamp("2024-02-01 00:00:00"), "e1")),
+ CheckNewAnswer(OutputRow("k1", timestamp("2024-02-01 00:00:00"),
2)),
Review Comment:
nit: You can just call ProcessAllAvailble() and expect it to fail. No need
to have validation code here.
##########
sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala:
##########
@@ -874,6 +874,34 @@ class StreamingAggregationSuite extends
StateStoreMetricsTest with Assertions {
)
}
+ testWithAllStateVersions("test that avro encoding is not supported") {
+ val inputData = MemoryStream[Int]
+
+ val aggregated =
+ inputData.toDF()
+ .groupBy($"value")
+ .agg(count("*"))
+ .as[(Int, Long)]
+
+ val ex = intercept[Exception] {
+ withSQLConf(SQLConf.STREAMING_STATE_STORE_ENCODING_FORMAT.key -> "avro")
{
+ testStream(aggregated, Update)(
+ AddData(inputData, 3),
+ CheckLastBatch((3, 1)),
Review Comment:
nit: ditto, no need to have a full code
##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala:
##########
@@ -140,6 +142,37 @@ object UnsupportedOperationChecker extends Logging {
}
}
+ private def checkAvroSupportForStatefulOperator(p: LogicalPlan): Boolean = p
match {
+ // TODO: remove operators from this list as support for avro encoding is
added
+ case s: Aggregate if s.isStreaming => false
+ // Since the Distinct node will be replaced to Aggregate in the optimizer
rule
+ // [[ReplaceDistinctWithAggregate]], here we also need to check all
Distinct node by
+ // assuming it as Aggregate.
+ case d @ Distinct(_: LogicalPlan) if d.isStreaming => false
+ case _ @ Join(left, right, _, _, _) if left.isStreaming &&
right.isStreaming => false
+ case f: FlatMapGroupsWithState if f.isStreaming => false
+ case f: FlatMapGroupsInPandasWithState if f.isStreaming => false
+ case d: Deduplicate if d.isStreaming => false
+ case d: DeduplicateWithinWatermark if d.isStreaming => false
+ case _ => true
+ }
+
+ // Rule to check that avro encoding format is not supported in case any
+ // non-transformWithState stateful streaming operators are present in the
query.
+ def checkSupportedStoreEncodingFormats(plan: LogicalPlan): Unit = {
+ val storeEncodingFormat = SQLConf.get.stateStoreEncodingFormat
+ if (storeEncodingFormat.toLowerCase(Locale.ROOT) == "avro") {
+ plan.foreach { subPlan =>
+ if (!checkAvroSupportForStatefulOperator(subPlan)) {
+ val errorMsg = "State store encoding format as avro is not supported
for " +
Review Comment:
nit: It'd be more informative if we could give an operator which triggers
this. We could either provide one of them, or provide all of them. Providing
any of them wouldn't be problematic, it is how UnsupportedOperationCheck has
been working.
Also let's be direct when providing the guidance - `till all stateful
operators support avro encoding` IMHO this part is not necessary for users.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]