chris-twiner commented on code in PR #50023: URL: https://github.com/apache/spark/pull/50023#discussion_r1971704899
########## sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala: ########## @@ -2841,6 +2860,265 @@ class DatasetSuite extends QueryTest checkDataset(Seq(seqMutableSet).toDS(), seqMutableSet) checkDataset(Seq(mapMutableSet).toDS(), mapMutableSet) } + + // below tests are related to SPARK-49960 and TransformingEncoder usage + test("Incorrect derived nullability with TransformingEncoder - non nullable") { + val sparkI = spark + type T = Tuple2[Seq[Seq[Int]], Seq[Int]] + val data: Seq[T] = Seq( ( Seq( Seq(1, 2, 3) ), Seq(1, 2, 3) ) ) + // for reference only + val sparkDataTypeOG = { + import sparkI.implicits._ + val ds = spark.createDataset[Tuple2[Seq[Seq[Int]], Seq[Int]]](data) + ds.schema + } + + val provider = () => + new Codec[Seq[Int], Seq[Int]]{ + override def encode(in: Seq[Int]): Seq[Int] = in + override def decode(out: Seq[Int]): Seq[Int] = out + } + + val transformingSeq = + TransformingEncoder[Seq[Int], Seq[Int]]( + implicitly[ClassTag[Seq[Int]]], + IterableEncoder[Seq[Int], Int](implicitly[ClassTag[Seq[Int]]], + PrimitiveIntEncoder, false, false), + provider + ) + + val enc = + ProductEncoder( + implicitly[ClassTag[T]], + Seq( + EncoderField("_1", + IterableEncoder[Seq[Seq[Int]], Seq[Int]](implicitly[ClassTag[Seq[Seq[Int]]]], + transformingSeq, false, false), false, Metadata.empty), + EncoderField( "_2", transformingSeq, false, Metadata.empty) + ), + None + ) + val sparkViaAgnostic = { + val ds = spark.createDataset(data)(enc) + ds.schema + } + // the nullability without TransformingEncoder nullability (SerializerBuilderHelper) + // is incorrect (_2 is inferred as nullable) + assert(enc.dataType === sparkViaAgnostic) + } + + def provider[A]: () => Codec[V[A], A] = () => + new Codec[V[A], A]{ + override def encode(in: V[A]): A = in.v + override def decode(out: A): V[A] = V(out) + } + + def transforming[A](underlying: AgnosticEncoder[A]): TransformingEncoder[V[A], A] = + TransformingEncoder[V[A], A]( + implicitly[ClassTag[V[A]]], + underlying, + provider + ) + + val V_INT = StructType(Seq(StructField("v", IntegerType, nullable = false))) + + // "value" usage for single field, a wrapping nullable type is required + val OPTION_OF_V_INT = StructType(Seq(StructField("value", + V_INT, nullable = true))) + + // product encoder for a non-nullable V + val V_OF_INT = + ProductEncoder( + classTag[V[Int]], + Seq(EncoderField("v", PrimitiveIntEncoder, nullable = false, Metadata.empty)), + None + ) + + test("""Encoder derivation with nested TransformingEncoder of OptionEncoder""".stripMargin) { + val sparkI = spark + type T = V[V[Option[V[Int]]]] + val data: Seq[T] = Seq(V(V(None)), V(V(Some(V(1))))) + // for reference - datatype will introduce nested classes as expected + val sparkDataTypeOG = { + import sparkI.implicits._ + val ds = spark.createDataset[V[V[Option[V[Int]]]]](data) + ds.schema + } + + /* attempt to behave as if value class semantics except the last product, + using a final transforming instead of a product serializes */ + val enc = + transforming( + transforming( + OptionEncoder( + // works + // transforming(PrimitiveIntEncoder) + // does not work + V_OF_INT + ) + ) + ) + + assert(enc.schema === OPTION_OF_V_INT) + + val sparkViaAgnostic = { + val ds = spark.createDataset(data)(enc) + ds.schema + } + + /* The schema has been changed to just the Product V[Int], the wrapping Option value + struct has been removed - it should not have been */ + assert(sparkViaAgnostic === enc.schema) + + val ds = spark.createDataset(data)(enc) + assert(ds.collect().toVector === data.toVector) + } + + test("""Encoder derivation with TransformingEncoder of OptionEncoder""".stripMargin) { Review Comment: not really, it's only testing the recursion works in the ExpressionEncoder() detection. happy to remove -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org