ericm-db commented on code in PR #49277: URL: https://github.com/apache/spark/pull/49277#discussion_r1915765970
########## sql/core/src/main/scala/org/apache/spark/sql/avro/SchemaConverters.scala: ########## @@ -372,6 +373,153 @@ object SchemaConverters extends Logging { schema } } + + private def getDefaultValue(dataType: DataType): Any = { + def createNestedDefault(st: StructType): java.util.HashMap[String, Any] = { + val defaultMap = new java.util.HashMap[String, Any]() + st.fields.foreach { field => + // For nullable fields in a struct, create a wrapped default value + val fieldDefault = getDefaultValue(field.dataType) + if (field.nullable) { + defaultMap.put(field.name, null) + } else { + defaultMap.put(field.name, fieldDefault) + } + } + defaultMap + } + + dataType match { + // Basic types + case BooleanType => null + case ByteType | ShortType | IntegerType => null + case LongType => null + case FloatType => null + case DoubleType => null + case StringType => null + case BinaryType => null + + // Complex types + case ArrayType(_, _) => new java.util.ArrayList[Any]() + case MapType(StringType, _, _) => new java.util.HashMap[String, Any]() + case st: StructType => createNestedDefault(st) + + // Special types + case _: DecimalType => java.nio.ByteBuffer.allocate(0) + case DateType => null + case TimestampType => null + case TimestampNTZType => null + case NullType => null + case _ => null + } + } + + def toAvroTypeWithDefaults( + catalystType: DataType, + nullable: Boolean = false, + recordName: String = "topLevelRecord", + namespace: String = "", + nestingLevel: Int = 0): Schema = { + if (nestingLevel == 0) { + assert(catalystType.isInstanceOf[StructType], + "toAvroTypeWithDefaults should only be called with StructType") + } + val builder = SchemaBuilder.builder() + + def getNestedRecordName(baseName: String): String = { + if (nestingLevel == 0) baseName + else s"${baseName}_nested_$nestingLevel" + } + + def processStructFields( + st: StructType, + fieldsAssembler: FieldAssembler[Schema]): FieldAssembler[Schema] = { + st.foreach { field => + val isLeafType = field.dataType match { + case _: StructType | _: ArrayType | _: MapType => false + case _ => true + } + + val innerType = toAvroTypeWithDefaults( + field.dataType, + nullable = isLeafType, // Only make leaf types nullable Review Comment: So we wanted to synchronize Avro and Spark SQL, where we set the StructFields to nullable as well -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org