ericm-db commented on code in PR #49277:
URL: https://github.com/apache/spark/pull/49277#discussion_r1915765970


##########
sql/core/src/main/scala/org/apache/spark/sql/avro/SchemaConverters.scala:
##########
@@ -372,6 +373,153 @@ object SchemaConverters extends Logging {
       schema
     }
   }
+
+  private def getDefaultValue(dataType: DataType): Any = {
+    def createNestedDefault(st: StructType): java.util.HashMap[String, Any] = {
+      val defaultMap = new java.util.HashMap[String, Any]()
+      st.fields.foreach { field =>
+        // For nullable fields in a struct, create a wrapped default value
+        val fieldDefault = getDefaultValue(field.dataType)
+        if (field.nullable) {
+          defaultMap.put(field.name, null)
+        } else {
+          defaultMap.put(field.name, fieldDefault)
+        }
+      }
+      defaultMap
+    }
+
+    dataType match {
+      // Basic types
+      case BooleanType => null
+      case ByteType | ShortType | IntegerType => null
+      case LongType => null
+      case FloatType => null
+      case DoubleType => null
+      case StringType => null
+      case BinaryType => null
+
+      // Complex types
+      case ArrayType(_, _) => new java.util.ArrayList[Any]()
+      case MapType(StringType, _, _) => new java.util.HashMap[String, Any]()
+      case st: StructType => createNestedDefault(st)
+
+      // Special types
+      case _: DecimalType => java.nio.ByteBuffer.allocate(0)
+      case DateType => null
+      case TimestampType => null
+      case TimestampNTZType => null
+      case NullType => null
+      case _ => null
+    }
+  }
+
+  def toAvroTypeWithDefaults(
+      catalystType: DataType,
+      nullable: Boolean = false,
+      recordName: String = "topLevelRecord",
+      namespace: String = "",
+      nestingLevel: Int = 0): Schema = {
+    if (nestingLevel == 0) {
+      assert(catalystType.isInstanceOf[StructType],
+        "toAvroTypeWithDefaults should only be called with StructType")
+    }
+    val builder = SchemaBuilder.builder()
+
+    def getNestedRecordName(baseName: String): String = {
+      if (nestingLevel == 0) baseName
+      else s"${baseName}_nested_$nestingLevel"
+    }
+
+    def processStructFields(
+        st: StructType,
+        fieldsAssembler: FieldAssembler[Schema]): FieldAssembler[Schema] = {
+      st.foreach { field =>
+        val isLeafType = field.dataType match {
+          case _: StructType | _: ArrayType | _: MapType => false
+          case _ => true
+        }
+
+        val innerType = toAvroTypeWithDefaults(
+          field.dataType,
+          nullable = isLeafType,  // Only make leaf types nullable

Review Comment:
   So we wanted to synchronize Avro and Spark SQL, where we set the 
StructFields to nullable as well



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Reply via email to