xiaonanyang-db commented on code in PR #50300: URL: https://github.com/apache/spark/pull/50300#discussion_r2035848625
########## sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala: ########## @@ -897,4 +914,219 @@ object StaxXmlParser { curRecord } } + + /** + * Parse the input XML string as a Variant value + */ + def parseVariant(xml: String, options: XmlOptions): VariantVal = { + val parser = StaxXmlParserUtils.filteredReader(xml) + val rootEvent = + StaxXmlParserUtils.skipUntil(parser, XMLStreamConstants.START_ELEMENT) + val rootAttributes = rootEvent.asStartElement.getAttributes.asScala.toArray + val variant = convertVariant(parser, rootAttributes, options) + val v = new VariantVal(variant.getValue, variant.getMetadata) + parser.close() + v + } + + /** + * Parse an XML element from the XML event stream into a Variant. + * This method transforms the XML element along with its attributes and child elements + * into a hierarchical Variant data structure that preserves the XML structure. + * + * @param parser The XML event stream reader positioned after the start element + * @param attributes The attributes of the current XML element to be included in the Variant + * @param options Configuration options that control how XML is parsed into Variants + * @return A Variant representing the XML element with its attributes and child content + */ + def convertVariant( + parser: XMLEventReader, + attributes: Array[Attribute], + options: XmlOptions): Variant = { + // The variant builder for the root startElement + val rootBuilder = new VariantBuilder(false) + val start = rootBuilder.getWritePos + + // Map to store the variant values of all child fields + // Each field could have multiple entries, which means it's an array + val fieldToVariants = collection.mutable.TreeMap.empty[String, java.util.ArrayList[Variant]] + + // Handle attributes first + StaxXmlParserUtils.convertAttributesToValuesMap(attributes, options).foreach { + case (f, v) => + val builder = new VariantBuilder(false) + appendXMLCharacterToVariant(builder, v, options) + val variants = fieldToVariants.getOrElseUpdate(f, new java.util.ArrayList[Variant]()) + variants.add(builder.result()) + } + + var shouldStop = false + while (!shouldStop) { + parser.nextEvent() match { + case s: StartElement => + // For each child element, convert it to a variant and keep track of it in + // fieldsToVariants + val attributes = s.getAttributes.asScala.map(_.asInstanceOf[Attribute]).toArray + val field = StaxXmlParserUtils.getName(s.asStartElement.getName, options) + val variants = fieldToVariants.getOrElseUpdate(field, new java.util.ArrayList[Variant]()) + variants.add(convertVariant(parser, attributes, options)) + + case c: Characters if !c.isWhiteSpace => + // Treat the character as a value tag field, where we use the [[XMLOptions.valueTag]] as + // the field key + val builder = new VariantBuilder(false) + appendXMLCharacterToVariant(builder, c.getData, options) + val variants = fieldToVariants.getOrElseUpdate( + options.valueTag, + new java.util.ArrayList[Variant]() + ) + variants.add(builder.result()) + + case _: EndElement => + if (fieldToVariants.nonEmpty) { + val onlyValueTagField = fieldToVariants.keySet.forall(_ == options.valueTag) + if (onlyValueTagField) { + // If the element only has value tag field, parse the element as a variant primitive + rootBuilder.appendVariant(fieldToVariants(options.valueTag).get(0)) + } else { + writeVariantObject(rootBuilder, start, fieldToVariants) + } + } + shouldStop = true + + case _: EndDocument => shouldStop = true + + case _ => // do nothing + } + } + + // If the element is empty, we treat it as a Variant null + if (rootBuilder.getWritePos == start) { + rootBuilder.appendNull() Review Comment: IIRC, it will error out when we call `result()` on an empty builder. cc @chenhao-db correct me if I'm wrong -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org