cloud-fan commented on code in PR #51287: URL: https://github.com/apache/spark/pull/51287#discussion_r2238839566
########## sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlPartitioningSuite.scala: ########## @@ -36,9 +35,31 @@ final class XmlPartitioningSuite extends SparkFunSuite with Matchers with Before try { val fileName = s"test-data/xml-resources/fias_house${if (large) ".large" else ""}.xml$suffix" val xmlFile = getClass.getClassLoader.getResource(fileName).getFile - val results = spark.read.option("rowTag", "House").option("mode", "FAILFAST").xml(xmlFile) - // Test file has 37 records; large file is 20x the records - assert(results.count() === (if (large) 740 else 37)) + if (large) { + // The large file is invalid because it concatenates several XML files together and thus + // there are more one root tags, and each one has a BOM character at the beginning. + + // In FAILFAST mode, we should throw an exception + val error = intercept[SparkException] { + spark.read.option("rowTag", "House").option("mode", "FAILFAST").xml(xmlFile) + } + checkError( + exception = error, + condition = "MALFORMED_RECORD_IN_PARSING.WITHOUT_SUGGESTION", + parameters = Map("badRecord" -> "_corrupt_record", "failFastMode" -> "FAILFAST") + ) + + // In PERMISSIVE mode, we should read the records in the first root tag and ignore the rest + // of the content Review Comment: what was the behavior without this optimization? We should call out all the behavior changes clearly. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org