Hi, When migrating to Spark 3, I'm getting a NoSuchElementException exception when getting partitions for a parquet dataframe -
The code I'm trying to execute is - val df = sparkSession.read.parquet(inputFilePath) val partitions = df.rdd.partitions and the spark session is created like so- val conf = new SparkConf() .setMaster("local[8]") .setAppName("test") .set("spark.ui.enabled", "false") .set("spark.sql.crossJoin.enabled", "true") .set("spark.sql.catalogImplementation", "in-memory") .set("spark.sql.retainGroupColumns", "false") .set("spark.sql.caseSensitive", "true") .set("spark.default.parallelism", "1") sparkSession = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate() Stacktrace - java.util.NoSuchElementException: None.get at scala.None$.get(Option.scala:529) at scala.None$.get(Option.scala:527) at org.apache.spark.sql.execution.FileSourceScanExec.needsUnsafeRowConversion$lzycompute(DataSourceScanExec.scala:178) at org.apache.spark.sql.execution.FileSourceScanExec.needsUnsafeRowConversion(DataSourceScanExec.scala:176) at org.apache.spark.sql.execution.FileSourceScanExec.doExecute(DataSourceScanExec.scala:462) at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:175) at org.apache.spark.sql.execution.SparkPlan$$Lambda$2249/464938461.apply(Unknown Source) at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:213) at org.apache.spark.sql.execution.SparkPlan$$Lambda$2250/357763388.apply(Unknown Source) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:210) at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:171) at org.apache.spark.sql.execution.DeserializeToObjectExec.doExecute(objects.scala:96) at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:175) at org.apache.spark.sql.execution.SparkPlan$$Lambda$2249/464938461.apply(Unknown Source) at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:213) at org.apache.spark.sql.execution.SparkPlan$$Lambda$2250/357763388.apply(Unknown Source) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:210) at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:171) at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:122) at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:121) at org.apache.spark.sql.Dataset.rdd$lzycompute(Dataset.scala:3198) at org.apache.spark.sql.Dataset.rdd(Dataset.scala:3196) This was not happening in our previous Spark code which was on 2.4.5. I could find some related JIRAs for this- https://issues.apache.org/jira/browse/SPARK-21418 https://issues.apache.org/jira/browse/SPARK-32589 Is there any workaround in Spark 3 for this issue? Thanks a lot in advance. -AB