Hello spark-dev I have loaded tpcds data in parquet format using spark *3.0.2* and while reading it from spark *3.2.1* , my query is failing with below error.
Later I set spark.sql.parquet.enableVectorizedReader=false my but it resulted in a different error. I am also providing output of parquet-tools below. spark-sql> select * from store_sales limit 100; org.apache.spark.sql.execution.QueryExecutionException: Parquet column cannot be converted in file obj_store_location/store_sales/ss_sold_date_sk=2451121/part-00440-eac89ce9-041a-4254-b90a-6aceb3c8e6c4.c000.snappy.parquet. Column: [ss_sold_time_sk], Expected: bigint, Found: INT32 at org.apache.spark.sql.errors.QueryExecutionErrors$.unsupportedSchemaColumnConvertError(QueryExecutionErrors.scala:570) at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:195) at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:104) at org.apache.spark.sql.execution.FileSourceScanExec$$anon$1.hasNext(DataSourceScanExec.scala:522) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.columnartorow_nextBatch_0$(Unknown Source) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source) at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:759) at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:349) at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:898) at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:898) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) Caused by: org.apache.spark.sql.execution.datasources.SchemaColumnConvertNotSupportedException at org.apache.spark.sql.execution.datasources.parquet.ParquetVectorUpdaterFactory.constructConvertNotSupportedException(ParquetVectorUpdaterFactory.java:1104) at org.apache.spark.sql.execution.datasources.parquet.ParquetVectorUpdaterFactory.getUpdater(ParquetVectorUpdaterFactory.java:181) at org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.readBatch(VectorizedColumnReader.java:161) at org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextBatch(VectorizedParquetRecordReader.java:298) at org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextKeyValue(VectorizedParquetRecordReader.java:196) at org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:39) at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:104) at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:191) ... 21 more 23/10/30 10:24:48 WARN TaskSetManager: Lost task 0.2 in stage 1.0 (TID 1826) org.apache.spark.sql.execution.QueryExecutionException: Parquet column cannot be converted in file objstore_location/store_sales/ss_sold_date_sk=2451121/part-00440-eac89ce9-041a-4254-b90a-6aceb3c8e6c4.c000.snappy.parquet. Column: [ss_sold_time_sk], Expected: bigint, Found: INT32 at org.apache.spark.sql.errors.QueryExecutionErrors$.unsupportedSchemaColumnConvertError(QueryExecutionErrors.scala:570) at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:195) at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:104) at org.apache.spark.sql.execution.FileSourceScanExec$$anon$1.hasNext(DataSourceScanExec.scala:522) set spark.sql.parquet.enableVectorizedReader=false;spark.sql.parquet.enableVectorizedReader false Time taken: 0.023 seconds, Fetched 1 row(s) 23/10/30 12:08:50 INFO SparkSQLCLIDriver: Time taken: 0.023 seconds, Fetched 1 row(s) spark-sql> select * from store_sales limit 10; 23/10/30 12:09:06 WARN TaskSetManager: Lost task 0.0 in stage 6.0 (TID 1847) (trinoprwn5.subnetpoc1.vcn12231050.oraclevcn.com executor 11): org.apache.spark.sql.execution.QueryExecutionException: Encounter error while reading parquet files. One possible cause: Parquet column cannot be converted in the corresponding files. Details: at org.apache.spark.sql.errors.QueryExecutionErrors$.cannotReadParquetFilesError(QueryExecutionErrors.scala:577) at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:200) at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:104) at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460) at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:349) at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:898) at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:898) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) Caused by: org.apache.parquet.io.ParquetDecodingException: Can not read value at 1 in block 0 in file objstorelocation/store_sales/ss_sold_date_sk=2451121/part-00440-eac89ce9-041a-4254-b90a-6aceb3c8e6c4.c000.snappy.parquet at org.apache.parquet.hadoop.InternalParquetRecordReader.nextKeyValue(InternalParquetRecordReader.java:254) at org.apache.parquet.hadoop.ParquetRecordReader.nextKeyValue(ParquetRecordReader.java:207) at org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:39) at org.apache.spark.sql.execution.datasources.RecordReaderIterator$$anon$1.hasNext(RecordReaderIterator.scala:61) at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:104) at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:191) ... 17 more Caused by: java.lang.ClassCastException: org.apache.spark.sql.catalyst.expressions.MutableLong cannot be cast to org.apache.spark.sql.catalyst.expressions.MutableInt at org.apache.spark.sql.catalyst.expressions.SpecificInternalRow.setInt(SpecificInternalRow.scala:254) at org.apache.spark.sql.execution.datasources.parquet.ParquetRowConverter$RowUpdater.setInt(ParquetRowConverter.scala:179) at org.apache.spark.sql.execution.datasources.parquet.ParquetPrimitiveConverter.addInt(ParquetRowConverter.scala:89) at org.apache.parquet.column.impl.ColumnReaderBase$2$3.writeValue(ColumnReaderBase.java:297) at org.apache.parquet.column.impl.ColumnReaderBase.writeCurrentValueToConverter(ColumnReaderBase.java:440) at org.apache.parquet.column.impl.ColumnReaderImpl.writeCurrentValueToConverter(ColumnReaderImpl.java:30) at org.apache.parquet.io.RecordReaderImplementation.read(RecordReaderImplementation.java:406) at org.apache.parquet.hadoop.InternalParquetRecordReader.nextKeyValue(InternalParquetRecordReader.java:229) ... 22 more java -cp /usr/lib/hadoop/hadoop-common.jar:/usr/lib/hadoop/lib/*:/home/opc/parquet_file/parquet-tools-1.9.0.jar org.apache.parquet.tools.Main meta part-00042-eac89ce9-041a-4254-b90a-6aceb3c8e6c4.c000.snappy.parquet file: file:/home/opc/parquet_file/part-00042-eac89ce9-041a-4254-b90a-6aceb3c8e6c4.c000.snappy.parquet creator: parquet-mr version 1.10.1 (build a89df8f9932b6ef6633d06069e50c9b7970bebd1) extra: org.apache.spark.version = 3.0.2 extra: org.apache.spark.sql.parquet.row.metadata = {"type":"struct","fields":[{"name":"ss_sold_time_sk","type":"integer","nullable":true,"metadata":{}},{"name":"ss_item_sk","type":"integer","nullable":true,"metadata":{}},{"name":"ss_customer_sk","type":"integer","nullable":true,"metadata":{}},{"name":"ss_cdemo_sk","type":"integer","nullable":true,"metadata":{}},{"name":"ss_hdemo_sk","type":"integer","nullable":true,"metadata":{}},{"name":"ss_addr_sk","type":"integer","nullable":true,"metadata":{}},{"name":"ss_store_sk","type":"integer","nullable":true,"metadata":{}},{"name":"ss_promo_sk","type":"integer","nullable":true,"metadata":{}},{"name":"ss_ticket_number","type":"long","nullable":true,"metadata":{}},{"name":"ss_quantity","type":"integer","nullable":true,"metadata":{}},{"name":"ss_wholesale_cost","type":"decimal(7,2)","nullable":true,"metadata":{}},{"name":"ss_list_price","type":"decimal(7,2)","nullable":true,"metadata":{}},{"name":"ss_sales_price","type":"decimal(7,2)","nullable":true,"metadata":{}},{"name":"ss_ext_discount_amt","type":"decimal(7,2)","nullable":true,"metadata":{}},{"name":"ss_ext_sales_price","type":"decimal(7,2)","nullable":true,"metadata":{}},{"name":"ss_ext_wholesale_cost","type":"decimal(7,2)","nullable":true,"metadata":{}},{"name":"ss_ext_list_price","type":"decimal(7,2)","nullable":true,"metadata":{}},{"name":"ss_ext_tax","type":"decimal(7,2)","nullable":true,"metadata":{}},{"name":"ss_coupon_amt","type":"decimal(7,2)","nullable":true,"metadata":{}},{"name":"ss_net_paid","type":"decimal(7,2)","nullable":true,"metadata":{}},{"name":"ss_net_paid_inc_tax","type":"decimal(7,2)","nullable":true,"metadata":{}},{"name":"ss_net_profit","type":"decimal(7,2)","nullable":true,"metadata":{}}]} file schema: spark_schema -------------------------------------------------------------------------------- ss_sold_time_sk: OPTIONAL INT32 R:0 D:1 ss_item_sk: OPTIONAL INT32 R:0 D:1 ss_customer_sk: OPTIONAL INT32 R:0 D:1 ss_cdemo_sk: OPTIONAL INT32 R:0 D:1 ss_hdemo_sk: OPTIONAL INT32 R:0 D:1 ss_addr_sk: OPTIONAL INT32 R:0 D:1 ss_store_sk: OPTIONAL INT32 R:0 D:1 ss_promo_sk: OPTIONAL INT32 R:0 D:1 ss_ticket_number: OPTIONAL INT64 R:0 D:1 ss_quantity: OPTIONAL INT32 R:0 D:1 ss_wholesale_cost: OPTIONAL INT32 O:DECIMAL R:0 D:1 ss_list_price: OPTIONAL INT32 O:DECIMAL R:0 D:1 ss_sales_price: OPTIONAL INT32 O:DECIMAL R:0 D:1 ss_ext_discount_amt: OPTIONAL INT32 O:DECIMAL R:0 D:1 ss_ext_sales_price: OPTIONAL INT32 O:DECIMAL R:0 D:1 ss_ext_wholesale_cost: OPTIONAL INT32 O:DECIMAL R:0 D:1 ss_ext_list_price: OPTIONAL INT32 O:DECIMAL R:0 D:1 ss_ext_tax: OPTIONAL INT32 O:DECIMAL R:0 D:1 ss_coupon_amt: OPTIONAL INT32 O:DECIMAL R:0 D:1 ss_net_paid: OPTIONAL INT32 O:DECIMAL R:0 D:1 ss_net_paid_inc_tax: OPTIONAL INT32 O:DECIMAL R:0 D:1 ss_net_profit: OPTIONAL INT32 O:DECIMAL R:0 D:1 *Please let me know if this is a known issue or fixed in later versions.* Thanks