pan3793 commented on code in PR #50765: URL: https://github.com/apache/spark/pull/50765#discussion_r2075150208
########## sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala: ########## @@ -207,13 +210,23 @@ class ParquetFileFormat val sharedConf = broadcastedHadoopConf.value.value - val fileFooter = if (enableVectorizedReader) { + val (inputFile: Option[HadoopInputFile], + inputStream: Option[SeekableInputStream], + fileFooter: ParquetMetadata) = if (enableVectorizedReader) { // When there are vectorized reads, we can avoid reading the footer twice by reading // all row groups in advance and filter row groups according to filters that require // push down (no need to read the footer metadata again). - ParquetFooterReader.readFooter(sharedConf, file, ParquetFooterReader.WITH_ROW_GROUPS) + val hadoopInputFile = HadoopInputFile.fromStatus(file.fileStatus, sharedConf) + val fileInputStream = hadoopInputFile.newStream() + val footerFilter = ParquetFooterReader.footerFilter( + sharedConf, file, ParquetFooterReader.WITH_ROW_GROUPS) + val footer = ParquetFooterReader.readFooter( + hadoopInputFile, fileInputStream, footerFilter) Review Comment: TODO: close the reader but keep the `fileInputStream` open, waiting for https://github.com/apache/parquet-java/pull/3208 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org