mbutrovich commented on code in PR #1568: URL: https://github.com/apache/datafusion-comet/pull/1568#discussion_r2016848830
########## spark/src/main/scala/org/apache/spark/sql/comet/execution/shuffle/CometShuffleExchangeExec.scala: ########## @@ -232,14 +222,17 @@ object CometShuffleExchangeExec extends ShimCometShuffleExchangeExec { (0, _) ), // adding fake partitionId that is always 0 because ShuffleDependency requires it serializer = serializer, - shuffleWriterProcessor = - new CometShuffleWriteProcessor(outputPartitioning, outputAttributes, metrics, numParts), + shuffleWriterProcessor = ShuffleExchangeExec.createShuffleWriteProcessor(metrics), shuffleType = CometNativeShuffle, partitioner = new Partitioner { override def numPartitions: Int = outputPartitioning.numPartitions override def getPartition(key: Any): Int = key.asInstanceOf[Int] }, - decodeTime = metrics("decode_time")) + decodeTime = metrics("decode_time"), + outputPartitioning = Some(outputPartitioning), + outputAttributes = outputAttributes, + shuffleWriteMetrics = metrics, + numParts = numParts) dependency } Review Comment: line 241 below: while you're in this file could you change `Boson` -> `Comet` please? ########## spark/src/test/scala/org/apache/comet/exec/CometNativeShuffleSuite.scala: ########## @@ -201,6 +204,17 @@ class CometNativeShuffleSuite extends CometTestBase with AdaptiveSparkPlanHelper } } + test("fix: Comet native shuffle deletes shuffle files after query") { + withParquetTable((0 until 5).map(i => (i, i + 1)), "tbl") { + sql("SELECT count(_2), sum(_2) FROM tbl GROUP BY _1").collect() + val diskBlockManager = SparkEnv.get.blockManager.diskBlockManager + eventually(timeout(30.seconds), interval(1.seconds)) { Review Comment: Can we assert the before state that the files list is non-empty here? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org