This is an automated email from the ASF dual-hosted git repository.

hongze pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git


The following commit(s) were added to refs/heads/main by this push:
     new e944668752 [VL] gluten-it: Fixes for clickbench (#10291)
e944668752 is described below

commit e944668752df9ddc9153f4ada6ad19eed56016a9
Author: Hongze Zhang <[email protected]>
AuthorDate: Wed Jul 30 12:55:28 2025 +0100

    [VL] gluten-it: Fixes for clickbench (#10291)
---
 .../src/main/resources/clickbench-queries/q39.sql   |  2 +-
 .../src/main/resources/clickbench-queries/q40.sql   |  2 +-
 .../src/main/resources/clickbench-queries/q41.sql   |  2 +-
 .../src/main/resources/clickbench-queries/q42.sql   |  2 +-
 .../src/main/resources/clickbench-queries/q43.sql   |  2 +-
 .../integration/clickbench/ClickBenchDataGen.scala  | 21 ++++++++++++++++-----
 .../clickbench/ClickBenchTableCreator.scala         |  7 ++++---
 7 files changed, 25 insertions(+), 13 deletions(-)

diff --git 
a/tools/gluten-it/common/src/main/resources/clickbench-queries/q39.sql 
b/tools/gluten-it/common/src/main/resources/clickbench-queries/q39.sql
index 93b691f37c..62c201b02c 100644
--- a/tools/gluten-it/common/src/main/resources/clickbench-queries/q39.sql
+++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q39.sql
@@ -1 +1 @@
-SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate 
>= DATE '2013-07-01' AND EventDate <= DATE '2013-07-31' AND IsRefresh = 0 AND 
IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC OFFSET 1000 
LIMIT 10;
+SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate 
>= DATE '2013-07-01' AND EventDate <= DATE '2013-07-31' AND IsRefresh = 0 AND 
IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 
OFFSET 1000;
diff --git 
a/tools/gluten-it/common/src/main/resources/clickbench-queries/q40.sql 
b/tools/gluten-it/common/src/main/resources/clickbench-queries/q40.sql
index d97b607720..e12daa1b11 100644
--- a/tools/gluten-it/common/src/main/resources/clickbench-queries/q40.sql
+++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q40.sql
@@ -1 +1 @@
-SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID 
= 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) 
AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= DATE '2013-07-01' 
AND EventDate <= DATE '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, 
SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) 
THEN Referer ELSE '' END, URL ORDER BY PageViews DESC OFFSET 1000 LIMIT 10;
+SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID 
= 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) 
AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= DATE '2013-07-01' 
AND EventDate <= DATE '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, 
SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) 
THEN Referer ELSE '' END, URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
diff --git 
a/tools/gluten-it/common/src/main/resources/clickbench-queries/q41.sql 
b/tools/gluten-it/common/src/main/resources/clickbench-queries/q41.sql
index 321a06e3e7..c5cef47cc9 100644
--- a/tools/gluten-it/common/src/main/resources/clickbench-queries/q41.sql
+++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q41.sql
@@ -1 +1 @@
-SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 
62 AND EventDate >= DATE '2013-07-01' AND EventDate <= DATE '2013-07-31' AND 
IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 
3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC OFFSET 
100 LIMIT 10;
+SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 
62 AND EventDate >= DATE '2013-07-01' AND EventDate <= DATE '2013-07-31' AND 
IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 
3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 
10 OFFSET 100;
diff --git 
a/tools/gluten-it/common/src/main/resources/clickbench-queries/q42.sql 
b/tools/gluten-it/common/src/main/resources/clickbench-queries/q42.sql
index 46b81c5be4..f76163cc5e 100644
--- a/tools/gluten-it/common/src/main/resources/clickbench-queries/q42.sql
+++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q42.sql
@@ -1 +1 @@
-SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits 
WHERE CounterID = 62 AND EventDate >= DATE '2013-07-01' AND EventDate <= DATE 
'2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 
2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY 
PageViews DESC OFFSET 10000 LIMIT 10;
+SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits 
WHERE CounterID = 62 AND EventDate >= DATE '2013-07-01' AND EventDate <= DATE 
'2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 
2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY 
PageViews DESC LIMIT 10 OFFSET 10000;
diff --git 
a/tools/gluten-it/common/src/main/resources/clickbench-queries/q43.sql 
b/tools/gluten-it/common/src/main/resources/clickbench-queries/q43.sql
index b54c092114..252d993538 100644
--- a/tools/gluten-it/common/src/main/resources/clickbench-queries/q43.sql
+++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q43.sql
@@ -1 +1 @@
-SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits 
WHERE CounterID = 62 AND EventDate >= DATE '2013-07-14' AND EventDate <= DATE 
'2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY 
DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) OFFSET 
1000 LIMIT 10;
+SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits 
WHERE CounterID = 62 AND EventDate >= DATE '2013-07-14' AND EventDate <= DATE 
'2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY 
DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 
10 OFFSET 1000;
diff --git 
a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/clickbench/ClickBenchDataGen.scala
 
b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/clickbench/ClickBenchDataGen.scala
index add7b01feb..80357f3eff 100644
--- 
a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/clickbench/ClickBenchDataGen.scala
+++ 
b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/clickbench/ClickBenchDataGen.scala
@@ -18,7 +18,7 @@ package org.apache.gluten.integration.clickbench
 
 import org.apache.commons.io.FileUtils
 import org.apache.gluten.integration.DataGen
-import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.{SparkSession, functions}
 
 import java.io.File
 import scala.language.postfixOps
@@ -29,20 +29,31 @@ class ClickBenchDataGen(val spark: SparkSession, dir: 
String) extends DataGen {
   override def gen(): Unit = {
     println(s"Start to download ClickBench Parquet dataset from URL: 
$DATA_URL... ")
     // Directly download from official URL.
-    val target = new File(dir + File.separator + FILE_NAME)
-    FileUtils.forceMkdirParent(target)
+    val tempFile = new File(dir + File.separator + TMP_FILE_NAME)
+    FileUtils.forceMkdirParent(tempFile)
     val cmd =
-      s"wget --no-verbose --show-progress --progress=bar:force:noscroll -O 
$target $DATA_URL"
+      s"wget --no-verbose --show-progress --progress=bar:force:noscroll -O 
$tempFile $DATA_URL"
     println(s"Executing command: $cmd")
     val code = Process(cmd) !;
     if (code != 0) {
       throw new RuntimeException("Download failed")
     }
-    println(s"ClickBench Parquet dataset successfully downloaded to $target.")
+    println(s"ClickBench Parquet dataset successfully downloaded to 
$tempFile.")
+
+    val sparkDataFile = new File(dir + File.separator + FILE_NAME)
+    println(s"Starting to write a data file $sparkDataFile that is compatible 
with Spark... ")
+    spark.read.parquet(tempFile.getAbsolutePath)
+      .withColumn("eventtime", functions.col("eventtime").cast("timestamp"))
+      .withColumn("clienteventtime", 
functions.col("clienteventtime").cast("timestamp"))
+      .withColumn("localeventtime", 
functions.col("localeventtime").cast("timestamp"))
+      .write
+      .parquet(sparkDataFile.getAbsolutePath)
+    println(s"ClickBench Parquet dataset (Spark compatible) successfully 
created at $sparkDataFile.")
   }
 }
 
 object ClickBenchDataGen {
   private val DATA_URL = 
"https://datasets.clickhouse.com/hits_compatible/hits.parquet";
+  private val TMP_FILE_NAME = "hits.parquet.tmp"
   private[clickbench] val FILE_NAME = "hits.parquet"
 }
diff --git 
a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/clickbench/ClickBenchTableCreator.scala
 
b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/clickbench/ClickBenchTableCreator.scala
index 33eac38629..c8e04e137d 100644
--- 
a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/clickbench/ClickBenchTableCreator.scala
+++ 
b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/clickbench/ClickBenchTableCreator.scala
@@ -19,11 +19,12 @@ package org.apache.gluten.integration.clickbench
 
 import org.apache.gluten.integration.TableCreator
 import org.apache.spark.sql.types.StructType
-import org.apache.spark.sql.{AnalysisException, SparkSession}
+import org.apache.spark.sql.{AnalysisException, SparkSession, functions}
 
 import java.io.File
 
 object ClickBenchTableCreator extends TableCreator {
+  private val SPARK_DATA_FILE_NAME = "hits-spark.parquet"
   private val TABLE_NAME = "hits"
   private val SCHEMA: StructType = StructType.fromDDL("""
       |watchid bigint,
@@ -134,15 +135,15 @@ object ClickBenchTableCreator extends TableCreator {
       |""".stripMargin)
 
   override def create(spark: SparkSession, dataPath: String): Unit = {
-    val file = new File(dataPath + File.separator + 
ClickBenchDataGen.FILE_NAME)
     if (spark.catalog.tableExists(TABLE_NAME)) {
       println("Table exists: " + TABLE_NAME)
       return
     }
     println("Creating catalog table: " + TABLE_NAME)
+    val file = new File(dataPath + File.separator + 
ClickBenchDataGen.FILE_NAME)
     spark.catalog.createTable(TABLE_NAME, "parquet", SCHEMA, Map("path" -> 
file.getAbsolutePath))
     try {
-      spark.catalog.recoverPartitions(file.getName)
+      spark.catalog.recoverPartitions(TABLE_NAME)
     } catch {
       case _: AnalysisException =>
     }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to