This is an automated email from the ASF dual-hosted git repository.
hongze pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git
The following commit(s) were added to refs/heads/main by this push:
new e944668752 [VL] gluten-it: Fixes for clickbench (#10291)
e944668752 is described below
commit e944668752df9ddc9153f4ada6ad19eed56016a9
Author: Hongze Zhang <[email protected]>
AuthorDate: Wed Jul 30 12:55:28 2025 +0100
[VL] gluten-it: Fixes for clickbench (#10291)
---
.../src/main/resources/clickbench-queries/q39.sql | 2 +-
.../src/main/resources/clickbench-queries/q40.sql | 2 +-
.../src/main/resources/clickbench-queries/q41.sql | 2 +-
.../src/main/resources/clickbench-queries/q42.sql | 2 +-
.../src/main/resources/clickbench-queries/q43.sql | 2 +-
.../integration/clickbench/ClickBenchDataGen.scala | 21 ++++++++++++++++-----
.../clickbench/ClickBenchTableCreator.scala | 7 ++++---
7 files changed, 25 insertions(+), 13 deletions(-)
diff --git
a/tools/gluten-it/common/src/main/resources/clickbench-queries/q39.sql
b/tools/gluten-it/common/src/main/resources/clickbench-queries/q39.sql
index 93b691f37c..62c201b02c 100644
--- a/tools/gluten-it/common/src/main/resources/clickbench-queries/q39.sql
+++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q39.sql
@@ -1 +1 @@
-SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate
>= DATE '2013-07-01' AND EventDate <= DATE '2013-07-31' AND IsRefresh = 0 AND
IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC OFFSET 1000
LIMIT 10;
+SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate
>= DATE '2013-07-01' AND EventDate <= DATE '2013-07-31' AND IsRefresh = 0 AND
IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10
OFFSET 1000;
diff --git
a/tools/gluten-it/common/src/main/resources/clickbench-queries/q40.sql
b/tools/gluten-it/common/src/main/resources/clickbench-queries/q40.sql
index d97b607720..e12daa1b11 100644
--- a/tools/gluten-it/common/src/main/resources/clickbench-queries/q40.sql
+++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q40.sql
@@ -1 +1 @@
-SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID
= 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*)
AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= DATE '2013-07-01'
AND EventDate <= DATE '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID,
SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0)
THEN Referer ELSE '' END, URL ORDER BY PageViews DESC OFFSET 1000 LIMIT 10;
+SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID
= 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*)
AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= DATE '2013-07-01'
AND EventDate <= DATE '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID,
SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0)
THEN Referer ELSE '' END, URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
diff --git
a/tools/gluten-it/common/src/main/resources/clickbench-queries/q41.sql
b/tools/gluten-it/common/src/main/resources/clickbench-queries/q41.sql
index 321a06e3e7..c5cef47cc9 100644
--- a/tools/gluten-it/common/src/main/resources/clickbench-queries/q41.sql
+++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q41.sql
@@ -1 +1 @@
-SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID =
62 AND EventDate >= DATE '2013-07-01' AND EventDate <= DATE '2013-07-31' AND
IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash =
3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC OFFSET
100 LIMIT 10;
+SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID =
62 AND EventDate >= DATE '2013-07-01' AND EventDate <= DATE '2013-07-31' AND
IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash =
3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT
10 OFFSET 100;
diff --git
a/tools/gluten-it/common/src/main/resources/clickbench-queries/q42.sql
b/tools/gluten-it/common/src/main/resources/clickbench-queries/q42.sql
index 46b81c5be4..f76163cc5e 100644
--- a/tools/gluten-it/common/src/main/resources/clickbench-queries/q42.sql
+++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q42.sql
@@ -1 +1 @@
-SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits
WHERE CounterID = 62 AND EventDate >= DATE '2013-07-01' AND EventDate <= DATE
'2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash =
2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY
PageViews DESC OFFSET 10000 LIMIT 10;
+SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits
WHERE CounterID = 62 AND EventDate >= DATE '2013-07-01' AND EventDate <= DATE
'2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash =
2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY
PageViews DESC LIMIT 10 OFFSET 10000;
diff --git
a/tools/gluten-it/common/src/main/resources/clickbench-queries/q43.sql
b/tools/gluten-it/common/src/main/resources/clickbench-queries/q43.sql
index b54c092114..252d993538 100644
--- a/tools/gluten-it/common/src/main/resources/clickbench-queries/q43.sql
+++ b/tools/gluten-it/common/src/main/resources/clickbench-queries/q43.sql
@@ -1 +1 @@
-SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits
WHERE CounterID = 62 AND EventDate >= DATE '2013-07-14' AND EventDate <= DATE
'2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY
DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) OFFSET
1000 LIMIT 10;
+SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits
WHERE CounterID = 62 AND EventDate >= DATE '2013-07-14' AND EventDate <= DATE
'2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY
DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT
10 OFFSET 1000;
diff --git
a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/clickbench/ClickBenchDataGen.scala
b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/clickbench/ClickBenchDataGen.scala
index add7b01feb..80357f3eff 100644
---
a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/clickbench/ClickBenchDataGen.scala
+++
b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/clickbench/ClickBenchDataGen.scala
@@ -18,7 +18,7 @@ package org.apache.gluten.integration.clickbench
import org.apache.commons.io.FileUtils
import org.apache.gluten.integration.DataGen
-import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.{SparkSession, functions}
import java.io.File
import scala.language.postfixOps
@@ -29,20 +29,31 @@ class ClickBenchDataGen(val spark: SparkSession, dir:
String) extends DataGen {
override def gen(): Unit = {
println(s"Start to download ClickBench Parquet dataset from URL:
$DATA_URL... ")
// Directly download from official URL.
- val target = new File(dir + File.separator + FILE_NAME)
- FileUtils.forceMkdirParent(target)
+ val tempFile = new File(dir + File.separator + TMP_FILE_NAME)
+ FileUtils.forceMkdirParent(tempFile)
val cmd =
- s"wget --no-verbose --show-progress --progress=bar:force:noscroll -O
$target $DATA_URL"
+ s"wget --no-verbose --show-progress --progress=bar:force:noscroll -O
$tempFile $DATA_URL"
println(s"Executing command: $cmd")
val code = Process(cmd) !;
if (code != 0) {
throw new RuntimeException("Download failed")
}
- println(s"ClickBench Parquet dataset successfully downloaded to $target.")
+ println(s"ClickBench Parquet dataset successfully downloaded to
$tempFile.")
+
+ val sparkDataFile = new File(dir + File.separator + FILE_NAME)
+ println(s"Starting to write a data file $sparkDataFile that is compatible
with Spark... ")
+ spark.read.parquet(tempFile.getAbsolutePath)
+ .withColumn("eventtime", functions.col("eventtime").cast("timestamp"))
+ .withColumn("clienteventtime",
functions.col("clienteventtime").cast("timestamp"))
+ .withColumn("localeventtime",
functions.col("localeventtime").cast("timestamp"))
+ .write
+ .parquet(sparkDataFile.getAbsolutePath)
+ println(s"ClickBench Parquet dataset (Spark compatible) successfully
created at $sparkDataFile.")
}
}
object ClickBenchDataGen {
private val DATA_URL =
"https://datasets.clickhouse.com/hits_compatible/hits.parquet"
+ private val TMP_FILE_NAME = "hits.parquet.tmp"
private[clickbench] val FILE_NAME = "hits.parquet"
}
diff --git
a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/clickbench/ClickBenchTableCreator.scala
b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/clickbench/ClickBenchTableCreator.scala
index 33eac38629..c8e04e137d 100644
---
a/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/clickbench/ClickBenchTableCreator.scala
+++
b/tools/gluten-it/common/src/main/scala/org/apache/gluten/integration/clickbench/ClickBenchTableCreator.scala
@@ -19,11 +19,12 @@ package org.apache.gluten.integration.clickbench
import org.apache.gluten.integration.TableCreator
import org.apache.spark.sql.types.StructType
-import org.apache.spark.sql.{AnalysisException, SparkSession}
+import org.apache.spark.sql.{AnalysisException, SparkSession, functions}
import java.io.File
object ClickBenchTableCreator extends TableCreator {
+ private val SPARK_DATA_FILE_NAME = "hits-spark.parquet"
private val TABLE_NAME = "hits"
private val SCHEMA: StructType = StructType.fromDDL("""
|watchid bigint,
@@ -134,15 +135,15 @@ object ClickBenchTableCreator extends TableCreator {
|""".stripMargin)
override def create(spark: SparkSession, dataPath: String): Unit = {
- val file = new File(dataPath + File.separator +
ClickBenchDataGen.FILE_NAME)
if (spark.catalog.tableExists(TABLE_NAME)) {
println("Table exists: " + TABLE_NAME)
return
}
println("Creating catalog table: " + TABLE_NAME)
+ val file = new File(dataPath + File.separator +
ClickBenchDataGen.FILE_NAME)
spark.catalog.createTable(TABLE_NAME, "parquet", SCHEMA, Map("path" ->
file.getAbsolutePath))
try {
- spark.catalog.recoverPartitions(file.getName)
+ spark.catalog.recoverPartitions(TABLE_NAME)
} catch {
case _: AnalysisException =>
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]