This is an automated email from the ASF dual-hosted git repository.
zhangzc pushed a commit to branch kylin-on-parquet-v2
in repository https://gitbox.apache.org/repos/asf/kylin.git
The following commit(s) were added to refs/heads/kylin-on-parquet-v2 by this
push:
new d16e7f0 KYLIN-4980 Support prunning segments from complex filter
conditions
d16e7f0 is described below
commit d16e7f053116cf659a3998affba233320d3d1dca
Author: zhengshengjun <[email protected]>
AuthorDate: Tue Apr 20 10:05:56 2021 +0800
KYLIN-4980 Support prunning segments from complex filter conditions
---
.../resources/query/sql_prune_segment/query02.sql | 28 +++++++++++++
.../query02.sql.expected/._SUCCESS.crc | Bin 0 -> 8 bytes
...343019-e3fd-4ce5-b509-d744f9ccb327-c000.csv.crc | Bin 0 -> 12 bytes
.../query02.sql.expected/_SUCCESS | 0
...0-45343019-e3fd-4ce5-b509-d744f9ccb327-c000.csv | 3 ++
.../sql/execution/datasource/FilePruner.scala | 46 +++++++++++++++++++--
6 files changed, 74 insertions(+), 3 deletions(-)
diff --git a/kylin-it/src/test/resources/query/sql_prune_segment/query02.sql
b/kylin-it/src/test/resources/query/sql_prune_segment/query02.sql
new file mode 100644
index 0000000..7eab250
--- /dev/null
+++ b/kylin-it/src/test/resources/query/sql_prune_segment/query02.sql
@@ -0,0 +1,28 @@
+--
+-- Licensed to the Apache Software Foundation (ASF) under one
+-- or more contributor license agreements. See the NOTICE file
+-- distributed with this work for additional information
+-- regarding copyright ownership. The ASF licenses this file
+-- to you under the Apache License, Version 2.0 (the
+-- "License"); you may not use this file except in compliance
+-- with the License. You may obtain a copy of the License at
+--
+-- http://www.apache.org/licenses/LICENSE-2.0
+--
+-- Unless required by applicable law or agreed to in writing, software
+-- distributed under the License is distributed on an "AS IS" BASIS,
+-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+-- See the License for the specific language governing permissions and
+-- limitations under the License.
+--
+
+select lo_orderdate, lo_quantity, sum(lo_revenue) from ssb.p_lineorder
+where
+(lo_orderdate = 19920906 and lo_quantity = 4) or
+(lo_orderdate = 19920905 and lo_quantity = 9) and
+(lo_orderdate = 19920904 and lo_quantity = 7) or
+(
+ lo_orderdate > 19920906 and lo_orderdate <= 19920907 and (lo_quantity = 6
or lo_quantity = 49)
+)
+group by lo_orderdate, lo_quantity
+;{"scanRowCount":9,"scanBytes":0,"scanFiles":2,"cuboidId":[7],"exactlyMatched":[false]}
\ No newline at end of file
diff --git
a/kylin-it/src/test/resources/query/sql_prune_segment/query02.sql.expected/._SUCCESS.crc
b/kylin-it/src/test/resources/query/sql_prune_segment/query02.sql.expected/._SUCCESS.crc
new file mode 100644
index 0000000..3b7b044
Binary files /dev/null and
b/kylin-it/src/test/resources/query/sql_prune_segment/query02.sql.expected/._SUCCESS.crc
differ
diff --git
a/kylin-it/src/test/resources/query/sql_prune_segment/query02.sql.expected/.part-00000-45343019-e3fd-4ce5-b509-d744f9ccb327-c000.csv.crc
b/kylin-it/src/test/resources/query/sql_prune_segment/query02.sql.expected/.part-00000-45343019-e3fd-4ce5-b509-d744f9ccb327-c000.csv.crc
new file mode 100644
index 0000000..71996cc
Binary files /dev/null and
b/kylin-it/src/test/resources/query/sql_prune_segment/query02.sql.expected/.part-00000-45343019-e3fd-4ce5-b509-d744f9ccb327-c000.csv.crc
differ
diff --git
a/kylin-it/src/test/resources/query/sql_prune_segment/query02.sql.expected/_SUCCESS
b/kylin-it/src/test/resources/query/sql_prune_segment/query02.sql.expected/_SUCCESS
new file mode 100644
index 0000000..e69de29
diff --git
a/kylin-it/src/test/resources/query/sql_prune_segment/query02.sql.expected/part-00000-45343019-e3fd-4ce5-b509-d744f9ccb327-c000.csv
b/kylin-it/src/test/resources/query/sql_prune_segment/query02.sql.expected/part-00000-45343019-e3fd-4ce5-b509-d744f9ccb327-c000.csv
new file mode 100644
index 0000000..9770131
--- /dev/null
+++
b/kylin-it/src/test/resources/query/sql_prune_segment/query02.sql.expected/part-00000-45343019-e3fd-4ce5-b509-d744f9ccb327-c000.csv
@@ -0,0 +1,3 @@
+19920907,6,1192201
+19920907,49,8718385
+19920906,4,396435
diff --git
a/kylin-spark-project/kylin-spark-common/src/main/scala/org/apache/spark/sql/execution/datasource/FilePruner.scala
b/kylin-spark-project/kylin-spark-common/src/main/scala/org/apache/spark/sql/execution/datasource/FilePruner.scala
index b6008c2..91faab4 100644
---
a/kylin-spark-project/kylin-spark-common/src/main/scala/org/apache/spark/sql/execution/datasource/FilePruner.scala
+++
b/kylin-spark-project/kylin-spark-common/src/main/scala/org/apache/spark/sql/execution/datasource/FilePruner.scala
@@ -237,7 +237,7 @@ class FilePruner(cubeInstance: CubeInstance,
require(isResolved)
val startTime = System.nanoTime
- val timePartitionFilters = getSpecFilter(dataFilters, timePartitionColumn)
+ val timePartitionFilters = getSegmentFilter(dataFilters,
timePartitionColumn)
logInfo(s"Applying time partition filters:
${timePartitionFilters.mkString(",")}")
val fsc = ShardFileStatusCache.getFileStatusCache(session)
@@ -295,8 +295,48 @@ class FilePruner(cubeInstance: CubeInstance,
}
}
- private def getSpecFilter(dataFilters: Seq[Expression], col: Attribute):
Seq[Expression] = {
- dataFilters.filter(_.references.subsetOf(AttributeSet(col)))
+ private def getSegmentFilter(dataFilters: Seq[Expression], col: Attribute):
Seq[Expression] = {
+ dataFilters.map(extractSegmentFilter(_,
col)).filter(!_.equals(None)).map(_.get)
+ }
+
+ private def extractSegmentFilter(filter: Expression, col: Attribute):
Option[Expression] = {
+ filter match {
+ case expressions.Or(left, right) =>
+ val leftChild = extractSegmentFilter(left, col)
+ val rightChild = extractSegmentFilter(right, col)
+
+ //if there exists leaf-node that doesn't contain partition column, the
parent filter is
+ //unnecessary for segment prunning.
+ //e.g. "where a = xxx or partition = xxx", we can't filter any segment
+ if (leftChild.eq(None) || rightChild.eq(None)) {
+ None
+ } else {
+ Some(expressions.Or(leftChild.get, rightChild.get))
+ }
+ case expressions.And(left, right) =>
+ val leftChild = extractSegmentFilter(left, col)
+ val rightChild = extractSegmentFilter(right, col)
+
+ //if there is only one leaf-node that contains partition column
+ //e.g. "where a = xxx and partition = xxx",
+ //then we can filter segment using "where partition = xxx"
+ if (!leftChild.eq(None) && !rightChild.eq(None)) {
+ Some(expressions.And(leftChild.get, rightChild.get))
+ } else if (!rightChild.eq(None)) {
+ rightChild
+ } else if (!leftChild.eq(None)) {
+ leftChild
+ } else {
+ None
+ }
+ case _ =>
+ //other unary filter like EqualTo, GreaterThan, GreaterThanOrEqual,
etc.
+ if (filter.references.subsetOf(AttributeSet(col))) {
+ Some(filter)
+ } else {
+ None
+ }
+ }
}
private def pruneSegments(filters: Seq[Expression],