This is an automated email from the ASF dual-hosted git repository.
joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
The following commit(s) were added to refs/heads/master by this push:
new e10c97693 IMPALA-13181: Disable tuple caching for locations with limits
e10c97693 is described below
commit e10c97693df941f4428a56e213e53f871df230b7
Author: Joe McDonnell <[email protected]>
AuthorDate: Sat Aug 3 11:09:00 2024 -0700
IMPALA-13181: Disable tuple caching for locations with limits
Limits can be non-deterministic, because they can depend on
the order in which rows are processed. This marks locations with
limits ineligible for tuple caching. In future, this can be
refined the rule to allow caching in locations that are
deterministic (e.g. limits on sorted inputs).
Testing:
- Added test cases to TupleCacheTest
Change-Id: Iedcb73082325f1dfe397fbda07c2b7b4817efa5e
Reviewed-on: http://gerrit.cloudera.org:8080/21857
Reviewed-by: Michael Smith <[email protected]>
Tested-by: Impala Public Jenkins <[email protected]>
---
.../org/apache/impala/common/ThriftSerializationCtx.java | 10 ++++++++++
fe/src/main/java/org/apache/impala/planner/PlanNode.java | 14 +++++++++++---
.../java/org/apache/impala/planner/TupleCacheInfo.java | 4 ++++
.../java/org/apache/impala/planner/TupleCacheTest.java | 16 ++++++++++++++++
4 files changed, 41 insertions(+), 3 deletions(-)
diff --git
a/fe/src/main/java/org/apache/impala/common/ThriftSerializationCtx.java
b/fe/src/main/java/org/apache/impala/common/ThriftSerializationCtx.java
index be39266ca..2c69701ec 100644
--- a/fe/src/main/java/org/apache/impala/common/ThriftSerializationCtx.java
+++ b/fe/src/main/java/org/apache/impala/common/ThriftSerializationCtx.java
@@ -20,6 +20,7 @@ package org.apache.impala.common;
import org.apache.impala.analysis.SlotId;
import org.apache.impala.analysis.TupleId;
import org.apache.impala.planner.TupleCacheInfo;
+import org.apache.impala.planner.TupleCacheInfo.IneligibilityReason;
import org.apache.impala.planner.HdfsScanNode;
/**
@@ -91,6 +92,15 @@ public class ThriftSerializationCtx {
}
}
+ /**
+ * Set this location as ineligible for tuple caching due to the specified
reason.
+ */
+ public void setTupleCachingIneligible(IneligibilityReason reason) {
+ if (isTupleCache()) {
+ tupleCacheInfo_.setIneligible(reason);
+ }
+ }
+
/**
* translateTupleId() is designed to be applied to every TupleId
incorporated into a
* Thrift structure. For tuple caching, translateTupleId() translates the
passed in
diff --git a/fe/src/main/java/org/apache/impala/planner/PlanNode.java
b/fe/src/main/java/org/apache/impala/planner/PlanNode.java
index c6d0e1ffc..11ed81064 100644
--- a/fe/src/main/java/org/apache/impala/planner/PlanNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/PlanNode.java
@@ -47,6 +47,7 @@ import org.apache.impala.common.PrintUtils;
import org.apache.impala.common.ThriftSerializationCtx;
import org.apache.impala.common.TreeNode;
import org.apache.impala.planner.RuntimeFilterGenerator.RuntimeFilter;
+import org.apache.impala.planner.TupleCacheInfo.IneligibilityReason;
import org.apache.impala.thrift.TExecNodePhase;
import org.apache.impala.thrift.TExecStats;
import org.apache.impala.thrift.TExplainLevel;
@@ -99,7 +100,7 @@ abstract public class PlanNode extends TreeNode<PlanNode> {
// unique w/in plan tree; assigned by planner, and not necessarily in c'tor
protected PlanNodeId id_;
- protected long limit_; // max. # of rows to be returned; 0: no limit_
+ protected long limit_; // max. # of rows to be returned; -1: no limit_
// ids materialized by the tree rooted at this node
protected List<TupleId> tupleIds_;
@@ -518,6 +519,10 @@ abstract public class PlanNode extends TreeNode<PlanNode> {
*/
private void initThrift(TPlanNode msg, ThriftSerializationCtx serialCtx) {
msg.limit = limit_;
+ if (hasLimit()) {
+ LOG.trace("{} ineligible for caching due to limit", this);
+ serialCtx.setTupleCachingIneligible(IneligibilityReason.LIMIT);
+ }
if (!serialCtx.isTupleCache()) {
msg.node_id = id_.asInt();
@@ -1378,12 +1383,15 @@ abstract public class PlanNode extends
TreeNode<PlanNode> {
}
// Incorporate this node's information
- // TODO: This will also calculate eligibility via initThrift/toThrift.
- // TODO: This will adjust the output of initThrift/toThrift to mask out
items.
TPlanNode msg = new TPlanNode();
ThriftSerializationCtx serialCtx = new
ThriftSerializationCtx(tupleCacheInfo_);
initThrift(msg, serialCtx);
toThrift(msg, serialCtx);
+ // Do not continue if initThrift or toThrift marked this as ineligible.
+ if (!tupleCacheInfo_.isEligible()) {
+ tupleCacheInfo_.finalizeHash();
+ return;
+ }
tupleCacheInfo_.hashThrift(msg);
if (getChildCount() == 0 && queryOptsHash != null) {
// Leaf node, add query options hash.
diff --git a/fe/src/main/java/org/apache/impala/planner/TupleCacheInfo.java
b/fe/src/main/java/org/apache/impala/planner/TupleCacheInfo.java
index 5c8af2b1d..dddee3e81 100644
--- a/fe/src/main/java/org/apache/impala/planner/TupleCacheInfo.java
+++ b/fe/src/main/java/org/apache/impala/planner/TupleCacheInfo.java
@@ -95,6 +95,10 @@ public class TupleCacheInfo {
public enum IneligibilityReason {
NOT_IMPLEMENTED,
CHILDREN_INELIGIBLE,
+ // Limits are ineligible because they are implemented in a
non-deterministic
+ // way. In future, this can support locations that are deterministic (e.g.
+ // limits on a sorted input).
+ LIMIT,
}
private EnumSet<IneligibilityReason> ineligibilityReasons_;
diff --git a/fe/src/test/java/org/apache/impala/planner/TupleCacheTest.java
b/fe/src/test/java/org/apache/impala/planner/TupleCacheTest.java
index c43bb18a9..03510895c 100644
--- a/fe/src/test/java/org/apache/impala/planner/TupleCacheTest.java
+++ b/fe/src/test/java/org/apache/impala/planner/TupleCacheTest.java
@@ -132,6 +132,13 @@ public class TupleCacheTest extends PlannerTestBase {
verifyCacheIneligible("select a.id from functional.alltypes a, " +
"functional_kudu.alltypes b where a.id = b.id");
+ // Verify that a limit makes a statement ineligible
+ verifyCacheIneligible("select id from functional.alltypes limit 10");
+ // A limit higher in the plan doesn't impact cache eligibility for the
scan nodes.
+ verifyIdenticalCacheKeys(
+ "select a.id from functional.alltypes a, functional.alltypes b",
+ "select a.id from functional.alltypes a, functional.alltypes b limit
10");
+
// TODO: Random functions should make a location ineligible
// rand()/random()/uuid()
// verifyCacheIneligible(
@@ -171,6 +178,15 @@ public class TupleCacheTest extends PlannerTestBase {
"select straight_join probe.id from functional.alltypes probe, " +
"functional.alltypes build where probe.id = build.id and
probe.int_col=100");
+ // A limit on the probe side doesn't impact eligibility on the build side.
+ verifyOverlappingCacheKeys(
+ "select straight_join probe.id from " +
+ "(select id from functional.alltypes limit 10) probe, " +
+ "functional.alltypes build where probe.id = build.id",
+ "select straight_join probe.id from " +
+ "(select id from functional.alltypes) probe, " +
+ "functional.alltypes build where probe.id = build.id");
+
// Build side is the same. Probe side has an extra int_col = 100 filter.
// Note that this test requires id translation, because the probe side's
reference
// to int_col takes up a slot id and shifts the slot ids of the build side.