This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


The following commit(s) were added to refs/heads/master by this push:
     new e10c97693 IMPALA-13181: Disable tuple caching for locations with limits
e10c97693 is described below

commit e10c97693df941f4428a56e213e53f871df230b7
Author: Joe McDonnell <[email protected]>
AuthorDate: Sat Aug 3 11:09:00 2024 -0700

    IMPALA-13181: Disable tuple caching for locations with limits
    
    Limits can be non-deterministic, because they can depend on
    the order in which rows are processed. This marks locations with
    limits ineligible for tuple caching. In future, this can be
    refined the rule to allow caching in locations that are
    deterministic (e.g. limits on sorted inputs).
    
    Testing:
     - Added test cases to TupleCacheTest
    
    Change-Id: Iedcb73082325f1dfe397fbda07c2b7b4817efa5e
    Reviewed-on: http://gerrit.cloudera.org:8080/21857
    Reviewed-by: Michael Smith <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 .../org/apache/impala/common/ThriftSerializationCtx.java | 10 ++++++++++
 fe/src/main/java/org/apache/impala/planner/PlanNode.java | 14 +++++++++++---
 .../java/org/apache/impala/planner/TupleCacheInfo.java   |  4 ++++
 .../java/org/apache/impala/planner/TupleCacheTest.java   | 16 ++++++++++++++++
 4 files changed, 41 insertions(+), 3 deletions(-)

diff --git 
a/fe/src/main/java/org/apache/impala/common/ThriftSerializationCtx.java 
b/fe/src/main/java/org/apache/impala/common/ThriftSerializationCtx.java
index be39266ca..2c69701ec 100644
--- a/fe/src/main/java/org/apache/impala/common/ThriftSerializationCtx.java
+++ b/fe/src/main/java/org/apache/impala/common/ThriftSerializationCtx.java
@@ -20,6 +20,7 @@ package org.apache.impala.common;
 import org.apache.impala.analysis.SlotId;
 import org.apache.impala.analysis.TupleId;
 import org.apache.impala.planner.TupleCacheInfo;
+import org.apache.impala.planner.TupleCacheInfo.IneligibilityReason;
 import org.apache.impala.planner.HdfsScanNode;
 
 /**
@@ -91,6 +92,15 @@ public class ThriftSerializationCtx {
     }
   }
 
+  /**
+   * Set this location as ineligible for tuple caching due to the specified 
reason.
+   */
+  public void setTupleCachingIneligible(IneligibilityReason reason) {
+    if (isTupleCache()) {
+      tupleCacheInfo_.setIneligible(reason);
+    }
+  }
+
   /**
    * translateTupleId() is designed to be applied to every TupleId 
incorporated into a
    * Thrift structure. For tuple caching, translateTupleId() translates the 
passed in
diff --git a/fe/src/main/java/org/apache/impala/planner/PlanNode.java 
b/fe/src/main/java/org/apache/impala/planner/PlanNode.java
index c6d0e1ffc..11ed81064 100644
--- a/fe/src/main/java/org/apache/impala/planner/PlanNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/PlanNode.java
@@ -47,6 +47,7 @@ import org.apache.impala.common.PrintUtils;
 import org.apache.impala.common.ThriftSerializationCtx;
 import org.apache.impala.common.TreeNode;
 import org.apache.impala.planner.RuntimeFilterGenerator.RuntimeFilter;
+import org.apache.impala.planner.TupleCacheInfo.IneligibilityReason;
 import org.apache.impala.thrift.TExecNodePhase;
 import org.apache.impala.thrift.TExecStats;
 import org.apache.impala.thrift.TExplainLevel;
@@ -99,7 +100,7 @@ abstract public class PlanNode extends TreeNode<PlanNode> {
   // unique w/in plan tree; assigned by planner, and not necessarily in c'tor
   protected PlanNodeId id_;
 
-  protected long limit_; // max. # of rows to be returned; 0: no limit_
+  protected long limit_; // max. # of rows to be returned; -1: no limit_
 
   // ids materialized by the tree rooted at this node
   protected List<TupleId> tupleIds_;
@@ -518,6 +519,10 @@ abstract public class PlanNode extends TreeNode<PlanNode> {
    */
   private void initThrift(TPlanNode msg, ThriftSerializationCtx serialCtx) {
     msg.limit = limit_;
+    if (hasLimit()) {
+      LOG.trace("{} ineligible for caching due to limit", this);
+      serialCtx.setTupleCachingIneligible(IneligibilityReason.LIMIT);
+    }
 
     if (!serialCtx.isTupleCache()) {
       msg.node_id = id_.asInt();
@@ -1378,12 +1383,15 @@ abstract public class PlanNode extends 
TreeNode<PlanNode> {
     }
 
     // Incorporate this node's information
-    // TODO: This will also calculate eligibility via initThrift/toThrift.
-    // TODO: This will adjust the output of initThrift/toThrift to mask out 
items.
     TPlanNode msg = new TPlanNode();
     ThriftSerializationCtx serialCtx = new 
ThriftSerializationCtx(tupleCacheInfo_);
     initThrift(msg, serialCtx);
     toThrift(msg, serialCtx);
+    // Do not continue if initThrift or toThrift marked this as ineligible.
+    if (!tupleCacheInfo_.isEligible()) {
+      tupleCacheInfo_.finalizeHash();
+      return;
+    }
     tupleCacheInfo_.hashThrift(msg);
     if (getChildCount() == 0 && queryOptsHash != null) {
       // Leaf node, add query options hash.
diff --git a/fe/src/main/java/org/apache/impala/planner/TupleCacheInfo.java 
b/fe/src/main/java/org/apache/impala/planner/TupleCacheInfo.java
index 5c8af2b1d..dddee3e81 100644
--- a/fe/src/main/java/org/apache/impala/planner/TupleCacheInfo.java
+++ b/fe/src/main/java/org/apache/impala/planner/TupleCacheInfo.java
@@ -95,6 +95,10 @@ public class TupleCacheInfo {
   public enum IneligibilityReason {
     NOT_IMPLEMENTED,
     CHILDREN_INELIGIBLE,
+    // Limits are ineligible because they are implemented in a 
non-deterministic
+    // way. In future, this can support locations that are deterministic (e.g.
+    // limits on a sorted input).
+    LIMIT,
   }
   private EnumSet<IneligibilityReason> ineligibilityReasons_;
 
diff --git a/fe/src/test/java/org/apache/impala/planner/TupleCacheTest.java 
b/fe/src/test/java/org/apache/impala/planner/TupleCacheTest.java
index c43bb18a9..03510895c 100644
--- a/fe/src/test/java/org/apache/impala/planner/TupleCacheTest.java
+++ b/fe/src/test/java/org/apache/impala/planner/TupleCacheTest.java
@@ -132,6 +132,13 @@ public class TupleCacheTest extends PlannerTestBase {
     verifyCacheIneligible("select a.id from functional.alltypes a, " +
         "functional_kudu.alltypes b where a.id = b.id");
 
+    // Verify that a limit makes a statement ineligible
+    verifyCacheIneligible("select id from functional.alltypes limit 10");
+    // A limit higher in the plan doesn't impact cache eligibility for the 
scan nodes.
+    verifyIdenticalCacheKeys(
+        "select a.id from functional.alltypes a, functional.alltypes b",
+        "select a.id from functional.alltypes a, functional.alltypes b limit 
10");
+
     // TODO: Random functions should make a location ineligible
     // rand()/random()/uuid()
     // verifyCacheIneligible(
@@ -171,6 +178,15 @@ public class TupleCacheTest extends PlannerTestBase {
         "select straight_join probe.id from functional.alltypes probe, " +
         "functional.alltypes build where probe.id = build.id and 
probe.int_col=100");
 
+    // A limit on the probe side doesn't impact eligibility on the build side.
+    verifyOverlappingCacheKeys(
+        "select straight_join probe.id from " +
+        "(select id from functional.alltypes limit 10) probe, " +
+        "functional.alltypes build where probe.id = build.id",
+        "select straight_join probe.id from " +
+        "(select id from functional.alltypes) probe, " +
+        "functional.alltypes build where probe.id = build.id");
+
     // Build side is the same. Probe side has an extra int_col = 100 filter.
     // Note that this test requires id translation, because the probe side's 
reference
     // to int_col takes up a slot id and shifts the slot ids of the build side.

Reply via email to