This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


The following commit(s) were added to refs/heads/master by this push:
     new 68c42a5d6 IMPALA-13179: Make non-deterministic functions ineligible 
for tuple caching
68c42a5d6 is described below

commit 68c42a5d660be4c411a61668af474ad43d730b69
Author: Joe McDonnell <[email protected]>
AuthorDate: Thu Oct 31 13:06:41 2024 -0700

    IMPALA-13179: Make non-deterministic functions ineligible for tuple caching
    
    Non-deterministic functions should make a location ineligible
    for caching. Unlike existing definitions of non-determinism
    like FunctionCallExpr.isNondeterministicBuiltinFn(),
    the non-determinism needs to apply over time and across query
    boundaries, so it is a broader list of functions.
    
    The following are considered non-deterministic in this change:
     1. Random functions like rand/random/uuid
     2. Current time functions like now/current_timestamp
     3. Session/system information like current_user/pid/coordinator
     4. AI functions
     5. UDFs
    
    With enable_expr_rewrites=true, constant folding can replace
    some of these with a single constant (e.g. now() becomes a specific
    timestamp). This is not a correctness problem for tuple caching,
    because the specific value is incorporated into the cache key.
    
    Testing:
     - Added test cases to TupleCacheTest
    
    Change-Id: I9601dba87b3c8f24cbe42eca0d8070db42b50488
    Reviewed-on: http://gerrit.cloudera.org:8080/22011
    Reviewed-by: Impala Public Jenkins <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 .../apache/impala/analysis/FunctionCallExpr.java   | 113 +++++++++++++++------
 .../org/apache/impala/planner/TupleCacheInfo.java  |   1 +
 .../org/apache/impala/planner/TupleCacheTest.java  |  59 ++++++++---
 3 files changed, 130 insertions(+), 43 deletions(-)

diff --git a/fe/src/main/java/org/apache/impala/analysis/FunctionCallExpr.java 
b/fe/src/main/java/org/apache/impala/analysis/FunctionCallExpr.java
index f9c30599c..f4f59df18 100644
--- a/fe/src/main/java/org/apache/impala/analysis/FunctionCallExpr.java
+++ b/fe/src/main/java/org/apache/impala/analysis/FunctionCallExpr.java
@@ -33,7 +33,9 @@ import org.apache.impala.catalog.ScalarFunction;
 import org.apache.impala.catalog.ScalarType;
 import org.apache.impala.catalog.Type;
 import org.apache.impala.common.AnalysisException;
+import org.apache.impala.common.ThriftSerializationCtx;
 import org.apache.impala.common.TreeNode;
+import org.apache.impala.planner.TupleCacheInfo.IneligibilityReason;
 import org.apache.impala.service.FrontendProfile;
 import org.apache.impala.thrift.TAggregateExpr;
 import org.apache.impala.thrift.TColumnType;
@@ -47,6 +49,7 @@ import org.slf4j.LoggerFactory;
 import com.google.common.base.Joiner;
 import com.google.common.base.MoreObjects;
 import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableSet;
 import com.google.common.collect.Lists;
 
 public class FunctionCallExpr extends Expr {
@@ -137,19 +140,43 @@ public class FunctionCallExpr extends Expr {
     return functionCallExpr;
   }
 
-  /** Returns true if fnName is a built-in with given name. */
-  private static boolean functionNameEqualsBuiltin(FunctionName fnName, String 
name) {
-    // We could either have a function path, or the function name and optional 
database.
+  /** Returns the function name if this is a built-in. Otherwise, it returns 
null.
+   * NOTE: This function can be used before the FunctionName is analyzed, so 
the function
+   * has not been validated. It can be used to compare against valid builtin 
functions.
+   */
+  private static String getBuiltinFunctionName(FunctionName fnName) {
     if (fnName.getFnNamePath() != null) {
-      return fnName.getFnNamePath().size() == 1
-             && fnName.getFnNamePath().get(0).equalsIgnoreCase(name)
-          || fnName.getFnNamePath().size() == 2
-             && fnName.getFnNamePath().get(0).equals(BuiltinsDb.NAME)
-             && fnName.getFnNamePath().get(1).equalsIgnoreCase(name);
+      if (fnName.getFnNamePath().size() == 1) {
+        return fnName.getFnNamePath().get(0);
+      } else if (fnName.getFnNamePath().size() == 2) {
+        if (fnName.getFnNamePath().get(0).equals(BuiltinsDb.NAME)) {
+          return fnName.getFnNamePath().get(1);
+        }
+      }
     } else {
-      return (fnName.getDb() == null || 
fnName.getDb().equals(BuiltinsDb.NAME)) &&
-          fnName.getFunction().equalsIgnoreCase(name);
+      if (fnName.getDb() == null || fnName.getDb().equals(BuiltinsDb.NAME)) {
+        return fnName.getFunction();
+      }
     }
+    // Invalid or not a builtin function
+    return null;
+  }
+
+  /** Returns true if fnName is a built-in with given name. */
+  private static boolean functionNameEqualsBuiltin(FunctionName fnName, String 
name) {
+    String fnNameStr = getBuiltinFunctionName(fnName);
+    if (fnNameStr == null) return false;
+    return fnNameStr.equalsIgnoreCase(name);
+  }
+
+  /** Returns true if fnName is a built-in with a name in the provided set of
+   * lower-case function names.
+   */
+  private static boolean functionNameInBuiltinSet(FunctionName fnName,
+      Set<String> lowerCaseNames) {
+    String fnNameStr = getBuiltinFunctionName(fnName);
+    if (fnNameStr == null) return false;
+    return lowerCaseNames.contains(fnNameStr.toLowerCase());
   }
 
   /**
@@ -355,35 +382,61 @@ public class FunctionCallExpr extends Expr {
    * about user defined functions.
    */
   public boolean isNondeterministicBuiltinFn() {
-    return functionNameEqualsBuiltin(fnName_, "rand") ||
-        functionNameEqualsBuiltin(fnName_, "random") ||
-        functionNameEqualsBuiltin(fnName_, "uuid");
+    return functionNameInBuiltinSet(fnName_, ImmutableSet.of("rand", "random", 
"uuid"));
+  }
+
+  /**
+   * Returns true if function is non-deterministic over time, i.e. it may 
produce
+   * different results for the same inputs in a different query execution. 
This is a more
+   * expansive definition than isNondeterministicBuiltinFn(), because there 
are many
+   * functions that can be deterministic inside a single query execution that 
would vary
+   * over time. For example, timestamp functions and session/system 
information functions
+   * are not deterministic over time.
+   */
+  public boolean isNondeterministicAcrossQueries() {
+    // UDFs are not known to be deterministic over time
+    // TODO: Provide a way to mark UDFs as deterministic over time
+    if (!fnName_.isBuiltin()) return true;
+
+    // TODO: rand()/random() is actually deterministic in scan nodes, because 
it resets
+    // the RNG for each scan range. This can be improved to consider 
rand()/random()
+    // deterministic in that context.
+    if (isNondeterministicBuiltinFn()) return true;
+
+    // Functions that vary across different query invocations / different 
sessions / etc
+    Set knownNondeterministicFns =
+        ImmutableSet.of(
+            // Current timestamp functions
+            "current_date", "current_timestamp", "now", "timeofday", 
"unix_timestamp",
+            "utc_timestamp",
+            // Session / system information
+            "coordinator", "current_database", "current_session", 
"current_user",
+            "effective_user", "logged_in_user", "pid", "user", "version",
+            // AI Functions
+            "ai_generate_text", "ai_generate_text_default");
+    return functionNameInBuiltinSet(fnName_, knownNondeterministicFns);
   }
 
   /**
    * Returns true if function is a conditional builtin function
    */
   public boolean isConditionalBuiltinFn() {
-    return functionNameEqualsBuiltin(fnName_, "coalesce") ||
-        functionNameEqualsBuiltin(fnName_, "decode") ||
-        functionNameEqualsBuiltin(fnName_, "if") ||
-        functionNameEqualsBuiltin(fnName_, "ifnull") ||
-        functionNameEqualsBuiltin(fnName_, "isfalse") ||
-        functionNameEqualsBuiltin(fnName_, "isnotfalse") ||
-        functionNameEqualsBuiltin(fnName_, "isnottrue") ||
-        functionNameEqualsBuiltin(fnName_, "isnull") ||
-        functionNameEqualsBuiltin(fnName_, "istrue") ||
-        functionNameEqualsBuiltin(fnName_, "nonnullvalue") ||
-        functionNameEqualsBuiltin(fnName_, "nullif") ||
-        functionNameEqualsBuiltin(fnName_, "nullifzero") ||
-        functionNameEqualsBuiltin(fnName_, "nullvalue") ||
-        functionNameEqualsBuiltin(fnName_, "nvl") ||
-        functionNameEqualsBuiltin(fnName_, "nvl2") ||
-        functionNameEqualsBuiltin(fnName_, "zeroifnull");
+    return functionNameInBuiltinSet(fnName_,
+        ImmutableSet.of("coalesce", "decode", "if", "ifnull", "isfalse", 
"isnotfalse",
+            "isnottrue", "isnull", "istrue", "nonnullvalue", "nullif", 
"nullifzero",
+            "nullvalue", "nvl", "nvl2", "zeroifnull"));
+  }
+
+  @Override protected void toThrift(TExprNode msg) {
+    Preconditions.checkState(false,
+        "Unexpected use of old FunctionCallExpr::toThrift() signature");
   }
 
   @Override
-  protected void toThrift(TExprNode msg) {
+  protected void toThrift(TExprNode msg, ThriftSerializationCtx serialCtx) {
+    if (serialCtx.isTupleCache() && isNondeterministicAcrossQueries()) {
+      
serialCtx.setTupleCachingIneligible(IneligibilityReason.NONDETERMINISTIC_FN);
+    }
     if (isAggregateFunction() || isAnalyticFnCall_) {
       msg.node_type = TExprNodeType.AGGREGATE_EXPR;
       List<TColumnType> aggFnArgTypes = Lists.newArrayList();
diff --git a/fe/src/main/java/org/apache/impala/planner/TupleCacheInfo.java 
b/fe/src/main/java/org/apache/impala/planner/TupleCacheInfo.java
index dddee3e81..0e67524d3 100644
--- a/fe/src/main/java/org/apache/impala/planner/TupleCacheInfo.java
+++ b/fe/src/main/java/org/apache/impala/planner/TupleCacheInfo.java
@@ -99,6 +99,7 @@ public class TupleCacheInfo {
     // way. In future, this can support locations that are deterministic (e.g.
     // limits on a sorted input).
     LIMIT,
+    NONDETERMINISTIC_FN,
   }
   private EnumSet<IneligibilityReason> ineligibilityReasons_;
 
diff --git a/fe/src/test/java/org/apache/impala/planner/TupleCacheTest.java 
b/fe/src/test/java/org/apache/impala/planner/TupleCacheTest.java
index 03510895c..b5ccc05ac 100644
--- a/fe/src/test/java/org/apache/impala/planner/TupleCacheTest.java
+++ b/fe/src/test/java/org/apache/impala/planner/TupleCacheTest.java
@@ -26,6 +26,7 @@ import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
 
+import org.apache.impala.catalog.Type;
 import org.apache.impala.common.ImpalaException;
 import org.apache.impala.service.Frontend.PlanCtx;
 import org.apache.impala.testutil.TestUtils;
@@ -33,6 +34,8 @@ import org.apache.impala.thrift.TQueryCtx;
 import org.apache.impala.thrift.TQueryOptions;
 import org.junit.Test;
 
+import com.google.common.collect.Lists;
+
 /**
  * Test the planner's tuple cache node keys and eligibility
  *
@@ -139,21 +142,51 @@ public class TupleCacheTest extends PlannerTestBase {
         "select a.id from functional.alltypes a, functional.alltypes b",
         "select a.id from functional.alltypes a, functional.alltypes b limit 
10");
 
-    // TODO: Random functions should make a location ineligible
+    // Random functions should make a location ineligible
     // rand()/random()/uuid()
-    // verifyCacheIneligible(
-    //     "select id from functional.alltypes where id < 7300 * rand()");
-    // verifyCacheIneligible(
-    //     "select id from functional.alltypes where id < 7300 * random()");
-    // verifyCacheIneligible(
-    //     "select id from functional.alltypes where string_col != uuid()");
-
-    // TODO: Time functions are replaced by constant folding
+    verifyCacheIneligible(
+        "select id from functional.alltypes where id < 7300 * rand()");
+    verifyCacheIneligible(
+        "select id from functional.alltypes where id < 7300 * random()");
+    verifyCacheIneligible(
+        "select id from functional.alltypes where string_col != uuid()");
+
+    // NOTE: Expression rewrites can replace the constant functions with 
constants,
+    // so the following tests only work for enable_expr_rewrites=false (which 
is the
+    // default for getPlan()). If they are replaced with a constant, then the 
constant
+    // is hashed into the cache key. This can lead to cache misses, but it 
does not
+    // lead to incorrect results.
+
     // 
now()/current_date()/current_timestamp()/unix_timestamp()/utc_timestamp()/etc.
-    // verifyCacheIneligible(
-    //     "select timestamp_col from functional.alltypes where timestamp_col 
< now()");
-    // verifyCacheIneligible(
-    //     "select date_col from functional.date_tbl where date_col < 
current_date()");
+    verifyCacheIneligible(
+        "select timestamp_col from functional.alltypes where timestamp_col < 
now()");
+    verifyCacheIneligible(
+        "select date_col from functional.date_tbl where date_col < 
current_date()");
+
+    // Mixed-case equivalent to check that it is not case sensitive
+    verifyCacheIneligible(
+        "select date_col from functional.date_tbl where date_col < 
CURRENT_date()");
+
+    // System / session information can change over time (e.g. different 
sessions/users
+    // can be connected to different coordinators, etc).
+    verifyCacheIneligible(
+        "select string_col from functional.alltypes where string_col = 
current_user()");
+    verifyCacheIneligible(
+        "select string_col from functional.alltypes where string_col = 
coordinator()");
+
+    // AI functions are nondeterministic
+    verifyCacheIneligible(
+       "select string_col from functional.alltypes " +
+       "where string_col = ai_generate_text_default(string_col)");
+
+    // UDFs are considered nondeterministic
+    addTestFunction("TestUdf", Lists.newArrayList(Type.INT), false);
+    verifyCacheIneligible(
+       "select string_col from functional.alltypes where TestUdf(int_col) = 
int_col");
+
+    // Mixing eligible conjuncts with ineligible conjuncts doesn't change the 
eligibility
+    verifyCacheIneligible(
+       "select * from functional.alltypes where id < 5 and string_col = 
current_user()");
   }
 
   /**

Reply via email to