This is an automated email from the ASF dual-hosted git repository.
joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
The following commit(s) were added to refs/heads/master by this push:
new 68c42a5d6 IMPALA-13179: Make non-deterministic functions ineligible
for tuple caching
68c42a5d6 is described below
commit 68c42a5d660be4c411a61668af474ad43d730b69
Author: Joe McDonnell <[email protected]>
AuthorDate: Thu Oct 31 13:06:41 2024 -0700
IMPALA-13179: Make non-deterministic functions ineligible for tuple caching
Non-deterministic functions should make a location ineligible
for caching. Unlike existing definitions of non-determinism
like FunctionCallExpr.isNondeterministicBuiltinFn(),
the non-determinism needs to apply over time and across query
boundaries, so it is a broader list of functions.
The following are considered non-deterministic in this change:
1. Random functions like rand/random/uuid
2. Current time functions like now/current_timestamp
3. Session/system information like current_user/pid/coordinator
4. AI functions
5. UDFs
With enable_expr_rewrites=true, constant folding can replace
some of these with a single constant (e.g. now() becomes a specific
timestamp). This is not a correctness problem for tuple caching,
because the specific value is incorporated into the cache key.
Testing:
- Added test cases to TupleCacheTest
Change-Id: I9601dba87b3c8f24cbe42eca0d8070db42b50488
Reviewed-on: http://gerrit.cloudera.org:8080/22011
Reviewed-by: Impala Public Jenkins <[email protected]>
Tested-by: Impala Public Jenkins <[email protected]>
---
.../apache/impala/analysis/FunctionCallExpr.java | 113 +++++++++++++++------
.../org/apache/impala/planner/TupleCacheInfo.java | 1 +
.../org/apache/impala/planner/TupleCacheTest.java | 59 ++++++++---
3 files changed, 130 insertions(+), 43 deletions(-)
diff --git a/fe/src/main/java/org/apache/impala/analysis/FunctionCallExpr.java
b/fe/src/main/java/org/apache/impala/analysis/FunctionCallExpr.java
index f9c30599c..f4f59df18 100644
--- a/fe/src/main/java/org/apache/impala/analysis/FunctionCallExpr.java
+++ b/fe/src/main/java/org/apache/impala/analysis/FunctionCallExpr.java
@@ -33,7 +33,9 @@ import org.apache.impala.catalog.ScalarFunction;
import org.apache.impala.catalog.ScalarType;
import org.apache.impala.catalog.Type;
import org.apache.impala.common.AnalysisException;
+import org.apache.impala.common.ThriftSerializationCtx;
import org.apache.impala.common.TreeNode;
+import org.apache.impala.planner.TupleCacheInfo.IneligibilityReason;
import org.apache.impala.service.FrontendProfile;
import org.apache.impala.thrift.TAggregateExpr;
import org.apache.impala.thrift.TColumnType;
@@ -47,6 +49,7 @@ import org.slf4j.LoggerFactory;
import com.google.common.base.Joiner;
import com.google.common.base.MoreObjects;
import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Lists;
public class FunctionCallExpr extends Expr {
@@ -137,19 +140,43 @@ public class FunctionCallExpr extends Expr {
return functionCallExpr;
}
- /** Returns true if fnName is a built-in with given name. */
- private static boolean functionNameEqualsBuiltin(FunctionName fnName, String
name) {
- // We could either have a function path, or the function name and optional
database.
+ /** Returns the function name if this is a built-in. Otherwise, it returns
null.
+ * NOTE: This function can be used before the FunctionName is analyzed, so
the function
+ * has not been validated. It can be used to compare against valid builtin
functions.
+ */
+ private static String getBuiltinFunctionName(FunctionName fnName) {
if (fnName.getFnNamePath() != null) {
- return fnName.getFnNamePath().size() == 1
- && fnName.getFnNamePath().get(0).equalsIgnoreCase(name)
- || fnName.getFnNamePath().size() == 2
- && fnName.getFnNamePath().get(0).equals(BuiltinsDb.NAME)
- && fnName.getFnNamePath().get(1).equalsIgnoreCase(name);
+ if (fnName.getFnNamePath().size() == 1) {
+ return fnName.getFnNamePath().get(0);
+ } else if (fnName.getFnNamePath().size() == 2) {
+ if (fnName.getFnNamePath().get(0).equals(BuiltinsDb.NAME)) {
+ return fnName.getFnNamePath().get(1);
+ }
+ }
} else {
- return (fnName.getDb() == null ||
fnName.getDb().equals(BuiltinsDb.NAME)) &&
- fnName.getFunction().equalsIgnoreCase(name);
+ if (fnName.getDb() == null || fnName.getDb().equals(BuiltinsDb.NAME)) {
+ return fnName.getFunction();
+ }
}
+ // Invalid or not a builtin function
+ return null;
+ }
+
+ /** Returns true if fnName is a built-in with given name. */
+ private static boolean functionNameEqualsBuiltin(FunctionName fnName, String
name) {
+ String fnNameStr = getBuiltinFunctionName(fnName);
+ if (fnNameStr == null) return false;
+ return fnNameStr.equalsIgnoreCase(name);
+ }
+
+ /** Returns true if fnName is a built-in with a name in the provided set of
+ * lower-case function names.
+ */
+ private static boolean functionNameInBuiltinSet(FunctionName fnName,
+ Set<String> lowerCaseNames) {
+ String fnNameStr = getBuiltinFunctionName(fnName);
+ if (fnNameStr == null) return false;
+ return lowerCaseNames.contains(fnNameStr.toLowerCase());
}
/**
@@ -355,35 +382,61 @@ public class FunctionCallExpr extends Expr {
* about user defined functions.
*/
public boolean isNondeterministicBuiltinFn() {
- return functionNameEqualsBuiltin(fnName_, "rand") ||
- functionNameEqualsBuiltin(fnName_, "random") ||
- functionNameEqualsBuiltin(fnName_, "uuid");
+ return functionNameInBuiltinSet(fnName_, ImmutableSet.of("rand", "random",
"uuid"));
+ }
+
+ /**
+ * Returns true if function is non-deterministic over time, i.e. it may
produce
+ * different results for the same inputs in a different query execution.
This is a more
+ * expansive definition than isNondeterministicBuiltinFn(), because there
are many
+ * functions that can be deterministic inside a single query execution that
would vary
+ * over time. For example, timestamp functions and session/system
information functions
+ * are not deterministic over time.
+ */
+ public boolean isNondeterministicAcrossQueries() {
+ // UDFs are not known to be deterministic over time
+ // TODO: Provide a way to mark UDFs as deterministic over time
+ if (!fnName_.isBuiltin()) return true;
+
+ // TODO: rand()/random() is actually deterministic in scan nodes, because
it resets
+ // the RNG for each scan range. This can be improved to consider
rand()/random()
+ // deterministic in that context.
+ if (isNondeterministicBuiltinFn()) return true;
+
+ // Functions that vary across different query invocations / different
sessions / etc
+ Set knownNondeterministicFns =
+ ImmutableSet.of(
+ // Current timestamp functions
+ "current_date", "current_timestamp", "now", "timeofday",
"unix_timestamp",
+ "utc_timestamp",
+ // Session / system information
+ "coordinator", "current_database", "current_session",
"current_user",
+ "effective_user", "logged_in_user", "pid", "user", "version",
+ // AI Functions
+ "ai_generate_text", "ai_generate_text_default");
+ return functionNameInBuiltinSet(fnName_, knownNondeterministicFns);
}
/**
* Returns true if function is a conditional builtin function
*/
public boolean isConditionalBuiltinFn() {
- return functionNameEqualsBuiltin(fnName_, "coalesce") ||
- functionNameEqualsBuiltin(fnName_, "decode") ||
- functionNameEqualsBuiltin(fnName_, "if") ||
- functionNameEqualsBuiltin(fnName_, "ifnull") ||
- functionNameEqualsBuiltin(fnName_, "isfalse") ||
- functionNameEqualsBuiltin(fnName_, "isnotfalse") ||
- functionNameEqualsBuiltin(fnName_, "isnottrue") ||
- functionNameEqualsBuiltin(fnName_, "isnull") ||
- functionNameEqualsBuiltin(fnName_, "istrue") ||
- functionNameEqualsBuiltin(fnName_, "nonnullvalue") ||
- functionNameEqualsBuiltin(fnName_, "nullif") ||
- functionNameEqualsBuiltin(fnName_, "nullifzero") ||
- functionNameEqualsBuiltin(fnName_, "nullvalue") ||
- functionNameEqualsBuiltin(fnName_, "nvl") ||
- functionNameEqualsBuiltin(fnName_, "nvl2") ||
- functionNameEqualsBuiltin(fnName_, "zeroifnull");
+ return functionNameInBuiltinSet(fnName_,
+ ImmutableSet.of("coalesce", "decode", "if", "ifnull", "isfalse",
"isnotfalse",
+ "isnottrue", "isnull", "istrue", "nonnullvalue", "nullif",
"nullifzero",
+ "nullvalue", "nvl", "nvl2", "zeroifnull"));
+ }
+
+ @Override protected void toThrift(TExprNode msg) {
+ Preconditions.checkState(false,
+ "Unexpected use of old FunctionCallExpr::toThrift() signature");
}
@Override
- protected void toThrift(TExprNode msg) {
+ protected void toThrift(TExprNode msg, ThriftSerializationCtx serialCtx) {
+ if (serialCtx.isTupleCache() && isNondeterministicAcrossQueries()) {
+
serialCtx.setTupleCachingIneligible(IneligibilityReason.NONDETERMINISTIC_FN);
+ }
if (isAggregateFunction() || isAnalyticFnCall_) {
msg.node_type = TExprNodeType.AGGREGATE_EXPR;
List<TColumnType> aggFnArgTypes = Lists.newArrayList();
diff --git a/fe/src/main/java/org/apache/impala/planner/TupleCacheInfo.java
b/fe/src/main/java/org/apache/impala/planner/TupleCacheInfo.java
index dddee3e81..0e67524d3 100644
--- a/fe/src/main/java/org/apache/impala/planner/TupleCacheInfo.java
+++ b/fe/src/main/java/org/apache/impala/planner/TupleCacheInfo.java
@@ -99,6 +99,7 @@ public class TupleCacheInfo {
// way. In future, this can support locations that are deterministic (e.g.
// limits on a sorted input).
LIMIT,
+ NONDETERMINISTIC_FN,
}
private EnumSet<IneligibilityReason> ineligibilityReasons_;
diff --git a/fe/src/test/java/org/apache/impala/planner/TupleCacheTest.java
b/fe/src/test/java/org/apache/impala/planner/TupleCacheTest.java
index 03510895c..b5ccc05ac 100644
--- a/fe/src/test/java/org/apache/impala/planner/TupleCacheTest.java
+++ b/fe/src/test/java/org/apache/impala/planner/TupleCacheTest.java
@@ -26,6 +26,7 @@ import java.util.HashSet;
import java.util.List;
import java.util.Set;
+import org.apache.impala.catalog.Type;
import org.apache.impala.common.ImpalaException;
import org.apache.impala.service.Frontend.PlanCtx;
import org.apache.impala.testutil.TestUtils;
@@ -33,6 +34,8 @@ import org.apache.impala.thrift.TQueryCtx;
import org.apache.impala.thrift.TQueryOptions;
import org.junit.Test;
+import com.google.common.collect.Lists;
+
/**
* Test the planner's tuple cache node keys and eligibility
*
@@ -139,21 +142,51 @@ public class TupleCacheTest extends PlannerTestBase {
"select a.id from functional.alltypes a, functional.alltypes b",
"select a.id from functional.alltypes a, functional.alltypes b limit
10");
- // TODO: Random functions should make a location ineligible
+ // Random functions should make a location ineligible
// rand()/random()/uuid()
- // verifyCacheIneligible(
- // "select id from functional.alltypes where id < 7300 * rand()");
- // verifyCacheIneligible(
- // "select id from functional.alltypes where id < 7300 * random()");
- // verifyCacheIneligible(
- // "select id from functional.alltypes where string_col != uuid()");
-
- // TODO: Time functions are replaced by constant folding
+ verifyCacheIneligible(
+ "select id from functional.alltypes where id < 7300 * rand()");
+ verifyCacheIneligible(
+ "select id from functional.alltypes where id < 7300 * random()");
+ verifyCacheIneligible(
+ "select id from functional.alltypes where string_col != uuid()");
+
+ // NOTE: Expression rewrites can replace the constant functions with
constants,
+ // so the following tests only work for enable_expr_rewrites=false (which
is the
+ // default for getPlan()). If they are replaced with a constant, then the
constant
+ // is hashed into the cache key. This can lead to cache misses, but it
does not
+ // lead to incorrect results.
+
//
now()/current_date()/current_timestamp()/unix_timestamp()/utc_timestamp()/etc.
- // verifyCacheIneligible(
- // "select timestamp_col from functional.alltypes where timestamp_col
< now()");
- // verifyCacheIneligible(
- // "select date_col from functional.date_tbl where date_col <
current_date()");
+ verifyCacheIneligible(
+ "select timestamp_col from functional.alltypes where timestamp_col <
now()");
+ verifyCacheIneligible(
+ "select date_col from functional.date_tbl where date_col <
current_date()");
+
+ // Mixed-case equivalent to check that it is not case sensitive
+ verifyCacheIneligible(
+ "select date_col from functional.date_tbl where date_col <
CURRENT_date()");
+
+ // System / session information can change over time (e.g. different
sessions/users
+ // can be connected to different coordinators, etc).
+ verifyCacheIneligible(
+ "select string_col from functional.alltypes where string_col =
current_user()");
+ verifyCacheIneligible(
+ "select string_col from functional.alltypes where string_col =
coordinator()");
+
+ // AI functions are nondeterministic
+ verifyCacheIneligible(
+ "select string_col from functional.alltypes " +
+ "where string_col = ai_generate_text_default(string_col)");
+
+ // UDFs are considered nondeterministic
+ addTestFunction("TestUdf", Lists.newArrayList(Type.INT), false);
+ verifyCacheIneligible(
+ "select string_col from functional.alltypes where TestUdf(int_col) =
int_col");
+
+ // Mixing eligible conjuncts with ineligible conjuncts doesn't change the
eligibility
+ verifyCacheIneligible(
+ "select * from functional.alltypes where id < 5 and string_col =
current_user()");
}
/**