This is an automated email from the ASF dual-hosted git repository.
michaelsmith pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
The following commit(s) were added to refs/heads/master by this push:
new 4681666e9 IMPALA-12800: Add cache for isTrueWithNullSlots() evaluation
4681666e9 is described below
commit 4681666e9386d87c647d19d6333750c16b6fa0c1
Author: Michael Smith <[email protected]>
AuthorDate: Fri May 31 11:47:39 2024 -0700
IMPALA-12800: Add cache for isTrueWithNullSlots() evaluation
isTrueWithNullSlots() can be expensive when it has to query the backend.
Many of the expressions will look similar, especially in large
auto-generated expressions. Adds a cache based on the nullified
expression to avoid querying the backend for expressions with identical
structure.
With DEBUG logging enabled for the Analyzer, computes and logs stats
about the null slots cache.
Adds 'use_null_slots_cache' query option to disable caching. Documents
the new option.
Change-Id: Ib63f5553284f21f775d2097b6c5d6bbb63699acd
Reviewed-on: http://gerrit.cloudera.org:8080/21484
Reviewed-by: Quanlong Huang <[email protected]>
Tested-by: Impala Public Jenkins <[email protected]>
---
be/src/service/query-options.cc | 4 ++
be/src/service/query-options.h | 3 +-
common/thrift/ImpalaService.thrift | 5 ++
common/thrift/Query.thrift | 3 ++
docs/impala.ditamap | 3 +-
docs/topics/impala_use_null_slots_cache.xml | 61 ++++++++++++++++++++++
.../java/org/apache/impala/analysis/Analyzer.java | 52 +++++++++++++++++-
.../java/org/apache/impala/planner/Planner.java | 1 +
8 files changed, 129 insertions(+), 3 deletions(-)
diff --git a/be/src/service/query-options.cc b/be/src/service/query-options.cc
index 49aa2fc74..7e561200b 100644
--- a/be/src/service/query-options.cc
+++ b/be/src/service/query-options.cc
@@ -1293,6 +1293,10 @@ Status impala::SetQueryOption(TImpalaQueryOptions::type
option, const string& va
query_options->__set_clean_dbcp_ds_cache(IsTrue(value));
break;
}
+ case TImpalaQueryOptions::USE_NULL_SLOTS_CACHE: {
+ query_options->__set_use_null_slots_cache(IsTrue(value));
+ break;
+ }
default:
string key = to_string(option);
if (IsRemovedQueryOption(key)) {
diff --git a/be/src/service/query-options.h b/be/src/service/query-options.h
index aa6d73ffb..96402d4e8 100644
--- a/be/src/service/query-options.h
+++ b/be/src/service/query-options.h
@@ -52,7 +52,7 @@ typedef std::unordered_map<string,
beeswax::TQueryOptionLevel::type>
// time we add or remove a query option to/from the enum TImpalaQueryOptions.
#define QUERY_OPTS_TABLE
\
DCHECK_EQ(_TImpalaQueryOptions_VALUES_TO_NAMES.size(),
\
- TImpalaQueryOptions::CLEAN_DBCP_DS_CACHE + 1);
\
+ TImpalaQueryOptions::USE_NULL_SLOTS_CACHE + 1);
\
REMOVED_QUERY_OPT_FN(abort_on_default_limit_exceeded,
ABORT_ON_DEFAULT_LIMIT_EXCEEDED) \
QUERY_OPT_FN(abort_on_error, ABORT_ON_ERROR, TQueryOptionLevel::REGULAR)
\
REMOVED_QUERY_OPT_FN(allow_unsupported_formats, ALLOW_UNSUPPORTED_FORMATS)
\
@@ -332,6 +332,7 @@ typedef std::unordered_map<string,
beeswax::TQueryOptionLevel::type>
RUNTIME_FILTER_IDS_TO_SKIP, TQueryOptionLevel::DEVELOPMENT)
\
QUERY_OPT_FN(slot_count_strategy, SLOT_COUNT_STRATEGY,
TQueryOptionLevel::ADVANCED) \
QUERY_OPT_FN(clean_dbcp_ds_cache, CLEAN_DBCP_DS_CACHE,
TQueryOptionLevel::ADVANCED) \
+ QUERY_OPT_FN(use_null_slots_cache, USE_NULL_SLOTS_CACHE,
TQueryOptionLevel::ADVANCED) \
;
/// Enforce practical limits on some query options to avoid undesired query
state.
diff --git a/common/thrift/ImpalaService.thrift
b/common/thrift/ImpalaService.thrift
index c83f46cdb..fdf7710dc 100644
--- a/common/thrift/ImpalaService.thrift
+++ b/common/thrift/ImpalaService.thrift
@@ -940,6 +940,11 @@ enum TImpalaQueryOptions {
// cache when its reference count equals 0. By caching DBCP DataSource
objects, we can
// avoid to reload JDBC driver.
CLEAN_DBCP_DS_CACHE = 178
+
+ // Enables cache for isTrueWithNullSlots, which can be expensive when
evaluating lots
+ // of expressions. The cache helps with generated expressions, which often
contain lots
+ // of repeated patterns.
+ USE_NULL_SLOTS_CACHE = 179
}
// The summary of a DML statement.
diff --git a/common/thrift/Query.thrift b/common/thrift/Query.thrift
index 075cf2739..31e6a6aca 100644
--- a/common/thrift/Query.thrift
+++ b/common/thrift/Query.thrift
@@ -729,6 +729,9 @@ struct TQueryOptions {
TSlotCountStrategy.LARGEST_FRAGMENT
179: optional bool clean_dbcp_ds_cache = true;
+
+ // See comment in ImpalaService.thrift
+ 180: optional bool use_null_slots_cache = true;
}
// Impala currently has three types of sessions: Beeswax, HiveServer2 and
external
diff --git a/docs/impala.ditamap b/docs/impala.ditamap
index 046721002..cf4d2e728 100644
--- a/docs/impala.ditamap
+++ b/docs/impala.ditamap
@@ -194,6 +194,7 @@ under the License.
<topicref href="topics/impala_enable_expr_rewrites.xml"/>
<topicref href="topics/impala_exec_single_node_rows_threshold.xml"/>
<topicref href="topics/impala_exec_time_limit_s.xml"/>
+ <topicref rev="4.2.0" href="topics/impala_expand_complex_types.xml"/>
<topicref href="topics/impala_explain_level.xml">
<topicref rev="2.5.0"
href="topics/impala_max_num_runtime_filters.xml"/>
</topicref>
@@ -255,8 +256,8 @@ under the License.
<topicref href="topics/impala_thread_reservation_limit.xml"/>
<topicref href="topics/impala_timezone.xml"/>
<topicref href="topics/impala_topn_bytes_limit.xml"/>
+ <topicref rev="4.5.0" href="topics/impala_use_null_slots_cache.xml"/>
<topicref href="topics/impala_utf8_mode.xml"/>
- <topicref href="topics/impala_expand_complex_types.xml"/>
</topicref>
<topicref href="topics/impala_show.xml"/>
<topicref href="topics/impala_shutdown.xml"/>
diff --git a/docs/topics/impala_use_null_slots_cache.xml
b/docs/topics/impala_use_null_slots_cache.xml
new file mode 100644
index 000000000..55678bec8
--- /dev/null
+++ b/docs/topics/impala_use_null_slots_cache.xml
@@ -0,0 +1,61 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements. See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership. The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied. See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="use_null_slots_cache">
+
+ <title>USE_NULL_SLOTS_CACHE Query Option</title>
+ <titlealts audience="PDF"><navtitle>USE NULL SLOTS
CACHE</navtitle></titlealts>
+ <prolog>
+ <metadata>
+ <data name="Category" value="Impala"/>
+ <data name="Category" value="Impala Query Options"/>
+ <data name="Category" value="Troubleshooting"/>
+ <data name="Category" value="Querying"/>
+ <data name="Category" value="Developers"/>
+ <data name="Category" value="Data Analysts"/>
+ </metadata>
+ </prolog>
+
+ <conbody>
+ <p>
+ <indexterm audience="hidden">USE_NULL_SLOTS_CACHE Query Option</indexterm>
Impala
+ optimizes expressions that contain null-rejecting conjuncts - expressions
that are
+ false if all "slots" (values returned from a table) are
<codeph>NULL</codeph>.
+ Evaluating whether a conjunct is null-rejecting can be expensive when
performed over a
+ lot of expressions. The "null slots cache" speeds analysis of generated
queries with
+ many similar expressions by caching the result of evaluating an expression
after slots
+ have been replaced with <codeph>NULL</codeph>s.
+ </p>
+
+ <p>
+ If query analysis uses a lot of memory, performance of the "null slots
cache" can be
+ reviewed by enabling logging
<codeph>org.apache.impala.analysis.Analyzer=DEBUG</codeph>
+ and searching coordinator logs for messages like
+ </p>
+
+<codeblock>
+null slots cache size: 286, median entry: 4416.0, 99th entry: 252688.88, hit
rate: 0.99456364
+</codeblock>
+
+ <p><b>Type: </b>BOOLEAN</p>
+ <p><b>Default: </b>TRUE</p>
+ <p><b>Added in: </b>Impala 4.5</p>
+ </conbody>
+</concept>
diff --git a/fe/src/main/java/org/apache/impala/analysis/Analyzer.java
b/fe/src/main/java/org/apache/impala/analysis/Analyzer.java
index 2590236b4..926b0263d 100644
--- a/fe/src/main/java/org/apache/impala/analysis/Analyzer.java
+++ b/fe/src/main/java/org/apache/impala/analysis/Analyzer.java
@@ -34,6 +34,7 @@ import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
+import java.util.concurrent.ExecutionException;
import java.util.function.Function;
import org.apache.commons.lang3.StringUtils;
@@ -108,6 +109,7 @@ import org.apache.impala.service.FeSupport;
import org.apache.impala.thrift.QueryConstants;
import org.apache.impala.thrift.TAccessEvent;
import org.apache.impala.thrift.TCatalogObjectType;
+import org.apache.impala.thrift.TImpalaQueryOptions;
import org.apache.impala.thrift.TLineageGraph;
import org.apache.impala.thrift.TNetworkAddress;
import org.apache.impala.thrift.TQueryCtx;
@@ -124,12 +126,19 @@ import org.apache.impala.util.ListMap;
import org.apache.impala.util.MetaStoreUtil;
import org.apache.impala.util.TSessionStateUtil;
import org.apache.kudu.client.KuduClient;
+import org.github.jamm.CannotAccessFieldException;
+import org.github.jamm.MemoryMeter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import com.codahale.metrics.Histogram;
+import com.codahale.metrics.Snapshot;
+import com.codahale.metrics.UniformReservoir;
import com.google.common.base.Joiner;
import com.google.common.base.Preconditions;
import com.google.common.base.Predicates;
+import com.google.common.cache.Cache;
+import com.google.common.cache.CacheBuilder;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
@@ -574,6 +583,10 @@ public class Analyzer {
// Expr rewriter for normalizing and rewriting expressions.
private final ExprRewriter exprRewriter_;
+ // Null slots cache - for expressions with slots replaced by nulls - to
reduce
+ // backend expression evaluation when query contains many similar
expressions.
+ private final Cache<Expr, Boolean> nullSlotsCache;
+
// Total number of expressions across the statement (including all
subqueries). This
// is used to enforce a limit on the total number of expressions.
Incremented by
// incrementNumStmtExprs(). Note that this does not include expressions
that do not
@@ -648,6 +661,9 @@ public class Analyzer {
}
rules.add(CountStarToConstRule.INSTANCE);
exprRewriter_ = new ExprRewriter(rules);
+ nullSlotsCache =
+ queryCtx.getClient_request().getQuery_options().use_null_slots_cache
?
+ CacheBuilder.newBuilder().concurrencyLevel(1).recordStats().build()
: null;
}
};
@@ -2958,7 +2974,41 @@ public class Analyzer {
*/
public boolean isTrueWithNullSlots(Expr p) throws InternalException {
Expr nullTuplePred = substituteNullSlots(p);
- return FeSupport.EvalPredicate(nullTuplePred, getQueryCtx());
+ if (globalState_.nullSlotsCache == null) {
+ return FeSupport.EvalPredicate(nullTuplePred, getQueryCtx());
+ }
+
+ try {
+ return globalState_.nullSlotsCache.get(nullTuplePred,
+ () -> FeSupport.EvalPredicate(nullTuplePred, getQueryCtx()));
+ } catch (ExecutionException e) {
+ Preconditions.checkState(e.getCause() instanceof InternalException,
+ "Internal error using null slots cache: %s\nDisable null slots cache
with " +
+ "the %s query option.", e,
TImpalaQueryOptions.USE_NULL_SLOTS_CACHE.name());
+ throw (InternalException) e.getCause();
+ }
+ }
+
+ /**
+ * Log hit rate and size of null slots cache.
+ */
+ public void logCacheStats() {
+ if (!LOG.isDebugEnabled() || globalState_.nullSlotsCache == null) return;
+
+ Histogram exprSize = new Histogram(new UniformReservoir());
+ MemoryMeter meter = MemoryMeter.builder().build();
+ for (Expr expr : globalState_.nullSlotsCache.asMap().keySet()) {
+ try {
+ exprSize.update(meter.measureDeep(expr));
+ } catch (CannotAccessFieldException e) {
+ // This may happen if we miss an add-opens call for lambdas in Java 17.
+ LOG.warn("Unable to weigh cache entry, additional add-opens needed",
e);
+ }
+ }
+ Snapshot snap = exprSize.getSnapshot();
+ LOG.debug("null slots cache size: {}, median entry: {}, 99th percentile
entry: {}, "+
+ "hit rate: {}", globalState_.nullSlotsCache.size(), snap.getMedian(),
+ snap.get99thPercentile(),
globalState_.nullSlotsCache.stats().hitRate());
}
/**
diff --git a/fe/src/main/java/org/apache/impala/planner/Planner.java
b/fe/src/main/java/org/apache/impala/planner/Planner.java
index 8c1d7791d..29685f924 100644
--- a/fe/src/main/java/org/apache/impala/planner/Planner.java
+++ b/fe/src/main/java/org/apache/impala/planner/Planner.java
@@ -212,6 +212,7 @@ public class Planner {
Collections.reverse(fragments);
ctx_.getTimeline().markEvent("Distributed plan created");
+ ctx_.getRootAnalyzer().logCacheStats();
ColumnLineageGraph graph = ctx_.getRootAnalyzer().getColumnLineageGraph();
if (BackendConfig.INSTANCE.getComputeLineage() ||
RuntimeEnv.INSTANCE.isTestEnv()) {