This is an automated email from the ASF dual-hosted git repository. stigahuang pushed a commit to branch branch-4.4.1 in repository https://gitbox.apache.org/repos/asf/impala.git
commit 51661d335b22fad0da72ddaa8980539cd5ef62c8 Author: Michael Smith <[email protected]> AuthorDate: Fri May 31 11:47:39 2024 -0700 IMPALA-12800: Add cache for isTrueWithNullSlots() evaluation isTrueWithNullSlots() can be expensive when it has to query the backend. Many of the expressions will look similar, especially in large auto-generated expressions. Adds a cache based on the nullified expression to avoid querying the backend for expressions with identical structure. With DEBUG logging enabled for the Analyzer, computes and logs stats about the null slots cache. Adds 'use_null_slots_cache' query option to disable caching. Documents the new option. Change-Id: Ib63f5553284f21f775d2097b6c5d6bbb63699acd Reviewed-on: http://gerrit.cloudera.org:8080/21484 Reviewed-by: Quanlong Huang <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> --- be/src/service/query-options.cc | 4 ++ be/src/service/query-options.h | 3 +- common/thrift/ImpalaService.thrift | 5 ++ common/thrift/Query.thrift | 3 ++ docs/impala.ditamap | 3 +- docs/topics/impala_use_null_slots_cache.xml | 61 ++++++++++++++++++++++ .../java/org/apache/impala/analysis/Analyzer.java | 52 +++++++++++++++++- .../java/org/apache/impala/planner/Planner.java | 1 + 8 files changed, 129 insertions(+), 3 deletions(-) diff --git a/be/src/service/query-options.cc b/be/src/service/query-options.cc index 49aa2fc74..7e561200b 100644 --- a/be/src/service/query-options.cc +++ b/be/src/service/query-options.cc @@ -1293,6 +1293,10 @@ Status impala::SetQueryOption(TImpalaQueryOptions::type option, const string& va query_options->__set_clean_dbcp_ds_cache(IsTrue(value)); break; } + case TImpalaQueryOptions::USE_NULL_SLOTS_CACHE: { + query_options->__set_use_null_slots_cache(IsTrue(value)); + break; + } default: string key = to_string(option); if (IsRemovedQueryOption(key)) { diff --git a/be/src/service/query-options.h b/be/src/service/query-options.h index aa6d73ffb..96402d4e8 100644 --- a/be/src/service/query-options.h +++ b/be/src/service/query-options.h @@ -52,7 +52,7 @@ typedef std::unordered_map<string, beeswax::TQueryOptionLevel::type> // time we add or remove a query option to/from the enum TImpalaQueryOptions. #define QUERY_OPTS_TABLE \ DCHECK_EQ(_TImpalaQueryOptions_VALUES_TO_NAMES.size(), \ - TImpalaQueryOptions::CLEAN_DBCP_DS_CACHE + 1); \ + TImpalaQueryOptions::USE_NULL_SLOTS_CACHE + 1); \ REMOVED_QUERY_OPT_FN(abort_on_default_limit_exceeded, ABORT_ON_DEFAULT_LIMIT_EXCEEDED) \ QUERY_OPT_FN(abort_on_error, ABORT_ON_ERROR, TQueryOptionLevel::REGULAR) \ REMOVED_QUERY_OPT_FN(allow_unsupported_formats, ALLOW_UNSUPPORTED_FORMATS) \ @@ -332,6 +332,7 @@ typedef std::unordered_map<string, beeswax::TQueryOptionLevel::type> RUNTIME_FILTER_IDS_TO_SKIP, TQueryOptionLevel::DEVELOPMENT) \ QUERY_OPT_FN(slot_count_strategy, SLOT_COUNT_STRATEGY, TQueryOptionLevel::ADVANCED) \ QUERY_OPT_FN(clean_dbcp_ds_cache, CLEAN_DBCP_DS_CACHE, TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(use_null_slots_cache, USE_NULL_SLOTS_CACHE, TQueryOptionLevel::ADVANCED) \ ; /// Enforce practical limits on some query options to avoid undesired query state. diff --git a/common/thrift/ImpalaService.thrift b/common/thrift/ImpalaService.thrift index c83f46cdb..fdf7710dc 100644 --- a/common/thrift/ImpalaService.thrift +++ b/common/thrift/ImpalaService.thrift @@ -940,6 +940,11 @@ enum TImpalaQueryOptions { // cache when its reference count equals 0. By caching DBCP DataSource objects, we can // avoid to reload JDBC driver. CLEAN_DBCP_DS_CACHE = 178 + + // Enables cache for isTrueWithNullSlots, which can be expensive when evaluating lots + // of expressions. The cache helps with generated expressions, which often contain lots + // of repeated patterns. + USE_NULL_SLOTS_CACHE = 179 } // The summary of a DML statement. diff --git a/common/thrift/Query.thrift b/common/thrift/Query.thrift index 075cf2739..31e6a6aca 100644 --- a/common/thrift/Query.thrift +++ b/common/thrift/Query.thrift @@ -729,6 +729,9 @@ struct TQueryOptions { TSlotCountStrategy.LARGEST_FRAGMENT 179: optional bool clean_dbcp_ds_cache = true; + + // See comment in ImpalaService.thrift + 180: optional bool use_null_slots_cache = true; } // Impala currently has three types of sessions: Beeswax, HiveServer2 and external diff --git a/docs/impala.ditamap b/docs/impala.ditamap index 046721002..cf4d2e728 100644 --- a/docs/impala.ditamap +++ b/docs/impala.ditamap @@ -194,6 +194,7 @@ under the License. <topicref href="topics/impala_enable_expr_rewrites.xml"/> <topicref href="topics/impala_exec_single_node_rows_threshold.xml"/> <topicref href="topics/impala_exec_time_limit_s.xml"/> + <topicref rev="4.2.0" href="topics/impala_expand_complex_types.xml"/> <topicref href="topics/impala_explain_level.xml"> <topicref rev="2.5.0" href="topics/impala_max_num_runtime_filters.xml"/> </topicref> @@ -255,8 +256,8 @@ under the License. <topicref href="topics/impala_thread_reservation_limit.xml"/> <topicref href="topics/impala_timezone.xml"/> <topicref href="topics/impala_topn_bytes_limit.xml"/> + <topicref rev="4.5.0" href="topics/impala_use_null_slots_cache.xml"/> <topicref href="topics/impala_utf8_mode.xml"/> - <topicref href="topics/impala_expand_complex_types.xml"/> </topicref> <topicref href="topics/impala_show.xml"/> <topicref href="topics/impala_shutdown.xml"/> diff --git a/docs/topics/impala_use_null_slots_cache.xml b/docs/topics/impala_use_null_slots_cache.xml new file mode 100644 index 000000000..55678bec8 --- /dev/null +++ b/docs/topics/impala_use_null_slots_cache.xml @@ -0,0 +1,61 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. +--> +<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd"> +<concept id="use_null_slots_cache"> + + <title>USE_NULL_SLOTS_CACHE Query Option</title> + <titlealts audience="PDF"><navtitle>USE NULL SLOTS CACHE</navtitle></titlealts> + <prolog> + <metadata> + <data name="Category" value="Impala"/> + <data name="Category" value="Impala Query Options"/> + <data name="Category" value="Troubleshooting"/> + <data name="Category" value="Querying"/> + <data name="Category" value="Developers"/> + <data name="Category" value="Data Analysts"/> + </metadata> + </prolog> + + <conbody> + <p> + <indexterm audience="hidden">USE_NULL_SLOTS_CACHE Query Option</indexterm> Impala + optimizes expressions that contain null-rejecting conjuncts - expressions that are + false if all "slots" (values returned from a table) are <codeph>NULL</codeph>. + Evaluating whether a conjunct is null-rejecting can be expensive when performed over a + lot of expressions. The "null slots cache" speeds analysis of generated queries with + many similar expressions by caching the result of evaluating an expression after slots + have been replaced with <codeph>NULL</codeph>s. + </p> + + <p> + If query analysis uses a lot of memory, performance of the "null slots cache" can be + reviewed by enabling logging <codeph>org.apache.impala.analysis.Analyzer=DEBUG</codeph> + and searching coordinator logs for messages like + </p> + +<codeblock> +null slots cache size: 286, median entry: 4416.0, 99th entry: 252688.88, hit rate: 0.99456364 +</codeblock> + + <p><b>Type: </b>BOOLEAN</p> + <p><b>Default: </b>TRUE</p> + <p><b>Added in: </b>Impala 4.5</p> + </conbody> +</concept> diff --git a/fe/src/main/java/org/apache/impala/analysis/Analyzer.java b/fe/src/main/java/org/apache/impala/analysis/Analyzer.java index b825f88d8..15be58a8b 100644 --- a/fe/src/main/java/org/apache/impala/analysis/Analyzer.java +++ b/fe/src/main/java/org/apache/impala/analysis/Analyzer.java @@ -34,6 +34,7 @@ import java.util.List; import java.util.Map; import java.util.Optional; import java.util.Set; +import java.util.concurrent.ExecutionException; import java.util.function.Function; import org.apache.commons.lang3.StringUtils; @@ -108,6 +109,7 @@ import org.apache.impala.service.FeSupport; import org.apache.impala.thrift.QueryConstants; import org.apache.impala.thrift.TAccessEvent; import org.apache.impala.thrift.TCatalogObjectType; +import org.apache.impala.thrift.TImpalaQueryOptions; import org.apache.impala.thrift.TLineageGraph; import org.apache.impala.thrift.TNetworkAddress; import org.apache.impala.thrift.TQueryCtx; @@ -124,12 +126,19 @@ import org.apache.impala.util.ListMap; import org.apache.impala.util.MetaStoreUtil; import org.apache.impala.util.TSessionStateUtil; import org.apache.kudu.client.KuduClient; +import org.github.jamm.CannotAccessFieldException; +import org.github.jamm.MemoryMeter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.codahale.metrics.Histogram; +import com.codahale.metrics.Snapshot; +import com.codahale.metrics.UniformReservoir; import com.google.common.base.Joiner; import com.google.common.base.Preconditions; import com.google.common.base.Predicates; +import com.google.common.cache.Cache; +import com.google.common.cache.CacheBuilder; import com.google.common.collect.ImmutableList; import com.google.common.collect.Lists; import com.google.common.collect.Sets; @@ -574,6 +583,10 @@ public class Analyzer { // Expr rewriter for normalizing and rewriting expressions. private final ExprRewriter exprRewriter_; + // Null slots cache - for expressions with slots replaced by nulls - to reduce + // backend expression evaluation when query contains many similar expressions. + private final Cache<Expr, Boolean> nullSlotsCache; + // Total number of expressions across the statement (including all subqueries). This // is used to enforce a limit on the total number of expressions. Incremented by // incrementNumStmtExprs(). Note that this does not include expressions that do not @@ -648,6 +661,9 @@ public class Analyzer { } rules.add(CountStarToConstRule.INSTANCE); exprRewriter_ = new ExprRewriter(rules); + nullSlotsCache = + queryCtx.getClient_request().getQuery_options().use_null_slots_cache ? + CacheBuilder.newBuilder().concurrencyLevel(1).recordStats().build() : null; } }; @@ -2941,7 +2957,41 @@ public class Analyzer { */ public boolean isTrueWithNullSlots(Expr p) throws InternalException { Expr nullTuplePred = substituteNullSlots(p); - return FeSupport.EvalPredicate(nullTuplePred, getQueryCtx()); + if (globalState_.nullSlotsCache == null) { + return FeSupport.EvalPredicate(nullTuplePred, getQueryCtx()); + } + + try { + return globalState_.nullSlotsCache.get(nullTuplePred, + () -> FeSupport.EvalPredicate(nullTuplePred, getQueryCtx())); + } catch (ExecutionException e) { + Preconditions.checkState(e.getCause() instanceof InternalException, + "Internal error using null slots cache: %s\nDisable null slots cache with " + + "the %s query option.", e, TImpalaQueryOptions.USE_NULL_SLOTS_CACHE.name()); + throw (InternalException) e.getCause(); + } + } + + /** + * Log hit rate and size of null slots cache. + */ + public void logCacheStats() { + if (!LOG.isDebugEnabled() || globalState_.nullSlotsCache == null) return; + + Histogram exprSize = new Histogram(new UniformReservoir()); + MemoryMeter meter = MemoryMeter.builder().build(); + for (Expr expr : globalState_.nullSlotsCache.asMap().keySet()) { + try { + exprSize.update(meter.measureDeep(expr)); + } catch (CannotAccessFieldException e) { + // This may happen if we miss an add-opens call for lambdas in Java 17. + LOG.warn("Unable to weigh cache entry, additional add-opens needed", e); + } + } + Snapshot snap = exprSize.getSnapshot(); + LOG.debug("null slots cache size: {}, median entry: {}, 99th percentile entry: {}, "+ + "hit rate: {}", globalState_.nullSlotsCache.size(), snap.getMedian(), + snap.get99thPercentile(), globalState_.nullSlotsCache.stats().hitRate()); } /** diff --git a/fe/src/main/java/org/apache/impala/planner/Planner.java b/fe/src/main/java/org/apache/impala/planner/Planner.java index 8c1d7791d..29685f924 100644 --- a/fe/src/main/java/org/apache/impala/planner/Planner.java +++ b/fe/src/main/java/org/apache/impala/planner/Planner.java @@ -212,6 +212,7 @@ public class Planner { Collections.reverse(fragments); ctx_.getTimeline().markEvent("Distributed plan created"); + ctx_.getRootAnalyzer().logCacheStats(); ColumnLineageGraph graph = ctx_.getRootAnalyzer().getColumnLineageGraph(); if (BackendConfig.INSTANCE.getComputeLineage() || RuntimeEnv.INSTANCE.isTestEnv()) {
