This is an automated email from the ASF dual-hosted git repository. stigahuang pushed a commit to branch branch-4.4.1 in repository https://gitbox.apache.org/repos/asf/impala.git
commit bd70701984ec6f962939f8ddc86f1bdb2683b125 Author: stiga-huang <[email protected]> AuthorDate: Tue May 7 13:30:06 2024 +0800 IMPALA-13034: Add logs and counters for HTTP profile requests blocking client fetches There are several endpoints in WebUI that can dump a query profile: /query_profile, /query_profile_encoded, /query_profile_plain_text, /query_profile_json. The HTTP handler thread goes into ImpalaServer::GetRuntimeProfileOutput() which acquires lock of the ClientRequestState. This could block client requests in fetching query results. To help identify this issue, this patch adds warning logs when such profile dumping requests run slow and the query is still in-flight. Also adds a profile counter, GetInFlightProfileTimeStats, for the summary stats of this time. Dumping the profiles after the query is archived (e.g. closed) won't be tracked. Logs for slow http responses are also added. The thresholds are defined by two new flags, slow_profile_dump_warning_threshold_ms, and slow_http_response_warning_threshold_ms. Note that dumping the profile in-flight won't always block the query, e.g. if there are no client fetch requests or if the coordinator fragment is idle waiting for executor fragment instances. So a long time shown in GetInFlightProfileTimeStats doesn't mean it's hitting the issue. To better identify this issue, this patch adds another profile counter, ClientFetchLockWaitTimer, as the cumulative time client fetch requests waiting for locks. Also fixes false positive logs for complaining invalid query handles. Such logs are added in GetQueryHandle() when the query is not found in the active query map, but it could still exist in the query log. This removes the logs in GetQueryHandle() and lets the callers decide whether to log the error. Tests: - Added e2e test - Ran CORE tests Change-Id: I538ebe914f70f460bc8412770a8f7a1cc8b505dc Reviewed-on: http://gerrit.cloudera.org:8080/21412 Reviewed-by: Impala Public Jenkins <[email protected]> Tested-by: Michael Smith <[email protected]> --- be/src/service/client-request-state.cc | 9 +++++++++ be/src/service/client-request-state.h | 17 +++++++++++++++++ be/src/service/impala-beeswax-server.cc | 10 ++++++++-- be/src/service/impala-hs2-server.cc | 11 +++++++++-- be/src/service/impala-http-handler.cc | 4 ++++ be/src/service/impala-server.cc | 21 +++++++++++++++------ be/src/util/webserver.cc | 16 +++++++++++++--- tests/query_test/test_observability.py | 26 ++++++++++++++++++++++++++ 8 files changed, 101 insertions(+), 13 deletions(-) diff --git a/be/src/service/client-request-state.cc b/be/src/service/client-request-state.cc index 86150ec29..cffdebd60 100644 --- a/be/src/service/client-request-state.cc +++ b/be/src/service/client-request-state.cc @@ -82,6 +82,11 @@ DECLARE_bool(use_local_catalog); namespace impala { +PROFILE_DEFINE_TIMER(ClientFetchLockWaitTimer, UNSTABLE, + "Cumulative time client fetch requests waiting for locks."); +PROFILE_DEFINE_SUMMARY_STATS_TIMER(GetInFlightProfileTimeStats, UNSTABLE, + "Summary stats of the time dumping profiles when the query is still in-flight."); + // Keys into the info string map of the runtime profile referring to specific // items used by CM for monitoring purposes. static const string PER_HOST_MEM_KEY = "Estimated Per-Host Mem"; @@ -140,6 +145,10 @@ ClientRequestState::ClientRequestState(const TQueryCtx& query_ctx, Frontend* fro rpc_read_timer_ = ADD_TIMER(server_profile_, "RPCReadTimer"); rpc_write_timer_ = ADD_TIMER(server_profile_, "RPCWriteTimer"); rpc_count_ = ADD_COUNTER(server_profile_, "RPCCount", TUnit::UNIT); + get_inflight_profile_time_stats_ = + PROFILE_GetInFlightProfileTimeStats.Instantiate(server_profile_); + client_fetch_lock_wait_timer_ = + PROFILE_ClientFetchLockWaitTimer.Instantiate(server_profile_); profile_->set_name("Query (id=" + PrintId(query_id()) + ")"); summary_profile_->AddInfoString("Session ID", PrintId(session_id())); diff --git a/be/src/service/client-request-state.h b/be/src/service/client-request-state.h index d5c582a57..71fcf5d6a 100644 --- a/be/src/service/client-request-state.h +++ b/be/src/service/client-request-state.h @@ -28,6 +28,7 @@ #include "runtime/hdfs-fs-cache.h" #include "util/condition-variable.h" #include "util/runtime-profile.h" +#include "util/runtime-profile-counters.h" #include "gen-cpp/Frontend_types.h" #include "gen-cpp/ImpalaHiveServer2Service.h" @@ -494,6 +495,17 @@ class ClientRequestState { void UnRegisterRemainingRPCs(); // Copy pending RPCs for a retried query void CopyRPCs(ClientRequestState& from_request); + + // Update the profile counter 'GetInFlightProfileTimeStats' which is the summary stats + // of time spent in dumping the profile before the query is archived. + void UpdateGetInFlightProfileTime(int64_t elapsed_time_ns) { + get_inflight_profile_time_stats_->UpdateCounter(elapsed_time_ns); + } + // Update the profile counter 'ClientFetchLockWaitTimer' which is the cumulative time + // that client fetch requests waiting for locks. + void AddClientFetchLockWaitTime(int64_t lock_wait_time_ns) { + client_fetch_lock_wait_timer_->Add(lock_wait_time_ns); + } protected: /// Updates the end_time_us_ of this query if it isn't set. The end time is determined /// when this function is called for the first time, calling it multiple times does not @@ -653,6 +665,11 @@ class ClientRequestState { MonotonicStopWatch client_wait_sw_; int64_t last_client_wait_time_ = 0; + // Tracks time spent in dumping the profile before the query is archived. + RuntimeProfile::SummaryStatsCounter* get_inflight_profile_time_stats_; + // Tracks time client fetch requests waiting for locks. + RuntimeProfile::Counter* client_fetch_lock_wait_timer_; + // Tracks time spent by client calls reading RPC arguments RuntimeProfile::Counter* rpc_read_timer_; // Tracks time spent by client calls writing RPC results diff --git a/be/src/service/impala-beeswax-server.cc b/be/src/service/impala-beeswax-server.cc index 5bf658188..6b797283a 100644 --- a/be/src/service/impala-beeswax-server.cc +++ b/be/src/service/impala-beeswax-server.cc @@ -347,8 +347,11 @@ void ImpalaServer::get_log(string& log, const LogContextId& context) { if (query_handle->IsRetriedQuery()) { QueryHandle original_query_handle; - RAISE_IF_ERROR(GetQueryHandle(query_id, &original_query_handle), - SQLSTATE_GENERAL_ERROR); + Status status = GetQueryHandle(query_id, &original_query_handle); + if (UNLIKELY(!status.ok())) { + VLOG(1) << "Error in get_log, could not get query handle: " << status.GetDetail(); + RaiseBeeswaxException(status.GetDetail(), SQLSTATE_GENERAL_ERROR); + } DCHECK(!original_query_handle->query_status().ok()); error_log_ss << Substitute(GET_LOG_QUERY_RETRY_INFO_FORMAT, original_query_handle->query_status().GetDetail(), @@ -597,8 +600,11 @@ Status ImpalaServer::FetchInternal(TUniqueId query_id, const bool start_over, return Status::OK(); } + int64_t start_time_ns = MonotonicNanos(); lock_guard<mutex> frl(*query_handle->fetch_rows_lock()); lock_guard<mutex> l(*query_handle->lock()); + int64_t lock_wait_time_ns = MonotonicNanos() - start_time_ns; + query_handle->AddClientFetchLockWaitTime(lock_wait_time_ns); if (query_handle->num_rows_fetched() == 0) { query_handle->set_fetched_rows(); diff --git a/be/src/service/impala-hs2-server.cc b/be/src/service/impala-hs2-server.cc index e651a8439..30c48deb8 100644 --- a/be/src/service/impala-hs2-server.cc +++ b/be/src/service/impala-hs2-server.cc @@ -203,8 +203,11 @@ Status ImpalaServer::FetchInternal(TUniqueId query_id, SessionState* session, return Status::OK(); } + int64_t start_time_ns = MonotonicNanos(); lock_guard<mutex> frl(*query_handle->fetch_rows_lock()); lock_guard<mutex> l(*query_handle->lock()); + int64_t lock_wait_time_ns = MonotonicNanos() - start_time_ns; + query_handle->AddClientFetchLockWaitTime(lock_wait_time_ns); // Check for cancellation or an error. RETURN_IF_ERROR(query_handle->query_status()); @@ -1091,8 +1094,12 @@ void ImpalaServer::GetLog(TGetLogResp& return_val, const TGetLogReq& request) { // Report the query status, if the query failed or has been retried. if (query_handle->IsRetriedQuery()) { QueryHandle original_query_handle; - HS2_RETURN_IF_ERROR(return_val, GetQueryHandle(query_id, &original_query_handle), - SQLSTATE_GENERAL_ERROR); + Status status = GetQueryHandle(query_id, &original_query_handle); + if (UNLIKELY(!status.ok())) { + VLOG(1) << "Error in GetLog, could not get query handle: " << status.GetDetail(); + HS2_RETURN_ERROR(return_val, status.GetDetail(), SQLSTATE_GENERAL_ERROR); + return; + } DCHECK(!original_query_handle->query_status().ok()); ss << Substitute(GET_LOG_QUERY_RETRY_INFO_FORMAT, original_query_handle->query_status().GetDetail(), diff --git a/be/src/service/impala-http-handler.cc b/be/src/service/impala-http-handler.cc index 6d11efced..1a7173833 100644 --- a/be/src/service/impala-http-handler.cc +++ b/be/src/service/impala-http-handler.cc @@ -1174,6 +1174,8 @@ void ImpalaHttpHandler::QueryBackendsHandler( return; } query_handle->GetCoordinator()->BackendsToJson(document); + } else { + VLOG(1) << "Failed to get backends of the query: " << status.GetDetail(); } } @@ -1197,6 +1199,8 @@ void ImpalaHttpHandler::QueryFInstancesHandler( return; } query_handle->GetCoordinator()->FInstanceStatsToJson(document); + } else { + VLOG(1) << "Failed to get fragment instances of the query: " << status.GetDetail(); } } diff --git a/be/src/service/impala-server.cc b/be/src/service/impala-server.cc index 029ae61dc..22207b349 100644 --- a/be/src/service/impala-server.cc +++ b/be/src/service/impala-server.cc @@ -389,6 +389,9 @@ DEFINE_int32(wait_for_new_catalog_service_id_max_iterations, 10, "this case it will wait indefinitely if " "'--wait_for_new_catalog_service_id_timeout_sec' is not set."); +DEFINE_int64(slow_profile_dump_warning_threshold_ms, 500, + "(Advanced) Threshold for considering dumping a profile to be unusually slow."); + // Flags for JWT token based authentication. DECLARE_bool(jwt_token_auth); DECLARE_bool(jwt_validate_signature); @@ -734,6 +737,7 @@ Status ImpalaServer::GetRuntimeProfileOutput(const string& user, return Status::Expected("Query plan is not ready."); } lock_guard<mutex> l(*query_handle->lock()); + int64_t start_time_ns = MonotonicNanos(); RETURN_IF_ERROR(CheckProfileAccess( user, query_handle->effective_user(), query_handle->user_has_profile_access())); if (query_handle->GetCoordinator() != nullptr) { @@ -750,6 +754,15 @@ Status ImpalaServer::GetRuntimeProfileOutput(const string& user, DCHECK_EQ(format, TRuntimeProfileFormat::STRING); query_handle->profile()->PrettyPrint(profile->string_output); } + int64_t elapsed_time_ns = MonotonicNanos() - start_time_ns; + if (elapsed_time_ns > FLAGS_slow_profile_dump_warning_threshold_ms * 1000 * 1000) { + LOG(WARNING) << "Slow in dumping " << format << " profile of " + << PrintId(query_handle->query_id()) << ". User " << user + << " held the lock for " << PrettyPrinter::Print(elapsed_time_ns, TUnit::TIME_NS) + << " (" << elapsed_time_ns << "ns). This might block client in fetching query " + << "results."; + } + query_handle->UpdateGetInFlightProfileTime(elapsed_time_ns); return Status::OK(); } @@ -1730,15 +1743,11 @@ Status ImpalaServer::GetQueryHandle( DCHECK(query_handle != nullptr); shared_ptr<QueryDriver> query_driver = GetQueryDriver(query_id, return_unregistered); if (UNLIKELY(query_driver == nullptr)) { - Status err = Status::Expected(TErrorCode::INVALID_QUERY_HANDLE, PrintId(query_id)); - VLOG(1) << err.GetDetail(); - return err; + return Status::Expected(TErrorCode::INVALID_QUERY_HANDLE, PrintId(query_id)); } ClientRequestState* request_state = query_driver->GetClientRequestState(query_id); if (UNLIKELY(request_state == nullptr)) { - Status err = Status::Expected(TErrorCode::INVALID_QUERY_HANDLE, PrintId(query_id)); - VLOG(1) << err.GetDetail(); - return err; + return Status::Expected(TErrorCode::INVALID_QUERY_HANDLE, PrintId(query_id)); } query_handle->SetHandle(query_driver, request_state); return Status::OK(); diff --git a/be/src/util/webserver.cc b/be/src/util/webserver.cc index 1289bde9d..daa9a3f78 100644 --- a/be/src/util/webserver.cc +++ b/be/src/util/webserver.cc @@ -149,6 +149,9 @@ DEFINE_bool(disable_content_security_policy_header, false, "If true then the webserver will not add the Content-Security-Policy " "HTTP header to HTTP responses"); +DEFINE_int64(slow_http_response_warning_threshold_ms, 500, + "(Advanced) Threshold for considering a HTTP response to be unusually slow."); + DECLARE_bool(enable_ldap_auth); DECLARE_string(hostname); DECLARE_bool(is_coordinator); @@ -938,9 +941,16 @@ sq_callback_result_t Webserver::BeginRequestCallback(struct sq_connection* conne connection, req, *url_handler, &output, &content_type, cookie_rand_value); } - VLOG(3) << "Rendering page " << request_info->uri << " took " - << PrettyPrinter::Print(sw.ElapsedTime(), TUnit::TIME_NS); - + uint64_t elapsed_time_ns = sw.ElapsedTime(); + if (elapsed_time_ns > FLAGS_slow_http_response_warning_threshold_ms * 1000 * 1000) { + LOG(WARNING) << "Rendering page " << request_info->uri << " took " + << PrettyPrinter::Print(sw.ElapsedTime(), TUnit::TIME_NS) + << ". User: " << req.source_user << ". Address: " << req.source_socket + << ". Args: " << req.query_string; + } else { + VLOG(3) << "Rendering page " << request_info->uri << " took " + << PrettyPrinter::Print(sw.ElapsedTime(), TUnit::TIME_NS); + } SendResponse(connection, HttpStatusCodeToString(response), Webserver::GetMimeType(content_type), output.str(), response_headers); diff --git a/tests/query_test/test_observability.py b/tests/query_test/test_observability.py index 60761e57c..fbfa86918 100644 --- a/tests/query_test/test_observability.py +++ b/tests/query_test/test_observability.py @@ -19,6 +19,7 @@ from __future__ import absolute_import, division, print_function from collections import defaultdict from datetime import datetime from tests.beeswax.impala_beeswax import ImpalaBeeswaxException +from tests.common.impala_cluster import ImpalaCluster from tests.common.impala_test_suite import ImpalaTestSuite from tests.common.skip import SkipIfFS, SkipIfLocal, SkipIfNotHdfsMinicluster from tests.util.filesystem_utils import IS_EC, WAREHOUSE @@ -962,6 +963,31 @@ class TestObservability(ImpalaTestSuite): runtime_profile = result.runtime_profile assert "cardinality=575.77K(filtered from 6.00M)" in runtime_profile + def test_query_profile_contains_get_inflight_profile_counter(self): + """Test that counter for getting inflight profiles appears in the profile""" + # This query runs 15s + query = "select count(*) from functional.alltypes where bool_col = sleep(50)" + handle = self.execute_query_async(query) + query_id = handle.get_handle().id + + cluster = ImpalaCluster.get_e2e_test_cluster() + impalad = cluster.get_first_impalad() + profile_urls = [ + "query_profile?query_id=", + "query_profile_encoded?query_id=", + "query_profile_json?query_id=", + "query_profile_plain_text?query_id=", + ] + for url in profile_urls: + impalad.service.read_debug_webpage(url + query_id) + profile = self.client.get_runtime_profile(handle) + assert "GetInFlightProfileTimeStats:" in profile + assert "ClientFetchLockWaitTimer:" in profile + # Make sure the counter actually records the requests + samples_search = re.search(r"GetInFlightProfileTimeStats:.*samples: (\d+)", profile) + num_samples = int(samples_search.group(1)) + assert num_samples > 0 + class TestQueryStates(ImpalaTestSuite): """Test that the 'Query State' and 'Impala Query State' are set correctly in the
