(impala) 25/45: IMPALA-13034: Add logs and counters for HTTP profile requests blocking client fetches

stigahuang Tue, 13 Aug 2024 14:49:23 -0700

This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch branch-4.4.1
in repository https://gitbox.apache.org/repos/asf/impala.git


commit bd70701984ec6f962939f8ddc86f1bdb2683b125
Author: stiga-huang <[email protected]>
AuthorDate: Tue May 7 13:30:06 2024 +0800

    IMPALA-13034: Add logs and counters for HTTP profile requests blocking 
client fetches
    
    There are several endpoints in WebUI that can dump a query profile:
    /query_profile, /query_profile_encoded, /query_profile_plain_text,
    /query_profile_json. The HTTP handler thread goes into
    ImpalaServer::GetRuntimeProfileOutput() which acquires lock of the
    ClientRequestState. This could block client requests in fetching query
    results.
    
    To help identify this issue, this patch adds warning logs when such
    profile dumping requests run slow and the query is still in-flight. Also
    adds a profile counter, GetInFlightProfileTimeStats, for the summary
    stats of this time. Dumping the profiles after the query is archived
    (e.g. closed) won't be tracked.
    
    Logs for slow http responses are also added. The thresholds are defined
    by two new flags, slow_profile_dump_warning_threshold_ms, and
    slow_http_response_warning_threshold_ms.
    
    Note that dumping the profile in-flight won't always block the query,
    e.g. if there are no client fetch requests or if the coordinator
    fragment is idle waiting for executor fragment instances. So a long time
    shown in GetInFlightProfileTimeStats doesn't mean it's hitting the
    issue.
    
    To better identify this issue, this patch adds another profile counter,
    ClientFetchLockWaitTimer, as the cumulative time client fetch requests
    waiting for locks.
    
    Also fixes false positive logs for complaining invalid query handles.
    Such logs are added in GetQueryHandle() when the query is not found in
    the active query map, but it could still exist in the query log. This
    removes the logs in GetQueryHandle() and lets the callers decide whether
    to log the error.
    
    Tests:
     - Added e2e test
     - Ran CORE tests
    
    Change-Id: I538ebe914f70f460bc8412770a8f7a1cc8b505dc
    Reviewed-on: http://gerrit.cloudera.org:8080/21412
    Reviewed-by: Impala Public Jenkins <[email protected]>
    Tested-by: Michael Smith <[email protected]>
---
 be/src/service/client-request-state.cc  |  9 +++++++++
 be/src/service/client-request-state.h   | 17 +++++++++++++++++
 be/src/service/impala-beeswax-server.cc | 10 ++++++++--
 be/src/service/impala-hs2-server.cc     | 11 +++++++++--
 be/src/service/impala-http-handler.cc   |  4 ++++
 be/src/service/impala-server.cc         | 21 +++++++++++++++------
 be/src/util/webserver.cc                | 16 +++++++++++++---
 tests/query_test/test_observability.py  | 26 ++++++++++++++++++++++++++
 8 files changed, 101 insertions(+), 13 deletions(-)

diff --git a/be/src/service/client-request-state.cc 
b/be/src/service/client-request-state.cc
index 86150ec29..cffdebd60 100644
--- a/be/src/service/client-request-state.cc
+++ b/be/src/service/client-request-state.cc
@@ -82,6 +82,11 @@ DECLARE_bool(use_local_catalog);
 
 namespace impala {
 
+PROFILE_DEFINE_TIMER(ClientFetchLockWaitTimer, UNSTABLE,
+    "Cumulative time client fetch requests waiting for locks.");
+PROFILE_DEFINE_SUMMARY_STATS_TIMER(GetInFlightProfileTimeStats, UNSTABLE,
+    "Summary stats of the time dumping profiles when the query is still 
in-flight.");
+
 // Keys into the info string map of the runtime profile referring to specific
 // items used by CM for monitoring purposes.
 static const string PER_HOST_MEM_KEY = "Estimated Per-Host Mem";
@@ -140,6 +145,10 @@ ClientRequestState::ClientRequestState(const TQueryCtx& 
query_ctx, Frontend* fro
   rpc_read_timer_ = ADD_TIMER(server_profile_, "RPCReadTimer");
   rpc_write_timer_ = ADD_TIMER(server_profile_, "RPCWriteTimer");
   rpc_count_ = ADD_COUNTER(server_profile_, "RPCCount", TUnit::UNIT);
+  get_inflight_profile_time_stats_ =
+      PROFILE_GetInFlightProfileTimeStats.Instantiate(server_profile_);
+  client_fetch_lock_wait_timer_ =
+      PROFILE_ClientFetchLockWaitTimer.Instantiate(server_profile_);
 
   profile_->set_name("Query (id=" + PrintId(query_id()) + ")");
   summary_profile_->AddInfoString("Session ID", PrintId(session_id()));
diff --git a/be/src/service/client-request-state.h 
b/be/src/service/client-request-state.h
index d5c582a57..71fcf5d6a 100644
--- a/be/src/service/client-request-state.h
+++ b/be/src/service/client-request-state.h
@@ -28,6 +28,7 @@
 #include "runtime/hdfs-fs-cache.h"
 #include "util/condition-variable.h"
 #include "util/runtime-profile.h"
+#include "util/runtime-profile-counters.h"
 #include "gen-cpp/Frontend_types.h"
 #include "gen-cpp/ImpalaHiveServer2Service.h"
 
@@ -494,6 +495,17 @@ class ClientRequestState {
   void UnRegisterRemainingRPCs();
   // Copy pending RPCs for a retried query
   void CopyRPCs(ClientRequestState& from_request);
+
+  // Update the profile counter 'GetInFlightProfileTimeStats' which is the 
summary stats
+  // of time spent in dumping the profile before the query is archived.
+  void UpdateGetInFlightProfileTime(int64_t elapsed_time_ns) {
+    get_inflight_profile_time_stats_->UpdateCounter(elapsed_time_ns);
+  }
+  // Update the profile counter 'ClientFetchLockWaitTimer' which is the 
cumulative time
+  // that client fetch requests waiting for locks.
+  void AddClientFetchLockWaitTime(int64_t lock_wait_time_ns) {
+    client_fetch_lock_wait_timer_->Add(lock_wait_time_ns);
+  }
  protected:
   /// Updates the end_time_us_ of this query if it isn't set. The end time is 
determined
   /// when this function is called for the first time, calling it multiple 
times does not
@@ -653,6 +665,11 @@ class ClientRequestState {
   MonotonicStopWatch client_wait_sw_;
   int64_t last_client_wait_time_ = 0;
 
+  // Tracks time spent in dumping the profile before the query is archived.
+  RuntimeProfile::SummaryStatsCounter* get_inflight_profile_time_stats_;
+  // Tracks time client fetch requests waiting for locks.
+  RuntimeProfile::Counter* client_fetch_lock_wait_timer_;
+
   // Tracks time spent by client calls reading RPC arguments
   RuntimeProfile::Counter* rpc_read_timer_;
   // Tracks time spent by client calls writing RPC results
diff --git a/be/src/service/impala-beeswax-server.cc 
b/be/src/service/impala-beeswax-server.cc
index 5bf658188..6b797283a 100644
--- a/be/src/service/impala-beeswax-server.cc
+++ b/be/src/service/impala-beeswax-server.cc
@@ -347,8 +347,11 @@ void ImpalaServer::get_log(string& log, const 
LogContextId& context) {
 
   if (query_handle->IsRetriedQuery()) {
     QueryHandle original_query_handle;
-    RAISE_IF_ERROR(GetQueryHandle(query_id, &original_query_handle),
-        SQLSTATE_GENERAL_ERROR);
+    Status status = GetQueryHandle(query_id, &original_query_handle);
+    if (UNLIKELY(!status.ok())) {
+      VLOG(1) << "Error in get_log, could not get query handle: " << 
status.GetDetail();
+      RaiseBeeswaxException(status.GetDetail(), SQLSTATE_GENERAL_ERROR);
+    }
     DCHECK(!original_query_handle->query_status().ok());
     error_log_ss << Substitute(GET_LOG_QUERY_RETRY_INFO_FORMAT,
         original_query_handle->query_status().GetDetail(),
@@ -597,8 +600,11 @@ Status ImpalaServer::FetchInternal(TUniqueId query_id, 
const bool start_over,
     return Status::OK();
   }
 
+  int64_t start_time_ns = MonotonicNanos();
   lock_guard<mutex> frl(*query_handle->fetch_rows_lock());
   lock_guard<mutex> l(*query_handle->lock());
+  int64_t lock_wait_time_ns = MonotonicNanos() - start_time_ns;
+  query_handle->AddClientFetchLockWaitTime(lock_wait_time_ns);
 
   if (query_handle->num_rows_fetched() == 0) {
     query_handle->set_fetched_rows();
diff --git a/be/src/service/impala-hs2-server.cc 
b/be/src/service/impala-hs2-server.cc
index e651a8439..30c48deb8 100644
--- a/be/src/service/impala-hs2-server.cc
+++ b/be/src/service/impala-hs2-server.cc
@@ -203,8 +203,11 @@ Status ImpalaServer::FetchInternal(TUniqueId query_id, 
SessionState* session,
     return Status::OK();
   }
 
+  int64_t start_time_ns = MonotonicNanos();
   lock_guard<mutex> frl(*query_handle->fetch_rows_lock());
   lock_guard<mutex> l(*query_handle->lock());
+  int64_t lock_wait_time_ns = MonotonicNanos() - start_time_ns;
+  query_handle->AddClientFetchLockWaitTime(lock_wait_time_ns);
 
   // Check for cancellation or an error.
   RETURN_IF_ERROR(query_handle->query_status());
@@ -1091,8 +1094,12 @@ void ImpalaServer::GetLog(TGetLogResp& return_val, const 
TGetLogReq& request) {
   // Report the query status, if the query failed or has been retried.
   if (query_handle->IsRetriedQuery()) {
     QueryHandle original_query_handle;
-    HS2_RETURN_IF_ERROR(return_val, GetQueryHandle(query_id, 
&original_query_handle),
-        SQLSTATE_GENERAL_ERROR);
+    Status status = GetQueryHandle(query_id, &original_query_handle);
+    if (UNLIKELY(!status.ok())) {
+      VLOG(1) << "Error in GetLog, could not get query handle: " << 
status.GetDetail();
+      HS2_RETURN_ERROR(return_val, status.GetDetail(), SQLSTATE_GENERAL_ERROR);
+      return;
+    }
     DCHECK(!original_query_handle->query_status().ok());
     ss << Substitute(GET_LOG_QUERY_RETRY_INFO_FORMAT,
         original_query_handle->query_status().GetDetail(),
diff --git a/be/src/service/impala-http-handler.cc 
b/be/src/service/impala-http-handler.cc
index 6d11efced..1a7173833 100644
--- a/be/src/service/impala-http-handler.cc
+++ b/be/src/service/impala-http-handler.cc
@@ -1174,6 +1174,8 @@ void ImpalaHttpHandler::QueryBackendsHandler(
       return;
     }
     query_handle->GetCoordinator()->BackendsToJson(document);
+  } else {
+    VLOG(1) << "Failed to get backends of the query: " << status.GetDetail();
   }
 }
 
@@ -1197,6 +1199,8 @@ void ImpalaHttpHandler::QueryFInstancesHandler(
       return;
     }
     query_handle->GetCoordinator()->FInstanceStatsToJson(document);
+  } else {
+    VLOG(1) << "Failed to get fragment instances of the query: " << 
status.GetDetail();
   }
 }
 
diff --git a/be/src/service/impala-server.cc b/be/src/service/impala-server.cc
index 029ae61dc..22207b349 100644
--- a/be/src/service/impala-server.cc
+++ b/be/src/service/impala-server.cc
@@ -389,6 +389,9 @@ 
DEFINE_int32(wait_for_new_catalog_service_id_max_iterations, 10,
     "this case it will wait indefinitely if "
     "'--wait_for_new_catalog_service_id_timeout_sec' is not set.");
 
+DEFINE_int64(slow_profile_dump_warning_threshold_ms, 500,
+    "(Advanced) Threshold for considering dumping a profile to be unusually 
slow.");
+
 // Flags for JWT token based authentication.
 DECLARE_bool(jwt_token_auth);
 DECLARE_bool(jwt_validate_signature);
@@ -734,6 +737,7 @@ Status ImpalaServer::GetRuntimeProfileOutput(const string& 
user,
     return Status::Expected("Query plan is not ready.");
   }
   lock_guard<mutex> l(*query_handle->lock());
+  int64_t start_time_ns = MonotonicNanos();
   RETURN_IF_ERROR(CheckProfileAccess(
       user, query_handle->effective_user(), 
query_handle->user_has_profile_access()));
   if (query_handle->GetCoordinator() != nullptr) {
@@ -750,6 +754,15 @@ Status ImpalaServer::GetRuntimeProfileOutput(const string& 
user,
     DCHECK_EQ(format, TRuntimeProfileFormat::STRING);
     query_handle->profile()->PrettyPrint(profile->string_output);
   }
+  int64_t elapsed_time_ns = MonotonicNanos() - start_time_ns;
+  if (elapsed_time_ns > FLAGS_slow_profile_dump_warning_threshold_ms * 1000 * 
1000) {
+    LOG(WARNING) << "Slow in dumping " << format << " profile of "
+        << PrintId(query_handle->query_id()) << ". User " << user
+        << " held the lock for " << PrettyPrinter::Print(elapsed_time_ns, 
TUnit::TIME_NS)
+        << " (" << elapsed_time_ns << "ns). This might block client in 
fetching query "
+        << "results.";
+  }
+  query_handle->UpdateGetInFlightProfileTime(elapsed_time_ns);
   return Status::OK();
 }
 
@@ -1730,15 +1743,11 @@ Status ImpalaServer::GetQueryHandle(
   DCHECK(query_handle != nullptr);
   shared_ptr<QueryDriver> query_driver = GetQueryDriver(query_id, 
return_unregistered);
   if (UNLIKELY(query_driver == nullptr)) {
-    Status err = Status::Expected(TErrorCode::INVALID_QUERY_HANDLE, 
PrintId(query_id));
-    VLOG(1) << err.GetDetail();
-    return err;
+    return Status::Expected(TErrorCode::INVALID_QUERY_HANDLE, 
PrintId(query_id));
   }
   ClientRequestState* request_state = 
query_driver->GetClientRequestState(query_id);
   if (UNLIKELY(request_state == nullptr)) {
-    Status err = Status::Expected(TErrorCode::INVALID_QUERY_HANDLE, 
PrintId(query_id));
-    VLOG(1) << err.GetDetail();
-    return err;
+    return Status::Expected(TErrorCode::INVALID_QUERY_HANDLE, 
PrintId(query_id));
   }
   query_handle->SetHandle(query_driver, request_state);
   return Status::OK();
diff --git a/be/src/util/webserver.cc b/be/src/util/webserver.cc
index 1289bde9d..daa9a3f78 100644
--- a/be/src/util/webserver.cc
+++ b/be/src/util/webserver.cc
@@ -149,6 +149,9 @@ DEFINE_bool(disable_content_security_policy_header, false,
     "If true then the webserver will not add the Content-Security-Policy "
     "HTTP header to HTTP responses");
 
+DEFINE_int64(slow_http_response_warning_threshold_ms, 500,
+    "(Advanced) Threshold for considering a HTTP response to be unusually 
slow.");
+
 DECLARE_bool(enable_ldap_auth);
 DECLARE_string(hostname);
 DECLARE_bool(is_coordinator);
@@ -938,9 +941,16 @@ sq_callback_result_t 
Webserver::BeginRequestCallback(struct sq_connection* conne
         connection, req, *url_handler, &output, &content_type, 
cookie_rand_value);
   }
 
-  VLOG(3) << "Rendering page " << request_info->uri << " took "
-          << PrettyPrinter::Print(sw.ElapsedTime(), TUnit::TIME_NS);
-
+  uint64_t elapsed_time_ns = sw.ElapsedTime();
+  if (elapsed_time_ns > FLAGS_slow_http_response_warning_threshold_ms * 1000 * 
1000) {
+    LOG(WARNING) << "Rendering page " << request_info->uri << " took "
+        << PrettyPrinter::Print(sw.ElapsedTime(), TUnit::TIME_NS)
+        << ". User: " << req.source_user << ". Address: " << req.source_socket
+        << ". Args: " << req.query_string;
+  } else {
+    VLOG(3) << "Rendering page " << request_info->uri << " took "
+            << PrettyPrinter::Print(sw.ElapsedTime(), TUnit::TIME_NS);
+  }
   SendResponse(connection, HttpStatusCodeToString(response),
       Webserver::GetMimeType(content_type), output.str(), response_headers);
 
diff --git a/tests/query_test/test_observability.py 
b/tests/query_test/test_observability.py
index 60761e57c..fbfa86918 100644
--- a/tests/query_test/test_observability.py
+++ b/tests/query_test/test_observability.py
@@ -19,6 +19,7 @@ from __future__ import absolute_import, division, 
print_function
 from collections import defaultdict
 from datetime import datetime
 from tests.beeswax.impala_beeswax import ImpalaBeeswaxException
+from tests.common.impala_cluster import ImpalaCluster
 from tests.common.impala_test_suite import ImpalaTestSuite
 from tests.common.skip import SkipIfFS, SkipIfLocal, SkipIfNotHdfsMinicluster
 from tests.util.filesystem_utils import IS_EC, WAREHOUSE
@@ -962,6 +963,31 @@ class TestObservability(ImpalaTestSuite):
     runtime_profile = result.runtime_profile
     assert "cardinality=575.77K(filtered from 6.00M)" in runtime_profile
 
+  def test_query_profile_contains_get_inflight_profile_counter(self):
+    """Test that counter for getting inflight profiles appears in the 
profile"""
+    # This query runs 15s
+    query = "select count(*) from functional.alltypes where bool_col = 
sleep(50)"
+    handle = self.execute_query_async(query)
+    query_id = handle.get_handle().id
+
+    cluster = ImpalaCluster.get_e2e_test_cluster()
+    impalad = cluster.get_first_impalad()
+    profile_urls = [
+      "query_profile?query_id=",
+      "query_profile_encoded?query_id=",
+      "query_profile_json?query_id=",
+      "query_profile_plain_text?query_id=",
+    ]
+    for url in profile_urls:
+      impalad.service.read_debug_webpage(url + query_id)
+    profile = self.client.get_runtime_profile(handle)
+    assert "GetInFlightProfileTimeStats:" in profile
+    assert "ClientFetchLockWaitTimer:" in profile
+    # Make sure the counter actually records the requests
+    samples_search = re.search(r"GetInFlightProfileTimeStats:.*samples: 
(\d+)", profile)
+    num_samples = int(samples_search.group(1))
+    assert num_samples > 0
+
 
 class TestQueryStates(ImpalaTestSuite):
   """Test that the 'Query State' and 'Impala Query State' are set correctly in 
the

(impala) 25/45: IMPALA-13034: Add logs and counters for HTTP profile requests blocking client fetches

Reply via email to