This is an automated email from the ASF dual-hosted git repository.
jasonmfehr pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
The following commit(s) were added to refs/heads/master by this push:
new ec809fc16 IMPALA-14433: Fix OpenTelemetry Tracing Deadlock
ec809fc16 is described below
commit ec809fc16c32afd47ad795d4b8a26880413bf660
Author: jasonmfehr <[email protected]>
AuthorDate: Fri Sep 12 16:24:07 2025 -0700
IMPALA-14433: Fix OpenTelemetry Tracing Deadlock
All functions in the SpanManager class operate under the assumption
that child_span_mu_ in the SpanManager class will be locked before
the ClientRequestState lock. However, the
ImpalaServer::ExecuteInternal function takes the ClientRequestState
lock before calling SpanManager::EndChildSpanPlanning. If another
function in the SpanManager class has already taken the
child_span_mu_ lock and is waiting for the ClientRequestState lock,
a deadlock occurs.
This issue was found by running end-to-end tests with OpenTelemetry
tracing enabled and a release buildof Impala.
Testing accomplished by re-running the end-to-end tests with
OpenTelemetry tracing enabled and verifying that the deadlock no
longer occurs.
Change-Id: I7b43dba794cfe61d283bdd476e4056b9304d8947
Reviewed-on: http://gerrit.cloudera.org:8080/23422
Reviewed-by: Joe McDonnell <[email protected]>
Reviewed-by: Michael Smith <[email protected]>
Tested-by: Impala Public Jenkins <[email protected]>
---
be/src/observe/span-manager.cc | 1 +
be/src/observe/span-manager.h | 9 ++++-----
be/src/service/impala-server.cc | 6 +++---
3 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/be/src/observe/span-manager.cc b/be/src/observe/span-manager.cc
index f5b21ac15..c556fb3a9 100644
--- a/be/src/observe/span-manager.cc
+++ b/be/src/observe/span-manager.cc
@@ -252,6 +252,7 @@ void SpanManager::StartChildSpanPlanning() {
void SpanManager::EndChildSpanPlanning() {
lock_guard<mutex> l(child_span_mu_);
+ lock_guard<mutex> crs_lock(*(client_request_state_->lock()));
DoEndChildSpanPlanning();
} // function EndChildSpanPlanning
diff --git a/be/src/observe/span-manager.h b/be/src/observe/span-manager.h
index 643e404b0..2d8d4ccd5 100644
--- a/be/src/observe/span-manager.h
+++ b/be/src/observe/span-manager.h
@@ -92,15 +92,11 @@ public:
// client_request_state_->lock().
void EndChildSpanInit();
void EndChildSpanSubmitted();
+ void EndChildSpanPlanning();
void EndChildSpanAdmissionControl(const Status& cause);
void EndChildSpanQueryExecution();
void EndChildSpanClose();
- // Function to end the Planning child span. This function takes ownership of
- // child_span_mu_ BUT NOT client_request_state_->lock(). The code calling
this function
- // already holds client_request_state_->lock().
- void EndChildSpanPlanning();
-
private:
// Tracer instance used to construct spans.
opentelemetry::nostd::shared_ptr<opentelemetry::trace::Tracer> tracer_;
@@ -120,6 +116,9 @@ private:
std::shared_ptr<TimedSpan> root_;
// TimedSpan instance for the current child span and the mutex to protect it.
+ // To ensure no deadlocks, locks must be acquired in the following order:
+ // 1. SpanManager::child_span_mu_
+ // 2. ClientRequestState::lock_
std::unique_ptr<TimedSpan> current_child_;
std::mutex child_span_mu_;
diff --git a/be/src/service/impala-server.cc b/be/src/service/impala-server.cc
index 23be7ee4b..7b9b639ab 100644
--- a/be/src/service/impala-server.cc
+++ b/be/src/service/impala-server.cc
@@ -1422,10 +1422,10 @@ Status ImpalaServer::ExecuteInternal(const TQueryCtx&
query_ctx,
if (result.__isset.result_set_metadata) {
(*query_handle)->set_result_metadata(result.result_set_metadata);
}
+ }
- if ((*query_handle)->otel_trace_query()) {
- (*query_handle)->otel_span_manager()->EndChildSpanPlanning();
- }
+ if ((*query_handle)->otel_trace_query()) {
+ (*query_handle)->otel_span_manager()->EndChildSpanPlanning();
}
VLOG(2) << "Execution request: "