This is an automated email from the ASF dual-hosted git repository.
stigahuang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
The following commit(s) were added to refs/heads/master by this push:
new 1970cc709 IMPALA-13844: Add /stacks web page to dump native thread
stacks
1970cc709 is described below
commit 1970cc709df637f1e416198fb9a6553b2947bffa
Author: Arnab Karmakar <[email protected]>
AuthorDate: Fri Nov 7 11:30:51 2025 +0530
IMPALA-13844: Add /stacks web page to dump native thread stacks
Kudu 1.7 introduced a utility (KUDU-2291) and a /stacks web page
for dumping stack traces of all native threads. Impala already
has the corresponding debug util ported from Kudu, so this patch
adds a similar /stacks endpoint to Impala’s WebUI.
This enables developers to easily inspect thread states and
diagnose deadlocks or hangs via the web interface.
Testing:
Added unit tests to verify that the /stacks endpoint
is available and returns valid thread stack information.
TODO:
Add jstack output to the /stacks handler via a parameter like
`/stacks?include_jvm=true`. Note that `JniUtil.java` currently uses
`ThreadMXBean.dumpAllThreads()`, but Java’s `ThreadInfo` does not
expose native thread IDs (nids), limiting correlation with
native stacks.
Change-Id: I9171c89d1d36726d98a4a61ca040d69254f50e92
Reviewed-on: http://gerrit.cloudera.org:8080/23652
Reviewed-by: Impala Public Jenkins <[email protected]>
Tested-by: Impala Public Jenkins <[email protected]>
---
be/src/common/init.cc | 28 ++++++++++++
be/src/util/default-path-handlers.cc | 83 ++++++++++++++++++++++++++++++++++++
be/src/util/thread.cc | 26 +++++++++++
be/src/util/thread.h | 5 +++
tests/webserver/test_web_pages.py | 72 +++++++++++++++++++++++++++++++
www/threadz.tmpl | 1 +
6 files changed, 215 insertions(+)
diff --git a/be/src/common/init.cc b/be/src/common/init.cc
index ee4ea6dd7..f4ea31b1d 100644
--- a/be/src/common/init.cc
+++ b/be/src/common/init.cc
@@ -41,6 +41,7 @@
#include "runtime/lib-cache.h"
#include "runtime/mem-tracker.h"
#include "service/impala-server.h"
+#include "kudu/util/debug-util.h"
#include "util/cgroup-util.h"
#include "util/cpu-info.h"
#include "util/debug-util.h"
@@ -585,6 +586,33 @@ void impala::InitCommonRuntime(int argc, char** argv, bool
init_jvm,
}
}
+ // Initialize the signal handler for stack trace collection BEFORE
+ // InitGoogleLoggingSafe. This must happen before
+ // google::InstallFailureSignalHandler() is called (which happens
+ // inside InitGoogleLoggingSafe), otherwise that might block our signal.
+ //
+ // We use SIGRTMIN+10 instead of SIGUSR1/SIGUSR2 because:
+ // - SIGUSR1 is used by minidump handling
+ // - SIGUSR2 crashes the embedded JVM
+ //
+ // IMPORTANT: This signal handler is used INTERNALLY by the /stacks web
endpoint.
+ // It is NOT meant to be triggered manually via kill/pkill. The handler
expects
+ // specific data structures to be set up when triggered, which only happens
during
+ // a web request to /stacks. Manually sending this signal will crash the
process.
+ // If you want to see thread stacks, visit the /stacks web UI page instead.
+ const int stack_trace_signal = SIGRTMIN + 10;
+ kudu::Status stack_trace_status =
kudu::SetStackTraceSignal(stack_trace_signal);
+ if (!stack_trace_status.ok()) {
+ // Using fprintf since LOG isn't available yet
+ fprintf(stderr, "WARNING: Failed to initialize stack trace signal handler
with "
+ "signal %d: %s. The /stacks endpoint will not work.\n",
stack_trace_signal,
+ stack_trace_status.ToString().c_str());
+ } else {
+ fprintf(stderr, "INFO: Stack trace signal handler initialized with signal
%d "
+ "(SIGRTMIN+10). Access thread stacks via the /stacks web UI (DO
NOT manually "
+ "send this signal).\n", stack_trace_signal);
+ }
+
impala::InitGoogleLoggingSafe(argv[0]);
// Breakpad needs flags and logging to initialize.
if (!external_fe) {
diff --git a/be/src/util/default-path-handlers.cc
b/be/src/util/default-path-handlers.cc
index f5429cc87..f4bfb511e 100644
--- a/be/src/util/default-path-handlers.cc
+++ b/be/src/util/default-path-handlers.cc
@@ -27,7 +27,13 @@
#include "common/daemon-env.h"
#include "common/logging.h"
+#include "kudu/gutil/strings/strip.h"
+#include "kudu/util/array_view.h"
+#include "kudu/util/debug-util.h"
+#include "kudu/util/env.h"
+#include "kudu/util/faststring.h"
#include "kudu/util/flags.h"
+#include "kudu/util/monotime.h"
#include "rpc/jni-thrift-util.h"
#include "runtime/exec-env.h"
#include "runtime/mem-tracker.h"
@@ -44,6 +50,7 @@
#include "util/pprof-path-handlers.h"
#include "util/process-state-info.h"
#include "util/runtime-profile-counters.h"
+#include "util/thread.h"
#include "common/names.h"
@@ -177,6 +184,81 @@ void ProfileDocsHandler(const Webserver::WebRequest& req,
Document* document) {
document->AddMember("profile_docs", profile_docs, document->GetAllocator());
}
+// Registered to handle "/stacks", and produces a plain text dump of all
thread stacks.
+void StacksHandler(const Webserver::WebRequest& req, Document* document) {
+ // Tell the webserver to render as plain text instead of HTML
+ document->AddMember(rapidjson::StringRef(Webserver::ENABLE_RAW_HTML_KEY),
true,
+ document->GetAllocator());
+
+ // Use MonoTime to measure elapsed time
+ kudu::StackTraceSnapshot snapshot;
+ // Disable automatic thread name collection - we'll use Impala's ThreadMgr
instead
+ // to get consistent names with the /thread-group page
+ snapshot.set_capture_thread_names(false);
+
+ kudu::MonoTime start = kudu::MonoTime::Now();
+ kudu::Status status = snapshot.SnapshotAllStacks();
+
+ stringstream output;
+ if (!status.ok()) {
+ output << "Error collecting stack traces: " << status.ToString() << "\n";
+ } else {
+ kudu::MonoDelta elapsed = kudu::MonoTime::Now() - start;
+ output << "Collected stacks from " << snapshot.num_threads() << " threads
in "
+ << elapsed.ToSeconds() << "s\n";
+
+ if (snapshot.num_failed() > 0) {
+ output << "Failed to collect stacks for " << snapshot.num_failed() << "
threads "
+ << "(they may have exited while we were iterating over the threads)\n";
+ }
+ output << "\n";
+
+ // Visit groups of threads with the same stack trace
+ snapshot.VisitGroups(
+ [&output](kudu::ArrayView<kudu::StackTraceSnapshot::ThreadInfo> group) {
+ // Print the thread count if there are multiple threads with the same
stack
+ if (group.size() > 1) {
+ output << group.size() << " threads with same stack:\n";
+ }
+
+ // Print thread IDs and names
+ for (auto& info : group) {
+ // Try to get the thread name from Impala's ThreadMgr first for
consistency
+ // with the /thread-group page. If not found (e.g., JVM threads or
library
+ // threads), fall back to reading from /proc.
+ string thread_name;
+ if (!GetThreadNameByTid(info.tid, &thread_name)) {
+ // Thread not managed by Impala - read from
/proc/self/task/[tid]/comm
+ kudu::faststring buf;
+ kudu::Status s = kudu::ReadFileToString(kudu::Env::Default(),
+ strings::Substitute("/proc/self/task/$0/comm", info.tid), &buf);
+ if (!s.ok()) {
+ thread_name = "<unknown>";
+ } else {
+ thread_name = buf.ToString();
+ StripTrailingNewline(&thread_name);
+ }
+ }
+ output << "TID " << info.tid << " (" << thread_name << "):\n";
+ }
+
+ // Print the stack trace (or error if collection failed)
+ if (group[0].status.ok() && group[0].stack.HasCollected()) {
+ output << group[0].stack.Symbolize();
+ } else if (!group[0].status.ok()) {
+ output << " <stack trace collection failed: " <<
group[0].status.ToString()
+ << ">\n";
+ } else {
+ output << " <stack trace not collected>\n";
+ }
+ output << "\n\n";
+ });
+ }
+
+ Value contents(output.str().c_str(), document->GetAllocator());
+ document->AddMember("contents", contents, document->GetAllocator());
+}
+
void JmxHandler(const Webserver::WebRequest& req, Document* document) {
document->AddMember(rapidjson::StringRef(Webserver::ENABLE_PLAIN_JSON_KEY),
true,
document->GetAllocator());
@@ -344,6 +426,7 @@ void AddDefaultUrlCallbacks(Webserver* webserver,
MetricGroup* metric_group,
webserver->RegisterUrlCallback("/varz", "flags.tmpl", FlagsHandler, true);
webserver->RegisterUrlCallback(
"/profile_docs", "profile_docs.tmpl", ProfileDocsHandler, true);
+ webserver->RegisterUrlCallback("/stacks", "raw_text.tmpl", StacksHandler,
false);
if (JniUtil::is_jvm_inited()) {
// JmxHandler outputs a plain JSON string and does not require a template
to
// render. However RawUrlCallback only supports PLAIN content type.
diff --git a/be/src/util/thread.cc b/be/src/util/thread.cc
index 315f2f6b6..9303cfa0a 100644
--- a/be/src/util/thread.cc
+++ b/be/src/util/thread.cc
@@ -144,6 +144,11 @@ class ThreadMgr {
// ]
void ThreadGroupUrlCallback(const Webserver::WebRequest& req, Document*
output);
+ // Looks up a thread name by its system TID. Returns true and sets 'name' if
found,
+ // returns false if the thread is not managed by ThreadMgr (e.g., JVM
threads or
+ // threads created by libraries).
+ bool GetThreadNameByTid(int64_t tid, string* name);
+
private:
// Container class for any details we want to capture about a thread
// TODO: Add start-time.
@@ -294,6 +299,20 @@ void ThreadMgr::ThreadGroupUrlCallback(const
Webserver::WebRequest& req,
document->AddMember("threads", lst, document->GetAllocator());
}
+bool ThreadMgr::GetThreadNameByTid(int64_t tid, string* name) {
+ DCHECK(name != nullptr);
+ lock_guard<mutex> l(lock_);
+ for (const ThreadCategoryMap::value_type& category : thread_categories_) {
+ for (const auto& thread : category.second.threads_by_id) {
+ if (thread.second.thread_id() == tid) {
+ *name = thread.second.name();
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
Status Thread::StartThread(const std::string& category, const std::string&
name,
const ThreadFunctor& functor, unique_ptr<Thread>* thread,
bool fault_injection_eligible) {
@@ -374,6 +393,13 @@ int ThreadGroup::Size() const {
return threads_.size();
}
+bool GetThreadNameByTid(int64_t tid, string* name) {
+ if (thread_manager) {
+ return thread_manager->GetThreadNameByTid(tid, name);
+ }
+ return false;
+}
+
namespace {
void RegisterUrlCallbacks(bool include_jvm_threads, Webserver* webserver) {
diff --git a/be/src/util/thread.h b/be/src/util/thread.h
index ab94250e7..0a8b109e3 100644
--- a/be/src/util/thread.h
+++ b/be/src/util/thread.h
@@ -225,6 +225,11 @@ void InitThreading();
/// live JVM threads in the web UI.
Status StartThreadInstrumentation(MetricGroup* metrics, Webserver* webserver,
bool include_jvm_threads) WARN_UNUSED_RESULT;
+
+/// Looks up a thread name by its system TID (thread ID). Returns true and
sets 'name'
+/// to the thread's full descriptive name if the thread is managed by Impala's
ThreadMgr.
+/// Returns false if the thread is not found (e.g., JVM threads, library
threads)
+bool GetThreadNameByTid(int64_t tid, std::string* name);
}
#endif
diff --git a/tests/webserver/test_web_pages.py
b/tests/webserver/test_web_pages.py
index 4a8d5fa10..63f5d2b51 100644
--- a/tests/webserver/test_web_pages.py
+++ b/tests/webserver/test_web_pages.py
@@ -66,6 +66,7 @@ class TestWebPage(ImpalaTestSuite):
EVENT_PROCESSOR_URL = "http://localhost:{0}/events"
HADOOP_VARZ_URL = "http://localhost:{0}/hadoop-varz"
JVM_THREADZ_URL = "http://localhost:{0}/jvm-threadz"
+ STACKS_URL = "http://localhost:{0}/stacks"
# log4j changes do not apply to the statestore since it doesn't
# have an embedded JVM. So we make two sets of ports to test the
@@ -200,6 +201,77 @@ class TestWebPage(ImpalaTestSuite):
except ValueError:
assert False, "Invalid JSON returned from /jmx endpoint: %s" % jmx_json
+ def test_stacks_endpoint(self):
+ """Tests that the /stacks endpoint on all daemons returns valid stack
traces."""
+ for port in self.TEST_PORTS_WITH_SS:
+ input_url = self.STACKS_URL.format(port)
+ response = requests.get(input_url)
+ assert response.status_code == requests.codes.ok
+ # The /stacks endpoint returns plain text
+ assert "text/plain" in response.headers['Content-Type']
+
+ # Verify the response contains expected stack trace elements
+ stack_text = response.text
+ assert "Collected stacks from" in stack_text, \
+ "Missing collection summary in response from port %s" % port
+ assert "threads in" in stack_text, \
+ "Missing thread count in response from port %s" % port
+ assert "TID" in stack_text, \
+ "Missing thread IDs in response from port %s" % port
+
+ # Verify it contains at least one thread with a stack trace
+ # Stack traces should have lines starting with '@' for frame information
+ assert "@" in stack_text, \
+ "Missing stack frames in response from port %s" % port
+
+ def test_stacks_endpoint_format(self):
+ """Tests that the /stacks endpoint returns properly formatted stack
traces."""
+ for port in self.TEST_PORTS_WITH_SS:
+ input_url = self.STACKS_URL.format(port)
+ response = requests.get(input_url)
+ assert response.status_code == requests.codes.ok
+
+ stack_text = response.text
+ lines = stack_text.split('\n')
+
+ # Verify the header line format
+ header_pattern = re.compile(r'Collected stacks from \d+ threads in
\d+\.\d+s')
+ assert any(header_pattern.match(line) for line in lines), \
+ "Missing or malformed collection summary header"
+
+ # Verify thread ID format (e.g., "TID 1911294 (statestored):" or
+ # "TID 262192 (subscriber-priority-update-worker(2:10)):" with nested
parens)
+ # Use a greedy match that captures everything between "TID <num> (" and
"):"
+ tid_pattern = re.compile(r'TID \d+ \(.+\):')
+ assert any(tid_pattern.match(line) for line in lines), \
+ "Missing or malformed thread ID lines"
+
+ # Verify stack frame format (lines starting with '@' and containing
addresses)
+ frame_pattern = re.compile(r'^\s*@\s+0x[0-9a-f]+\s+.*')
+ assert any(frame_pattern.match(line) for line in lines), \
+ "Missing or malformed stack frame lines"
+
+ # Check for thread grouping format (e.g., "3 threads with same stack:")
+ # This may or may not be present depending on whether there are
duplicate stacks
+ group_pattern = re.compile(r'^\d+ threads with same stack:')
+ has_grouped_threads = any(group_pattern.match(line) for line in lines)
+
+ # If there are grouped threads, verify that the TIDs are listed before
the stack
+ if has_grouped_threads:
+ # Find a group header
+ for i, line in enumerate(lines):
+ if group_pattern.match(line):
+ # The next few lines should be TID lines
+ assert i + 1 < len(lines), "Group header at end of output"
+ # At least one TID should follow
+ found_tid = False
+ for j in range(i + 1, min(i + 10, len(lines))):
+ if tid_pattern.match(lines[j]):
+ found_tid = True
+ break
+ assert found_tid, "No TID found after group header"
+ break
+
def get_and_check_status(
self, url, string_to_search="", ports_to_test=None, regex=False,
headers=None):
"""Helper method that polls a given url and asserts the return code is ok
and
diff --git a/www/threadz.tmpl b/www/threadz.tmpl
index 5f52a3d93..a4a2d0cca 100644
--- a/www/threadz.tmpl
+++ b/www/threadz.tmpl
@@ -27,6 +27,7 @@ under the License.
{{/total-threads}}
<a href="{{ __common__.host-url }}/thread-group?all"><h5>All threads</h5></a>
+<a href="{{ __common__.host-url }}/stacks"><h5>Stacks</h5></a>
{{#thread-groups}}
<a href='{{ __common__.host-url }}/thread-group?group={{name}}'>