This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


The following commit(s) were added to refs/heads/master by this push:
     new 1970cc709 IMPALA-13844: Add /stacks web page to dump native thread 
stacks
1970cc709 is described below

commit 1970cc709df637f1e416198fb9a6553b2947bffa
Author: Arnab Karmakar <[email protected]>
AuthorDate: Fri Nov 7 11:30:51 2025 +0530

    IMPALA-13844: Add /stacks web page to dump native thread stacks
    
    Kudu 1.7 introduced a utility (KUDU-2291) and a /stacks web page
    for dumping stack traces of all native threads. Impala already
    has the corresponding debug util ported from Kudu, so this patch
    adds a similar /stacks endpoint to Impala’s WebUI.
    
    This enables developers to easily inspect thread states and
    diagnose deadlocks or hangs via the web interface.
    
    Testing:
    Added unit tests to verify that the /stacks endpoint
    is available and returns valid thread stack information.
    
    TODO:
    Add jstack output to the /stacks handler via a parameter like
    `/stacks?include_jvm=true`. Note that `JniUtil.java` currently uses
    `ThreadMXBean.dumpAllThreads()`, but Java’s `ThreadInfo` does not
    expose native thread IDs (nids), limiting correlation with
    native stacks.
    
    Change-Id: I9171c89d1d36726d98a4a61ca040d69254f50e92
    Reviewed-on: http://gerrit.cloudera.org:8080/23652
    Reviewed-by: Impala Public Jenkins <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 be/src/common/init.cc                | 28 ++++++++++++
 be/src/util/default-path-handlers.cc | 83 ++++++++++++++++++++++++++++++++++++
 be/src/util/thread.cc                | 26 +++++++++++
 be/src/util/thread.h                 |  5 +++
 tests/webserver/test_web_pages.py    | 72 +++++++++++++++++++++++++++++++
 www/threadz.tmpl                     |  1 +
 6 files changed, 215 insertions(+)

diff --git a/be/src/common/init.cc b/be/src/common/init.cc
index ee4ea6dd7..f4ea31b1d 100644
--- a/be/src/common/init.cc
+++ b/be/src/common/init.cc
@@ -41,6 +41,7 @@
 #include "runtime/lib-cache.h"
 #include "runtime/mem-tracker.h"
 #include "service/impala-server.h"
+#include "kudu/util/debug-util.h"
 #include "util/cgroup-util.h"
 #include "util/cpu-info.h"
 #include "util/debug-util.h"
@@ -585,6 +586,33 @@ void impala::InitCommonRuntime(int argc, char** argv, bool 
init_jvm,
     }
   }
 
+  // Initialize the signal handler for stack trace collection BEFORE
+  // InitGoogleLoggingSafe. This must happen before
+  // google::InstallFailureSignalHandler() is called (which happens
+  // inside InitGoogleLoggingSafe), otherwise that might block our signal.
+  //
+  // We use SIGRTMIN+10 instead of SIGUSR1/SIGUSR2 because:
+  //   - SIGUSR1 is used by minidump handling
+  //   - SIGUSR2 crashes the embedded JVM
+  //
+  // IMPORTANT: This signal handler is used INTERNALLY by the /stacks web 
endpoint.
+  // It is NOT meant to be triggered manually via kill/pkill. The handler 
expects
+  // specific data structures to be set up when triggered, which only happens 
during
+  // a web request to /stacks. Manually sending this signal will crash the 
process.
+  // If you want to see thread stacks, visit the /stacks web UI page instead.
+  const int stack_trace_signal = SIGRTMIN + 10;
+  kudu::Status stack_trace_status = 
kudu::SetStackTraceSignal(stack_trace_signal);
+  if (!stack_trace_status.ok()) {
+    // Using fprintf since LOG isn't available yet
+    fprintf(stderr, "WARNING: Failed to initialize stack trace signal handler 
with "
+            "signal %d: %s. The /stacks endpoint will not work.\n", 
stack_trace_signal,
+            stack_trace_status.ToString().c_str());
+  } else {
+    fprintf(stderr, "INFO: Stack trace signal handler initialized with signal 
%d "
+            "(SIGRTMIN+10). Access thread stacks via the /stacks web UI (DO 
NOT manually "
+            "send this signal).\n", stack_trace_signal);
+  }
+
   impala::InitGoogleLoggingSafe(argv[0]);
   // Breakpad needs flags and logging to initialize.
   if (!external_fe) {
diff --git a/be/src/util/default-path-handlers.cc 
b/be/src/util/default-path-handlers.cc
index f5429cc87..f4bfb511e 100644
--- a/be/src/util/default-path-handlers.cc
+++ b/be/src/util/default-path-handlers.cc
@@ -27,7 +27,13 @@
 
 #include "common/daemon-env.h"
 #include "common/logging.h"
+#include "kudu/gutil/strings/strip.h"
+#include "kudu/util/array_view.h"
+#include "kudu/util/debug-util.h"
+#include "kudu/util/env.h"
+#include "kudu/util/faststring.h"
 #include "kudu/util/flags.h"
+#include "kudu/util/monotime.h"
 #include "rpc/jni-thrift-util.h"
 #include "runtime/exec-env.h"
 #include "runtime/mem-tracker.h"
@@ -44,6 +50,7 @@
 #include "util/pprof-path-handlers.h"
 #include "util/process-state-info.h"
 #include "util/runtime-profile-counters.h"
+#include "util/thread.h"
 
 #include "common/names.h"
 
@@ -177,6 +184,81 @@ void ProfileDocsHandler(const Webserver::WebRequest& req, 
Document* document) {
   document->AddMember("profile_docs", profile_docs, document->GetAllocator());
 }
 
+// Registered to handle "/stacks", and produces a plain text dump of all 
thread stacks.
+void StacksHandler(const Webserver::WebRequest& req, Document* document) {
+  // Tell the webserver to render as plain text instead of HTML
+  document->AddMember(rapidjson::StringRef(Webserver::ENABLE_RAW_HTML_KEY), 
true,
+      document->GetAllocator());
+
+  // Use MonoTime to measure elapsed time
+  kudu::StackTraceSnapshot snapshot;
+  // Disable automatic thread name collection - we'll use Impala's ThreadMgr 
instead
+  // to get consistent names with the /thread-group page
+  snapshot.set_capture_thread_names(false);
+
+  kudu::MonoTime start = kudu::MonoTime::Now();
+  kudu::Status status = snapshot.SnapshotAllStacks();
+
+  stringstream output;
+  if (!status.ok()) {
+    output << "Error collecting stack traces: " << status.ToString() << "\n";
+  } else {
+    kudu::MonoDelta elapsed = kudu::MonoTime::Now() - start;
+    output << "Collected stacks from " << snapshot.num_threads() << " threads 
in "
+      << elapsed.ToSeconds() << "s\n";
+
+    if (snapshot.num_failed() > 0) {
+      output << "Failed to collect stacks for " << snapshot.num_failed() << " 
threads "
+        << "(they may have exited while we were iterating over the threads)\n";
+    }
+    output << "\n";
+
+    // Visit groups of threads with the same stack trace
+    snapshot.VisitGroups(
+      [&output](kudu::ArrayView<kudu::StackTraceSnapshot::ThreadInfo> group) {
+      // Print the thread count if there are multiple threads with the same 
stack
+      if (group.size() > 1) {
+        output << group.size() << " threads with same stack:\n";
+      }
+
+      // Print thread IDs and names
+      for (auto& info : group) {
+        // Try to get the thread name from Impala's ThreadMgr first for 
consistency
+        // with the /thread-group page. If not found (e.g., JVM threads or 
library
+        // threads), fall back to reading from /proc.
+        string thread_name;
+        if (!GetThreadNameByTid(info.tid, &thread_name)) {
+          // Thread not managed by Impala - read from 
/proc/self/task/[tid]/comm
+          kudu::faststring buf;
+          kudu::Status s = kudu::ReadFileToString(kudu::Env::Default(),
+              strings::Substitute("/proc/self/task/$0/comm", info.tid), &buf);
+          if (!s.ok()) {
+            thread_name = "<unknown>";
+          } else {
+            thread_name = buf.ToString();
+            StripTrailingNewline(&thread_name);
+          }
+        }
+        output << "TID " << info.tid << " (" << thread_name << "):\n";
+      }
+
+      // Print the stack trace (or error if collection failed)
+      if (group[0].status.ok() && group[0].stack.HasCollected()) {
+        output << group[0].stack.Symbolize();
+      } else if (!group[0].status.ok()) {
+        output << "    <stack trace collection failed: " << 
group[0].status.ToString()
+          << ">\n";
+      } else {
+        output << "    <stack trace not collected>\n";
+      }
+      output << "\n\n";
+    });
+  }
+
+  Value contents(output.str().c_str(), document->GetAllocator());
+  document->AddMember("contents", contents, document->GetAllocator());
+}
+
 void JmxHandler(const Webserver::WebRequest& req, Document* document) {
   document->AddMember(rapidjson::StringRef(Webserver::ENABLE_PLAIN_JSON_KEY), 
true,
       document->GetAllocator());
@@ -344,6 +426,7 @@ void AddDefaultUrlCallbacks(Webserver* webserver, 
MetricGroup* metric_group,
   webserver->RegisterUrlCallback("/varz", "flags.tmpl", FlagsHandler, true);
   webserver->RegisterUrlCallback(
       "/profile_docs", "profile_docs.tmpl", ProfileDocsHandler, true);
+  webserver->RegisterUrlCallback("/stacks", "raw_text.tmpl", StacksHandler, 
false);
   if (JniUtil::is_jvm_inited()) {
     // JmxHandler outputs a plain JSON string and does not require a template 
to
     // render. However RawUrlCallback only supports PLAIN content type.
diff --git a/be/src/util/thread.cc b/be/src/util/thread.cc
index 315f2f6b6..9303cfa0a 100644
--- a/be/src/util/thread.cc
+++ b/be/src/util/thread.cc
@@ -144,6 +144,11 @@ class ThreadMgr {
   //        ]
   void ThreadGroupUrlCallback(const Webserver::WebRequest& req, Document* 
output);
 
+  // Looks up a thread name by its system TID. Returns true and sets 'name' if 
found,
+  // returns false if the thread is not managed by ThreadMgr (e.g., JVM 
threads or
+  // threads created by libraries).
+  bool GetThreadNameByTid(int64_t tid, string* name);
+
  private:
   // Container class for any details we want to capture about a thread
   // TODO: Add start-time.
@@ -294,6 +299,20 @@ void ThreadMgr::ThreadGroupUrlCallback(const 
Webserver::WebRequest& req,
   document->AddMember("threads", lst, document->GetAllocator());
 }
 
+bool ThreadMgr::GetThreadNameByTid(int64_t tid, string* name) {
+  DCHECK(name != nullptr);
+  lock_guard<mutex> l(lock_);
+  for (const ThreadCategoryMap::value_type& category : thread_categories_) {
+    for (const auto& thread : category.second.threads_by_id) {
+      if (thread.second.thread_id() == tid) {
+        *name = thread.second.name();
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 Status Thread::StartThread(const std::string& category, const std::string& 
name,
     const ThreadFunctor& functor, unique_ptr<Thread>* thread,
     bool fault_injection_eligible) {
@@ -374,6 +393,13 @@ int ThreadGroup::Size() const {
   return threads_.size();
 }
 
+bool GetThreadNameByTid(int64_t tid, string* name) {
+  if (thread_manager) {
+    return thread_manager->GetThreadNameByTid(tid, name);
+  }
+  return false;
+}
+
 namespace {
 
 void RegisterUrlCallbacks(bool include_jvm_threads, Webserver* webserver) {
diff --git a/be/src/util/thread.h b/be/src/util/thread.h
index ab94250e7..0a8b109e3 100644
--- a/be/src/util/thread.h
+++ b/be/src/util/thread.h
@@ -225,6 +225,11 @@ void InitThreading();
 /// live JVM threads in the web UI.
 Status StartThreadInstrumentation(MetricGroup* metrics, Webserver* webserver,
     bool include_jvm_threads) WARN_UNUSED_RESULT;
+
+/// Looks up a thread name by its system TID (thread ID). Returns true and 
sets 'name'
+/// to the thread's full descriptive name if the thread is managed by Impala's 
ThreadMgr.
+/// Returns false if the thread is not found (e.g., JVM threads, library 
threads)
+bool GetThreadNameByTid(int64_t tid, std::string* name);
 }
 
 #endif
diff --git a/tests/webserver/test_web_pages.py 
b/tests/webserver/test_web_pages.py
index 4a8d5fa10..63f5d2b51 100644
--- a/tests/webserver/test_web_pages.py
+++ b/tests/webserver/test_web_pages.py
@@ -66,6 +66,7 @@ class TestWebPage(ImpalaTestSuite):
   EVENT_PROCESSOR_URL = "http://localhost:{0}/events";
   HADOOP_VARZ_URL = "http://localhost:{0}/hadoop-varz";
   JVM_THREADZ_URL = "http://localhost:{0}/jvm-threadz";
+  STACKS_URL = "http://localhost:{0}/stacks";
 
   # log4j changes do not apply to the statestore since it doesn't
   # have an embedded JVM. So we make two sets of ports to test the
@@ -200,6 +201,77 @@ class TestWebPage(ImpalaTestSuite):
       except ValueError:
         assert False, "Invalid JSON returned from /jmx endpoint: %s" % jmx_json
 
+  def test_stacks_endpoint(self):
+    """Tests that the /stacks endpoint on all daemons returns valid stack 
traces."""
+    for port in self.TEST_PORTS_WITH_SS:
+      input_url = self.STACKS_URL.format(port)
+      response = requests.get(input_url)
+      assert response.status_code == requests.codes.ok
+      # The /stacks endpoint returns plain text
+      assert "text/plain" in response.headers['Content-Type']
+
+      # Verify the response contains expected stack trace elements
+      stack_text = response.text
+      assert "Collected stacks from" in stack_text, \
+          "Missing collection summary in response from port %s" % port
+      assert "threads in" in stack_text, \
+          "Missing thread count in response from port %s" % port
+      assert "TID" in stack_text, \
+          "Missing thread IDs in response from port %s" % port
+
+      # Verify it contains at least one thread with a stack trace
+      # Stack traces should have lines starting with '@' for frame information
+      assert "@" in stack_text, \
+          "Missing stack frames in response from port %s" % port
+
+  def test_stacks_endpoint_format(self):
+    """Tests that the /stacks endpoint returns properly formatted stack 
traces."""
+    for port in self.TEST_PORTS_WITH_SS:
+      input_url = self.STACKS_URL.format(port)
+      response = requests.get(input_url)
+      assert response.status_code == requests.codes.ok
+
+      stack_text = response.text
+      lines = stack_text.split('\n')
+
+      # Verify the header line format
+      header_pattern = re.compile(r'Collected stacks from \d+ threads in 
\d+\.\d+s')
+      assert any(header_pattern.match(line) for line in lines), \
+          "Missing or malformed collection summary header"
+
+      # Verify thread ID format (e.g., "TID 1911294 (statestored):" or
+      # "TID 262192 (subscriber-priority-update-worker(2:10)):" with nested 
parens)
+      # Use a greedy match that captures everything between "TID <num> (" and 
"):"
+      tid_pattern = re.compile(r'TID \d+ \(.+\):')
+      assert any(tid_pattern.match(line) for line in lines), \
+          "Missing or malformed thread ID lines"
+
+      # Verify stack frame format (lines starting with '@' and containing 
addresses)
+      frame_pattern = re.compile(r'^\s*@\s+0x[0-9a-f]+\s+.*')
+      assert any(frame_pattern.match(line) for line in lines), \
+          "Missing or malformed stack frame lines"
+
+      # Check for thread grouping format (e.g., "3 threads with same stack:")
+      # This may or may not be present depending on whether there are 
duplicate stacks
+      group_pattern = re.compile(r'^\d+ threads with same stack:')
+      has_grouped_threads = any(group_pattern.match(line) for line in lines)
+
+      # If there are grouped threads, verify that the TIDs are listed before 
the stack
+      if has_grouped_threads:
+        # Find a group header
+        for i, line in enumerate(lines):
+          if group_pattern.match(line):
+            # The next few lines should be TID lines
+            assert i + 1 < len(lines), "Group header at end of output"
+            # At least one TID should follow
+            found_tid = False
+            for j in range(i + 1, min(i + 10, len(lines))):
+              if tid_pattern.match(lines[j]):
+                found_tid = True
+                break
+            assert found_tid, "No TID found after group header"
+            break
+
   def get_and_check_status(
       self, url, string_to_search="", ports_to_test=None, regex=False, 
headers=None):
     """Helper method that polls a given url and asserts the return code is ok 
and
diff --git a/www/threadz.tmpl b/www/threadz.tmpl
index 5f52a3d93..a4a2d0cca 100644
--- a/www/threadz.tmpl
+++ b/www/threadz.tmpl
@@ -27,6 +27,7 @@ under the License.
 {{/total-threads}}
 
 <a href="{{ __common__.host-url }}/thread-group?all"><h5>All threads</h5></a>
+<a href="{{ __common__.host-url }}/stacks"><h5>Stacks</h5></a>
 
 {{#thread-groups}}
 <a href='{{ __common__.host-url }}/thread-group?group={{name}}'>

Reply via email to