This is an automated email from the ASF dual-hosted git repository. yiguolei pushed a commit to branch branch-2.1 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push: new e755d64e62f [feature](be jvm monitor)append enable_jvm_monitor in be.conf to control jvm monitor. (#35608) (#35764) e755d64e62f is described below commit e755d64e62fa6c315c4a833a7dff62a59e2bce66 Author: Mingyu Chen <morning...@163.com> AuthorDate: Sun Jun 2 00:18:44 2024 +0800 [feature](be jvm monitor)append enable_jvm_monitor in be.conf to control jvm monitor. (#35608) (#35764) bp #35608 Co-authored-by: daidai <2017501...@qq.com> --- be/src/common/config.cpp | 3 + be/src/common/config.h | 3 + be/src/util/jvm_metrics.cpp | 112 ++++++++++++++++++++----- be/src/util/jvm_metrics.h | 16 ++-- regression-test/pipeline/external/conf/be.conf | 3 + regression-test/pipeline/p0/conf/be.conf | 3 + regression-test/pipeline/p1/conf/be.conf | 5 +- 7 files changed, 116 insertions(+), 29 deletions(-) diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index 8d8cb8f222e..6d0c866cebe 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -1222,6 +1222,9 @@ DEFINE_mInt32(thrift_client_open_num_tries, "1"); DEFINE_mBool(ignore_schema_change_check, "false"); +//JVM monitoring enable. To prevent be from crashing due to jvm compatibility issues. The default setting is off. +DEFINE_Bool(enable_jvm_monitor, "false"); + // clang-format off #ifdef BE_TEST // test s3 diff --git a/be/src/common/config.h b/be/src/common/config.h index 0334cb085c6..b172a3406d6 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -1303,6 +1303,9 @@ DECLARE_mInt32(thrift_client_open_num_tries); DECLARE_mBool(ignore_schema_change_check); +//JVM monitoring enable. To prevent be from crashing due to jvm compatibility issues. +DECLARE_Bool(enable_jvm_monitor); + #ifdef BE_TEST // test s3 DECLARE_String(test_s3_resource); diff --git a/be/src/util/jvm_metrics.cpp b/be/src/util/jvm_metrics.cpp index e55cf8f3fbe..fc30d1073ac 100644 --- a/be/src/util/jvm_metrics.cpp +++ b/be/src/util/jvm_metrics.cpp @@ -17,10 +17,12 @@ #include "jvm_metrics.h" +#include <util/jni-util.h> + #include <functional> +#include "common/config.h" #include "util/metrics.h" - namespace doris { #define DEFINE_JVM_SIZE_BYTES_METRIC(name, type) \ @@ -76,15 +78,28 @@ DEFINE_COUNTER_METRIC_PROTOTYPE_5ARG(jvm_gc_g1_old_generation_time_ms, MetricUni const char* JvmMetrics::_s_hook_name = "jvm_metrics"; -JvmMetrics::JvmMetrics(MetricRegistry* registry, JNIEnv* env) : _jvm_stats(env) { +JvmMetrics::JvmMetrics(MetricRegistry* registry, JNIEnv* env) { DCHECK(registry != nullptr); _registry = registry; _server_entity = _registry->register_entity("server"); DCHECK(_server_entity != nullptr); - if (_jvm_stats.init_complete()) { + + do { + if (!doris::config::enable_jvm_monitor) { + break; + } + try { + _jvm_stats.init(env); + } catch (...) { + LOG(WARNING) << "JVM STATS INIT FAIL"; + break; + } + if (!_jvm_stats.init_complete()) { + break; + } _server_entity->register_hook(_s_hook_name, std::bind(&JvmMetrics::update, this)); - } + } while (false); INT_GAUGE_METRIC_REGISTER(_server_entity, jvm_heap_size_bytes_max); INT_GAUGE_METRIC_REGISTER(_server_entity, jvm_heap_size_bytes_committed); @@ -117,11 +132,58 @@ JvmMetrics::JvmMetrics(MetricRegistry* registry, JNIEnv* env) : _jvm_stats(env) } void JvmMetrics::update() { - _jvm_stats.refresh(this); + static long fail_count = 0; + bool have_exception = false; + try { + _jvm_stats.refresh(this); + } catch (...) { + have_exception = true; + LOG(WARNING) << "JVM MONITOR UPDATE FAIL!"; + fail_count++; + } + + //When 30 consecutive exceptions occur, turn off jvm information collection. + if (!have_exception) { + fail_count = 0; + } + if (fail_count >= 30) { + LOG(WARNING) << "JVM MONITOR CLOSE!"; + _jvm_stats.set_complete(false); + _server_entity->deregister_hook(_s_hook_name); + + jvm_heap_size_bytes_max->set_value(0); + jvm_heap_size_bytes_committed->set_value(0); + jvm_heap_size_bytes_used->set_value(0); + + jvm_non_heap_size_bytes_used->set_value(0); + jvm_non_heap_size_bytes_committed->set_value(0); + + jvm_young_size_bytes_used->set_value(0); + jvm_young_size_bytes_peak_used->set_value(0); + jvm_young_size_bytes_max->set_value(0); + + jvm_old_size_bytes_used->set_value(0); + jvm_old_size_bytes_peak_used->set_value(0); + jvm_old_size_bytes_max->set_value(0); + + jvm_thread_count->set_value(0); + jvm_thread_peak_count->set_value(0); + jvm_thread_new_count->set_value(0); + jvm_thread_runnable_count->set_value(0); + jvm_thread_blocked_count->set_value(0); + jvm_thread_waiting_count->set_value(0); + jvm_thread_timed_waiting_count->set_value(0); + jvm_thread_terminated_count->set_value(0); + + jvm_gc_g1_young_generation_count->set_value(0); + jvm_gc_g1_young_generation_time_ms->set_value(0); + jvm_gc_g1_old_generation_count->set_value(0); + jvm_gc_g1_old_generation_time_ms->set_value(0); + } } -#include <util/jni-util.h> -jvmStats::jvmStats(JNIEnv* ENV) : env(ENV) { +void JvmStats::init(JNIEnv* ENV) { + env = ENV; _managementFactoryClass = env->FindClass("java/lang/management/ManagementFactory"); if (_managementFactoryClass == nullptr) { LOG(WARNING) @@ -244,15 +306,19 @@ jvmStats::jvmStats(JNIEnv* ENV) : env(ENV) { LOG(INFO) << "Start JVM monitoring."; _init_complete = true; + return; } -#include "jni.h" - -void jvmStats::refresh(JvmMetrics* jvm_metrics) { +void JvmStats::refresh(JvmMetrics* jvm_metrics) { if (!_init_complete) { return; } - static_cast<void>(JniUtil::GetJNIEnv(&env)); + + Status st = JniUtil::GetJNIEnv(&env); + if (!st.ok()) { + LOG(WARNING) << "JVM STATS GET JNI ENV FAIL"; + return; + } jobject memoryMXBeanObj = env->CallStaticObjectMethod(_managementFactoryClass, _getMemoryMXBeanMethod); @@ -302,8 +368,8 @@ void jvmStats::refresh(JvmMetrics* jvm_metrics) { jstring name = (jstring)env->CallObjectMethod(memoryPoolMXBean, _getMemoryPollMXBeanNameMethod); - const char* nameStr = env->GetStringUTFChars(name, NULL); - if (nameStr != NULL) { + const char* nameStr = env->GetStringUTFChars(name, nullptr); + if (nameStr != nullptr) { auto it = _memoryPoolName.find(nameStr); if (it == _memoryPoolName.end()) { continue; @@ -408,16 +474,22 @@ void jvmStats::refresh(JvmMetrics* jvm_metrics) { env->DeleteLocalRef(threadMXBean); env->DeleteLocalRef(gcMXBeansList); } -jvmStats::~jvmStats() { +JvmStats::~JvmStats() { if (!_init_complete) { return; } - env->DeleteLocalRef(_newThreadStateObj); - env->DeleteLocalRef(_runnableThreadStateObj); - env->DeleteLocalRef(_blockedThreadStateObj); - env->DeleteLocalRef(_waitingThreadStateObj); - env->DeleteLocalRef(_timedWaitingThreadStateObj); - env->DeleteLocalRef(_terminatedThreadStateObj); + try { + env->DeleteLocalRef(_newThreadStateObj); + env->DeleteLocalRef(_runnableThreadStateObj); + env->DeleteLocalRef(_blockedThreadStateObj); + env->DeleteLocalRef(_waitingThreadStateObj); + env->DeleteLocalRef(_timedWaitingThreadStateObj); + env->DeleteLocalRef(_terminatedThreadStateObj); + + } catch (...) { + // When be is killed, DeleteLocalRef may fail. + // In order to exit more gracefully, we catch the exception here. + } } } // namespace doris diff --git a/be/src/util/jvm_metrics.h b/be/src/util/jvm_metrics.h index 5f9929d8cf0..459a3cbf938 100644 --- a/be/src/util/jvm_metrics.h +++ b/be/src/util/jvm_metrics.h @@ -17,8 +17,6 @@ #pragma once -#include <jni.h> - #include "jni.h" #include "util/jni-util.h" #include "util/metrics.h" @@ -27,7 +25,7 @@ namespace doris { class JvmMetrics; -class jvmStats { +class JvmStats { private: JNIEnv* env = nullptr; jclass _managementFactoryClass = nullptr; @@ -98,16 +96,18 @@ private: bool _init_complete = false; public: - jvmStats(JNIEnv* ENV); - bool init_complete() { return _init_complete; } + // JvmStats(JNIEnv* ENV); + void init(JNIEnv* ENV); + bool init_complete() const { return _init_complete; } + void set_complete(bool val) { _init_complete = val; } void refresh(JvmMetrics* jvm_metrics); - ~jvmStats(); + ~JvmStats(); }; class JvmMetrics { public: JvmMetrics(MetricRegistry* registry, JNIEnv* env); - ~JvmMetrics() {} + ~JvmMetrics() = default; void update(); IntGauge* jvm_heap_size_bytes_max = nullptr; @@ -140,7 +140,7 @@ public: IntGauge* jvm_gc_g1_old_generation_time_ms = nullptr; private: - jvmStats _jvm_stats; + JvmStats _jvm_stats; std::shared_ptr<MetricEntity> _server_entity; static const char* _s_hook_name; MetricRegistry* _registry = nullptr; diff --git a/regression-test/pipeline/external/conf/be.conf b/regression-test/pipeline/external/conf/be.conf index 9a5b3641b84..2bd810e55cc 100644 --- a/regression-test/pipeline/external/conf/be.conf +++ b/regression-test/pipeline/external/conf/be.conf @@ -70,3 +70,6 @@ fragment_pool_thread_num_max=5000 enable_fuzzy_mode=true enable_set_in_bitmap_value=true enable_feature_binlog=true + +enable_jvm_monitor = true + diff --git a/regression-test/pipeline/p0/conf/be.conf b/regression-test/pipeline/p0/conf/be.conf index 15f19ec4f42..b5d6944acae 100644 --- a/regression-test/pipeline/p0/conf/be.conf +++ b/regression-test/pipeline/p0/conf/be.conf @@ -82,3 +82,6 @@ user_files_secure_path=/ enable_debug_points=true # debug scanner context dead loop enable_debug_log_timeout_secs=0 + +enable_jvm_monitor = true + diff --git a/regression-test/pipeline/p1/conf/be.conf b/regression-test/pipeline/p1/conf/be.conf index e1ae9653c78..0c450c9281e 100644 --- a/regression-test/pipeline/p1/conf/be.conf +++ b/regression-test/pipeline/p1/conf/be.conf @@ -71,4 +71,7 @@ fragment_pool_thread_num_max=5000 enable_fuzzy_mode=true enable_set_in_bitmap_value=true enable_feature_binlog=true -max_sys_mem_available_low_water_mark_bytes=69206016 \ No newline at end of file +max_sys_mem_available_low_water_mark_bytes=69206016 + +enable_jvm_monitor = true + --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org