Re: [PR] [feat](cloud) Add system rate limit for meta-service [doris]

via GitHub Thu, 26 Mar 2026 23:55:27 -0700


bobhan1 commented on code in PR #61516:
URL: https://github.com/apache/doris/pull/61516#discussion_r2999323464



##########
cloud/src/meta-service/meta_service_rate_limit_helper.cpp:
##########
@@ -0,0 +1,410 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "meta-service/meta_service_rate_limit_helper.h"
+
+#include <fmt/format.h>
+#include <sys/resource.h>
+#include <sys/sysinfo.h>
+
+#include <algorithm>
+#include <atomic>
+#include <chrono>
+#include <condition_variable>
+#include <cstdint>
+#include <deque>
+#include <fstream>
+#include <memory>
+#include <mutex>
+#include <random>
+#include <string>
+#include <thread>
+
+#include "common/config.h"
+#include "common/logging.h"
+
+namespace doris::cloud {
+namespace {
+constexpr int64_t kNanosecondsPerMillisecond = 1000 * 1000;
+constexpr int64_t kInvalidPercent = -1;
+
+struct WindowSample {
+    int64_t second {0};
+    int64_t fdb_client_thread_busyness_percent {BVAR_FDB_INVALID_VALUE};
+    int64_t ms_cpu_usage_percent {kInvalidPercent};
+    int64_t ms_memory_usage_percent {kInvalidPercent};
+};
+
+struct ProcessResourceSample {
+    int64_t cpu_usage_percent {kInvalidPercent};
+    int64_t memory_usage_percent {kInvalidPercent};
+};
+
+class ProcessResourceSampler {
+public:
+    ProcessResourceSample sample() {
+        using namespace std::chrono;
+
+        const auto now = steady_clock::now();
+        const int64_t current_cpu_time_ns = get_process_cpu_time_ns();
+        ProcessResourceSample sample;
+        sample.memory_usage_percent = get_process_memory_usage_percent();
+
+        const auto current_wall_time_ns =
+                duration_cast<nanoseconds>(now.time_since_epoch()).count();
+        std::lock_guard lock(mutex_);
+        if (last_cpu_time_ns_ != kInvalidPercent && current_cpu_time_ns != 
kInvalidPercent &&
+            current_wall_time_ns > last_wall_time_ns_) {
+            const double delta_cpu_ns = current_cpu_time_ns - 
last_cpu_time_ns_;
+            const double delta_wall_ns = current_wall_time_ns - 
last_wall_time_ns_;
+            const uint32_t cpu_cores = std::max<uint32_t>(1, 
std::thread::hardware_concurrency());
+            sample.cpu_usage_percent =
+                    static_cast<int64_t>(delta_cpu_ns * 100.0 / delta_wall_ns 
/ cpu_cores);
+        }
+        last_cpu_time_ns_ = current_cpu_time_ns;
+        last_wall_time_ns_ = current_wall_time_ns;
+        return sample;
+    }
+
+private:
+    static int64_t get_process_cpu_time_ns() {
+        rusage usage {};
+        if (getrusage(RUSAGE_SELF, &usage) != 0) {
+            return kInvalidPercent;
+        }
+        return usage.ru_utime.tv_sec * 1000L * 1000 * 1000 + 
usage.ru_utime.tv_usec * 1000L +
+               usage.ru_stime.tv_sec * 1000L * 1000 * 1000 + 
usage.ru_stime.tv_usec * 1000L;
+    }
+
+    static int64_t get_process_memory_usage_percent() {
+        std::ifstream status("/proc/self/status");
+        if (!status.is_open()) {
+            return kInvalidPercent;
+        }
+
+        int64_t rss_kb = kInvalidPercent;
+        std::string line;
+        while (std::getline(status, line)) {
+            if (!line.starts_with("VmRSS:")) {
+                continue;
+            }
+            size_t pos = std::string("VmRSS:").size();
+            while (pos < line.size() && line[pos] == ' ') {
+                ++pos;
+            }
+            rss_kb = std::stoll(line.substr(pos));
+            break;
+        }
+        if (rss_kb == kInvalidPercent) {
+            return kInvalidPercent;
+        }
+
+        struct sysinfo info {};
+        if (sysinfo(&info) != 0) {
+            return kInvalidPercent;
+        }
+        const double total_memory_bytes =
+                static_cast<double>(info.totalram) * 
static_cast<double>(info.mem_unit);
+        if (total_memory_bytes <= 0) {
+            return kInvalidPercent;
+        }
+        const double rss_bytes = static_cast<double>(rss_kb) * 1024.0;
+        return static_cast<int64_t>(rss_bytes * 100.0 / total_memory_bytes);
+    }
+
+    std::mutex mutex_;
+    int64_t last_cpu_time_ns_ {kInvalidPercent};
+    int64_t last_wall_time_ns_ {0};
+};
+
+MsStressMetrics collect_ms_stress_metrics(ProcessResourceSampler* sampler) {
+    MsStressMetrics metrics;
+    metrics.fdb_commit_latency_ns = 
g_bvar_fdb_latency_probe_commit_ns.get_value();
+    metrics.fdb_read_latency_ns = g_bvar_fdb_latency_probe_read_ns.get_value();
+    metrics.fdb_performance_limited_by_name = 
g_bvar_fdb_performance_limited_by_name.get_value();
+    metrics.fdb_client_thread_busyness_percent =
+            g_bvar_fdb_client_thread_busyness_percent.get_value();
+    const auto resource_sample = sampler->sample();
+    metrics.ms_cpu_usage_percent = resource_sample.cpu_usage_percent;
+    metrics.ms_memory_usage_percent = resource_sample.memory_usage_percent;
+    return metrics;
+}
+
+class MsStressDetector {
+public:
+    ~MsStressDetector() { stop(); }
+
+    // Compute decision from metrics and store it in latest_decision_.
+    // Called by the background thread or synchronously in tests.
+    void update(int64_t now_ms, const MsStressMetrics& metrics) {
+        auto decision = std::make_shared<MsStressDecision>();
+        decision->fdb_commit_latency_ns = metrics.fdb_commit_latency_ns;
+        decision->fdb_read_latency_ns = metrics.fdb_read_latency_ns;
+        decision->fdb_performance_limited_by_name = 
metrics.fdb_performance_limited_by_name;
+        decision->fdb_client_thread_busyness_percent = 
metrics.fdb_client_thread_busyness_percent;
+        decision->ms_cpu_usage_percent = metrics.ms_cpu_usage_percent;
+        decision->ms_memory_usage_percent = metrics.ms_memory_usage_percent;
+        const bool commit_latency_high =
+                metrics.fdb_commit_latency_ns != BVAR_FDB_INVALID_VALUE &&
+                metrics.fdb_commit_latency_ns >
+                        config::ms_rate_limit_fdb_commit_latency_ms * 
kNanosecondsPerMillisecond;
+        const bool read_latency_high =
+                metrics.fdb_read_latency_ns != BVAR_FDB_INVALID_VALUE &&
+                metrics.fdb_read_latency_ns >
+                        config::ms_rate_limit_fdb_read_latency_ms * 
kNanosecondsPerMillisecond;
+        decision->fdb_cluster_under_pressure = (commit_latency_high || 
read_latency_high) &&
+                                               
metrics.fdb_performance_limited_by_name != 0;

Review Comment:
   **[Bug]** `fdb_performance_limited_by_name` 初始值为 
`BVAR_FDB_INVALID_VALUE`（非零），在 bvar 尚未被 metric.cpp 更新时（如 MS 刚启动），如果 latency 
恰好超阈值，`!= 0` 为 true，会**误判为 FDB 存在非 workload 瓶颈**而触发限流。
   
   建议增加 INVALID_VALUE 过滤：
   ```cpp
   decision->fdb_cluster_under_pressure = (commit_latency_high || 
read_latency_high) &&
       metrics.fdb_performance_limited_by_name != BVAR_FDB_INVALID_VALUE &&
       metrics.fdb_performance_limited_by_name != 0;
   ```



##########
cloud/src/meta-service/meta_service_rate_limit_helper.cpp:
##########
@@ -0,0 +1,410 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "meta-service/meta_service_rate_limit_helper.h"
+
+#include <fmt/format.h>
+#include <sys/resource.h>
+#include <sys/sysinfo.h>
+
+#include <algorithm>
+#include <atomic>
+#include <chrono>
+#include <condition_variable>
+#include <cstdint>
+#include <deque>
+#include <fstream>
+#include <memory>
+#include <mutex>
+#include <random>
+#include <string>
+#include <thread>
+
+#include "common/config.h"
+#include "common/logging.h"
+
+namespace doris::cloud {
+namespace {
+constexpr int64_t kNanosecondsPerMillisecond = 1000 * 1000;
+constexpr int64_t kInvalidPercent = -1;
+
+struct WindowSample {
+    int64_t second {0};
+    int64_t fdb_client_thread_busyness_percent {BVAR_FDB_INVALID_VALUE};
+    int64_t ms_cpu_usage_percent {kInvalidPercent};
+    int64_t ms_memory_usage_percent {kInvalidPercent};
+};
+
+struct ProcessResourceSample {
+    int64_t cpu_usage_percent {kInvalidPercent};
+    int64_t memory_usage_percent {kInvalidPercent};
+};
+
+class ProcessResourceSampler {
+public:
+    ProcessResourceSample sample() {
+        using namespace std::chrono;
+
+        const auto now = steady_clock::now();
+        const int64_t current_cpu_time_ns = get_process_cpu_time_ns();
+        ProcessResourceSample sample;
+        sample.memory_usage_percent = get_process_memory_usage_percent();
+
+        const auto current_wall_time_ns =
+                duration_cast<nanoseconds>(now.time_since_epoch()).count();
+        std::lock_guard lock(mutex_);
+        if (last_cpu_time_ns_ != kInvalidPercent && current_cpu_time_ns != 
kInvalidPercent &&
+            current_wall_time_ns > last_wall_time_ns_) {
+            const double delta_cpu_ns = current_cpu_time_ns - 
last_cpu_time_ns_;
+            const double delta_wall_ns = current_wall_time_ns - 
last_wall_time_ns_;
+            const uint32_t cpu_cores = std::max<uint32_t>(1, 
std::thread::hardware_concurrency());
+            sample.cpu_usage_percent =
+                    static_cast<int64_t>(delta_cpu_ns * 100.0 / delta_wall_ns 
/ cpu_cores);
+        }
+        last_cpu_time_ns_ = current_cpu_time_ns;
+        last_wall_time_ns_ = current_wall_time_ns;
+        return sample;
+    }
+
+private:
+    static int64_t get_process_cpu_time_ns() {
+        rusage usage {};
+        if (getrusage(RUSAGE_SELF, &usage) != 0) {
+            return kInvalidPercent;
+        }
+        return usage.ru_utime.tv_sec * 1000L * 1000 * 1000 + 
usage.ru_utime.tv_usec * 1000L +
+               usage.ru_stime.tv_sec * 1000L * 1000 * 1000 + 
usage.ru_stime.tv_usec * 1000L;
+    }
+
+    static int64_t get_process_memory_usage_percent() {
+        std::ifstream status("/proc/self/status");
+        if (!status.is_open()) {
+            return kInvalidPercent;
+        }
+
+        int64_t rss_kb = kInvalidPercent;
+        std::string line;
+        while (std::getline(status, line)) {
+            if (!line.starts_with("VmRSS:")) {
+                continue;
+            }
+            size_t pos = std::string("VmRSS:").size();
+            while (pos < line.size() && line[pos] == ' ') {
+                ++pos;
+            }
+            rss_kb = std::stoll(line.substr(pos));
+            break;

Review Comment:
   **[Bug]** `std::stoll` 在输入为空或非法时会抛出 `std::invalid_argument` / 
`std::out_of_range`，可能导致进程 crash。
   
   VmRSS 行格式为 `"VmRSS:   1234 kB"`，`stoll` 会在遇到空格时停止解析所以正常情况没问题，但建议做防御性处理，使用 
`std::from_chars` 或 try-catch 保护。



##########
cloud/src/meta-service/meta_service_helper.h:
##########
@@ -311,6 +312,20 @@ inline MetaServiceCode cast_as(TxnErrorCode code) {
     [[maybe_unused]] std::string instance_id;                                  
               \
     [[maybe_unused]] bool drop_request = false;                                
               \
     [[maybe_unused]] KVStats stats;                                            
               \
+    [[maybe_unused]] MsStressDecision ms_stress_decision;                      
               \
+    if (config::enable_ms_rate_limit || 
config::enable_ms_rate_limit_injection) {             \
+        ms_stress_decision = get_ms_stress_decision();                         
               \
+    }                                                                          
               \
+    if ((config::enable_ms_rate_limit || 
config::enable_ms_rate_limit_injection) &&           \
+        ms_stress_decision.under_greate_stress()) {                            
               \
+        drop_request = true;                                                   
               \
+        code = MetaServiceCode::MS_TOO_BUSY;                                   
               \
+        msg = ms_stress_decision.debug_string();                               
               \
+        response->mutable_status()->set_code(code);                            
               \
+        response->mutable_status()->set_msg(msg);                              
               \
+        finish_rpc(#func_name, ctrl, request, response);                       
               \
+        return;                                                                
               \

Review Comment:
   **[Design]** 限流触发时手动调 `finish_rpc()` 然后 `return`。`DORIS_CLOUD_DEFER` 在 
return 之后定义所以不会执行——这部分正确。
   
   但需确认 `brpc::ClosureGuard closure_guard` 析构时（return 触发）是否会重复调用 
`done->Run()`。如果 `finish_rpc` 内部也触发了 closure，则可能 double-run。
   
   建议在 return 前加 `closure_guard.reset(nullptr)`，或者将限流检查移到 `DORIS_CLOUD_DEFER` 
之后、业务逻辑之前（与现有 `RPC_RATE_LIMIT` 宏同样的位置），这样可以复用 DEFER 中的 `finish_rpc` 逻辑。



##########
cloud/src/common/metric.cpp:
##########
@@ -130,6 +130,16 @@ static void export_fdb_status_details(const std::string& 
status_str) {
         if (node->value.IsArray()) return node->value.Size();
         return BVAR_FDB_INVALID_VALUE;
     };
+    auto get_string_value = [&](const std::vector<const char*>& v) -> 
std::string {
+        if (v.empty()) return "invalid";
+        auto node = document.FindMember("cluster");

Review Comment:
   **[Bug]** 如果 JSON 中不存在 `"cluster"` key，`document.FindMember("cluster")` 返回 
`MemberEnd()`，后续 `node->value` 访问是**未定义行为**。
   
   建议加检查：
   ```cpp
   auto node = document.FindMember("cluster");
   if (node == document.MemberEnd()) return "invalid";
   ```
   
   （注：已有的 `get_value`/`get_nanoseconds` 也有同样问题，但作为新增代码建议先修正。）



##########
be/src/cloud/cloud_meta_mgr.cpp:
##########
@@ -423,6 +423,8 @@ Status retry_rpc(std::string_view op_name, const Request& 
req, Response* res,
         } else if (res->status().code() == MetaServiceCode::INVALID_ARGUMENT) {
             return Status::Error<ErrorCode::INVALID_ARGUMENT, false>("failed 
to {}: {}", op_name,
                                                                      
res->status().msg());
+        } else if (res->status().code() == MetaServiceCode::MS_TOO_BUSY) {

Review Comment:
   **[Design]** `MS_TOO_BUSY` 与 `KV_TXN_CONFLICT` 走相同重试路径，但限流场景有不同需求：
   
   1. **退避策略**：限流时应使用更长的退避时间（指数退避），否则会在 MS 过载时继续施压
   2. **独立重试上限**：类似 `meta_service_conflict_error_retry_times`，`MS_TOO_BUSY` 
也应有独立配置控制重试次数
   3. **日志信息**：retry loop 中 WARNING 日志打印 `cntl.ErrorText()` 而不是 `error_msg`（MS 
返回的限流原因），不利于排查



##########
cloud/src/common/config.h:
##########
@@ -193,6 +193,20 @@ CONF_Int64(default_max_qps_limit, "1000000");
 CONF_String(specific_max_qps_limit, "get_cluster:5000000;begin_txn:5000000");
 CONF_Bool(enable_rate_limit, "true");
 CONF_Int64(bvar_qps_update_second, "5");
+CONF_Bool(enable_ms_rate_limit, "true");

Review Comment:
   **[Design]** `enable_ms_rate_limit` 是 `CONF_Bool`（非 mutable），修改需重启 
MS。其他阈值参数均为 `CONF_mXxx` 可热更新。如果线上误触发限流需要紧急关闭，无法运行时修改是一个**可用性风险**。
   
   建议改为 `CONF_mBool`。



##########
cloud/src/meta-service/meta_service_rate_limit_helper.h:
##########
@@ -0,0 +1,66 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <string>
+
+#include "common/bvars.h"
+
+namespace doris::cloud {
+
+struct MsStressMetrics {
+    int64_t fdb_commit_latency_ns {BVAR_FDB_INVALID_VALUE};
+    int64_t fdb_read_latency_ns {BVAR_FDB_INVALID_VALUE};
+    int64_t fdb_performance_limited_by_name {BVAR_FDB_INVALID_VALUE};
+    int64_t fdb_client_thread_busyness_percent {BVAR_FDB_INVALID_VALUE};
+    int64_t ms_cpu_usage_percent {-1};
+    int64_t ms_memory_usage_percent {-1};
+};
+
+struct MsStressDecision {
+    bool fdb_cluster_under_pressure {false};
+    bool fdb_client_thread_under_pressure {false};
+    bool ms_resource_under_pressure {false};
+    bool rate_limit_injected_for_test {false};
+    int64_t fdb_commit_latency_ns {BVAR_FDB_INVALID_VALUE};
+    int64_t fdb_read_latency_ns {BVAR_FDB_INVALID_VALUE};
+    int64_t fdb_performance_limited_by_name {BVAR_FDB_INVALID_VALUE};
+    int64_t fdb_client_thread_busyness_percent {BVAR_FDB_INVALID_VALUE};
+    double fdb_client_thread_busyness_avg_percent {-1};
+    int64_t ms_cpu_usage_percent {-1};
+    double ms_cpu_usage_avg_percent {-1};
+    int64_t ms_memory_usage_percent {-1};
+    double ms_memory_usage_avg_percent {-1};
+    int32_t rate_limit_injected_random_value {-1};
+
+    [[nodiscard]] bool under_greate_stress() const {

Review Comment:
   **[Minor/Typo]** `under_greate_stress` → `under_great_stress`。作为公共 
API，建议在合入前修正，避免后续改名成本。同理 `check_ms_if_under_greate_stress()` 也需修正。



##########
cloud/src/meta-service/meta_service_rate_limit_helper.cpp:
##########
@@ -0,0 +1,410 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "meta-service/meta_service_rate_limit_helper.h"
+
+#include <fmt/format.h>
+#include <sys/resource.h>
+#include <sys/sysinfo.h>
+
+#include <algorithm>
+#include <atomic>
+#include <chrono>
+#include <condition_variable>
+#include <cstdint>
+#include <deque>
+#include <fstream>
+#include <memory>
+#include <mutex>
+#include <random>
+#include <string>
+#include <thread>
+
+#include "common/config.h"
+#include "common/logging.h"
+
+namespace doris::cloud {
+namespace {
+constexpr int64_t kNanosecondsPerMillisecond = 1000 * 1000;
+constexpr int64_t kInvalidPercent = -1;
+
+struct WindowSample {
+    int64_t second {0};
+    int64_t fdb_client_thread_busyness_percent {BVAR_FDB_INVALID_VALUE};
+    int64_t ms_cpu_usage_percent {kInvalidPercent};
+    int64_t ms_memory_usage_percent {kInvalidPercent};
+};
+
+struct ProcessResourceSample {
+    int64_t cpu_usage_percent {kInvalidPercent};
+    int64_t memory_usage_percent {kInvalidPercent};
+};
+
+class ProcessResourceSampler {
+public:
+    ProcessResourceSample sample() {
+        using namespace std::chrono;
+
+        const auto now = steady_clock::now();
+        const int64_t current_cpu_time_ns = get_process_cpu_time_ns();
+        ProcessResourceSample sample;
+        sample.memory_usage_percent = get_process_memory_usage_percent();
+
+        const auto current_wall_time_ns =
+                duration_cast<nanoseconds>(now.time_since_epoch()).count();
+        std::lock_guard lock(mutex_);
+        if (last_cpu_time_ns_ != kInvalidPercent && current_cpu_time_ns != 
kInvalidPercent &&
+            current_wall_time_ns > last_wall_time_ns_) {
+            const double delta_cpu_ns = current_cpu_time_ns - 
last_cpu_time_ns_;
+            const double delta_wall_ns = current_wall_time_ns - 
last_wall_time_ns_;
+            const uint32_t cpu_cores = std::max<uint32_t>(1, 
std::thread::hardware_concurrency());
+            sample.cpu_usage_percent =
+                    static_cast<int64_t>(delta_cpu_ns * 100.0 / delta_wall_ns 
/ cpu_cores);
+        }
+        last_cpu_time_ns_ = current_cpu_time_ns;
+        last_wall_time_ns_ = current_wall_time_ns;
+        return sample;
+    }
+
+private:
+    static int64_t get_process_cpu_time_ns() {
+        rusage usage {};
+        if (getrusage(RUSAGE_SELF, &usage) != 0) {
+            return kInvalidPercent;
+        }
+        return usage.ru_utime.tv_sec * 1000L * 1000 * 1000 + 
usage.ru_utime.tv_usec * 1000L +
+               usage.ru_stime.tv_sec * 1000L * 1000 * 1000 + 
usage.ru_stime.tv_usec * 1000L;
+    }
+
+    static int64_t get_process_memory_usage_percent() {
+        std::ifstream status("/proc/self/status");
+        if (!status.is_open()) {
+            return kInvalidPercent;
+        }
+
+        int64_t rss_kb = kInvalidPercent;
+        std::string line;
+        while (std::getline(status, line)) {
+            if (!line.starts_with("VmRSS:")) {
+                continue;
+            }
+            size_t pos = std::string("VmRSS:").size();
+            while (pos < line.size() && line[pos] == ' ') {
+                ++pos;
+            }
+            rss_kb = std::stoll(line.substr(pos));
+            break;
+        }
+        if (rss_kb == kInvalidPercent) {
+            return kInvalidPercent;
+        }
+
+        struct sysinfo info {};
+        if (sysinfo(&info) != 0) {
+            return kInvalidPercent;
+        }
+        const double total_memory_bytes =
+                static_cast<double>(info.totalram) * 
static_cast<double>(info.mem_unit);
+        if (total_memory_bytes <= 0) {
+            return kInvalidPercent;
+        }
+        const double rss_bytes = static_cast<double>(rss_kb) * 1024.0;
+        return static_cast<int64_t>(rss_bytes * 100.0 / total_memory_bytes);
+    }
+
+    std::mutex mutex_;
+    int64_t last_cpu_time_ns_ {kInvalidPercent};
+    int64_t last_wall_time_ns_ {0};
+};
+
+MsStressMetrics collect_ms_stress_metrics(ProcessResourceSampler* sampler) {
+    MsStressMetrics metrics;
+    metrics.fdb_commit_latency_ns = 
g_bvar_fdb_latency_probe_commit_ns.get_value();
+    metrics.fdb_read_latency_ns = g_bvar_fdb_latency_probe_read_ns.get_value();
+    metrics.fdb_performance_limited_by_name = 
g_bvar_fdb_performance_limited_by_name.get_value();
+    metrics.fdb_client_thread_busyness_percent =
+            g_bvar_fdb_client_thread_busyness_percent.get_value();
+    const auto resource_sample = sampler->sample();
+    metrics.ms_cpu_usage_percent = resource_sample.cpu_usage_percent;
+    metrics.ms_memory_usage_percent = resource_sample.memory_usage_percent;
+    return metrics;
+}
+
+class MsStressDetector {
+public:
+    ~MsStressDetector() { stop(); }
+
+    // Compute decision from metrics and store it in latest_decision_.
+    // Called by the background thread or synchronously in tests.
+    void update(int64_t now_ms, const MsStressMetrics& metrics) {
+        auto decision = std::make_shared<MsStressDecision>();
+        decision->fdb_commit_latency_ns = metrics.fdb_commit_latency_ns;
+        decision->fdb_read_latency_ns = metrics.fdb_read_latency_ns;
+        decision->fdb_performance_limited_by_name = 
metrics.fdb_performance_limited_by_name;
+        decision->fdb_client_thread_busyness_percent = 
metrics.fdb_client_thread_busyness_percent;
+        decision->ms_cpu_usage_percent = metrics.ms_cpu_usage_percent;
+        decision->ms_memory_usage_percent = metrics.ms_memory_usage_percent;
+        const bool commit_latency_high =
+                metrics.fdb_commit_latency_ns != BVAR_FDB_INVALID_VALUE &&
+                metrics.fdb_commit_latency_ns >
+                        config::ms_rate_limit_fdb_commit_latency_ms * 
kNanosecondsPerMillisecond;
+        const bool read_latency_high =
+                metrics.fdb_read_latency_ns != BVAR_FDB_INVALID_VALUE &&
+                metrics.fdb_read_latency_ns >
+                        config::ms_rate_limit_fdb_read_latency_ms * 
kNanosecondsPerMillisecond;
+        decision->fdb_cluster_under_pressure = (commit_latency_high || 
read_latency_high) &&
+                                               
metrics.fdb_performance_limited_by_name != 0;
+
+        const int64_t current_second = now_ms / 1000;
+        // No mutex needed: update() is only called from a single thread
+        // (background thread in production, test thread in tests).
+        record_sample(current_second, metrics);
+
+        const double avg_busyness =
+                get_window_avg(current_second, 
&WindowSample::fdb_client_thread_busyness_percent,
+                               BVAR_FDB_INVALID_VALUE);
+        decision->fdb_client_thread_busyness_avg_percent = avg_busyness;
+        if (avg_busyness >= 0 &&
+            metrics.fdb_client_thread_busyness_percent != 
BVAR_FDB_INVALID_VALUE) {
+            decision->fdb_client_thread_under_pressure =
+                    avg_busyness > 
config::ms_rate_limit_fdb_client_thread_busyness_avg_percent &&
+                    metrics.fdb_client_thread_busyness_percent >
+                            
config::ms_rate_limit_fdb_client_thread_busyness_instant_percent;
+        }
+
+        const double avg_cpu = get_window_avg(current_second, 
&WindowSample::ms_cpu_usage_percent,
+                                              kInvalidPercent);
+        const double avg_memory = get_window_avg(
+                current_second, &WindowSample::ms_memory_usage_percent, 
kInvalidPercent);
+        decision->ms_cpu_usage_avg_percent = avg_cpu;
+        decision->ms_memory_usage_avg_percent = avg_memory;
+        if (avg_cpu >= 0 && metrics.ms_cpu_usage_percent != kInvalidPercent) {
+            decision->ms_resource_under_pressure =
+                    metrics.ms_cpu_usage_percent > 
config::ms_rate_limit_cpu_usage_percent &&
+                    avg_cpu > config::ms_rate_limit_cpu_usage_percent;
+        }
+        if (avg_memory >= 0 && metrics.ms_memory_usage_percent != 
kInvalidPercent) {
+            decision->ms_resource_under_pressure =
+                    decision->ms_resource_under_pressure ||
+                    (metrics.ms_memory_usage_percent > 
config::ms_rate_limit_memory_usage_percent &&
+                     avg_memory > config::ms_rate_limit_memory_usage_percent);
+        }
+        latest_decision_.store(std::move(decision));
+    }
+
+    // Lock-free read of the latest decision. Returns nullptr before first 
update.
+    std::shared_ptr<const MsStressDecision> get_latest_decision() const {
+        return latest_decision_.load();
+    }
+
+    void reset() { samples_.clear(); }
+
+    // Start the background thread that periodically collects metrics and 
updates.
+    void start() {
+        std::unique_lock lock(mtx_);
+        if (running_.load() != 0) {
+            return;
+        }
+        running_.store(1);
+        bg_thread_ = std::make_unique<std::thread>([this] {
+            pthread_setname_np(pthread_self(), "ms_stress_det");
+            LOG(INFO) << "MsStressDetector background thread started";
+            ProcessResourceSampler sampler;
+            while (running_.load() == 1) {
+                const auto now_ms = 
std::chrono::duration_cast<std::chrono::milliseconds>(
+                                            
std::chrono::steady_clock::now().time_since_epoch())
+                                            .count();
+                const auto metrics = collect_ms_stress_metrics(&sampler);
+                update(now_ms, metrics);
+                std::unique_lock l(mtx_);
+                cv_.wait_for(l, std::chrono::seconds(1), [this]() { return 
running_.load() != 1; });
+            }
+            LOG(INFO) << "MsStressDetector background thread stopped";
+        });
+    }
+
+    void stop() {
+        {
+            std::unique_lock lock(mtx_);
+            if (running_.load() != 1) {
+                return;
+            }
+            running_.store(2);
+            cv_.notify_all();
+        }
+        if (bg_thread_ && bg_thread_->joinable()) {
+            bg_thread_->join();
+            bg_thread_.reset();
+        }
+    }
+
+private:
+    using SampleField = int64_t WindowSample::*;
+
+    void record_sample(int64_t current_second, const MsStressMetrics& metrics) 
{
+        WindowSample sample;
+        sample.second = current_second;
+        sample.fdb_client_thread_busyness_percent = 
metrics.fdb_client_thread_busyness_percent;
+        sample.ms_cpu_usage_percent = metrics.ms_cpu_usage_percent;
+        sample.ms_memory_usage_percent = metrics.ms_memory_usage_percent;
+        if (!samples_.empty() && samples_.back().second == current_second) {
+            samples_.back() = sample;
+        } else {
+            samples_.push_back(sample);
+        }
+
+        const int64_t window_start =
+                current_second - std::max<int64_t>(1, 
config::ms_rate_limit_window_seconds) + 1;
+        while (!samples_.empty() && samples_.front().second < window_start) {
+            samples_.pop_front();
+        }
+    }
+
+    double get_window_avg(int64_t current_second, SampleField field, int64_t 
invalid_value) const {
+        if (samples_.empty()) {
+            return -1;
+        }
+        const int64_t required_span =
+                std::max<int64_t>(1, config::ms_rate_limit_window_seconds) - 1;
+        if (samples_.back().second != current_second ||
+            current_second - samples_.front().second < required_span) {
+            return -1;
+        }
+
+        double sum = 0;
+        int64_t valid_count = 0;
+        for (const auto& sample : samples_) {
+            if (sample.*field == invalid_value) {
+                continue;
+            }
+            sum += sample.*field;
+            ++valid_count;
+        }
+        if (valid_count == 0) {
+            return -1;
+        }
+        return sum / valid_count;
+    }
+
+    std::atomic<std::shared_ptr<const MsStressDecision>> latest_decision_;
+    std::deque<WindowSample> samples_;
+
+    // Background thread lifecycle
+    std::atomic<int> running_ {0};
+    mutable std::mutex mtx_;
+    std::condition_variable cv_;
+    std::unique_ptr<std::thread> bg_thread_;
+};
+
+MsStressDetector& global_ms_stress_detector() {
+    static MsStressDetector detector;
+    // Auto-start background thread on first access.
+    // start() is idempotent: subsequent calls are no-ops.
+    detector.start();
+    return detector;
+}
+
+int32_t get_ms_rate_limit_injection_random_value() {
+    thread_local std::mt19937 gen(std::random_device {}());
+    thread_local std::uniform_int_distribution<int32_t> dist(0, 99);
+    return dist(gen);
+}
+
+void maybe_apply_ms_rate_limit_injection(MsStressDecision* decision, int32_t 
random_value) {
+    if (!config::enable_ms_rate_limit_injection) {
+        return;
+    }
+    if (random_value < 0 || random_value >= 
config::ms_rate_limit_injection_probability) {
+        return;
+    }
+    decision->rate_limit_injected_for_test = true;
+    decision->rate_limit_injected_random_value = random_value;
+}
+} // namespace
+
+std::string MsStressDecision::debug_string() const {
+    if (!under_greate_stress()) {
+        return "meta service rate limited: no stress condition matched";
+    }
+
+    std::vector<std::string> reasons;
+    if (fdb_cluster_under_pressure) {
+        reasons.push_back(fmt::format(
+                "fdb_cluster(commit_latency_ms={}, read_latency_ms={}, 
performance_limited_by={})",
+                fdb_commit_latency_ns == BVAR_FDB_INVALID_VALUE ? -1
+                                                                : 
fdb_commit_latency_ns / 1000000,
+                fdb_read_latency_ns == BVAR_FDB_INVALID_VALUE ? -1 : 
fdb_read_latency_ns / 1000000,
+                fdb_performance_limited_by_name));
+    }
+    if (fdb_client_thread_under_pressure) {
+        reasons.push_back(fmt::format(
+                "fdb_client_thread(busyness_avg={:.2f}%, busyness_instant={}%, 
thresholds=avg>{}% "
+                "and instant>{}%)",
+                fdb_client_thread_busyness_avg_percent, 
fdb_client_thread_busyness_percent,
+                config::ms_rate_limit_fdb_client_thread_busyness_avg_percent,
+                
config::ms_rate_limit_fdb_client_thread_busyness_instant_percent));
+    }
+    if (ms_resource_under_pressure) {
+        reasons.push_back(
+                fmt::format("ms_resource(cpu_current={}%, cpu_avg={:.2f}%, 
memory_current={}%, "
+                            "memory_avg={:.2f}%, thresholds=cpu>{}% or 
memory>{}%)",
+                            ms_cpu_usage_percent, ms_cpu_usage_avg_percent, 
ms_memory_usage_percent,
+                            ms_memory_usage_avg_percent, 
config::ms_rate_limit_cpu_usage_percent,
+                            config::ms_rate_limit_memory_usage_percent));
+    }
+    if (rate_limit_injected_for_test) {
+        reasons.push_back(fmt::format("test_injection(random_value={}, 
probability<{}%)",
+                                      rate_limit_injected_random_value,
+                                      
config::ms_rate_limit_injection_probability));
+    }
+    return fmt::format("meta service rate limited by {}", fmt::join(reasons, 
"; "));
+}
+
+MsStressDecision get_ms_stress_decision() {
+    auto decision_ptr = global_ms_stress_detector().get_latest_decision();
+    MsStressDecision decision;
+    if (decision_ptr) {
+        decision = *decision_ptr;
+    }
+    // Rate limit injection is per-request (random), so apply it here, not in 
the background thread.
+    maybe_apply_ms_rate_limit_injection(&decision, 
get_ms_rate_limit_injection_random_value());
+    return decision;
+}
+
+bool check_ms_if_under_greate_stress() {

Review Comment:
   **[Minor]** `check_ms_if_under_greate_stress()` 在本 PR 
中未被任何地方调用，属于死代码。建议移除，待需要时再加。



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [feat](cloud) Add system rate limit for meta-service [doris]

Reply via email to