This is an automated email from the ASF dual-hosted git repository.

hello-stephen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new ff954cc45f3 [test](pipeline) collect Cloud P0 resource usage (#65125)
ff954cc45f3 is described below

commit ff954cc45f3fd568c80dcee0ce75877ade8fba9c
Author: shuke <[email protected]>
AuthorDate: Fri Jul 3 11:02:21 2026 +0800

    [test](pipeline) collect Cloud P0 resource usage (#65125)
    
    ## Proposed changes
    - Sample system resource usage every minute during Cloud P0 regression
    runs.
    - Save snapshots to output/regression-test/log/system_resource_usage.log
    so they are archived with Doris logs.
    - Capture free, meminfo, top 10 processes by RSS/CPU, top output, and a
    final snapshot before archive.
    
    ## Testing
    - bash -n regression-test/pipeline/common/doris-utils.sh
    - bash -n regression-test/pipeline/cloud_p0/run.sh
    - git diff --check
---
 regression-test/pipeline/cloud_p0/run.sh       |  3 ++
 regression-test/pipeline/common/doris-utils.sh | 75 ++++++++++++++++++++++++++
 2 files changed, 78 insertions(+)

diff --git a/regression-test/pipeline/cloud_p0/run.sh 
b/regression-test/pipeline/cloud_p0/run.sh
index 7728cd94698..c659af89141 100644
--- a/regression-test/pipeline/cloud_p0/run.sh
+++ b/regression-test/pipeline/cloud_p0/run.sh
@@ -49,6 +49,8 @@ need_collect_log=false
 
 # monitoring the log files in "${DORIS_HOME}"/regression-test/log/ for keyword 
'Reach limit of connections'
 _monitor_regression_log &
+start_system_resource_monitor
+trap 'stop_system_resource_monitor' EXIT
 
 # shellcheck disable=SC2329
 run() {
@@ -114,6 +116,7 @@ timeout_minutes=$((${repeat_times_from_trigger:-1} * 
${BUILD_TIMEOUT_MINUTES:-18
 timeout "${timeout_minutes}" bash -cx run
 exit_flag="$?"
 if print_running_pipeline_tasks; then :; fi
+stop_system_resource_monitor
 # shellcheck source=/dev/null
 source "$(cd "${teamcity_build_checkoutDir}" && bash 
"${teamcity_build_checkoutDir}"/regression-test/pipeline/common/get-or-set-tmp-env.sh
 'get')"
 if get_jstack_and_jmap_of_fe; then echo "INFO: get_jstack_and_jmap_of_fe 
done."; fi
diff --git a/regression-test/pipeline/common/doris-utils.sh 
b/regression-test/pipeline/common/doris-utils.sh
index 00b6bc9f40a..aa3ee55dc39 100644
--- a/regression-test/pipeline/common/doris-utils.sh
+++ b/regression-test/pipeline/common/doris-utils.sh
@@ -662,6 +662,81 @@ _monitor_regression_log() {
 
 }
 
+_append_system_resource_snapshot() {
+    local output_file="$1"
+    local top_n="${SYSTEM_RESOURCE_MONITOR_TOP_N:-10}"
+    {
+        echo "==================== $(date '+%Y-%m-%d %H:%M:%S %z') 
===================="
+        echo "[uptime]"
+        uptime || true
+        echo
+        echo "[free -h]"
+        free -h || true
+        echo
+        echo "[/proc/meminfo]"
+        grep -E 
'^(MemTotal|MemFree|MemAvailable|Buffers|Cached|SwapTotal|SwapFree|Slab|SReclaimable|SUnreclaim):'
 /proc/meminfo || true
+        echo
+        echo "[ps sorted by rss]"
+        ps -eo pid,ppid,rss,vsz,%mem,%cpu,comm,args --sort=-rss | head -n 
"$((top_n + 1))" || true
+        echo
+        echo "[ps sorted by cpu]"
+        ps -eo pid,ppid,rss,vsz,%mem,%cpu,comm,args --sort=-%cpu | head -n 
"$((top_n + 1))" || true
+        echo
+        if command -v top >/dev/null; then
+            echo "[top]"
+            COLUMNS=240 top -b -n 1 -w 240 | head -n "$((top_n + 7))" || true
+            echo
+        fi
+    } >>"${output_file}" 2>&1
+}
+
+start_system_resource_monitor() {
+    if [[ ! -d "${DORIS_HOME:-}" ]]; then return 1; fi
+    if [[ -n "${SYSTEM_RESOURCE_MONITOR_PID:-}" ]] && kill -0 
"${SYSTEM_RESOURCE_MONITOR_PID}" 2>/dev/null; then
+        echo "INFO: system resource monitor already started, 
pid=${SYSTEM_RESOURCE_MONITOR_PID}"
+        return 0
+    fi
+
+    local interval="${SYSTEM_RESOURCE_MONITOR_INTERVAL_SECONDS:-60}"
+    local output_file="${DORIS_HOME}/be/log/system_resource_usage.log"
+    mkdir -p "$(dirname "${output_file}")"
+    echo "INFO: start system resource monitor, interval=${interval}s, 
output=${output_file}"
+    (
+        set +e
+        monitor_sleep_pid=""
+        monitor_cleanup() {
+            if [[ -n "${monitor_sleep_pid}" ]]; then
+                kill "${monitor_sleep_pid}" 2>/dev/null || true
+                wait "${monitor_sleep_pid}" 2>/dev/null || true
+            fi
+            exit 0
+        }
+        trap monitor_cleanup TERM INT
+        while true; do
+            _append_system_resource_snapshot "${output_file}"
+            sleep "${interval}" &
+            monitor_sleep_pid="$!"
+            wait "${monitor_sleep_pid}"
+            monitor_sleep_pid=""
+        done
+    ) &
+    SYSTEM_RESOURCE_MONITOR_PID="$!"
+    export SYSTEM_RESOURCE_MONITOR_PID
+}
+
+stop_system_resource_monitor() {
+    if [[ -z "${SYSTEM_RESOURCE_MONITOR_PID:-}" ]]; then return 0; fi
+    if kill -0 "${SYSTEM_RESOURCE_MONITOR_PID}" 2>/dev/null; then
+        kill "${SYSTEM_RESOURCE_MONITOR_PID}" 2>/dev/null || true
+        wait "${SYSTEM_RESOURCE_MONITOR_PID}" 2>/dev/null || true
+        echo "INFO: stop system resource monitor, 
pid=${SYSTEM_RESOURCE_MONITOR_PID}"
+    fi
+    if [[ -d "${DORIS_HOME:-}" ]]; then
+        _append_system_resource_snapshot 
"${DORIS_HOME}/be/log/system_resource_usage.log"
+    fi
+    unset SYSTEM_RESOURCE_MONITOR_PID
+}
+
 _redact_creds() {
     local expr="" v escaped
     for v in "${hwYunAk:-}" "${hwYunSk:-}" "${s3SourceAk:-}" "${s3SourceSk:-}" 
"${txYunAk:-}" "${txYunSk:-}"; do


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to