This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-4.0 by this push:
     new b6a1d7cf591 branch-4.0: [fix](fe) fix host not match if start fe in 
metadata_failure_recovery (#62748) (#63360)
b6a1d7cf591 is described below

commit b6a1d7cf591c6574effd8d5f2c888b1ca4d8b3c7
Author: meiyi <[email protected]>
AuthorDate: Thu May 28 12:09:36 2026 +0800

    branch-4.0: [fix](fe) fix host not match if start fe in 
metadata_failure_recovery (#62748) (#63360)
    
    pick: https://github.com/apache/doris/pull/62748
---
 cloud/src/common/config.h                          |  2 +-
 docker/runtime/doris-compose/cluster.py            |  6 +++
 docker/runtime/doris-compose/command.py            | 45 +++++++++++++++++--
 docker/runtime/doris-compose/resource/fdb.conf     |  4 +-
 docker/runtime/doris-compose/resource/init_fdb.sh  | 19 ++++++++
 docker/runtime/doris-compose/resource/init_fe.sh   | 29 +++++++++++-
 .../main/java/org/apache/doris/catalog/Env.java    | 51 ++++++++++++++++++++++
 .../doris/regression/suite/SuiteCluster.groovy     | 33 ++++++++++++++
 8 files changed, 181 insertions(+), 8 deletions(-)

diff --git a/cloud/src/common/config.h b/cloud/src/common/config.h
index 35d38a4d416..51fef04ee2a 100644
--- a/cloud/src/common/config.h
+++ b/cloud/src/common/config.h
@@ -390,7 +390,7 @@ CONF_Bool(enable_split_rowset_meta_pb, "true");
 CONF_Int32(split_rowset_meta_pb_size, "10000"); // split rowset meta pb size, 
default is 10K
 CONF_Bool(enable_split_tablet_schema_pb, "false");
 CONF_Int32(split_tablet_schema_pb_size, "10000"); // split tablet schema pb 
size, default is 10K
-CONF_Bool(enable_check_fe_drop_in_safe_time, "true");
+CONF_mBool(enable_check_fe_drop_in_safe_time, "true");
 
 CONF_Bool(enable_logging_for_single_version_reading, "false");
 CONF_mBool(enable_logging_conflict_keys, "false");
diff --git a/docker/runtime/doris-compose/cluster.py 
b/docker/runtime/doris-compose/cluster.py
index bfa8eb65a4c..517f92ab38c 100644
--- a/docker/runtime/doris-compose/cluster.py
+++ b/docker/runtime/doris-compose/cluster.py
@@ -809,6 +809,12 @@ class FDB(Node):
     def node_type(self):
         return Node.TYPE_FDB
 
+    def docker_env(self):
+        envs = super().docker_env()
+        for key, value in self.cluster.cloud_store_config.items():
+            envs[key] = value
+        return envs
+
     def expose_sub_dirs(self):
         return super().expose_sub_dirs() + ["data"]
 
diff --git a/docker/runtime/doris-compose/command.py 
b/docker/runtime/doris-compose/command.py
index db8b403b4a0..c7883454899 100644
--- a/docker/runtime/doris-compose/command.py
+++ b/docker/runtime/doris-compose/command.py
@@ -438,6 +438,14 @@ class UpCommand(Command):
             "Add custom host-to-IP mappings (host:ip). For example: 
--extra-hosts myhost1:192.168.10.1 myhost2:192.168.10.2 . Only use when 
creating new cluster."
         )
 
+        parser.add_argument(
+            "--cloud-config",
+            nargs="*",
+            type=str,
+            help=
+            "Override cloud store config values. For example: --cloud-config 
DORIS_CLOUD_AK=xxx DORIS_CLOUD_BUCKET=yyy. Only use when creating new cloud 
cluster."
+        )
+
         parser.add_argument("--coverage-dir",
                             default="",
                             help="Set code coverage output directory")
@@ -490,7 +498,7 @@ class UpCommand(Command):
         parser.add_argument(
             "--fdb-version",
             type=str,
-            default="7.1.26",
+            default="7.3.69",
             help="fdb image version. Only use in cloud cluster.")
         parser.add_argument(
             "--fdb-image",
@@ -595,7 +603,9 @@ class UpCommand(Command):
                     args.add_recycle_num = 1
                 if not args.be_cluster:
                     args.be_cluster = "compute_cluster"
-                cloud_store_config = self._get_cloud_store_config()
+                cloud_store_config = self._merge_cloud_store_config(
+                    self._get_cloud_store_config(), args.cloud_config
+                )
             else:
                 args.add_ms_num = 0
                 args.add_recycle_num = 0
@@ -807,8 +817,7 @@ class UpCommand(Command):
                     except Exception as e:
                         LOG.error(f"Failed to add BE {be_endpoint}: {str(e)}")
                 if is_new_cluster:
-                    cloud_store_config = self._get_cloud_store_config()
-                    db_mgr.create_default_storage_vault(cloud_store_config)
+                    
db_mgr.create_default_storage_vault(cluster.cloud_store_config)
 
             if not cluster.is_host_network():
                 wait_service(True, args.wait_timeout, cluster, add_fe_ids,
@@ -884,6 +893,34 @@ class UpCommand(Command):
                     format(key, CLUSTER.CLOUD_CFG_FILE))
         return config
 
+    @staticmethod
+    def _merge_cloud_store_config(base_config, overrides):
+        if not overrides:
+            return base_config
+
+        merged = dict(base_config)
+        for item in overrides:
+            pos = item.find('=')
+            if pos <= 0:
+                raise Exception(
+                    "cloud config override '{}' error format, should be like 
'name=value'".
+                    format(item)
+                )
+            key = item[:pos].strip()
+            value = item[pos + 1:].strip()
+            if not key or not value:
+                raise Exception(
+                    "cloud config override '{}' error format, should be like 
'name=value'".
+                    format(item)
+                )
+            if key not in merged:
+                raise Exception(
+                    "Unknown cloud config override '{}', available keys: {}".
+                    format(key, ", ".join(sorted(merged.keys())))
+                )
+            merged[key] = value
+        return merged
+
 
 class DownCommand(Command):
 
diff --git a/docker/runtime/doris-compose/resource/fdb.conf 
b/docker/runtime/doris-compose/resource/fdb.conf
index 14c9976e4fc..5d8d4f4ef60 100644
--- a/docker/runtime/doris-compose/resource/fdb.conf
+++ b/docker/runtime/doris-compose/resource/fdb.conf
@@ -41,5 +41,7 @@ cache-memory = 200MiB
 [fdbserver.4500]
 
 [backup_agent]
-command = /usr/bin/backup_agent
+command = /usr/bin/backup_agent --blob-credentials 
/opt/apache-doris/fdb/conf/blob_creds.json
 logdir = /opt/apache-doris/fdb/log
+
+[backup_agent.1]
\ No newline at end of file
diff --git a/docker/runtime/doris-compose/resource/init_fdb.sh 
b/docker/runtime/doris-compose/resource/init_fdb.sh
index b3b22f44ee4..41267af5c5e 100644
--- a/docker/runtime/doris-compose/resource/init_fdb.sh
+++ b/docker/runtime/doris-compose/resource/init_fdb.sh
@@ -27,6 +27,25 @@ init_db() {
         return
     fi
 
+    if [ -n "$DORIS_CLOUD_AK" ] && [ -n "$DORIS_CLOUD_SK" ] && [ -n 
"$DORIS_CLOUD_ENDPOINT" ]; then
+        BLOB_CREDS_FILE="${DORIS_HOME}/conf/blob_creds.json"
+        if [ "$DORIS_CLOUD_PROVIDER" = "OSS" ] || [ "$DORIS_CLOUD_PROVIDER" = 
"COS" ]; then
+            
AK_WITH_ENDPOINT="${DORIS_CLOUD_AK}@${DORIS_CLOUD_BUCKET}.${DORIS_CLOUD_ENDPOINT}"
+        else
+            AK_WITH_ENDPOINT="${DORIS_CLOUD_AK}@${DORIS_CLOUD_ENDPOINT}"
+        fi
+        cat > $BLOB_CREDS_FILE << EOF
+{
+  "accounts" : {
+    "${AK_WITH_ENDPOINT}" : {
+      "secret" : "${DORIS_CLOUD_SK}"
+    }
+  }
+}
+EOF
+        health_log "Created blob_creds.json"
+    fi
+
     for ((i = 0; i < 10; i++)); do
         /usr/bin/fdbcli -C ${DORIS_HOME}/conf/fdb.cluster --exec 'configure 
new single ssd'
         if [ $? -eq 0 ]; then
diff --git a/docker/runtime/doris-compose/resource/init_fe.sh 
b/docker/runtime/doris-compose/resource/init_fe.sh
index 4e846ed182f..051711c8004 100755
--- a/docker/runtime/doris-compose/resource/init_fe.sh
+++ b/docker/runtime/doris-compose/resource/init_fe.sh
@@ -97,9 +97,34 @@ run_fe() {
 }
 
 start_cloud_fe() {
-    if [ -f "$REGISTER_FILE" ]; then
+    RECOVERY_SCRIPT="${DORIS_HOME}/conf/restore_snapshot.sh"
+    RECOVERY_ARGS=""
+    if [ -f "$RECOVERY_SCRIPT" ]; then
+        JOURNAL_ID=$(grep '^JOURNAL_ID=' "$RECOVERY_SCRIPT" | head -1 | cut 
-d= -f2)
+        if [ -z "$JOURNAL_ID" ]; then
+            health_log "ERROR: Could not extract JOURNAL_ID from recovery 
script"
+            exit 1
+        fi
+        health_log "Found recovery script with JOURNAL_ID=$JOURNAL_ID, 
executing..."
+        bash "$RECOVERY_SCRIPT"
+        RECOVERY_RES=$?
+        if [ $RECOVERY_RES -ne 0 ]; then
+            health_log "ERROR: Recovery script failed with exit code 
$RECOVERY_RES"
+            exit $RECOVERY_RES
+        fi
+        mv "$RECOVERY_SCRIPT" "${RECOVERY_SCRIPT}.bak"
+        MV_RES=$?
+        if [ $MV_RES -ne 0 ]; then
+            health_log "ERROR: Failed to rename recovery script to 
${RECOVERY_SCRIPT}.bak"
+            exit $MV_RES
+        fi
+        health_log "Recovery script executed and renamed to 
${RECOVERY_SCRIPT}.bak"
+        RECOVERY_ARGS="--metadata_failure_recovery --recovery_journal_id 
$JOURNAL_ID"
+    fi
+
+    if [ -f "$REGISTER_FILE" ] || [ -n "$RECOVERY_ARGS" ]; then
         fe_daemon &
-        run_fe
+        run_fe $RECOVERY_ARGS
         return
     fi
 
diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java 
b/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java
index a9c6277863a..cc43996eee9 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java
@@ -2095,6 +2095,7 @@ public class Env {
     private void checkCurrentNodeExist() {
         boolean metadataFailureRecovery = null != 
System.getProperty(FeConstants.METADATA_FAILURE_RECOVERY_KEY);
         if (metadataFailureRecovery) {
+            updateRecoveryFrontendHostIfNeeded();
             return;
         }
 
@@ -2111,6 +2112,56 @@ public class Env {
         }
     }
 
+    // During backup-restore recovery, the restored metadata may contain FE 
entries with old IP
+    // addresses from the source cluster. Find the FE entry by node name and 
update its host to
+    // the current node's actual address. This must run before 
CloudClusterChecker starts to
+    // prevent it from dropping the only FE and leaving the BDB group empty.
+    private void updateRecoveryFrontendHostIfNeeded() {
+        if (Config.isNotCloudMode()) {
+            return;
+        }
+        Frontend selfFe = frontends.get(nodeName);
+        if (selfFe == null) {
+            LOG.error("Recovery mode: frontend with node name '{}' not found 
in metadata. "
+                    + "Available frontends: {}. Will exit.", nodeName, 
frontends.keySet());
+            System.exit(-1);
+        }
+
+        if (selfFe.getRole() != role) {
+            LOG.error("Recovery mode: role mismatch for frontend '{}': 
expected={}, actual={}. Will exit.",
+                    nodeName, role, selfFe.getRole());
+            System.exit(-1);
+        }
+
+        if (selfFe.getHost().equals(selfNode.getHost()) && 
selfFe.getEditLogPort() == selfNode.getPort()) {
+            LOG.info("Recovery mode: frontend '{}' already has correct address 
{}:{}",
+                    nodeName, selfNode.getHost(), selfNode.getPort());
+            return;
+        }
+
+        if (selfFe.getEditLogPort() != selfNode.getPort()) {
+            LOG.error("Recovery mode: edit_log_port mismatch for frontend 
'{}': restored={}, current={}. "
+                    + "Port migration during recovery is not supported. Will 
exit.",
+                    nodeName, selfFe.getEditLogPort(), selfNode.getPort());
+            System.exit(-1);
+        }
+
+        Frontend conflicting = checkFeExist(selfNode.getHost(), 
selfNode.getPort());
+        if (conflicting != null && 
!conflicting.getNodeName().equals(nodeName)) {
+            LOG.error("Recovery mode: another frontend '{}' already has 
address {}:{}. "
+                    + "Cannot update frontend '{}' to this address. Will 
exit.",
+                    conflicting.getNodeName(), selfNode.getHost(), 
selfNode.getPort(), nodeName);
+            System.exit(-1);
+        }
+
+        LOG.info("Recovery mode: updating frontend '{}' host from {} to {} to 
match current node",
+                nodeName, selfFe.getHost(), selfNode.getHost());
+        selfFe.setHost(selfNode.getHost());
+
+        editLog.logModifyFrontend(selfFe);
+        LOG.info("Recovery mode: frontend host update persisted to edit log");
+    }
+
     private void checkBeExecVersion() {
         if (Config.be_exec_version < Config.min_be_exec_version
                 || Config.be_exec_version > Config.max_be_exec_version) {
diff --git 
a/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/SuiteCluster.groovy
 
b/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/SuiteCluster.groovy
index 513c0a5b3bd..da4617ff5d3 100644
--- 
a/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/SuiteCluster.groovy
+++ 
b/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/SuiteCluster.groovy
@@ -71,6 +71,9 @@ class ClusterOptions {
     // just as `docker run --add-host myhost:192.168.10.10` do.
     List<String> extraHosts = []
 
+    // cloud store overrides for cloud docker clusters, each item should be 
'name=value'
+    List<String> cloudStoreConfigs = []
+
     boolean connectToFollower = false
 
     // 1. cloudMode = true, only create cloud cluster.
@@ -353,6 +356,10 @@ class SuiteCluster {
             cmd += ['--extra-hosts']
             cmd += options.extraHosts
         }
+        if (!options.cloudStoreConfigs.isEmpty()) {
+            cmd += ['--cloud-config']
+            cmd += options.cloudStoreConfigs
+        }
         if (config.dockerCoverageOutputDir != null && 
config.dockerCoverageOutputDir != '') {
             cmd += ['--coverage-dir', config.dockerCoverageOutputDir]
         }
@@ -604,6 +611,18 @@ class SuiteCluster {
         runBackendsCmd(START_WAIT_TIMEOUT + 5, "start  --wait-timeout 
${START_WAIT_TIMEOUT}".toString(), indices)
     }
 
+    // indices start from 1, not 0
+    // if not specific meta-service indices, then start all meta services
+    void startMetaServices(int... indices) {
+        runMsCmd(START_WAIT_TIMEOUT + 5, "start  --wait-timeout 
${START_WAIT_TIMEOUT}".toString(), indices)
+    }
+
+    // indices start from 1, not 0
+    // if not specific recycler indices, then start all recyclers
+    void startRecyclers(int... indices) {
+        runRecyclerCmd(START_WAIT_TIMEOUT + 5, "start  --wait-timeout 
${START_WAIT_TIMEOUT}".toString(), indices)
+    }
+
     // indices start from 1, not 0
     // if not specific fe indices, then stop all frontends
     void stopFrontends(int... indices) {
@@ -618,6 +637,20 @@ class SuiteCluster {
         waitHbChanged()
     }
 
+    // indices start from 1, not 0
+    // if not specific meta-service indices, then stop all meta services
+    void stopMetaServices(int... indices) {
+        runMsCmd(STOP_WAIT_TIMEOUT + 5, "stop --wait-timeout 
${STOP_WAIT_TIMEOUT}".toString(), indices)
+        waitHbChanged()
+    }
+
+    // indices start from 1, not 0
+    // if not specific recycler indices, then stop all recyclers
+    void stopRecyclers(int... indices) {
+        runRecyclerCmd(STOP_WAIT_TIMEOUT + 5, "stop --wait-timeout 
${STOP_WAIT_TIMEOUT}".toString(), indices)
+        waitHbChanged()
+    }
+
     // indices start from 1, not 0
     // if not specific fe indices, then restart all frontends
     void restartFrontends(int... indices) {


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to