(doris) 02/02: [opt](memory) Intern cluster ID strings in CloudReplica

dataroaring Thu, 12 Mar 2026 06:19:12 -0700

This is an automated email from the ASF dual-hosted git repository.

dataroaring pushed a commit to branch feature/cloud-replica-memory-opt
in repository https://gitbox.apache.org/repos/asf/doris.git


commit 0d10541ed9a4cb873c33e8f46085271a14f361ca
Author: Yongqiang YANG <[email protected]>
AuthorDate: Thu Mar 12 06:18:16 2026 -0700

    [opt](memory) Intern cluster ID strings in CloudReplica
    
    Add a static intern pool for cluster ID strings to eliminate millions
    of duplicate String instances across CloudReplica objects. Each replica
    stores cluster IDs as map keys; without interning, Gson deserialization
    creates a separate String instance per replica (~40-70 bytes each).
    
    - Add ConcurrentHashMap-based intern pool (heap-safe, unlike String.intern)
    - Intern strings at all write points (updateClusterToPrimaryBe/Secondary)
    - Intern keys during gsonPostProcess for deserialized maps
    - For 1M replicas with 3 clusters: saves ~40-70 MB of duplicate Strings
    
    Co-Authored-By: Claude Opus 4.6 <[email protected]>
---
 .../apache/doris/cloud/catalog/CloudReplica.java   | 27 +++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudReplica.java 
b/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudReplica.java
index b5f119cc803..6948ef2f49c 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudReplica.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudReplica.java
@@ -73,6 +73,18 @@ public class CloudReplica extends Replica implements 
GsonPostProcessable {
 
     private static final Random rand = new Random();
 
+    // Intern pool for cluster ID strings to avoid millions of duplicate 
String instances.
+    // Typically only a handful of distinct cluster IDs exist in the system.
+    private static final ConcurrentHashMap<String, String> CLUSTER_ID_POOL = 
new ConcurrentHashMap<>();
+
+    private static String internClusterId(String clusterId) {
+        if (clusterId == null) {
+            return null;
+        }
+        String existing = CLUSTER_ID_POOL.putIfAbsent(clusterId, clusterId);
+        return existing != null ? existing : clusterId;
+    }
+
     private Map<String, List<Long>> memClusterToBackends = null;
 
     // clusterId, secondaryBe, changeTimestamp
@@ -608,7 +620,7 @@ public class CloudReplica extends Replica implements 
GsonPostProcessable {
     }
 
     public void updateClusterToPrimaryBe(String cluster, long beId) {
-        getOrCreatePrimaryMap().put(cluster, beId);
+        getOrCreatePrimaryMap().put(internClusterId(cluster), beId);
         Map<String, Pair<Long, Long>> secMap = secondaryClusterToBackends;
         if (secMap != null) {
             secMap.remove(cluster);
@@ -626,7 +638,7 @@ public class CloudReplica extends Replica implements 
GsonPostProcessable {
             LOG.debug("add to secondary clusterId {}, beId {}, changeTimestamp 
{}, replica info {}",
                     cluster, beId, changeTimestamp, this);
         }
-        getOrCreateSecondaryMap().put(cluster, Pair.of(beId, changeTimestamp));
+        getOrCreateSecondaryMap().put(internClusterId(cluster), Pair.of(beId, 
changeTimestamp));
     }
 
     public void clearClusterToBe(String cluster) {
@@ -709,10 +721,19 @@ public class CloudReplica extends Replica implements 
GsonPostProcessable {
                 String clusterId = entry.getKey();
                 List<Long> beIds = entry.getValue();
                 if (beIds != null && !beIds.isEmpty()) {
-                    map.put(clusterId, beIds.get(0));
+                    map.put(internClusterId(clusterId), beIds.get(0));
                 }
             }
             this.primaryClusterToBackends = null;
         }
+        // Intern cluster ID keys in deserialized primary map to share String 
instances
+        // across millions of CloudReplica objects
+        if (primaryClusterToBackend != null) {
+            ConcurrentHashMap<String, Long> interned = new 
ConcurrentHashMap<>(primaryClusterToBackend.size());
+            for (Map.Entry<String, Long> entry : 
primaryClusterToBackend.entrySet()) {
+                interned.put(internClusterId(entry.getKey()), 
entry.getValue());
+            }
+            primaryClusterToBackend = interned;
+        }
     }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(doris) 02/02: [opt](memory) Intern cluster ID strings in CloudReplica

Reply via email to