This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 49196804f8e74db346001d951b44777d6e0b2bfe
Author: Arnab Karmakar <[email protected]>
AuthorDate: Fri Feb 6 02:13:49 2026 -0800

    IMPALA-14623: Optimize memory usage for Iceberg file path hashes
    
    Use THash128 Thrift struct (16 bytes) instead of String (64 bytes)
    for storing 128-bit XXH128 hashes of Iceberg file paths,
    achieving 4x memory reduction.
    
    Key Changes:
    - Added THash128 Thrift struct with two i64 fields (high/low)
    - Updated TIcebergContentFileStore to use THash128 as map keys
    - Created Hash128 Java class with Thrift serialization support
    - Migrated from Murmur3 to XXH128 for better performance
    - Added C++ comparison operators for THash128
    
    Testing:
    - Added comprehensive JUnit tests for Hash128 class
    
    Change-Id: Ie0de793de2434dae3b60c3aa4f87dba203eee3c1
    Reviewed-on: http://gerrit.cloudera.org:8080/23946
    Tested-by: Impala Public Jenkins <[email protected]>
    Reviewed-by: Zoltan Borok-Nagy <[email protected]>
---
 be/src/catalog/catalog-server.cc                   |   1 +
 be/src/catalog/catalog.cc                          |   1 +
 be/src/rpc/hs2-http-test.cc                        |   1 +
 be/src/runtime/descriptors.cc                      |   1 +
 be/src/service/frontend.cc                         |   1 +
 be/src/util/container-util.h                       |   1 +
 be/src/util/thash128-util.h                        |  38 +++++
 common/thrift/CatalogObjects.thrift                |  17 +-
 fe/pom.xml                                         |   6 +
 .../org/apache/impala/catalog/FeIcebergTable.java  |   3 +-
 .../impala/catalog/IcebergContentFileStore.java    |  36 ++--
 .../apache/impala/planner/IcebergScanPlanner.java  |   3 +-
 .../main/java/org/apache/impala/util/Hash128.java  |  83 +++++++++
 .../java/org/apache/impala/util/IcebergUtil.java   |  13 +-
 .../impala/catalog/local/LocalCatalogTest.java     |   3 +-
 .../java/org/apache/impala/util/Hash128Test.java   | 185 +++++++++++++++++++++
 .../org/apache/impala/util/IcebergUtilTest.java    |   4 +-
 17 files changed, 364 insertions(+), 33 deletions(-)

diff --git a/be/src/catalog/catalog-server.cc b/be/src/catalog/catalog-server.cc
index 88a0eb571..077ec22cb 100644
--- a/be/src/catalog/catalog-server.cc
+++ b/be/src/catalog/catalog-server.cc
@@ -28,6 +28,7 @@
 #include "gen-cpp/CatalogService_types.h"
 #include "statestore/statestore-subscriber.h"
 #include "util/collection-metrics.h"
+#include "util/container-util.h"
 #include "util/debug-util.h"
 #include "util/event-metrics.h"
 #include "util/json-util.h"
diff --git a/be/src/catalog/catalog.cc b/be/src/catalog/catalog.cc
index 08d8fad6a..4f88d6767 100644
--- a/be/src/catalog/catalog.cc
+++ b/be/src/catalog/catalog.cc
@@ -24,6 +24,7 @@
 #include "common/thread-debug-info.h"
 #include "rpc/jni-thrift-util.h"
 #include "util/backend-gflag-util.h"
+#include "util/container-util.h"
 
 #include "common/names.h"
 
diff --git a/be/src/rpc/hs2-http-test.cc b/be/src/rpc/hs2-http-test.cc
index b3db0b0ba..35ffbef30 100644
--- a/be/src/rpc/hs2-http-test.cc
+++ b/be/src/rpc/hs2-http-test.cc
@@ -21,6 +21,7 @@
 
 #include "gen-cpp/ImpalaHiveServer2Service.h"
 #include "rpc/authentication.h"
+#include "util/container-util.h"
 #include "util/kudu-status-util.h"
 #include "util/network-util.h"
 #include "util/os-util.h"
diff --git a/be/src/runtime/descriptors.cc b/be/src/runtime/descriptors.cc
index d7de726d1..e018f7a9d 100644
--- a/be/src/runtime/descriptors.cc
+++ b/be/src/runtime/descriptors.cc
@@ -35,6 +35,7 @@
 #include "gen-cpp/PlanNodes_types.h"
 #include "rpc/thrift-util.h"
 #include "runtime/runtime-state.h"
+#include "util/container-util.h"
 #include "util/hash-util.h"
 
 #include "common/names.h"
diff --git a/be/src/service/frontend.cc b/be/src/service/frontend.cc
index 6255f8fc8..94aa8f140 100644
--- a/be/src/service/frontend.cc
+++ b/be/src/service/frontend.cc
@@ -24,6 +24,7 @@
 #include "common/logging.h"
 #include "rpc/jni-thrift-util.h"
 #include "util/backend-gflag-util.h"
+#include "util/container-util.h"
 #include "util/jni-util.h"
 #include "util/test-info.h"
 #include "util/time.h"
diff --git a/be/src/util/container-util.h b/be/src/util/container-util.h
index 01f586b0f..f472ba48e 100644
--- a/be/src/util/container-util.h
+++ b/be/src/util/container-util.h
@@ -32,6 +32,7 @@
 #include "gen-cpp/Types_types.h"
 #include "gen-cpp/StatestoreService_types.h"
 #include "gen-cpp/common.pb.h"
+#include "util/thash128-util.h"
 
 /// Comparators for types that we commonly use in containers.
 namespace impala {
diff --git a/be/src/util/thash128-util.h b/be/src/util/thash128-util.h
new file mode 100644
index 000000000..e92af5898
--- /dev/null
+++ b/be/src/util/thash128-util.h
@@ -0,0 +1,38 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+// either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "gen-cpp/CatalogObjects_types.h"
+
+namespace impala {
+
+/// Comparison operators for THash128 (used as key in std::map)
+inline bool operator==(const THash128& lhs, const THash128& rhs) {
+  return lhs.high == rhs.high && lhs.low == rhs.low;
+}
+
+inline bool operator!=(const THash128& lhs, const THash128& rhs) {
+  return !(lhs == rhs);
+}
+
+/// Required for using THash128 as std::map key
+inline bool operator<(const THash128& lhs, const THash128& rhs) {
+  return std::tie(lhs.high, lhs.low) < std::tie(rhs.high, rhs.low);
+}
+
+} // namespace impala
diff --git a/common/thrift/CatalogObjects.thrift 
b/common/thrift/CatalogObjects.thrift
index 7b0fd2f88..60009e523 100644
--- a/common/thrift/CatalogObjects.thrift
+++ b/common/thrift/CatalogObjects.thrift
@@ -657,12 +657,19 @@ struct TIcebergPartitionStats {
   3: required i64 file_size_in_bytes;
 }
 
-// Contains maps from 128-bit Murmur3 hash of file path to its file descriptor
+// Represents a 128-bit hash value (XXH128 hash of file path)
+// Stored as two 64-bit longs for memory efficiency
+struct THash128 {
+  1: required i64 high
+  2: required i64 low
+}
+
+// Contains maps from 128-bit XXH128 hash of file path to its file descriptor
 struct TIcebergContentFileStore {
-  1: optional map<string, THdfsFileDesc> path_hash_to_data_file_without_deletes
-  2: optional map<string, THdfsFileDesc> path_hash_to_data_file_with_deletes
-  3: optional map<string, THdfsFileDesc> path_hash_to_position_delete_file
-  4: optional map<string, THdfsFileDesc> path_hash_to_equality_delete_file
+  1: optional map<THash128, THdfsFileDesc> 
path_hash_to_data_file_without_deletes
+  2: optional map<THash128, THdfsFileDesc> path_hash_to_data_file_with_deletes
+  3: optional map<THash128, THdfsFileDesc> path_hash_to_position_delete_file
+  4: optional map<THash128, THdfsFileDesc> path_hash_to_equality_delete_file
   5: optional bool has_avro
   6: optional bool has_orc
   7: optional bool has_parquet
diff --git a/fe/pom.xml b/fe/pom.xml
index b280ca87a..25aa141fb 100644
--- a/fe/pom.xml
+++ b/fe/pom.xml
@@ -585,6 +585,12 @@ under the License.
               </exclusion>
           </exclusions>
       </dependency>
+
+    <dependency>
+      <groupId>net.openhft</groupId>
+      <artifactId>zero-allocation-hashing</artifactId>
+      <version>0.16</version>
+    </dependency>
   </dependencies>
 
   <reporting>
diff --git a/fe/src/main/java/org/apache/impala/catalog/FeIcebergTable.java 
b/fe/src/main/java/org/apache/impala/catalog/FeIcebergTable.java
index 637613f74..dc2a2b988 100644
--- a/fe/src/main/java/org/apache/impala/catalog/FeIcebergTable.java
+++ b/fe/src/main/java/org/apache/impala/catalog/FeIcebergTable.java
@@ -84,6 +84,7 @@ import org.apache.impala.thrift.TIcebergTable;
 import org.apache.impala.thrift.TNetworkAddress;
 import org.apache.impala.thrift.TResultSet;
 import org.apache.impala.thrift.TResultSetMetadata;
+import org.apache.impala.util.Hash128;
 import org.apache.impala.util.HdfsCachingUtil;
 import org.apache.impala.util.IcebergSchemaConverter;
 import org.apache.impala.util.IcebergUtil;
@@ -496,7 +497,7 @@ public interface FeIcebergTable extends FeFsTable {
         rowBuilder.add(filePath);
 
         // Try to get actual file size from content file store
-        String pathHash = IcebergUtil.getFilePathHash(filePath);
+        Hash128 pathHash = IcebergUtil.getFilePathHash(filePath);
         FileDescriptor fd = contentFileStore.getDataFileDescriptor(pathHash);
         if (fd == null) {
           fd = contentFileStore.getDeleteFileDescriptor(pathHash);
diff --git 
a/fe/src/main/java/org/apache/impala/catalog/IcebergContentFileStore.java 
b/fe/src/main/java/org/apache/impala/catalog/IcebergContentFileStore.java
index 9baf60ece..95434cea2 100644
--- a/fe/src/main/java/org/apache/impala/catalog/IcebergContentFileStore.java
+++ b/fe/src/main/java/org/apache/impala/catalog/IcebergContentFileStore.java
@@ -44,10 +44,12 @@ import org.apache.impala.fb.FbFileDesc;
 import org.apache.impala.fb.FbFileMetadata;
 import org.apache.impala.fb.FbIcebergDataFileFormat;
 import org.apache.impala.fb.FbIcebergMetadata;
+import org.apache.impala.thrift.THash128;
 import org.apache.impala.thrift.THdfsFileDesc;
 import org.apache.impala.thrift.TIcebergContentFileStore;
 import org.apache.impala.thrift.TIcebergPartition;
 import org.apache.impala.thrift.TNetworkAddress;
+import org.apache.impala.util.Hash128;
 import org.apache.impala.util.IcebergUtil;
 import org.apache.impala.util.ListMap;
 
@@ -97,12 +99,12 @@ public class IcebergContentFileStore {
   // Auxiliary class for holding file descriptors in both a map and a list.
   private static class MapListContainer {
     // Key is the ContentFile path hash, value is FileDescriptor transformed 
from DataFile
-    private final Map<String, EncodedFileDescriptor> fileDescMap_ = new 
HashMap<>();
+    private final Map<Hash128, EncodedFileDescriptor> fileDescMap_ = new 
HashMap<>();
     private final List<EncodedFileDescriptor> fileDescList_ = new 
ArrayList<>();
 
     // Adds a file to the map. If this is a new entry, then add it to the list 
as well.
     // Return true if 'desc' was a new entry.
-    public boolean add(String pathHash, EncodedFileDescriptor desc) {
+    public boolean add(Hash128 pathHash, EncodedFileDescriptor desc) {
       if (fileDescMap_.put(pathHash, desc) == null) {
         fileDescList_.add(desc);
         return true;
@@ -110,7 +112,7 @@ public class IcebergContentFileStore {
       return false;
     }
 
-    public IcebergFileDescriptor get(String pathHash) {
+    public IcebergFileDescriptor get(Hash128 pathHash) {
       if (!fileDescMap_.containsKey(pathHash)) return null;
       return decode(fileDescMap_.get(pathHash));
     }
@@ -124,27 +126,27 @@ public class IcebergContentFileStore {
     }
 
     // It's enough to only convert the map part to thrift.
-    Map<String, THdfsFileDesc> toThrift() {
-      Map<String, THdfsFileDesc> ret = new HashMap<>();
-      for (Map.Entry<String, EncodedFileDescriptor> entry : 
fileDescMap_.entrySet()) {
+    Map<THash128, THdfsFileDesc> toThrift() {
+      Map<THash128, THdfsFileDesc> ret = new HashMap<>();
+      for (Map.Entry<Hash128, EncodedFileDescriptor> entry : 
fileDescMap_.entrySet()) {
         ret.put(
-            entry.getKey(),
+            entry.getKey().toThrift(),
             decode(entry.getValue()).toThrift());
       }
       return ret;
     }
 
-    static MapListContainer fromThrift(Map<String, THdfsFileDesc> thriftMap,
+    static MapListContainer fromThrift(Map<THash128, THdfsFileDesc> thriftMap,
         List<TNetworkAddress> networkAddresses, ListMap<TNetworkAddress> 
hostIndex) {
       MapListContainer ret = new MapListContainer();
-      for (Map.Entry<String, THdfsFileDesc> entry : thriftMap.entrySet()) {
+      for (Map.Entry<THash128, THdfsFileDesc> entry : thriftMap.entrySet()) {
         IcebergFileDescriptor fd = 
IcebergFileDescriptor.fromThrift(entry.getValue());
         Preconditions.checkNotNull(fd);
         if (networkAddresses != null) {
           Preconditions.checkNotNull(hostIndex);
           fd = fd.cloneWithNewHostIndex(networkAddresses, hostIndex);
         }
-        ret.add(entry.getKey(), encode(fd));
+        ret.add(Hash128.fromThrift(entry.getKey()), encode(fd));
       }
       return ret;
     }
@@ -161,7 +163,7 @@ public class IcebergContentFileStore {
   private Map<TIcebergPartition, Integer> partitions_;
 
   // Caches file descriptors loaded during time-travel queries.
-  private final ConcurrentMap<String, EncodedFileDescriptor> oldFileDescMap_ =
+  private final ConcurrentMap<Hash128, EncodedFileDescriptor> oldFileDescMap_ =
       new ConcurrentHashMap<>();
   // Caches the partitions of file descriptors loaded during time-travel 
queries.
   private final ConcurrentMap<TIcebergPartition, Integer> oldPartitionMap_ =
@@ -206,7 +208,7 @@ public class IcebergContentFileStore {
 
   private void storeFile(ContentFile<?> contentFile,
       Map<String, IcebergFileDescriptor> fileDescMap, MapListContainer 
container) {
-    Pair<String, EncodedFileDescriptor> pathHashAndFd =
+    Pair<Hash128, EncodedFileDescriptor> pathHashAndFd =
         getPathHashAndFd(contentFile, fileDescMap);
     if (pathHashAndFd.second != null) {
       container.add(pathHashAndFd.first, pathHashAndFd.second);
@@ -217,7 +219,7 @@ public class IcebergContentFileStore {
 
   // This is only invoked during time travel, when we are querying a snapshot 
that has
   // data files which have been removed since.
-  public void addOldFileDescriptor(String pathHash, IcebergFileDescriptor 
desc) {
+  public void addOldFileDescriptor(Hash128 pathHash, IcebergFileDescriptor 
desc) {
     oldFileDescMap_.put(pathHash, encode(desc));
   }
 
@@ -227,19 +229,19 @@ public class IcebergContentFileStore {
     oldPartitionMap_.put(partition, id);
   }
 
-  public IcebergFileDescriptor getDataFileDescriptor(String pathHash) {
+  public IcebergFileDescriptor getDataFileDescriptor(Hash128 pathHash) {
     IcebergFileDescriptor desc = dataFilesWithoutDeletes_.get(pathHash);
     if (desc != null) return desc;
     return dataFilesWithDeletes_.get(pathHash);
   }
 
-  public IcebergFileDescriptor getDeleteFileDescriptor(String pathHash) {
+  public IcebergFileDescriptor getDeleteFileDescriptor(Hash128 pathHash) {
     IcebergFileDescriptor ret = positionDeleteFiles_.get(pathHash);
     if (ret != null) return ret;
     return equalityDeleteFiles_.get(pathHash);
   }
 
-  public IcebergFileDescriptor getOldFileDescriptor(String pathHash) {
+  public IcebergFileDescriptor getOldFileDescriptor(Hash128 pathHash) {
     if (!oldFileDescMap_.containsKey(pathHash)) return null;
     return decode(oldFileDescMap_.get(pathHash));
   }
@@ -332,7 +334,7 @@ public class IcebergContentFileStore {
     }
   }
 
-  private Pair<String, EncodedFileDescriptor> getPathHashAndFd(
+  private Pair<Hash128, EncodedFileDescriptor> getPathHashAndFd(
       ContentFile<?> contentFile, Map<String, IcebergFileDescriptor> 
fileDescMap) {
     return new Pair<>(
         IcebergUtil.getFilePathHash(contentFile),
diff --git a/fe/src/main/java/org/apache/impala/planner/IcebergScanPlanner.java 
b/fe/src/main/java/org/apache/impala/planner/IcebergScanPlanner.java
index 76b16f166..2d0bbd87f 100644
--- a/fe/src/main/java/org/apache/impala/planner/IcebergScanPlanner.java
+++ b/fe/src/main/java/org/apache/impala/planner/IcebergScanPlanner.java
@@ -88,6 +88,7 @@ import org.apache.impala.thrift.TIcebergPartition;
 import org.apache.impala.thrift.TIcebergPartitionTransformType;
 import org.apache.impala.thrift.TQueryOptions;
 import org.apache.impala.thrift.TVirtualColumnType;
+import org.apache.impala.util.Hash128;
 import org.apache.impala.util.IcebergUtil;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -801,7 +802,7 @@ public class IcebergScanPlanner {
   private Pair<IcebergFileDescriptor, Boolean> 
getFileDescriptor(ContentFile<?> cf,
       IcebergContentFileStore fileStore)
       throws ImpalaRuntimeException {
-    String pathHash = IcebergUtil.getFilePathHash(cf);
+    Hash128 pathHash = IcebergUtil.getFilePathHash(cf);
 
     IcebergFileDescriptor iceFileDesc = cf.content() == FileContent.DATA ?
         fileStore.getDataFileDescriptor(pathHash) :
diff --git a/fe/src/main/java/org/apache/impala/util/Hash128.java 
b/fe/src/main/java/org/apache/impala/util/Hash128.java
new file mode 100644
index 000000000..718281f00
--- /dev/null
+++ b/fe/src/main/java/org/apache/impala/util/Hash128.java
@@ -0,0 +1,83 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.impala.util;
+
+import com.google.common.base.Preconditions;
+import java.util.Objects;
+import org.apache.impala.thrift.THash128;
+
+/**
+ * Represents a 128-bit hash value using two longs (high and low 64 bits).
+ *
+ * Usage:
+ * - Create: Hash128 hash = IcebergUtil.getFilePathHash(filePath);
+ * - As map key: map.put(hash, value);
+ * - Thrift: THash128 t = hash.toThrift(); Hash128 h = Hash128.fromThrift(t);
+ * - Logging: LOG.debug("Hash: {}", hash.toString());
+ */
+public class Hash128 {
+  private final long high_;
+  private final long low_;
+
+  public Hash128(long high, long low) {
+    high_ = high;
+    low_ = low;
+  }
+
+  public long getHigh() {
+    return high_;
+  }
+
+  public long getLow() {
+    return low_;
+  }
+
+  /**
+   * Converts this hash to a Thrift THash128 struct for serialization.
+   */
+  public THash128 toThrift() {
+    return new THash128(high_, low_);
+  }
+
+  /**
+   * Creates a Hash128 from a Thrift THash128 struct.
+   */
+  public static Hash128 fromThrift(THash128 thrift) {
+    Preconditions.checkNotNull(thrift);
+    return new Hash128(thrift.getHigh(), thrift.getLow());
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o) return true;
+    if (o == null || getClass() != o.getClass()) return false;
+    Hash128 hash128 = (Hash128) o;
+    return high_ == hash128.high_ && low_ == hash128.low_;
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(high_, low_);
+  }
+
+  @Override
+  public String toString() {
+    // Return hexadecimal representation for debugging
+    return String.format("%016x%016x", high_, low_);
+  }
+}
diff --git a/fe/src/main/java/org/apache/impala/util/IcebergUtil.java 
b/fe/src/main/java/org/apache/impala/util/IcebergUtil.java
index 8c8736cbd..38f10a20d 100644
--- a/fe/src/main/java/org/apache/impala/util/IcebergUtil.java
+++ b/fe/src/main/java/org/apache/impala/util/IcebergUtil.java
@@ -734,16 +734,17 @@ public class IcebergUtil {
   }
 
   /**
-   * Use ContentFile path to generate 128-bit Murmur3 hash as map key, cached 
in memory
+   * Use ContentFile path to generate 128-bit XXH128 hash as map key, cached 
in memory
    */
-  public static String getFilePathHash(ContentFile contentFile) {
+  public static Hash128 getFilePathHash(ContentFile contentFile) {
     return getFilePathHash(contentFile.path().toString());
   }
 
-  public static String getFilePathHash(String path) {
-    Hasher hasher = Hashing.murmur3_128().newHasher();
-    hasher.putUnencodedChars(path);
-    return hasher.hash().toString();
+  public static Hash128 getFilePathHash(String path) {
+    net.openhft.hashing.LongTupleHashFunction hasher =
+        net.openhft.hashing.LongTupleHashFunction.xx128();
+    long[] result = hasher.hashChars(path);
+    return new Hash128(result[0], result[1]);
   }
 
   /**
diff --git 
a/fe/src/test/java/org/apache/impala/catalog/local/LocalCatalogTest.java 
b/fe/src/test/java/org/apache/impala/catalog/local/LocalCatalogTest.java
index deb8bd647..a213336c3 100644
--- a/fe/src/test/java/org/apache/impala/catalog/local/LocalCatalogTest.java
+++ b/fe/src/test/java/org/apache/impala/catalog/local/LocalCatalogTest.java
@@ -52,6 +52,7 @@ import org.apache.impala.thrift.TMetadataOpcode;
 import org.apache.impala.thrift.TNetworkAddress;
 import org.apache.impala.thrift.TPartialTableInfo;
 import org.apache.impala.thrift.TResultSet;
+import org.apache.impala.util.Hash128;
 import org.apache.impala.util.IcebergUtil;
 import org.apache.impala.util.ListMap;
 import org.apache.impala.util.MetaStoreUtil;
@@ -292,7 +293,7 @@ public class LocalCatalogTest {
       // For this test table the manifest files contain data paths without 
FS-scheme, so
       // they are loaded to the file content store without them.
       path = path.substring(path.indexOf("/test-warehouse"));
-      String pathHash = IcebergUtil.getFilePathHash(path);
+      Hash128 pathHash = IcebergUtil.getFilePathHash(path);
       FileDescriptor catalogFd = 
catalogFileStore.getDataFileDescriptor(pathHash);
       assertEquals(localFd.getNumFileBlocks(), 1);
       FbFileBlock localBlock = localFd.getFbFileBlock(0);
diff --git a/fe/src/test/java/org/apache/impala/util/Hash128Test.java 
b/fe/src/test/java/org/apache/impala/util/Hash128Test.java
new file mode 100644
index 000000000..a70a3f6ad
--- /dev/null
+++ b/fe/src/test/java/org/apache/impala/util/Hash128Test.java
@@ -0,0 +1,185 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.impala.util;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotEquals;
+import static org.junit.Assert.assertNull;
+
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.impala.thrift.THash128;
+import org.junit.Test;
+
+/**
+ * Unit tests for the Hash128 class which stores 128-bit hash values
+ * as two longs for memory efficiency.
+ */
+public class Hash128Test {
+
+  @Test
+  public void testBasicConstruction() {
+    Hash128 hash = new Hash128(0x1234567890ABCDEFL, 0xFEDCBA0987654321L);
+    assertEquals(0x1234567890ABCDEFL, hash.getHigh());
+    assertEquals(0xFEDCBA0987654321L, hash.getLow());
+  }
+
+  @Test
+  public void testEquality() {
+    Hash128 hash1 = new Hash128(0x1234567890ABCDEFL, 0xFEDCBA0987654321L);
+    Hash128 hash2 = new Hash128(0x1234567890ABCDEFL, 0xFEDCBA0987654321L);
+    Hash128 hash3 = new Hash128(0x1234567890ABCDEFL, 0xFEDCBA0987654320L);
+    Hash128 hash4 = new Hash128(0x1234567890ABCDEFL, 0xFEDCBA0987654321L);
+
+    // Test equals
+    assertEquals(hash1, hash2);
+    assertEquals(hash1, hash4);
+    assertNotEquals(hash1, hash3);
+    assertNotEquals(hash1, null);
+    assertNotEquals(hash1, "not a hash");
+
+    // Test hashCode consistency
+    assertEquals(hash1.hashCode(), hash2.hashCode());
+    assertEquals(hash1.hashCode(), hash4.hashCode());
+  }
+
+  @Test
+  public void testHashMapUsage() {
+    // Hash128 should work correctly as a HashMap key
+    Map<Hash128, String> map = new HashMap<>();
+    Hash128 key1 = new Hash128(0x1234567890ABCDEFL, 0xFEDCBA0987654321L);
+    Hash128 key2 = new Hash128(0x1234567890ABCDEFL, 0xFEDCBA0987654321L);
+    Hash128 key3 = new Hash128(0xAAAAAAAAAAAAAAAAL, 0xBBBBBBBBBBBBBBBBL);
+
+    map.put(key1, "value1");
+    map.put(key3, "value3");
+
+    // key2 should retrieve the same value as key1 since they're equal
+    assertEquals("value1", map.get(key2));
+    assertEquals("value3", map.get(key3));
+    assertNull(map.get(new Hash128(0x0L, 0x0L)));
+  }
+
+  @Test
+  public void testToString() {
+    Hash128 hash = new Hash128(0x1234567890ABCDEFL, 0xFEDCBA0987654321L);
+    String str = hash.toString();
+
+    // Should be 32 hex characters (16 for each long)
+    assertEquals(32, str.length());
+    assertEquals("1234567890abcdeffedcba0987654321", str);
+
+    // Test with zero values
+    Hash128 zeroHash = new Hash128(0x0L, 0x0L);
+    assertEquals("00000000000000000000000000000000", zeroHash.toString());
+
+    // Test with max values
+    Hash128 maxHash = new Hash128(0xFFFFFFFFFFFFFFFFL, 0xFFFFFFFFFFFFFFFFL);
+    assertEquals("ffffffffffffffffffffffffffffffff", maxHash.toString());
+  }
+
+  @Test
+  public void testThriftRoundTrip() {
+    Hash128 original = new Hash128(0x1234567890ABCDEFL, 0xFEDCBA0987654321L);
+
+    // Convert to Thrift and back
+    THash128 thrift = original.toThrift();
+    Hash128 restored = Hash128.fromThrift(thrift);
+
+    assertEquals(original, restored);
+    assertEquals(original.getHigh(), restored.getHigh());
+    assertEquals(original.getLow(), restored.getLow());
+  }
+
+  @Test
+  public void testThriftStructFields() {
+    Hash128 hash = new Hash128(0x1234567890ABCDEFL, 0xFEDCBA0987654321L);
+    THash128 thrift = hash.toThrift();
+
+    // Verify Thrift struct has correct values
+    assertEquals(0x1234567890ABCDEFL, thrift.getHigh());
+    assertEquals(0xFEDCBA0987654321L, thrift.getLow());
+  }
+
+  @Test
+  public void testThriftWithEdgeCases() {
+    // Test with zero
+    Hash128 zero = new Hash128(0x0L, 0x0L);
+    THash128 thriftZero = zero.toThrift();
+    Hash128 restoredZero = Hash128.fromThrift(thriftZero);
+    assertEquals(zero, restoredZero);
+
+    // Test with all ones
+    Hash128 ones = new Hash128(0xFFFFFFFFFFFFFFFFL, 0xFFFFFFFFFFFFFFFFL);
+    THash128 thriftOnes = ones.toThrift();
+    Hash128 restoredOnes = Hash128.fromThrift(thriftOnes);
+    assertEquals(ones, restoredOnes);
+
+    // Test with high bit patterns
+    Hash128 pattern = new Hash128(0x8000000000000000L, 0x8000000000000001L);
+    THash128 thriftPattern = pattern.toThrift();
+    Hash128 restoredPattern = Hash128.fromThrift(thriftPattern);
+    assertEquals(pattern, restoredPattern);
+  }
+
+  @Test(expected = NullPointerException.class)
+  public void testNullThriftInput() {
+    Hash128.fromThrift(null);
+  }
+
+  @Test
+  public void testConsistentHashing() {
+    // Multiple Hash128 objects with same values should have consistent 
hashCode
+    Hash128 hash1 = new Hash128(0x1234567890ABCDEFL, 0xFEDCBA0987654321L);
+    Hash128 hash2 = new Hash128(0x1234567890ABCDEFL, 0xFEDCBA0987654321L);
+
+    // Create and recreate through serialization
+    Hash128 hash3 = Hash128.fromThrift(hash1.toThrift());
+
+    assertEquals(hash1.hashCode(), hash2.hashCode());
+    assertEquals(hash1.hashCode(), hash3.hashCode());
+  }
+
+  @Test
+  public void testDifferentHashesDifferentHashCodes() {
+    // Different Hash128 objects should (likely) have different hashCodes
+    Hash128 hash1 = new Hash128(0x1234567890ABCDEFL, 0xFEDCBA0987654321L);
+    Hash128 hash2 = new Hash128(0x1234567890ABCDEFL, 0xFEDCBA0987654320L);
+    Hash128 hash3 = new Hash128(0x1234567890ABCDEEL, 0xFEDCBA0987654321L);
+
+    // Note: hashCode collision is possible but unlikely
+    assertNotEquals(hash1.hashCode(), hash2.hashCode());
+    assertNotEquals(hash1.hashCode(), hash3.hashCode());
+  }
+
+  @Test
+  public void testThriftAsMapKey() {
+    // Verify that Hash128 can be used as a map key through Thrift round-trip
+    Map<Hash128, String> map = new HashMap<>();
+    Hash128 key1 = new Hash128(0x1234567890ABCDEFL, 0xFEDCBA0987654321L);
+    map.put(key1, "value1");
+
+    // Serialize and deserialize
+    THash128 thriftKey = key1.toThrift();
+    Hash128 key2 = Hash128.fromThrift(thriftKey);
+
+    // Should retrieve the same value
+    assertEquals("value1", map.get(key2));
+  }
+}
diff --git a/fe/src/test/java/org/apache/impala/util/IcebergUtilTest.java 
b/fe/src/test/java/org/apache/impala/util/IcebergUtilTest.java
index 15e8c9879..d00a127ae 100644
--- a/fe/src/test/java/org/apache/impala/util/IcebergUtilTest.java
+++ b/fe/src/test/java/org/apache/impala/util/IcebergUtilTest.java
@@ -259,9 +259,9 @@ public class IcebergUtilTest {
    */
   @Test
   public void testGetDataFilePathHash() {
-    String hash = getFilePathHash(FILE_A);
+    Hash128 hash = getFilePathHash(FILE_A);
     assertNotNull(hash);
-    String hash2 = getFilePathHash(FILE_A);
+    Hash128 hash2 = getFilePathHash(FILE_A);
     assertEquals(hash, hash2);
   }
 

Reply via email to