This is an automated email from the ASF dual-hosted git repository.

sivabalan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git


The following commit(s) were added to refs/heads/master by this push:
     new c50f5604e8b8 fix(storage-format)!: Fix HFile writer to properly encode 
checksum for compatibility (#13897)
c50f5604e8b8 is described below

commit c50f5604e8b8176fa6295560a911f36174cddda4
Author: Y Ethan Guo <[email protected]>
AuthorDate: Fri Sep 19 18:00:16 2025 -0700

    fix(storage-format)!: Fix HFile writer to properly encode checksum for 
compatibility (#13897)
---
 .../java/org/apache/hudi/io/hfile/HFileBlock.java  |  22 ++++++++++-----------
 .../org/apache/hudi/io/hfile/TestHFileWriter.java  |   2 +-
 .../org/apache/hudi/io/hfile/TestHfileBlock.java   |   5 -----
 .../src/test/resources/hfile/hudi-generated.hfile  | Bin 4654 -> 4674 bytes
 4 files changed, 11 insertions(+), 18 deletions(-)

diff --git a/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileBlock.java 
b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileBlock.java
index 4c61e6b3b8be..95033be6dd25 100644
--- a/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileBlock.java
+++ b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileBlock.java
@@ -189,9 +189,6 @@ public abstract class HFileBlock {
    * @return the number of checksum chunks.
    */
   static int numChecksumChunks(long numBytes, int bytesPerChecksum) {
-    if (bytesPerChecksum == 0) {
-      return 0;
-    }
     long numChunks = numBytes / bytesPerChecksum;
     if (numBytes % bytesPerChecksum != 0) {
       numChunks++;
@@ -278,7 +275,10 @@ public abstract class HFileBlock {
     // 1. Magic is always 8 bytes.
     buf.put(blockType.getMagic(), 0, 8);
     // 2. onDiskSizeWithoutHeader.
-    buf.putInt(compressedBlockData.limit());
+    int compressedDataSize = compressedBlockData.limit();
+    int onDiskDataSizeWithHeader = HFileBlock.HFILEBLOCK_HEADER_SIZE + 
compressedDataSize;
+    int numChecksumBytes = numChecksumBytes(onDiskDataSizeWithHeader, 
DEFAULT_BYTES_PER_CHECKSUM);
+    buf.putInt(compressedDataSize + numChecksumBytes);
     // 3. uncompressedSizeWithoutHeader.
     buf.putInt(uncompressedBlockData.limit());
     // 4. Previous block offset.
@@ -288,17 +288,15 @@ public abstract class HFileBlock {
     // 6. Bytes covered per checksum.
     // Note that: Default value is 16K. There is a check on
     // onDiskSizeWithoutHeader = uncompressedSizeWithoutHeader + Checksum.
-    // In order to pass this check, either we make isUseHBaseChecksum false in 
HFileContext (hbase),
-    // or we set this value to zero.
-    buf.putInt(0);
+    // For compatibility with both HBase and native reader, the size of 
checksum bytes is
+    // calculated based on this and the checksum is appended at the end of the 
block
+    buf.putInt(DEFAULT_BYTES_PER_CHECKSUM);
     // 7. onDiskDataSizeWithHeader
-    int onDiskDataSizeWithHeader =
-        HFileBlock.HFILEBLOCK_HEADER_SIZE + compressedBlockData.limit();
     buf.putInt(onDiskDataSizeWithHeader);
     // 8. Payload.
     buf.put(compressedBlockData);
     // 9. Checksum.
-    buf.put(generateChecksumBytes(context.getChecksumType()));
+    buf.put(generateChecksumBytes(context.getChecksumType(), 
numChecksumBytes));
 
     // Update sizes
     buf.flip();
@@ -323,9 +321,9 @@ public abstract class HFileBlock {
    * Returns checksum bytes if checksum type is not NULL.
    * Note that current HFileReaderImpl does not support non-NULL checksum.
    */
-  private byte[] generateChecksumBytes(ChecksumType type) {
+  private byte[] generateChecksumBytes(ChecksumType type, int 
numChecksumBytes) {
     if (type == ChecksumType.NULL) {
-      return EMPTY_BYTE_ARRAY;
+      return new byte[numChecksumBytes];
     }
     throw new HoodieException("Only NULL checksum type is supported");
   }
diff --git 
a/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHFileWriter.java 
b/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHFileWriter.java
index da87dddf1ae1..2ecfebb6d0c0 100644
--- a/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHFileWriter.java
+++ b/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHFileWriter.java
@@ -174,7 +174,7 @@ class TestHFileWriter {
   private static void validateHFileSize() throws IOException {
     Path path = Paths.get(TEST_FILE);
     long actualSize = Files.size(path);
-    long expectedSize = 4521;
+    long expectedSize = 4537;
     assertEquals(expectedSize, actualSize);
   }
 
diff --git a/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHfileBlock.java 
b/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHfileBlock.java
index 93f90141b155..9bb57d11cacc 100644
--- a/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHfileBlock.java
+++ b/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHfileBlock.java
@@ -30,11 +30,6 @@ class TestHfileBlock {
     Assertions.assertEquals(0, HFileBlock.numChecksumChunks(0L, 512));
   }
 
-  @Test
-  void testNumChecksumChunksZeroBytesPerChecksum() {
-    Assertions.assertEquals(0, HFileBlock.numChecksumChunks(100L, 0));
-  }
-
   @Test
   void testNumChecksumChunksExactDivision() {
     Assertions.assertEquals(2, HFileBlock.numChecksumChunks(1024L, 512));
diff --git a/hudi-io/src/test/resources/hfile/hudi-generated.hfile 
b/hudi-io/src/test/resources/hfile/hudi-generated.hfile
index 27d5f9af1345..8ef2bd95d4a4 100644
Binary files a/hudi-io/src/test/resources/hfile/hudi-generated.hfile and 
b/hudi-io/src/test/resources/hfile/hudi-generated.hfile differ

Reply via email to