This is an automated email from the ASF dual-hosted git repository.
sivabalan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hudi.git
The following commit(s) were added to refs/heads/master by this push:
new c50f5604e8b8 fix(storage-format)!: Fix HFile writer to properly encode
checksum for compatibility (#13897)
c50f5604e8b8 is described below
commit c50f5604e8b8176fa6295560a911f36174cddda4
Author: Y Ethan Guo <[email protected]>
AuthorDate: Fri Sep 19 18:00:16 2025 -0700
fix(storage-format)!: Fix HFile writer to properly encode checksum for
compatibility (#13897)
---
.../java/org/apache/hudi/io/hfile/HFileBlock.java | 22 ++++++++++-----------
.../org/apache/hudi/io/hfile/TestHFileWriter.java | 2 +-
.../org/apache/hudi/io/hfile/TestHfileBlock.java | 5 -----
.../src/test/resources/hfile/hudi-generated.hfile | Bin 4654 -> 4674 bytes
4 files changed, 11 insertions(+), 18 deletions(-)
diff --git a/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileBlock.java
b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileBlock.java
index 4c61e6b3b8be..95033be6dd25 100644
--- a/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileBlock.java
+++ b/hudi-io/src/main/java/org/apache/hudi/io/hfile/HFileBlock.java
@@ -189,9 +189,6 @@ public abstract class HFileBlock {
* @return the number of checksum chunks.
*/
static int numChecksumChunks(long numBytes, int bytesPerChecksum) {
- if (bytesPerChecksum == 0) {
- return 0;
- }
long numChunks = numBytes / bytesPerChecksum;
if (numBytes % bytesPerChecksum != 0) {
numChunks++;
@@ -278,7 +275,10 @@ public abstract class HFileBlock {
// 1. Magic is always 8 bytes.
buf.put(blockType.getMagic(), 0, 8);
// 2. onDiskSizeWithoutHeader.
- buf.putInt(compressedBlockData.limit());
+ int compressedDataSize = compressedBlockData.limit();
+ int onDiskDataSizeWithHeader = HFileBlock.HFILEBLOCK_HEADER_SIZE +
compressedDataSize;
+ int numChecksumBytes = numChecksumBytes(onDiskDataSizeWithHeader,
DEFAULT_BYTES_PER_CHECKSUM);
+ buf.putInt(compressedDataSize + numChecksumBytes);
// 3. uncompressedSizeWithoutHeader.
buf.putInt(uncompressedBlockData.limit());
// 4. Previous block offset.
@@ -288,17 +288,15 @@ public abstract class HFileBlock {
// 6. Bytes covered per checksum.
// Note that: Default value is 16K. There is a check on
// onDiskSizeWithoutHeader = uncompressedSizeWithoutHeader + Checksum.
- // In order to pass this check, either we make isUseHBaseChecksum false in
HFileContext (hbase),
- // or we set this value to zero.
- buf.putInt(0);
+ // For compatibility with both HBase and native reader, the size of
checksum bytes is
+ // calculated based on this and the checksum is appended at the end of the
block
+ buf.putInt(DEFAULT_BYTES_PER_CHECKSUM);
// 7. onDiskDataSizeWithHeader
- int onDiskDataSizeWithHeader =
- HFileBlock.HFILEBLOCK_HEADER_SIZE + compressedBlockData.limit();
buf.putInt(onDiskDataSizeWithHeader);
// 8. Payload.
buf.put(compressedBlockData);
// 9. Checksum.
- buf.put(generateChecksumBytes(context.getChecksumType()));
+ buf.put(generateChecksumBytes(context.getChecksumType(),
numChecksumBytes));
// Update sizes
buf.flip();
@@ -323,9 +321,9 @@ public abstract class HFileBlock {
* Returns checksum bytes if checksum type is not NULL.
* Note that current HFileReaderImpl does not support non-NULL checksum.
*/
- private byte[] generateChecksumBytes(ChecksumType type) {
+ private byte[] generateChecksumBytes(ChecksumType type, int
numChecksumBytes) {
if (type == ChecksumType.NULL) {
- return EMPTY_BYTE_ARRAY;
+ return new byte[numChecksumBytes];
}
throw new HoodieException("Only NULL checksum type is supported");
}
diff --git
a/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHFileWriter.java
b/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHFileWriter.java
index da87dddf1ae1..2ecfebb6d0c0 100644
--- a/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHFileWriter.java
+++ b/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHFileWriter.java
@@ -174,7 +174,7 @@ class TestHFileWriter {
private static void validateHFileSize() throws IOException {
Path path = Paths.get(TEST_FILE);
long actualSize = Files.size(path);
- long expectedSize = 4521;
+ long expectedSize = 4537;
assertEquals(expectedSize, actualSize);
}
diff --git a/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHfileBlock.java
b/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHfileBlock.java
index 93f90141b155..9bb57d11cacc 100644
--- a/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHfileBlock.java
+++ b/hudi-io/src/test/java/org/apache/hudi/io/hfile/TestHfileBlock.java
@@ -30,11 +30,6 @@ class TestHfileBlock {
Assertions.assertEquals(0, HFileBlock.numChecksumChunks(0L, 512));
}
- @Test
- void testNumChecksumChunksZeroBytesPerChecksum() {
- Assertions.assertEquals(0, HFileBlock.numChecksumChunks(100L, 0));
- }
-
@Test
void testNumChecksumChunksExactDivision() {
Assertions.assertEquals(2, HFileBlock.numChecksumChunks(1024L, 512));
diff --git a/hudi-io/src/test/resources/hfile/hudi-generated.hfile
b/hudi-io/src/test/resources/hfile/hudi-generated.hfile
index 27d5f9af1345..8ef2bd95d4a4 100644
Binary files a/hudi-io/src/test/resources/hfile/hudi-generated.hfile and
b/hudi-io/src/test/resources/hfile/hudi-generated.hfile differ