This is an automated email from the ASF dual-hosted git repository.
Fokko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-java.git
The following commit(s) were added to refs/heads/master by this push:
new 2346fdbed GH-3464: Improve `DeltaByteArrayWriter.writeBytes` (#3465)
2346fdbed is described below
commit 2346fdbed480a362d9aa5242955ae85fac656e4e
Author: André Rouél <[email protected]>
AuthorDate: Wed May 6 21:30:32 2026 +0200
GH-3464: Improve `DeltaByteArrayWriter.writeBytes` (#3465)
* GH-3464 Improve `DeltaByteArrayWriter.writeBytes` to avoid unnecessary
allocation and scalar prefix comparison
* GH-3464 Add regression test
* Update DeltaByteArrayWriter.java
---
.../values/deltastrings/DeltaByteArrayWriter.java | 15 +++++++++------
.../values/deltastrings/TestDeltaByteArray.java | 21 +++++++++++++++++++++
2 files changed, 30 insertions(+), 6 deletions(-)
diff --git
a/parquet-column/src/main/java/org/apache/parquet/column/values/deltastrings/DeltaByteArrayWriter.java
b/parquet-column/src/main/java/org/apache/parquet/column/values/deltastrings/DeltaByteArrayWriter.java
index c23410861..5496ed194 100644
---
a/parquet-column/src/main/java/org/apache/parquet/column/values/deltastrings/DeltaByteArrayWriter.java
+++
b/parquet-column/src/main/java/org/apache/parquet/column/values/deltastrings/DeltaByteArrayWriter.java
@@ -18,6 +18,7 @@
*/
package org.apache.parquet.column.values.deltastrings;
+import java.util.Arrays;
import org.apache.parquet.bytes.ByteBufferAllocator;
import org.apache.parquet.bytes.BytesInput;
import org.apache.parquet.column.Encoding;
@@ -88,12 +89,14 @@ public class DeltaByteArrayWriter extends ValuesWriter {
@Override
public void writeBytes(Binary v) {
- int i = 0;
- byte[] vb = v.getBytes();
- int length = previous.length < vb.length ? previous.length : vb.length;
- // find the number of matching prefix bytes between this value and the
previous one
- for (i = 0; (i < length) && (previous[i] == vb[i]); i++)
- ;
+ byte[] vb = v.isBackingBytesReused() ? v.getBytes() : v.getBytesUnsafe();
+ int length = Math.min(previous.length, vb.length);
+ // Find the number of matching prefix bytes between this value and the
previous one.
+ // Arrays.mismatch is intrinsified by the JVM to use SIMD instructions.
+ int i = Arrays.mismatch(previous, 0, length, vb, 0, length);
+ if (i < 0) {
+ i = length; // all bytes in the common range matched
+ }
prefixLengthWriter.writeInteger(i);
suffixWriter.writeBytes(v.slice(i, vb.length - i));
previous = vb;
diff --git
a/parquet-column/src/test/java/org/apache/parquet/column/values/deltastrings/TestDeltaByteArray.java
b/parquet-column/src/test/java/org/apache/parquet/column/values/deltastrings/TestDeltaByteArray.java
index 5ce6adbdf..b73e5562d 100644
---
a/parquet-column/src/test/java/org/apache/parquet/column/values/deltastrings/TestDeltaByteArray.java
+++
b/parquet-column/src/test/java/org/apache/parquet/column/values/deltastrings/TestDeltaByteArray.java
@@ -19,6 +19,7 @@
package org.apache.parquet.column.values.deltastrings;
import java.io.IOException;
+import java.nio.charset.StandardCharsets;
import org.apache.parquet.bytes.ByteBufferInputStream;
import org.apache.parquet.bytes.DirectByteBufferAllocator;
import org.apache.parquet.column.values.Utils;
@@ -128,4 +129,24 @@ public class TestDeltaByteArray {
assertReadWrite(writer, new DeltaByteArrayReader(), values);
}
+
+ @Test
+ public void testReusedBackingArrayRegression() throws Exception {
+ DeltaByteArrayWriter writer = new DeltaByteArrayWriter(64 * 1024, 64 *
1024, new DirectByteBufferAllocator());
+ DeltaByteArrayReader reader = new DeltaByteArrayReader();
+
+ byte[] buffer = "parquet-000".getBytes(StandardCharsets.UTF_8);
+ writer.writeBytes(Binary.fromReusedByteArray(buffer));
+
+ System.arraycopy("parquet-111".getBytes(StandardCharsets.UTF_8), 0,
buffer, 0, buffer.length);
+ writer.writeBytes(Binary.fromReusedByteArray(buffer));
+
+ System.arraycopy("parquet-222".getBytes(StandardCharsets.UTF_8), 0,
buffer, 0, buffer.length);
+ writer.writeBytes(Binary.fromReusedByteArray(buffer));
+
+ Binary[] decoded = Utils.readData(reader,
writer.getBytes().toInputStream(), 3);
+ Assert.assertEquals(Binary.fromString("parquet-000"), decoded[0]);
+ Assert.assertEquals(Binary.fromString("parquet-111"), decoded[1]);
+ Assert.assertEquals(Binary.fromString("parquet-222"), decoded[2]);
+ }
}