This is an automated email from the ASF dual-hosted git repository.
Fokko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-java.git
The following commit(s) were added to refs/heads/master by this push:
new b8f333085 GH-3573: Fix VariantUtil string decoding to use explicit
UTF-8 charset (#3576)
b8f333085 is described below
commit b8f33308534e990003e48a2e66036ccc83fd5db4
Author: Mikhail Melnik <[email protected]>
AuthorDate: Wed May 27 21:11:18 2026 +0200
GH-3573: Fix VariantUtil string decoding to use explicit UTF-8 charset
(#3576)
* Fix VariantUtil string decoding to use explicit UTF-8 charset
* Apply Spotless formatting
* Add tests for non-ASCII string values, object keys and metadata map
---
.../org/apache/parquet/variant/VariantUtil.java | 18 ++++++++++------
.../apache/parquet/variant/TestVariantObject.java | 25 ++++++++++++++++++++++
.../parquet/variant/TestVariantParseJson.java | 9 ++++++++
3 files changed, 46 insertions(+), 6 deletions(-)
diff --git
a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java
b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java
index 7ad867e0f..f50a0f316 100644
--- a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java
+++ b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java
@@ -20,6 +20,7 @@ import java.math.BigDecimal;
import java.math.BigInteger;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
+import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.HashMap;
import org.apache.parquet.Preconditions;
@@ -657,12 +658,12 @@ class VariantUtil {
checkIndex(start + length - 1, value.limit());
if (value.hasArray()) {
// If the buffer is backed by an array, we can use the array directly.
- return new String(value.array(), value.arrayOffset() + start, length);
+ return new String(value.array(), value.arrayOffset() + start, length,
StandardCharsets.UTF_8);
} else {
// If the buffer is not backed by an array, we need to copy the bytes
into a new array.
byte[] valueArray = new byte[length];
slice(value, start).get(valueArray);
- return new String(valueArray);
+ return new String(valueArray, StandardCharsets.UTF_8);
}
}
throw unexpectedType(Variant.Type.STRING, value);
@@ -825,12 +826,16 @@ class VariantUtil {
}
checkIndex(dataPos + nextOffset - 1, metadata.limit());
if (metadata.hasArray() && !metadata.isReadOnly()) {
- return new String(metadata.array(), metadata.arrayOffset() + dataPos +
offset, nextOffset - offset);
+ return new String(
+ metadata.array(),
+ metadata.arrayOffset() + dataPos + offset,
+ nextOffset - offset,
+ StandardCharsets.UTF_8);
} else {
// ByteBuffer does not have an array, so we need to use the `get` method
to read the bytes.
byte[] metadataArray = new byte[nextOffset - offset];
slice(metadata, dataPos + offset).get(metadataArray);
- return new String(metadataArray);
+ return new String(metadataArray, StandardCharsets.UTF_8);
}
}
@@ -861,13 +866,14 @@ class VariantUtil {
new String(
metadata.array(),
metadata.arrayOffset() + pos + stringStart + offset,
- nextOffset - offset),
+ nextOffset - offset,
+ StandardCharsets.UTF_8),
id);
} else {
// ByteBuffer does not have an array, so we need to use the `get`
method to read the bytes.
byte[] metadataArray = new byte[nextOffset - offset];
slice(metadata, pos + stringStart + offset).get(metadataArray);
- result.put(new String(metadataArray), id);
+ result.put(new String(metadataArray, StandardCharsets.UTF_8), id);
}
offset = nextOffset;
}
diff --git
a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantObject.java
b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantObject.java
index 1c823bd76..1c5dc4c5e 100644
---
a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantObject.java
+++
b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantObject.java
@@ -406,4 +406,29 @@ public class TestVariantObject {
Assert.assertEquals(0, immutableMetadata.getOrInsert("name"));
Assert.assertEquals(1, immutableMetadata.getOrInsert("age"));
}
+
+ @Test
+ public void testMetadataMapWithUnicodeKeys() {
+ // Build a variant whose metadata dictionary contains non-ASCII keys.
+ VariantBuilder vb = new VariantBuilder();
+ VariantObjectBuilder obj = vb.startObject();
+ obj.appendKey("élève");
+ obj.appendInt(1);
+ obj.appendKey("中文");
+ obj.appendInt(2);
+ vb.endObject();
+ Variant variant = vb.build();
+
+ ByteBuffer metaBuf = variant.getMetadataBuffer();
+
+ // hasArray branch
+ ImmutableMetadata writable = new ImmutableMetadata(metaBuf);
+ Assert.assertEquals(0, writable.getOrInsert("élève"));
+ Assert.assertEquals(1, writable.getOrInsert("中文"));
+
+ // read-only branch (else path in getMetadataMap): asReadOnlyBuffer()
makes isReadOnly() true
+ ImmutableMetadata readOnly = new
ImmutableMetadata(metaBuf.asReadOnlyBuffer());
+ Assert.assertEquals(0, readOnly.getOrInsert("élève"));
+ Assert.assertEquals(1, readOnly.getOrInsert("中文"));
+ }
}
diff --git
a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantParseJson.java
b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantParseJson.java
index f2697a00f..fc1a24ba2 100644
---
a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantParseJson.java
+++
b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantParseJson.java
@@ -241,6 +241,15 @@ public class TestVariantParseJson {
Assert.assertEquals("\u00e9l\u00e8ve", v.getString());
}
+ @Test
+ public void testParseUnicodeKey() throws IOException {
+ Variant v = VariantJsonParser.parseJson("{\"\\u00e9l\\u00e8ve\": 42}");
+ Assert.assertEquals(Variant.Type.OBJECT, v.getType());
+ Variant value = v.getFieldByKey("élève");
+ Assert.assertNotNull(value);
+ Assert.assertEquals(42, value.getInt());
+ }
+
@Test
public void testParseEscapedString() throws IOException {
Variant v = VariantJsonParser.parseJson("\"hello\\nworld\"");