This is an automated email from the ASF dual-hosted git repository.

Fokko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-java.git


The following commit(s) were added to refs/heads/master by this push:
     new b8f333085 GH-3573: Fix VariantUtil string decoding to use explicit 
UTF-8 charset (#3576)
b8f333085 is described below

commit b8f33308534e990003e48a2e66036ccc83fd5db4
Author: Mikhail Melnik <[email protected]>
AuthorDate: Wed May 27 21:11:18 2026 +0200

    GH-3573: Fix VariantUtil string decoding to use explicit UTF-8 charset 
(#3576)
    
    * Fix VariantUtil string decoding to use explicit UTF-8 charset
    
    * Apply Spotless formatting
    
    * Add tests for non-ASCII string values, object keys and metadata map
---
 .../org/apache/parquet/variant/VariantUtil.java    | 18 ++++++++++------
 .../apache/parquet/variant/TestVariantObject.java  | 25 ++++++++++++++++++++++
 .../parquet/variant/TestVariantParseJson.java      |  9 ++++++++
 3 files changed, 46 insertions(+), 6 deletions(-)

diff --git 
a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java 
b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java
index 7ad867e0f..f50a0f316 100644
--- a/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java
+++ b/parquet-variant/src/main/java/org/apache/parquet/variant/VariantUtil.java
@@ -20,6 +20,7 @@ import java.math.BigDecimal;
 import java.math.BigInteger;
 import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
+import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
 import java.util.HashMap;
 import org.apache.parquet.Preconditions;
@@ -657,12 +658,12 @@ class VariantUtil {
       checkIndex(start + length - 1, value.limit());
       if (value.hasArray()) {
         // If the buffer is backed by an array, we can use the array directly.
-        return new String(value.array(), value.arrayOffset() + start, length);
+        return new String(value.array(), value.arrayOffset() + start, length, 
StandardCharsets.UTF_8);
       } else {
         // If the buffer is not backed by an array, we need to copy the bytes 
into a new array.
         byte[] valueArray = new byte[length];
         slice(value, start).get(valueArray);
-        return new String(valueArray);
+        return new String(valueArray, StandardCharsets.UTF_8);
       }
     }
     throw unexpectedType(Variant.Type.STRING, value);
@@ -825,12 +826,16 @@ class VariantUtil {
     }
     checkIndex(dataPos + nextOffset - 1, metadata.limit());
     if (metadata.hasArray() && !metadata.isReadOnly()) {
-      return new String(metadata.array(), metadata.arrayOffset() + dataPos + 
offset, nextOffset - offset);
+      return new String(
+          metadata.array(),
+          metadata.arrayOffset() + dataPos + offset,
+          nextOffset - offset,
+          StandardCharsets.UTF_8);
     } else {
       // ByteBuffer does not have an array, so we need to use the `get` method 
to read the bytes.
       byte[] metadataArray = new byte[nextOffset - offset];
       slice(metadata, dataPos + offset).get(metadataArray);
-      return new String(metadataArray);
+      return new String(metadataArray, StandardCharsets.UTF_8);
     }
   }
 
@@ -861,13 +866,14 @@ class VariantUtil {
             new String(
                 metadata.array(),
                 metadata.arrayOffset() + pos + stringStart + offset,
-                nextOffset - offset),
+                nextOffset - offset,
+                StandardCharsets.UTF_8),
             id);
       } else {
         // ByteBuffer does not have an array, so we need to use the `get` 
method to read the bytes.
         byte[] metadataArray = new byte[nextOffset - offset];
         slice(metadata, pos + stringStart + offset).get(metadataArray);
-        result.put(new String(metadataArray), id);
+        result.put(new String(metadataArray, StandardCharsets.UTF_8), id);
       }
       offset = nextOffset;
     }
diff --git 
a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantObject.java
 
b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantObject.java
index 1c823bd76..1c5dc4c5e 100644
--- 
a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantObject.java
+++ 
b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantObject.java
@@ -406,4 +406,29 @@ public class TestVariantObject {
     Assert.assertEquals(0, immutableMetadata.getOrInsert("name"));
     Assert.assertEquals(1, immutableMetadata.getOrInsert("age"));
   }
+
+  @Test
+  public void testMetadataMapWithUnicodeKeys() {
+    // Build a variant whose metadata dictionary contains non-ASCII keys.
+    VariantBuilder vb = new VariantBuilder();
+    VariantObjectBuilder obj = vb.startObject();
+    obj.appendKey("élève");
+    obj.appendInt(1);
+    obj.appendKey("中文");
+    obj.appendInt(2);
+    vb.endObject();
+    Variant variant = vb.build();
+
+    ByteBuffer metaBuf = variant.getMetadataBuffer();
+
+    // hasArray branch
+    ImmutableMetadata writable = new ImmutableMetadata(metaBuf);
+    Assert.assertEquals(0, writable.getOrInsert("élève"));
+    Assert.assertEquals(1, writable.getOrInsert("中文"));
+
+    // read-only branch (else path in getMetadataMap): asReadOnlyBuffer() 
makes isReadOnly() true
+    ImmutableMetadata readOnly = new 
ImmutableMetadata(metaBuf.asReadOnlyBuffer());
+    Assert.assertEquals(0, readOnly.getOrInsert("élève"));
+    Assert.assertEquals(1, readOnly.getOrInsert("中文"));
+  }
 }
diff --git 
a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantParseJson.java
 
b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantParseJson.java
index f2697a00f..fc1a24ba2 100644
--- 
a/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantParseJson.java
+++ 
b/parquet-variant/src/test/java/org/apache/parquet/variant/TestVariantParseJson.java
@@ -241,6 +241,15 @@ public class TestVariantParseJson {
     Assert.assertEquals("\u00e9l\u00e8ve", v.getString());
   }
 
+  @Test
+  public void testParseUnicodeKey() throws IOException {
+    Variant v = VariantJsonParser.parseJson("{\"\\u00e9l\\u00e8ve\": 42}");
+    Assert.assertEquals(Variant.Type.OBJECT, v.getType());
+    Variant value = v.getFieldByKey("élève");
+    Assert.assertNotNull(value);
+    Assert.assertEquals(42, value.getInt());
+  }
+
   @Test
   public void testParseEscapedString() throws IOException {
     Variant v = VariantJsonParser.parseJson("\"hello\\nworld\"");

Reply via email to