This is an automated email from the ASF dual-hosted git repository.

jtao pushed a commit to branch hotfix-unicode
in repository https://gitbox.apache.org/repos/asf/pinot.git


The following commit(s) were added to refs/heads/hotfix-unicode by this push:
     new d3dbd5829f9 Enforce UTF8 when decoding byte[] to string in ValueReader 
(#16608)
d3dbd5829f9 is described below

commit d3dbd5829f98a419c9f896d06b28d88ecc1b4626
Author: Jiapeng Tao <[email protected]>
AuthorDate: Fri Aug 15 14:04:21 2025 -0700

    Enforce UTF8 when decoding byte[] to string in ValueReader (#16608)
---
 .../pinot/segment/local/io/util/ValueReader.java   |  3 +-
 .../FixedByteValueReaderWriterTest.java            | 36 ++++++++++++++++++++++
 2 files changed, 38 insertions(+), 1 deletion(-)

diff --git 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/util/ValueReader.java
 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/util/ValueReader.java
index 9aa9382e319..acf3d4fea2a 100644
--- 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/util/ValueReader.java
+++ 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/io/util/ValueReader.java
@@ -20,6 +20,7 @@ package org.apache.pinot.segment.local.io.util;
 
 import java.io.Closeable;
 import java.math.BigDecimal;
+import java.nio.charset.StandardCharsets;
 import org.apache.pinot.spi.utils.BigDecimalUtils;
 import org.apache.pinot.spi.utils.hash.MurmurHashFunctions;
 
@@ -63,7 +64,7 @@ public interface ValueReader extends Closeable {
    */
   default String getUnpaddedString(int index, int numBytesPerValue, byte[] 
buffer) {
     int length = readUnpaddedBytes(index, numBytesPerValue, buffer);
-    return new String(buffer, 0, length);
+    return new String(buffer, 0, length, StandardCharsets.UTF_8);
   }
 
   /**
diff --git 
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/readerwriter/FixedByteValueReaderWriterTest.java
 
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/readerwriter/FixedByteValueReaderWriterTest.java
index 65b09df5644..5e47acf8bca 100644
--- 
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/readerwriter/FixedByteValueReaderWriterTest.java
+++ 
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/segment/index/readerwriter/FixedByteValueReaderWriterTest.java
@@ -74,4 +74,40 @@ public class FixedByteValueReaderWriterTest implements 
PinotBuffersAfterMethodCh
       }
     }
   }
+
+  @Test(dataProvider = "params")
+  public void testFixedByteValueReaderWriterNonAscii(int maxStringLength, int 
configuredMaxLength, ByteOrder byteOrder)
+      throws IOException {
+    byte[] bytes = new byte[configuredMaxLength];
+    // Use a multi-byte UTF-8 character (é = 0xC3 0xA9)
+    byte[] nonAsciiChar = "é".getBytes(StandardCharsets.UTF_8);
+
+    try (PinotDataBuffer buffer = 
PinotDataBuffer.allocateDirect(configuredMaxLength * 1000L, byteOrder,
+        "testFixedByteValueReaderWriterNonAscii")) {
+      FixedByteValueReaderWriter readerWriter = new 
FixedByteValueReaderWriter(buffer);
+      List<String> inputs = new ArrayList<>(1000);
+
+      for (int i = 0; i < 1000; i++) {
+        // number of *characters* to write
+        int charCount = ThreadLocalRandom.current().nextInt(maxStringLength);
+        int byteCount = charCount * nonAsciiChar.length;
+        if (byteCount > configuredMaxLength) {
+          byteCount = configuredMaxLength - (configuredMaxLength % 
nonAsciiChar.length); // fit whole chars
+          charCount = byteCount / nonAsciiChar.length;
+        }
+
+        Arrays.fill(bytes, (byte) 0);
+        for (int pos = 0; pos < byteCount; pos += nonAsciiChar.length) {
+          System.arraycopy(nonAsciiChar, 0, bytes, pos, nonAsciiChar.length);
+        }
+
+        readerWriter.writeBytes(i, configuredMaxLength, bytes);
+        inputs.add("é".repeat(charCount));
+      }
+
+      for (int i = 0; i < 1000; i++) {
+        assertEquals(readerWriter.getUnpaddedString(i, configuredMaxLength, 
bytes), inputs.get(i));
+      }
+    }
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to