(impala) 04/05: IMPALA-14237: Fix Iceberg partition values encoding

stigahuang Sat, 20 Sep 2025 18:15:36 -0700

This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


commit 321429eac6400fd8c1b22c8aabb2a11fee381437
Author: Daniel Vanko <[email protected]>
AuthorDate: Fri Jul 18 15:56:17 2025 +0200

    IMPALA-14237: Fix Iceberg partition values encoding
    
    This patch modifies the string overload of
    IcebergFunctions::TruncatePartitionTransform so that it always handles
    strings as UTF-8-encoded ones, because the Iceberg specification states
    that that strings are UTF-8 encoded.
    
    Also, for an Iceberg table UrlEncode is called in not the
    Hive-compatible way, rather than the standard way, similar to Java's
    URLEncoder.encode() (which the Iceberg API also uses) to conform with
    existing practices by Hive, Spark and Trino. This included a change in
    the set of characters which are not escaped to follow the URL Standard's
    application/x-www-form-urlencoded format. [1] Also renamed it from
    ShouldNotEscape to IsUrlSafe for better readability.
    
    Testing:
     * add and extend e2e tests to check partitions with Unicode characters
     * add be tests to coding-util-test.cc
    
    [1]: 
https://url.spec.whatwg.org/#application-x-www-form-urlencoded-percent-encode-set
    
    Change-Id: Iabb39727f6dd49b76c918bcd6b3ec62532555755
    Reviewed-on: http://gerrit.cloudera.org:8080/23190
    Reviewed-by: Impala Public Jenkins <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 be/src/exec/table-sink-base.cc                     |  7 ++-
 be/src/exprs/iceberg-functions-ir.cc               |  5 +-
 be/src/util/coding-util-test.cc                    | 41 ++++++++++---
 be/src/util/coding-util.cc                         | 27 ++++++---
 .../iceberg-partition-transform-insert.test        | 68 ++++++++++++++++++++--
 .../queries/QueryTest/unicode-column-name.test     | 41 +++++++++++++
 tests/query_test/test_insert.py                    |  2 +-
 7 files changed, 165 insertions(+), 26 deletions(-)

diff --git a/be/src/exec/table-sink-base.cc b/be/src/exec/table-sink-base.cc
index c21ed002a..598c50492 100644
--- a/be/src/exec/table-sink-base.cc
+++ b/be/src/exec/table-sink-base.cc
@@ -107,7 +107,12 @@ string TableSinkBase::GetPartitionName(int i) {
 
 string TableSinkBase::UrlEncodePartitionValue(const string& raw_str) {
   string encoded_str;
-  UrlEncode(raw_str, &encoded_str, true);
+  if (IsIceberg()) {
+    // Iceberg partition values should be URL encoded, but not Hive compatible 
way.
+    UrlEncode(raw_str, &encoded_str, false);
+  } else {
+    UrlEncode(raw_str, &encoded_str, true);
+  }
   return encoded_str.empty() ? table_desc_->null_partition_key_value() : 
encoded_str;
 }
 
diff --git a/be/src/exprs/iceberg-functions-ir.cc 
b/be/src/exprs/iceberg-functions-ir.cc
index 2194e6f5a..2029537c5 100644
--- a/be/src/exprs/iceberg-functions-ir.cc
+++ b/be/src/exprs/iceberg-functions-ir.cc
@@ -22,6 +22,7 @@
 
 #include "common/compiler-util.h"
 #include "common/logging.h"
+#include "exprs/string-functions.h"
 #include "runtime/timestamp-value.inline.h"
 #include "thirdparty/murmurhash/MurmurHash3.h"
 #include "udf/udf-internal.h"
@@ -102,7 +103,9 @@ StringVal 
IcebergFunctions::TruncatePartitionTransform(FunctionContext* ctx,
     const StringVal& input, const IntVal& width) {
   if (!CheckInputsAndSetError(ctx, input, width)) return StringVal::null();
   if (input.len <= width.val) return input;
-  return StringVal::CopyFrom(ctx, input.ptr, width.val);
+  // String handled as UTF8 regardless of utf8_mode, because Iceberg spec 
states that
+  // character strings must be stored as UTF-8 encoded byte arrays.
+  return StringFunctions::Utf8Substring(ctx, input, 1, width.val);
 }
 
 template<typename T, typename W>
diff --git a/be/src/util/coding-util-test.cc b/be/src/util/coding-util-test.cc
index 8a2f855c8..793ea259e 100644
--- a/be/src/util/coding-util-test.cc
+++ b/be/src/util/coding-util-test.cc
@@ -117,14 +117,39 @@ TEST(UrlCodingTest, PathSeparators) {
   string encoded_test_path = "%2Fhome%2Fimpala%2Fdirectory%2F";
   TestUrl(test_path, encoded_test_path, false);
   TestUrl(test_path, encoded_test_path, true);
-  string test = 
"SpecialCharacters\x01\x02\x03\x04\x05\x06\x07\b\t\n\v\f\r\x0E\x0F\x10"
-                
"\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F\x7F\"#%'"
-                "*/:=?\\{[]^";
-  string encoded_test = 
"SpecialCharacters%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F"
-                        
"%10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F%7F%22%23%25"
-                        "%27%2A%2F%3A%3D%3F%5C%7B%5B%5D%5E";
-  TestUrl(test, encoded_test, false);
-  TestUrl(test, encoded_test, true);
+}
+
+// Test URL encoding of the ASCII table, character values from 1 to 127.
+TEST(UrlCodingTest, AsciiCharacters) {
+  string raw = "\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D"
+                "\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19"
+                "\x1A\x1B\x1C\x1D\x1E\x1F !\"#$%&'()*+,-./"
+                "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+                "[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7F";
+  string hive_encoded = "%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D"
+                        "%0E%0F%10%11%12%13%14%15%16%17%18%19"
+                        "%1A%1B%1C%1D%1E%1F !%22%23$%25&%27()%2A+,-.%2F"
+                        "0123456789%3A;<%3D>%3F@ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+                        "%5B%5C%5D%5E_`abcdefghijklmnopqrstuvwxyz%7B|}~%7F";
+  TestUrl(raw, hive_encoded, true);
+  string url_encoded = "%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D"
+                       "%0E%0F%10%11%12%13%14%15%16%17%18%19"
+                       
"%1A%1B%1C%1D%1E%1F+%21%22%23%24%25%26%27%28%29*%2B%2C-.%2F"
+                       
"0123456789%3A%3B%3C%3D%3E%3F%40ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+                       
"%5B%5C%5D%5E_%60abcdefghijklmnopqrstuvwxyz%7B%7C%7D%7E%7F";
+  TestUrl(raw, url_encoded, false);
+}
+
+// Test a few unicode characters that are not in the ASCII table.
+TEST(UrlCodingTest, UnicodeCharacters) {
+  string raw = "árvíztűrőtükörfúrógép 你们好 აბგ";
+  string hive_encoded = "árvíztűrőtükörfúrógép 你们好 აბგ";
+  TestUrl(raw, hive_encoded, true);
+  string url_encoded = "%C3%A1rv%C3%ADzt%C5%B1r%C5%91"
+                       "t%C3%BCk%C3%B6rf%C3%BAr%C3%B3g%C3%A9p"
+                       "+%E4%BD%A0%E4%BB%AC%E5%A5%BD"
+                       "+%E1%83%90%E1%83%91%E1%83%92";
+  TestUrl(raw, url_encoded, false);
 }
 
 TEST(Base64Test, Basic) {
diff --git a/be/src/util/coding-util.cc b/be/src/util/coding-util.cc
index f4578e4d4..3228dc33e 100644
--- a/be/src/util/coding-util.cc
+++ b/be/src/util/coding-util.cc
@@ -38,9 +38,9 @@ using std::uppercase;
 
 namespace impala {
 
-// It is more convenient to maintain the complement of the set of
-// characters to escape when not in Hive-compat mode.
-static function<bool (char)> ShouldNotEscape = is_any_of("-_.~");
+// It is more convenient to maintain the set of characters that are safe to use
+// directly in URLs without escaping
+static function<bool (char)> IsUrlSafe = is_any_of(".-*_");
 
 // Hive selectively encodes characters. This is the whitelist of
 // characters it will encode.
@@ -53,18 +53,27 @@ static const std::unordered_set<char> SpecialCharacters = {
     '\x1F', '\x7F', '"', '#', '%', '\'', '*', '/', ':', '=', '?', '\\', '{', 
'[', ']',
     '^'};
 
+// Encodes the input string as a URL-encoded string based on UTF-8.
+// If 'hive_compat' is set to true, the string is encoded in a Hive-compatible 
way;
+// otherwise, a more standard URL encoding is used, similar to the 
URLEncoder.encode()
+// method in Java.
 static inline void UrlEncode(const char* in, int in_len, string* out, bool 
hive_compat) {
   stringstream ss;
   // "uppercase" and "hex" only affect the insertion of integers, not that of 
char values.
   ss << uppercase << hex << setfill('0');
   for (char ch : std::string_view(in, in_len)) {
-    // Escape the character iff a) we are in Hive-compat mode and the
-    // character is in the Hive whitelist or b) we are not in Hive-compat mode 
and
-    // the character is not alphanumeric and it is not one of the characters 
specifically
-    // excluded from escaping (see ShouldNotEscape()).
+    // Escape the character iff
+    //   a) we are in Hive-compat mode and the character is in the Hive 
whitelist or
+    //   b) we are not in Hive-compat mode and the character is not 
alphanumeric
+    //      and it is not safe to use in URLs (see IsUrlSafe()).
     if ((hive_compat && SpecialCharacters.count(ch) > 0) || (!hive_compat &&
-            !isalnum(static_cast<unsigned char>(ch)) && !ShouldNotEscape(ch))) 
{
-      ss << '%' << setw(2) << static_cast<uint32_t>(static_cast<unsigned 
char>(ch));
+            !isalnum(static_cast<unsigned char>(ch)) && !IsUrlSafe(ch))) {
+      // Iff we are not in Hive-compat mode, we encode space as '+'.
+      if (!hive_compat && ch == ' ') {
+        ss << '+';
+      } else {
+        ss << '%' << setw(2) << static_cast<uint32_t>(static_cast<unsigned 
char>(ch));
+      }
     } else {
       ss << ch;
     }
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/iceberg-partition-transform-insert.test
 
b/testdata/workloads/functional-query/queries/QueryTest/iceberg-partition-transform-insert.test
index 7a871358c..a5a349fe7 100644
--- 
a/testdata/workloads/functional-query/queries/QueryTest/iceberg-partition-transform-insert.test
+++ 
b/testdata/workloads/functional-query/queries/QueryTest/iceberg-partition-transform-insert.test
@@ -427,11 +427,11 @@ INT,BIGINT,DECIMAL,STRING
 ---- QUERY
 show files in multi_col_truncate;
 ---- RESULTS
-row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/multi_col_truncate/data/s_trunc=the
 quick 
brown/i_trunc=0/b_trunc=11/d_trunc=11111.100000/.*.parq','.*','','$ERASURECODE_POLICY'
-row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/multi_col_truncate/data/s_trunc=the
 quick 
brown/i_trunc=0/b_trunc=220/d_trunc=421.000000/.*.parq','.*','','$ERASURECODE_POLICY'
-row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/multi_col_truncate/data/s_trunc=the
 quick 
brown/i_trunc=5/b_trunc=330/d_trunc=113211.200000/.*.parq','.*','','$ERASURECODE_POLICY'
-row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/multi_col_truncate/data/s_trunc=the
 quick fox 
b/i_trunc=5/b_trunc=440/d_trunc=1111154.100000/.*.parq','.*','','$ERASURECODE_POLICY'
-row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/multi_col_truncate/data/s_trunc=the
 quick 
impal/i_trunc=15/b_trunc=550/d_trunc=9999913.200000/.*.parq','.*','','$ERASURECODE_POLICY'
+row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/multi_col_truncate/data/s_trunc=the\+quick\+brown/i_trunc=0/b_trunc=11/d_trunc=11111.100000/.*.parq','.*','','$ERASURECODE_POLICY'
+row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/multi_col_truncate/data/s_trunc=the\+quick\+brown/i_trunc=0/b_trunc=220/d_trunc=421.000000/.*.parq','.*','','$ERASURECODE_POLICY'
+row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/multi_col_truncate/data/s_trunc=the\+quick\+brown/i_trunc=5/b_trunc=330/d_trunc=113211.200000/.*.parq','.*','','$ERASURECODE_POLICY'
+row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/multi_col_truncate/data/s_trunc=the\+quick\+fox\+b/i_trunc=5/b_trunc=440/d_trunc=1111154.100000/.*.parq','.*','','$ERASURECODE_POLICY'
+row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/multi_col_truncate/data/s_trunc=the\+quick\+impal/i_trunc=15/b_trunc=550/d_trunc=9999913.200000/.*.parq','.*','','$ERASURECODE_POLICY'
 
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/multi_col_truncate/data/s_trunc=__HIVE_DEFAULT_PARTITION__/i_trunc=__HIVE_DEFAULT_PARTITION__/b_trunc=__HIVE_DEFAULT_PARTITION__/d_trunc=__HIVE_DEFAULT_PARTITION__/.*.parq','.*','','$ERASURECODE_POLICY'
 ---- TYPES
 STRING, STRING, STRING, STRING
@@ -1010,7 +1010,7 @@ STRING,BIGINT,DECIMAL,TIMESTAMP,DATE
 show files in mixed_and_shuffled;
 ---- RESULTS
 
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/mixed_and_shuffled/data/t_day=1971-07-01/da_year=1971/s_trunc=green/b_bucket=1/de_trunc=71.00/.*.parq','.*','','$ERASURECODE_POLICY'
-row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/mixed_and_shuffled/data/t_day=1999-09-09/da_year=1999/s_trunc=pink
 /b_bucket=1/de_trunc=9.00/.*.parq','.*','','$ERASURECODE_POLICY'
+row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/mixed_and_shuffled/data/t_day=1999-09-09/da_year=1999/s_trunc=pink\+/b_bucket=1/de_trunc=9.00/.*.parq','.*','','$ERASURECODE_POLICY'
 
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/mixed_and_shuffled/data/t_day=2020-01-06/da_year=2020/s_trunc=quick/b_bucket=1/de_trunc=3333.00/.*.parq','.*','','$ERASURECODE_POLICY'
 
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/mixed_and_shuffled/data/t_day=2021-01-01/da_year=2021/s_trunc=quick/b_bucket=1/de_trunc=543.00/.*.parq','.*','','$ERASURECODE_POLICY'
 
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/mixed_and_shuffled/data/t_day=2021-01-01/da_year=2021/s_trunc=quick/b_bucket=2/de_trunc=123.00/.*.parq','.*','','$ERASURECODE_POLICY'
@@ -1088,3 +1088,59 @@ where da is not null;
 ---- TYPES
 STRING,BIGINT,DECIMAL,TIMESTAMP,DATE
 ====
+---- QUERY
+# Test truncate partition transform with Unicode strings
+create table unicode_truncate (s string)
+partitioned by spec (truncate(5, s))
+stored as iceberg;
+====
+---- QUERY
+insert into unicode_truncate values
+('impala'),
+('árvíztűrőtükörfúrógép'),
+('árvíztűrő'),
+('űűű'),
+('你们好hello');
+select * from unicode_truncate;
+---- RESULTS: RAW_STRING
+'impala'
+'árvíztűrőtükörfúrógép'
+'árvíztűrő'
+'űűű'
+'你们好hello'
+---- TYPES
+STRING
+====
+---- QUERY
+show files in unicode_truncate;
+---- RESULTS
+row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/unicode_truncate/data/s_trunc=impal/.*.parq','.*','','$ERASURECODE_POLICY'
+row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/unicode_truncate/data/s_trunc=%C3%A1rv%C3%ADz/.*.parq','.*','','$ERASURECODE_POLICY'
+row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/unicode_truncate/data/s_trunc=%C5%B1%C5%B1%C5%B1/.*.parq','.*','','$ERASURECODE_POLICY'
+row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/unicode_truncate/data/s_trunc=%E4%BD%A0%E4%BB%AC%E5%A5%BDhe/.*.parq','.*','','$ERASURECODE_POLICY'
+---- TYPES
+STRING, STRING, STRING, STRING
+====
+---- QUERY
+select * from unicode_truncate
+where s like "árvíz%";
+---- RESULTS: RAW_STRING
+'árvíztűrőtükörfúrógép'
+'árvíztűrő'
+---- TYPES
+STRING
+---- RUNTIME_PROFILE
+aggregation(SUM, RowsRead): 2
+aggregation(SUM, NumRowGroups): 4
+====
+---- QUERY
+select * from unicode_truncate
+where s = "űűű";
+---- RESULTS: RAW_STRING
+'űűű'
+---- TYPES
+STRING
+---- RUNTIME_PROFILE
+aggregation(SUM, RowsRead): 1
+aggregation(SUM, NumRowGroups): 1
+====
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/unicode-column-name.test
 
b/testdata/workloads/functional-query/queries/QueryTest/unicode-column-name.test
index 9d9f304da..6c750c3bb 100644
--- 
a/testdata/workloads/functional-query/queries/QueryTest/unicode-column-name.test
+++ 
b/testdata/workloads/functional-query/queries/QueryTest/unicode-column-name.test
@@ -311,6 +311,25 @@ select * from unicode_partition_values;
 INT, STRING
 ====
 ---- QUERY
+show files in unicode_partition_values;
+---- RESULTS: RAW_STRING
+row_regex:'$NAMENODE/test-warehouse/$DATABASE\.db/unicode_partition_values/p=运/.*\.parq','.*','p=运','$ERASURECODE_POLICY'
+row_regex:'$NAMENODE/test-warehouse/$DATABASE\.db/unicode_partition_values/p=运营业务数据1234567890!@%23\$%25%5E&%2A\(\)%7B}%5B%5D/.*\.parq','.*','p=运营业务数据1234567890!@%23\$%25%5E&%2A\(\)%7B}%5B%5D','$ERASURECODE_POLICY'
+row_regex:'$NAMENODE/test-warehouse/$DATABASE\.db/unicode_partition_values/p=运营业务数据/.*\.parq','.*','p=运营业务数据','$ERASURECODE_POLICY'
+---- TYPES
+STRING, STRING, STRING, STRING
+====
+---- QUERY
+select * from unicode_partition_values
+where p like '运%';
+---- RESULTS: RAW_STRING
+0,'运'
+0,'运营业务数据'
+0,'运营业务数据1234567890!@#$%^&*(){}[]'
+---- TYPES
+INT, STRING
+====
+---- QUERY
 create table unicode_partition_values_iceberg (id int, p string) partitioned 
by spec (identity(p)) stored by iceberg;
 ---- RESULTS
 'Table has been created.'
@@ -326,4 +345,26 @@ select * from unicode_partition_values_iceberg;
 0,'运营业务数据1234567890!@#$%^&*(){}[]'
 ---- TYPES
 INT, STRING
+====
+---- QUERY
+show files in unicode_partition_values_iceberg;
+---- RESULTS
+row_regex:'$NAMENODE/test-warehouse/$DATABASE\.db/unicode_partition_values_iceberg/data/p=%E8%BF%90/.*\.parq','.*','','$ERASURECODE_POLICY'
+row_regex:'$NAMENODE/test-warehouse/$DATABASE\.db/unicode_partition_values_iceberg/data/p=%E8%BF%90%E8%90%A5%E4%B8%9A%E5%8A%A1%E6%95%B0%E6%8D%AE1234567890%21%40%23%24%25%5E%26\*%28%29%7B%7D%5B%5D/.*\.parq','.*','','$ERASURECODE_POLICY'
+row_regex:'$NAMENODE/test-warehouse/$DATABASE\.db/unicode_partition_values_iceberg/data/p=%E8%BF%90%E8%90%A5%E4%B8%9A%E5%8A%A1%E6%95%B0%E6%8D%AE/.*\.parq','.*','','$ERASURECODE_POLICY'
+---- TYPES
+STRING, STRING, STRING, STRING
+====
+---- QUERY
+insert into unicode_partition_values_iceberg values (2, '运营业务数据');
+select * from unicode_partition_values_iceberg
+where p = '运营业务数据';
+---- RESULTS: RAW_STRING
+0,'运营业务数据'
+2,'运营业务数据'
+---- TYPES
+INT, STRING
+---- RUNTIME_PROFILE
+aggregation(SUM, RowsRead): 2
+aggregation(SUM, NumRowGroups): 2
 ====
\ No newline at end of file
diff --git a/tests/query_test/test_insert.py b/tests/query_test/test_insert.py
index f5b0af57b..5c71e7da0 100644
--- a/tests/query_test/test_insert.py
+++ b/tests/query_test/test_insert.py
@@ -333,7 +333,7 @@ class TestInsertPartKey(ImpalaTestSuite):
                  "\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B" \
                  "\x1C\x1D\x1E\x1F\"\x7F'%*/:=?\\{[]#^"
     part_dir = 
"p=SpecialCharacters%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F%" \
-               
"10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F%22%7F%27%25%2A" \
+               "10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F%22%7F%27%25*" \
                "%2F%3A%3D%3F%5C%7B%5B%5D%23%5E"
     show_part_value = 
"SpecialCharacters\\u0001\\u0002\\u0003\\u0004\\u0005\\u0006" \
                       "\\u0007\\b\\t\\n\\u000B\\f\\r\\u000E\\u000F\\u0010" \

(impala) 04/05: IMPALA-14237: Fix Iceberg partition values encoding

Reply via email to