This is an automated email from the ASF dual-hosted git repository. stigahuang pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit 321429eac6400fd8c1b22c8aabb2a11fee381437 Author: Daniel Vanko <[email protected]> AuthorDate: Fri Jul 18 15:56:17 2025 +0200 IMPALA-14237: Fix Iceberg partition values encoding This patch modifies the string overload of IcebergFunctions::TruncatePartitionTransform so that it always handles strings as UTF-8-encoded ones, because the Iceberg specification states that that strings are UTF-8 encoded. Also, for an Iceberg table UrlEncode is called in not the Hive-compatible way, rather than the standard way, similar to Java's URLEncoder.encode() (which the Iceberg API also uses) to conform with existing practices by Hive, Spark and Trino. This included a change in the set of characters which are not escaped to follow the URL Standard's application/x-www-form-urlencoded format. [1] Also renamed it from ShouldNotEscape to IsUrlSafe for better readability. Testing: * add and extend e2e tests to check partitions with Unicode characters * add be tests to coding-util-test.cc [1]: https://url.spec.whatwg.org/#application-x-www-form-urlencoded-percent-encode-set Change-Id: Iabb39727f6dd49b76c918bcd6b3ec62532555755 Reviewed-on: http://gerrit.cloudera.org:8080/23190 Reviewed-by: Impala Public Jenkins <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> --- be/src/exec/table-sink-base.cc | 7 ++- be/src/exprs/iceberg-functions-ir.cc | 5 +- be/src/util/coding-util-test.cc | 41 ++++++++++--- be/src/util/coding-util.cc | 27 ++++++--- .../iceberg-partition-transform-insert.test | 68 ++++++++++++++++++++-- .../queries/QueryTest/unicode-column-name.test | 41 +++++++++++++ tests/query_test/test_insert.py | 2 +- 7 files changed, 165 insertions(+), 26 deletions(-) diff --git a/be/src/exec/table-sink-base.cc b/be/src/exec/table-sink-base.cc index c21ed002a..598c50492 100644 --- a/be/src/exec/table-sink-base.cc +++ b/be/src/exec/table-sink-base.cc @@ -107,7 +107,12 @@ string TableSinkBase::GetPartitionName(int i) { string TableSinkBase::UrlEncodePartitionValue(const string& raw_str) { string encoded_str; - UrlEncode(raw_str, &encoded_str, true); + if (IsIceberg()) { + // Iceberg partition values should be URL encoded, but not Hive compatible way. + UrlEncode(raw_str, &encoded_str, false); + } else { + UrlEncode(raw_str, &encoded_str, true); + } return encoded_str.empty() ? table_desc_->null_partition_key_value() : encoded_str; } diff --git a/be/src/exprs/iceberg-functions-ir.cc b/be/src/exprs/iceberg-functions-ir.cc index 2194e6f5a..2029537c5 100644 --- a/be/src/exprs/iceberg-functions-ir.cc +++ b/be/src/exprs/iceberg-functions-ir.cc @@ -22,6 +22,7 @@ #include "common/compiler-util.h" #include "common/logging.h" +#include "exprs/string-functions.h" #include "runtime/timestamp-value.inline.h" #include "thirdparty/murmurhash/MurmurHash3.h" #include "udf/udf-internal.h" @@ -102,7 +103,9 @@ StringVal IcebergFunctions::TruncatePartitionTransform(FunctionContext* ctx, const StringVal& input, const IntVal& width) { if (!CheckInputsAndSetError(ctx, input, width)) return StringVal::null(); if (input.len <= width.val) return input; - return StringVal::CopyFrom(ctx, input.ptr, width.val); + // String handled as UTF8 regardless of utf8_mode, because Iceberg spec states that + // character strings must be stored as UTF-8 encoded byte arrays. + return StringFunctions::Utf8Substring(ctx, input, 1, width.val); } template<typename T, typename W> diff --git a/be/src/util/coding-util-test.cc b/be/src/util/coding-util-test.cc index 8a2f855c8..793ea259e 100644 --- a/be/src/util/coding-util-test.cc +++ b/be/src/util/coding-util-test.cc @@ -117,14 +117,39 @@ TEST(UrlCodingTest, PathSeparators) { string encoded_test_path = "%2Fhome%2Fimpala%2Fdirectory%2F"; TestUrl(test_path, encoded_test_path, false); TestUrl(test_path, encoded_test_path, true); - string test = "SpecialCharacters\x01\x02\x03\x04\x05\x06\x07\b\t\n\v\f\r\x0E\x0F\x10" - "\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F\x7F\"#%'" - "*/:=?\\{[]^"; - string encoded_test = "SpecialCharacters%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F" - "%10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F%7F%22%23%25" - "%27%2A%2F%3A%3D%3F%5C%7B%5B%5D%5E"; - TestUrl(test, encoded_test, false); - TestUrl(test, encoded_test, true); +} + +// Test URL encoding of the ASCII table, character values from 1 to 127. +TEST(UrlCodingTest, AsciiCharacters) { + string raw = "\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D" + "\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19" + "\x1A\x1B\x1C\x1D\x1E\x1F !\"#$%&'()*+,-./" + "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7F"; + string hive_encoded = "%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D" + "%0E%0F%10%11%12%13%14%15%16%17%18%19" + "%1A%1B%1C%1D%1E%1F !%22%23$%25&%27()%2A+,-.%2F" + "0123456789%3A;<%3D>%3F@ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "%5B%5C%5D%5E_`abcdefghijklmnopqrstuvwxyz%7B|}~%7F"; + TestUrl(raw, hive_encoded, true); + string url_encoded = "%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D" + "%0E%0F%10%11%12%13%14%15%16%17%18%19" + "%1A%1B%1C%1D%1E%1F+%21%22%23%24%25%26%27%28%29*%2B%2C-.%2F" + "0123456789%3A%3B%3C%3D%3E%3F%40ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "%5B%5C%5D%5E_%60abcdefghijklmnopqrstuvwxyz%7B%7C%7D%7E%7F"; + TestUrl(raw, url_encoded, false); +} + +// Test a few unicode characters that are not in the ASCII table. +TEST(UrlCodingTest, UnicodeCharacters) { + string raw = "árvíztűrőtükörfúrógép 你们好 აბგ"; + string hive_encoded = "árvíztűrőtükörfúrógép 你们好 აბგ"; + TestUrl(raw, hive_encoded, true); + string url_encoded = "%C3%A1rv%C3%ADzt%C5%B1r%C5%91" + "t%C3%BCk%C3%B6rf%C3%BAr%C3%B3g%C3%A9p" + "+%E4%BD%A0%E4%BB%AC%E5%A5%BD" + "+%E1%83%90%E1%83%91%E1%83%92"; + TestUrl(raw, url_encoded, false); } TEST(Base64Test, Basic) { diff --git a/be/src/util/coding-util.cc b/be/src/util/coding-util.cc index f4578e4d4..3228dc33e 100644 --- a/be/src/util/coding-util.cc +++ b/be/src/util/coding-util.cc @@ -38,9 +38,9 @@ using std::uppercase; namespace impala { -// It is more convenient to maintain the complement of the set of -// characters to escape when not in Hive-compat mode. -static function<bool (char)> ShouldNotEscape = is_any_of("-_.~"); +// It is more convenient to maintain the set of characters that are safe to use +// directly in URLs without escaping +static function<bool (char)> IsUrlSafe = is_any_of(".-*_"); // Hive selectively encodes characters. This is the whitelist of // characters it will encode. @@ -53,18 +53,27 @@ static const std::unordered_set<char> SpecialCharacters = { '\x1F', '\x7F', '"', '#', '%', '\'', '*', '/', ':', '=', '?', '\\', '{', '[', ']', '^'}; +// Encodes the input string as a URL-encoded string based on UTF-8. +// If 'hive_compat' is set to true, the string is encoded in a Hive-compatible way; +// otherwise, a more standard URL encoding is used, similar to the URLEncoder.encode() +// method in Java. static inline void UrlEncode(const char* in, int in_len, string* out, bool hive_compat) { stringstream ss; // "uppercase" and "hex" only affect the insertion of integers, not that of char values. ss << uppercase << hex << setfill('0'); for (char ch : std::string_view(in, in_len)) { - // Escape the character iff a) we are in Hive-compat mode and the - // character is in the Hive whitelist or b) we are not in Hive-compat mode and - // the character is not alphanumeric and it is not one of the characters specifically - // excluded from escaping (see ShouldNotEscape()). + // Escape the character iff + // a) we are in Hive-compat mode and the character is in the Hive whitelist or + // b) we are not in Hive-compat mode and the character is not alphanumeric + // and it is not safe to use in URLs (see IsUrlSafe()). if ((hive_compat && SpecialCharacters.count(ch) > 0) || (!hive_compat && - !isalnum(static_cast<unsigned char>(ch)) && !ShouldNotEscape(ch))) { - ss << '%' << setw(2) << static_cast<uint32_t>(static_cast<unsigned char>(ch)); + !isalnum(static_cast<unsigned char>(ch)) && !IsUrlSafe(ch))) { + // Iff we are not in Hive-compat mode, we encode space as '+'. + if (!hive_compat && ch == ' ') { + ss << '+'; + } else { + ss << '%' << setw(2) << static_cast<uint32_t>(static_cast<unsigned char>(ch)); + } } else { ss << ch; } diff --git a/testdata/workloads/functional-query/queries/QueryTest/iceberg-partition-transform-insert.test b/testdata/workloads/functional-query/queries/QueryTest/iceberg-partition-transform-insert.test index 7a871358c..a5a349fe7 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/iceberg-partition-transform-insert.test +++ b/testdata/workloads/functional-query/queries/QueryTest/iceberg-partition-transform-insert.test @@ -427,11 +427,11 @@ INT,BIGINT,DECIMAL,STRING ---- QUERY show files in multi_col_truncate; ---- RESULTS -row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/multi_col_truncate/data/s_trunc=the quick brown/i_trunc=0/b_trunc=11/d_trunc=11111.100000/.*.parq','.*','','$ERASURECODE_POLICY' -row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/multi_col_truncate/data/s_trunc=the quick brown/i_trunc=0/b_trunc=220/d_trunc=421.000000/.*.parq','.*','','$ERASURECODE_POLICY' -row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/multi_col_truncate/data/s_trunc=the quick brown/i_trunc=5/b_trunc=330/d_trunc=113211.200000/.*.parq','.*','','$ERASURECODE_POLICY' -row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/multi_col_truncate/data/s_trunc=the quick fox b/i_trunc=5/b_trunc=440/d_trunc=1111154.100000/.*.parq','.*','','$ERASURECODE_POLICY' -row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/multi_col_truncate/data/s_trunc=the quick impal/i_trunc=15/b_trunc=550/d_trunc=9999913.200000/.*.parq','.*','','$ERASURECODE_POLICY' +row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/multi_col_truncate/data/s_trunc=the\+quick\+brown/i_trunc=0/b_trunc=11/d_trunc=11111.100000/.*.parq','.*','','$ERASURECODE_POLICY' +row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/multi_col_truncate/data/s_trunc=the\+quick\+brown/i_trunc=0/b_trunc=220/d_trunc=421.000000/.*.parq','.*','','$ERASURECODE_POLICY' +row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/multi_col_truncate/data/s_trunc=the\+quick\+brown/i_trunc=5/b_trunc=330/d_trunc=113211.200000/.*.parq','.*','','$ERASURECODE_POLICY' +row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/multi_col_truncate/data/s_trunc=the\+quick\+fox\+b/i_trunc=5/b_trunc=440/d_trunc=1111154.100000/.*.parq','.*','','$ERASURECODE_POLICY' +row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/multi_col_truncate/data/s_trunc=the\+quick\+impal/i_trunc=15/b_trunc=550/d_trunc=9999913.200000/.*.parq','.*','','$ERASURECODE_POLICY' row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/multi_col_truncate/data/s_trunc=__HIVE_DEFAULT_PARTITION__/i_trunc=__HIVE_DEFAULT_PARTITION__/b_trunc=__HIVE_DEFAULT_PARTITION__/d_trunc=__HIVE_DEFAULT_PARTITION__/.*.parq','.*','','$ERASURECODE_POLICY' ---- TYPES STRING, STRING, STRING, STRING @@ -1010,7 +1010,7 @@ STRING,BIGINT,DECIMAL,TIMESTAMP,DATE show files in mixed_and_shuffled; ---- RESULTS row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/mixed_and_shuffled/data/t_day=1971-07-01/da_year=1971/s_trunc=green/b_bucket=1/de_trunc=71.00/.*.parq','.*','','$ERASURECODE_POLICY' -row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/mixed_and_shuffled/data/t_day=1999-09-09/da_year=1999/s_trunc=pink /b_bucket=1/de_trunc=9.00/.*.parq','.*','','$ERASURECODE_POLICY' +row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/mixed_and_shuffled/data/t_day=1999-09-09/da_year=1999/s_trunc=pink\+/b_bucket=1/de_trunc=9.00/.*.parq','.*','','$ERASURECODE_POLICY' row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/mixed_and_shuffled/data/t_day=2020-01-06/da_year=2020/s_trunc=quick/b_bucket=1/de_trunc=3333.00/.*.parq','.*','','$ERASURECODE_POLICY' row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/mixed_and_shuffled/data/t_day=2021-01-01/da_year=2021/s_trunc=quick/b_bucket=1/de_trunc=543.00/.*.parq','.*','','$ERASURECODE_POLICY' row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/mixed_and_shuffled/data/t_day=2021-01-01/da_year=2021/s_trunc=quick/b_bucket=2/de_trunc=123.00/.*.parq','.*','','$ERASURECODE_POLICY' @@ -1088,3 +1088,59 @@ where da is not null; ---- TYPES STRING,BIGINT,DECIMAL,TIMESTAMP,DATE ==== +---- QUERY +# Test truncate partition transform with Unicode strings +create table unicode_truncate (s string) +partitioned by spec (truncate(5, s)) +stored as iceberg; +==== +---- QUERY +insert into unicode_truncate values +('impala'), +('árvíztűrőtükörfúrógép'), +('árvíztűrő'), +('űűű'), +('你们好hello'); +select * from unicode_truncate; +---- RESULTS: RAW_STRING +'impala' +'árvíztűrőtükörfúrógép' +'árvíztűrő' +'űűű' +'你们好hello' +---- TYPES +STRING +==== +---- QUERY +show files in unicode_truncate; +---- RESULTS +row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/unicode_truncate/data/s_trunc=impal/.*.parq','.*','','$ERASURECODE_POLICY' +row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/unicode_truncate/data/s_trunc=%C3%A1rv%C3%ADz/.*.parq','.*','','$ERASURECODE_POLICY' +row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/unicode_truncate/data/s_trunc=%C5%B1%C5%B1%C5%B1/.*.parq','.*','','$ERASURECODE_POLICY' +row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/unicode_truncate/data/s_trunc=%E4%BD%A0%E4%BB%AC%E5%A5%BDhe/.*.parq','.*','','$ERASURECODE_POLICY' +---- TYPES +STRING, STRING, STRING, STRING +==== +---- QUERY +select * from unicode_truncate +where s like "árvíz%"; +---- RESULTS: RAW_STRING +'árvíztűrőtükörfúrógép' +'árvíztűrő' +---- TYPES +STRING +---- RUNTIME_PROFILE +aggregation(SUM, RowsRead): 2 +aggregation(SUM, NumRowGroups): 4 +==== +---- QUERY +select * from unicode_truncate +where s = "űűű"; +---- RESULTS: RAW_STRING +'űűű' +---- TYPES +STRING +---- RUNTIME_PROFILE +aggregation(SUM, RowsRead): 1 +aggregation(SUM, NumRowGroups): 1 +==== diff --git a/testdata/workloads/functional-query/queries/QueryTest/unicode-column-name.test b/testdata/workloads/functional-query/queries/QueryTest/unicode-column-name.test index 9d9f304da..6c750c3bb 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/unicode-column-name.test +++ b/testdata/workloads/functional-query/queries/QueryTest/unicode-column-name.test @@ -311,6 +311,25 @@ select * from unicode_partition_values; INT, STRING ==== ---- QUERY +show files in unicode_partition_values; +---- RESULTS: RAW_STRING +row_regex:'$NAMENODE/test-warehouse/$DATABASE\.db/unicode_partition_values/p=运/.*\.parq','.*','p=运','$ERASURECODE_POLICY' +row_regex:'$NAMENODE/test-warehouse/$DATABASE\.db/unicode_partition_values/p=运营业务数据1234567890!@%23\$%25%5E&%2A\(\)%7B}%5B%5D/.*\.parq','.*','p=运营业务数据1234567890!@%23\$%25%5E&%2A\(\)%7B}%5B%5D','$ERASURECODE_POLICY' +row_regex:'$NAMENODE/test-warehouse/$DATABASE\.db/unicode_partition_values/p=运营业务数据/.*\.parq','.*','p=运营业务数据','$ERASURECODE_POLICY' +---- TYPES +STRING, STRING, STRING, STRING +==== +---- QUERY +select * from unicode_partition_values +where p like '运%'; +---- RESULTS: RAW_STRING +0,'运' +0,'运营业务数据' +0,'运营业务数据1234567890!@#$%^&*(){}[]' +---- TYPES +INT, STRING +==== +---- QUERY create table unicode_partition_values_iceberg (id int, p string) partitioned by spec (identity(p)) stored by iceberg; ---- RESULTS 'Table has been created.' @@ -326,4 +345,26 @@ select * from unicode_partition_values_iceberg; 0,'运营业务数据1234567890!@#$%^&*(){}[]' ---- TYPES INT, STRING +==== +---- QUERY +show files in unicode_partition_values_iceberg; +---- RESULTS +row_regex:'$NAMENODE/test-warehouse/$DATABASE\.db/unicode_partition_values_iceberg/data/p=%E8%BF%90/.*\.parq','.*','','$ERASURECODE_POLICY' +row_regex:'$NAMENODE/test-warehouse/$DATABASE\.db/unicode_partition_values_iceberg/data/p=%E8%BF%90%E8%90%A5%E4%B8%9A%E5%8A%A1%E6%95%B0%E6%8D%AE1234567890%21%40%23%24%25%5E%26\*%28%29%7B%7D%5B%5D/.*\.parq','.*','','$ERASURECODE_POLICY' +row_regex:'$NAMENODE/test-warehouse/$DATABASE\.db/unicode_partition_values_iceberg/data/p=%E8%BF%90%E8%90%A5%E4%B8%9A%E5%8A%A1%E6%95%B0%E6%8D%AE/.*\.parq','.*','','$ERASURECODE_POLICY' +---- TYPES +STRING, STRING, STRING, STRING +==== +---- QUERY +insert into unicode_partition_values_iceberg values (2, '运营业务数据'); +select * from unicode_partition_values_iceberg +where p = '运营业务数据'; +---- RESULTS: RAW_STRING +0,'运营业务数据' +2,'运营业务数据' +---- TYPES +INT, STRING +---- RUNTIME_PROFILE +aggregation(SUM, RowsRead): 2 +aggregation(SUM, NumRowGroups): 2 ==== \ No newline at end of file diff --git a/tests/query_test/test_insert.py b/tests/query_test/test_insert.py index f5b0af57b..5c71e7da0 100644 --- a/tests/query_test/test_insert.py +++ b/tests/query_test/test_insert.py @@ -333,7 +333,7 @@ class TestInsertPartKey(ImpalaTestSuite): "\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B" \ "\x1C\x1D\x1E\x1F\"\x7F'%*/:=?\\{[]#^" part_dir = "p=SpecialCharacters%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F%" \ - "10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F%22%7F%27%25%2A" \ + "10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F%22%7F%27%25*" \ "%2F%3A%3D%3F%5C%7B%5B%5D%23%5E" show_part_value = "SpecialCharacters\\u0001\\u0002\\u0003\\u0004\\u0005\\u0006" \ "\\u0007\\b\\t\\n\\u000B\\f\\r\\u000E\\u000F\\u0010" \
