This is an automated email from the ASF dual-hosted git repository. wzhou pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit fb45c786e9527d00844dfe986bf624ec5181cb31 Author: stiga-huang <[email protected]> AuthorDate: Sat Jan 25 10:26:19 2025 +0800 IMPALA-13691: Partition values from HMS events don't need URL decoding Hive uses URL encoding to format the partition strings when creating the partition folders, e.g. "00:00:00" will be encoded into "00%3A00%3A00". When you create a partition of string type partition column "p" and using "00:00:00" as the partition value, the underlying partition folder is "p=00%3A00%3A00". When parsing the partition folders, Impala will URL-decode the partition folder names to get the correct partition values. This is correct in ALTER TABLE RECOVER PARTITIONS command that gets the partition strings from the file paths. However, for partition strings come from HMS events, Impala shouldn't URL-decode them since they are not URL encoded and are the original partition values. This causes HMS events on partitions that have percent signs in the value strings being matched to wrong partitions. This patch fixes the issue by only URL-decoding the partition strings that come from file paths. Tests: - Ran tests/metadata/test_recover_partitions.py - Added custom-cluster test. Change-Id: I7ba7fbbed47d39b02fa0b1b86d27dcda5468e344 Reviewed-on: http://gerrit.cloudera.org:8080/22388 Reviewed-by: Wenzhe Zhou <[email protected]> Reviewed-by: Csaba Ringhofer <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> --- .../java/org/apache/impala/catalog/HdfsTable.java | 16 +++++----- tests/custom_cluster/test_events_custom_configs.py | 35 ++++++++++++++++++++++ 2 files changed, 42 insertions(+), 9 deletions(-) diff --git a/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java b/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java index 59b12ce75..32c58a590 100644 --- a/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java +++ b/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java @@ -58,10 +58,8 @@ import org.apache.impala.analysis.NullLiteral; import org.apache.impala.analysis.NumericLiteral; import org.apache.impala.analysis.PartitionKeyValue; import org.apache.impala.catalog.events.MetastoreEventsProcessor; -import org.apache.impala.catalog.CatalogObject.ThriftObjectType; import org.apache.impala.catalog.HdfsPartition.FileBlock; import org.apache.impala.catalog.HdfsPartition.FileDescriptor; -import org.apache.impala.catalog.HdfsTable.FileMetadataStats; import org.apache.impala.catalog.iceberg.GroupedContentFiles; import org.apache.impala.common.FileSystemUtil; import org.apache.impala.common.ImpalaException; @@ -2646,7 +2644,7 @@ public class HdfsTable extends Table implements FeFsTable { FileStatus status = statuses.next(); if (!status.isDirectory()) continue; Pair<String, LiteralExpr> keyValues = - getTypeCompatibleValue(status.getPath(), partitionKeys.get(depth)); + getPartValueFromPath(status.getPath(), partitionKeys.get(depth)); if (keyValues == null) continue; List<String> currentPartitionValues = Lists.newArrayList(partitionValues); @@ -2692,7 +2690,7 @@ public class HdfsTable extends Table implements FeFsTable { * original value, the second element is the LiteralExpr created from the original * value. */ - private Pair<String, LiteralExpr> getTypeCompatibleValue( + private Pair<String, LiteralExpr> getPartValueFromPath( Path path, String partitionKey) { String partName[] = path.getName().split("="); if (partName.length != 2 || !partName[0].equals(partitionKey)) return null; @@ -2701,22 +2699,22 @@ public class HdfsTable extends Table implements FeFsTable { Column column = getColumn(partName[0]); Preconditions.checkNotNull(column); Type type = column.getType(); - return getPartitionExprFromValue(partName[1], type); + // URL decode the partition value since it may contain encoded URL. + String partValue = FileUtils.unescapePathName(partName[1]); + return getPartitionExprFromValue(partValue, type); } /** * Converts a given partition value to a {@link LiteralExpr} based on the type of the * partition column. - * @param partValue Value of the partition column + * @param value Value of the partition column * @param type Type of the partition column * @return Pair which contains the partition value and its equivalent * {@link LiteralExpr} according to the type provided. */ private Pair<String, LiteralExpr> getPartitionExprFromValue( - String partValue, Type type) { + String value, Type type) { LiteralExpr expr; - // URL decode the partition value since it may contain encoded URL. - String value = FileUtils.unescapePathName(partValue); if (!value.equals(getNullPartitionKeyValue())) { try { expr = LiteralExpr.createFromUnescapedStr(value, type); diff --git a/tests/custom_cluster/test_events_custom_configs.py b/tests/custom_cluster/test_events_custom_configs.py index 3b2d60c21..31a6b7b33 100644 --- a/tests/custom_cluster/test_events_custom_configs.py +++ b/tests/custom_cluster/test_events_custom_configs.py @@ -1382,6 +1382,41 @@ class TestEventProcessingCustomConfigs(TestEventProcessingCustomConfigsBase): # Make sure the table doesn't change after this test self.execute_query("alter table {} drop if exists partition(j=-1)".format(tbl)) + @CustomClusterTestSuite.with_args( + catalogd_args="--invalidate_metadata_on_event_processing_failure=false") + def test_encoded_partition_values(self, unique_database): + # Create a table with URL encoded partition values + tbl = unique_database + ".tbl" + self.execute_query("create table {}(i int) partitioned by(s string)".format(tbl)) + part_values = [ + "00:00:00", + "00%3A00%3A00", + "00%253A00%253A00", + ] + for p in part_values: + self.execute_query("alter table {} add partition (s='{}')".format(tbl, p)) + + # Generate INSERT event on a URL encoded partition which could be incorrectly decoded + # to another partition, e.g. '00%3A00%3A00' -> '00:00:00' + self.run_stmt_in_hive( + "insert into table {} partition(s='00%3A00%3A00') values (0)".format(tbl)) + EventProcessorUtils.wait_for_event_processing(self) + res = self.execute_query("select * from " + tbl) + assert len(res.data) == 1 + assert res.data[0] == '0\t00%3A00%3A00' + + # Test on processing batch of such partition events + self.run_stmt_in_hive( + "set hive.exec.dynamic.partition.mode=nonstrict; " + "insert into table {} values (1, '00%3A00%3A00'), (2, '00%253A00%253A00')" + .format(tbl)) + EventProcessorUtils.wait_for_event_processing(self) + res = self.execute_query("select * from " + tbl) + assert len(res.data) == 3 + assert '0\t00%3A00%3A00' in res.data + assert '1\t00%3A00%3A00' in res.data + assert '2\t00%253A00%253A00' in res.data + @SkipIfFS.hive class TestEventProcessingWithImpala(TestEventProcessingCustomConfigsBase):
