This is an automated email from the ASF dual-hosted git repository.

wzhou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit fb45c786e9527d00844dfe986bf624ec5181cb31
Author: stiga-huang <[email protected]>
AuthorDate: Sat Jan 25 10:26:19 2025 +0800

    IMPALA-13691: Partition values from HMS events don't need URL decoding
    
    Hive uses URL encoding to format the partition strings when creating the
    partition folders, e.g. "00:00:00" will be encoded into "00%3A00%3A00".
    When you create a partition of string type partition column "p" and
    using "00:00:00" as the partition value, the underlying partition folder
    is "p=00%3A00%3A00".
    
    When parsing the partition folders, Impala will URL-decode the partition
    folder names to get the correct partition values. This is correct in
    ALTER TABLE RECOVER PARTITIONS command that gets the partition strings
    from the file paths. However, for partition strings come from HMS
    events, Impala shouldn't URL-decode them since they are not URL encoded
    and are the original partition values. This causes HMS events on
    partitions that have percent signs in the value strings being matched to
    wrong partitions.
    
    This patch fixes the issue by only URL-decoding the partition strings
    that come from file paths.
    
    Tests:
     - Ran tests/metadata/test_recover_partitions.py
     - Added custom-cluster test.
    
    Change-Id: I7ba7fbbed47d39b02fa0b1b86d27dcda5468e344
    Reviewed-on: http://gerrit.cloudera.org:8080/22388
    Reviewed-by: Wenzhe Zhou <[email protected]>
    Reviewed-by: Csaba Ringhofer <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 .../java/org/apache/impala/catalog/HdfsTable.java  | 16 +++++-----
 tests/custom_cluster/test_events_custom_configs.py | 35 ++++++++++++++++++++++
 2 files changed, 42 insertions(+), 9 deletions(-)

diff --git a/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java 
b/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java
index 59b12ce75..32c58a590 100644
--- a/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java
+++ b/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java
@@ -58,10 +58,8 @@ import org.apache.impala.analysis.NullLiteral;
 import org.apache.impala.analysis.NumericLiteral;
 import org.apache.impala.analysis.PartitionKeyValue;
 import org.apache.impala.catalog.events.MetastoreEventsProcessor;
-import org.apache.impala.catalog.CatalogObject.ThriftObjectType;
 import org.apache.impala.catalog.HdfsPartition.FileBlock;
 import org.apache.impala.catalog.HdfsPartition.FileDescriptor;
-import org.apache.impala.catalog.HdfsTable.FileMetadataStats;
 import org.apache.impala.catalog.iceberg.GroupedContentFiles;
 import org.apache.impala.common.FileSystemUtil;
 import org.apache.impala.common.ImpalaException;
@@ -2646,7 +2644,7 @@ public class HdfsTable extends Table implements FeFsTable 
{
       FileStatus status = statuses.next();
       if (!status.isDirectory()) continue;
       Pair<String, LiteralExpr> keyValues =
-          getTypeCompatibleValue(status.getPath(), partitionKeys.get(depth));
+          getPartValueFromPath(status.getPath(), partitionKeys.get(depth));
       if (keyValues == null) continue;
 
       List<String> currentPartitionValues = 
Lists.newArrayList(partitionValues);
@@ -2692,7 +2690,7 @@ public class HdfsTable extends Table implements FeFsTable 
{
    * original value, the second element is the LiteralExpr created from the 
original
    * value.
    */
-  private Pair<String, LiteralExpr> getTypeCompatibleValue(
+  private Pair<String, LiteralExpr> getPartValueFromPath(
       Path path, String partitionKey) {
     String partName[] = path.getName().split("=");
     if (partName.length != 2 || !partName[0].equals(partitionKey)) return null;
@@ -2701,22 +2699,22 @@ public class HdfsTable extends Table implements 
FeFsTable {
     Column column = getColumn(partName[0]);
     Preconditions.checkNotNull(column);
     Type type = column.getType();
-    return getPartitionExprFromValue(partName[1], type);
+    // URL decode the partition value since it may contain encoded URL.
+    String partValue = FileUtils.unescapePathName(partName[1]);
+    return getPartitionExprFromValue(partValue, type);
   }
 
   /**
    * Converts a given partition value to a {@link LiteralExpr} based on the 
type of the
    * partition column.
-   * @param partValue Value of the partition column
+   * @param value Value of the partition column
    * @param type Type of the partition column
    * @return Pair which contains the partition value and its equivalent
    * {@link LiteralExpr} according to the type provided.
    */
   private Pair<String, LiteralExpr> getPartitionExprFromValue(
-      String partValue, Type type) {
+      String value, Type type) {
     LiteralExpr expr;
-    // URL decode the partition value since it may contain encoded URL.
-    String value = FileUtils.unescapePathName(partValue);
     if (!value.equals(getNullPartitionKeyValue())) {
       try {
         expr = LiteralExpr.createFromUnescapedStr(value, type);
diff --git a/tests/custom_cluster/test_events_custom_configs.py 
b/tests/custom_cluster/test_events_custom_configs.py
index 3b2d60c21..31a6b7b33 100644
--- a/tests/custom_cluster/test_events_custom_configs.py
+++ b/tests/custom_cluster/test_events_custom_configs.py
@@ -1382,6 +1382,41 @@ class 
TestEventProcessingCustomConfigs(TestEventProcessingCustomConfigsBase):
       # Make sure the table doesn't change after this test
       self.execute_query("alter table {} drop if exists 
partition(j=-1)".format(tbl))
 
+  @CustomClusterTestSuite.with_args(
+    catalogd_args="--invalidate_metadata_on_event_processing_failure=false")
+  def test_encoded_partition_values(self, unique_database):
+    # Create a table with URL encoded partition values
+    tbl = unique_database + ".tbl"
+    self.execute_query("create table {}(i int) partitioned by(s 
string)".format(tbl))
+    part_values = [
+      "00:00:00",
+      "00%3A00%3A00",
+      "00%253A00%253A00",
+    ]
+    for p in part_values:
+      self.execute_query("alter table {} add partition (s='{}')".format(tbl, 
p))
+
+    # Generate INSERT event on a URL encoded partition which could be 
incorrectly decoded
+    # to another partition, e.g. '00%3A00%3A00' -> '00:00:00'
+    self.run_stmt_in_hive(
+        "insert into table {} partition(s='00%3A00%3A00') values 
(0)".format(tbl))
+    EventProcessorUtils.wait_for_event_processing(self)
+    res = self.execute_query("select * from " + tbl)
+    assert len(res.data) == 1
+    assert res.data[0] == '0\t00%3A00%3A00'
+
+    # Test on processing batch of such partition events
+    self.run_stmt_in_hive(
+        "set hive.exec.dynamic.partition.mode=nonstrict; "
+        "insert into table {} values (1, '00%3A00%3A00'), (2, 
'00%253A00%253A00')"
+        .format(tbl))
+    EventProcessorUtils.wait_for_event_processing(self)
+    res = self.execute_query("select * from " + tbl)
+    assert len(res.data) == 3
+    assert '0\t00%3A00%3A00' in res.data
+    assert '1\t00%3A00%3A00' in res.data
+    assert '2\t00%253A00%253A00' in res.data
+
 
 @SkipIfFS.hive
 class TestEventProcessingWithImpala(TestEventProcessingCustomConfigsBase):

Reply via email to