(impala) branch master updated: IMPALA-14737 Part1: Push down LIKE predicates to Iceberg

michaelsmith Tue, 24 Feb 2026 14:32:08 -0800

This is an automated email from the ASF dual-hosted git repository.

michaelsmith pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git



The following commit(s) were added to refs/heads/master by this push:
     new 540a3784e IMPALA-14737 Part1: Push down LIKE predicates to Iceberg
540a3784e is described below

commit 540a3784e0e3cad1a962c368ddbcb14b05de6832
Author: Arnab Karmakar <[email protected]>
AuthorDate: Wed Feb 18 02:46:00 2026 -0800

    IMPALA-14737 Part1: Push down LIKE predicates to Iceberg
    
    This patch adds support for pushing down LIKE predicates to Iceberg using
    startsWith() and equal() expressions. When a LIKE predicate has a pattern
    starting with non-wildcard characters, Impala analyzes the pattern and
    pushes it down to Iceberg for efficient file-level filtering.
    
    Supported patterns:
    - 'abc%' -> pushes down startsWith('abc')
    - 'pre_fix%' -> pushes down startsWith('pre') (underscore is wildcard)
    - 'a_b%' -> pushes down startsWith('a')
    - 'exact' -> pushes down equal('exact') (no wildcards)
    - 'asd\%' -> pushes down equal('asd%') (escaped wildcard treated as literal)
    
    Unsupported patterns (not pushed down):
    - '%suffix' - starts with wildcard
    - '_prefix%' - starts with wildcard
    - 'prefix%suffix' - has literal content after wildcard
    
    Benefits:
    - File-level filtering using Iceberg metadata
    - Partition pruning when LIKE is on partition columns
    - Works with UTF-8 strings
    
    Testing:
    - Added iceberg-like-pushdown.test with comprehensive test coverage
    - Tests include prefix patterns, underscore wildcards, exact matches,
      partition pruning comparison, UTF-8 strings and cases where the pattern
      cannot be pushed down
    
    Change-Id: I548834126540bcc8d22efc872c2571293b8b7ec4
    Reviewed-on: http://gerrit.cloudera.org:8080/24001
    Reviewed-by: Impala Public Jenkins <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 .../org/apache/impala/analysis/LikePredicate.java  |   2 +-
 .../impala/common/IcebergPredicateConverter.java   | 168 ++++++++++
 .../common/IcebergPredicateConverterTest.java      | 197 ++++++++++++
 .../queries/PlannerTest/iceberg-predicates.test    |  25 +-
 .../queries/PlannerTest/iceberg-v2-update.test     |  42 +--
 .../queries/QueryTest/iceberg-like-pushdown.test   | 356 +++++++++++++++++++++
 .../iceberg-partition-transform-insert.test        |   2 +-
 tests/query_test/test_iceberg.py                   |   3 +
 8 files changed, 756 insertions(+), 39 deletions(-)

diff --git a/fe/src/main/java/org/apache/impala/analysis/LikePredicate.java 
b/fe/src/main/java/org/apache/impala/analysis/LikePredicate.java
index e17917bff..84e560fe1 100644
--- a/fe/src/main/java/org/apache/impala/analysis/LikePredicate.java
+++ b/fe/src/main/java/org/apache/impala/analysis/LikePredicate.java
@@ -32,7 +32,7 @@ import com.google.common.base.Preconditions;
 import com.google.common.collect.Lists;
 
 public class LikePredicate extends Predicate {
-  enum Operator {
+  public enum Operator {
     LIKE("LIKE"),
     ILIKE("ILIKE"),
     RLIKE("RLIKE"),
diff --git 
a/fe/src/main/java/org/apache/impala/common/IcebergPredicateConverter.java 
b/fe/src/main/java/org/apache/impala/common/IcebergPredicateConverter.java
index c42f96f4d..946fcf41e 100644
--- a/fe/src/main/java/org/apache/impala/common/IcebergPredicateConverter.java
+++ b/fe/src/main/java/org/apache/impala/common/IcebergPredicateConverter.java
@@ -38,6 +38,7 @@ import org.apache.impala.analysis.DateLiteral;
 import org.apache.impala.analysis.Expr;
 import org.apache.impala.analysis.InPredicate;
 import org.apache.impala.analysis.IsNullPredicate;
+import org.apache.impala.analysis.LikePredicate;
 import org.apache.impala.analysis.LiteralExpr;
 import org.apache.impala.analysis.NumericLiteral;
 import org.apache.impala.analysis.SlotDescriptor;
@@ -75,6 +76,8 @@ public class IcebergPredicateConverter {
       return convert((IsNullPredicate) expr);
     } else if (expr instanceof CompoundPredicate) {
       return convert((CompoundPredicate) expr);
+    } else if (expr instanceof LikePredicate) {
+      return convert((LikePredicate) expr);
     } else {
       throw new ImpalaRuntimeException(String.format(
           "Unsupported expression: %s", expr.toSql()));
@@ -150,6 +153,171 @@ public class IcebergPredicateConverter {
         Expressions.or(left, right);
   }
 
+  /**
+   * Checks if a wildcard character at the given position is escaped by 
counting
+   * preceding backslashes. An odd number of backslashes means the wildcard is 
escaped.
+   */
+  static boolean isWildcardEscaped(String pattern, int wildcardPos) {
+    int backslashCount = 0;
+    int j = wildcardPos - 1;
+    while (j >= 0 && pattern.charAt(j) == '\\') {
+      backslashCount++;
+      j--;
+    }
+    return backslashCount % 2 == 1;
+  }
+
+  /**
+   * Checks if there's any literal (non-wildcard) content after the given 
position.
+   * This determines if a pattern like 'd%d' has content after the first 
wildcard.
+   * Only unescaped % and _ are considered non-literal.
+   *
+   * @param pattern The pattern string from StringLiteral.getUnescapedValue()
+   * @param afterPos Position to check after (exclusive)
+   * @return True if there's any literal content after afterPos
+   */
+  static boolean hasLiteralContentAfterWildcard(String pattern, int afterPos) {
+    for (int i = afterPos + 1; i < pattern.length(); i++) {
+      char c = pattern.charAt(i);
+
+      // Check if this is a wildcard
+      if (c == '%' || c == '_') {
+        // If escaped, it's literal content
+        if (isWildcardEscaped(pattern, i)) {
+          return true;
+        }
+        // Unescaped wildcard, continue checking
+        continue;
+      }
+
+      // Any non-wildcard character is literal content
+      return true;
+    }
+    return false;
+  }
+
+  /**
+   * Finds the position of the first unescaped wildcard (% or _) in the 
pattern.
+   * Wildcards can be escaped with '\', e.g., 'asd\%' has no unescaped 
wildcard.
+   *
+   * @return Position of first unescaped wildcard, or -1 if none found
+   */
+  static int findFirstUnescapedWildcard(String pattern) {
+    for (int i = 0; i < pattern.length(); i++) {
+      char c = pattern.charAt(i);
+
+      // Check if this character is a wildcard
+      if (c == '%' || c == '_') {
+        // Return position if not escaped
+        if (!isWildcardEscaped(pattern, i)) {
+          return i;
+        }
+      }
+    }
+    return -1;
+  }
+
+  /**
+   * Removes backslash escapes from LIKE wildcards in the pattern.
+   *
+   * Background: When Impala parses SQL LIKE 'test\%value',
+   * StringLiteral.getUnescapedValue() preserves LIKE-specific escape sequences
+   * and returns the Java string "test\\%value" (backslash followed by %). This
+   * method converts it to "test%value" (literal %) for Iceberg, which doesn't
+   * use backslash escaping.
+   *
+   * Example:
+   *   Input: "test\\%value" (from SQL: LIKE 'test\%value')
+   *   Output: "test%value" (literal % for Iceberg equal())
+   *
+   * @param pattern The pattern string from StringLiteral.getUnescapedValue()
+   * @param endPos Position to stop processing, or -1 to process entire string
+   * @return Pattern with LIKE escape sequences removed
+   */
+  static String unescapeLikePattern(String pattern, int endPos) {
+    int len = (endPos == -1) ? pattern.length() : endPos;
+    StringBuilder sb = new StringBuilder(len);
+
+    for (int i = 0; i < len; i++) {
+      char c = pattern.charAt(i);
+
+      if (c == '\\' && i + 1 < pattern.length()) {
+        char next = pattern.charAt(i + 1);
+        // Unescape LIKE wildcards: \% -> %, \_ -> _
+        // Also handle escaped backslash: \\ -> \
+        if (next == '%' || next == '_' || next == '\\') {
+          sb.append(next);
+          i++; // Skip the next character as we've already processed it
+          continue;
+        }
+      }
+
+      sb.append(c);
+    }
+
+    return sb.toString();
+  }
+
+  protected Expression convert(LikePredicate predicate)
+      throws ImpalaRuntimeException {
+    // Only LIKE operator is supported, not RLIKE, REGEXP, etc.
+    if (predicate.getOp() != LikePredicate.Operator.LIKE) {
+      throw new ImpalaRuntimeException(String.format(
+          "Only LIKE operator is supported for Iceberg pushdown, got: %s",
+          predicate.getOp()));
+    }
+
+    Term term = getTerm(predicate.getChild(0));
+    IcebergColumn column = term.referencedColumn_;
+
+    // Check if the column is a string type
+    if (!column.getType().isStringType()) {
+      throw new ImpalaRuntimeException(String.format(
+          "LIKE predicate pushdown only supports string columns, got: %s",
+          column.getType()));
+    }
+
+    LiteralExpr literal = getSecondChildAsLiteralExpr(predicate);
+    checkNullLiteral(literal);
+
+    if (!(literal instanceof StringLiteral)) {
+      throw new ImpalaRuntimeException(String.format(
+          "LIKE pattern must be a string literal, got: %s", literal.toSql()));
+    }
+
+    String pattern = ((StringLiteral) literal).getUnescapedValue();
+    if (pattern == null || pattern.isEmpty()) {
+      throw new ImpalaRuntimeException("LIKE pattern cannot be null or empty");
+    }
+
+    // Find first unescaped wildcard position
+    int firstWildcard = findFirstUnescapedWildcard(pattern);
+
+    // Case 1: Pattern starts with wildcard - cannot push down
+    if (firstWildcard == 0) {
+      throw new ImpalaRuntimeException(String.format(
+          "LIKE pattern '%s' cannot be pushed down to Iceberg. Patterns must 
start "
+          + "with at least one literal character.", pattern));
+    }
+
+    // Case 2: No wildcards - exact match
+    if (firstWildcard == -1) {
+      String unescapedPattern = unescapeLikePattern(pattern, -1);
+      return Expressions.equal(column.getName(), unescapedPattern);
+    }
+
+    // Case 3: Wildcard in middle with literal content after - cannot push down
+    if (hasLiteralContentAfterWildcard(pattern, firstWildcard)) {
+      throw new ImpalaRuntimeException(String.format(
+          "LIKE pattern '%s' cannot be pushed down to Iceberg. Only prefix 
patterns "
+          + "(e.g., 'prefix%%') are supported.", pattern));
+    }
+
+    // Case 4: Pure prefix pattern (wildcard only at end) - use startsWith
+    String unescapedPattern = unescapeLikePattern(pattern, firstWildcard);
+    return Expressions.startsWith(column.getName(), unescapedPattern);
+  }
+
   protected void checkNullLiteral(LiteralExpr literal) throws 
ImpalaRuntimeException {
     if (Expr.IS_NULL_LITERAL.apply(literal)) {
       throw new ImpalaRuntimeException("Expression can't be NULL literal: " + 
literal);
diff --git 
a/fe/src/test/java/org/apache/impala/common/IcebergPredicateConverterTest.java 
b/fe/src/test/java/org/apache/impala/common/IcebergPredicateConverterTest.java
new file mode 100644
index 000000000..41f8e942f
--- /dev/null
+++ 
b/fe/src/test/java/org/apache/impala/common/IcebergPredicateConverterTest.java
@@ -0,0 +1,197 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.impala.common;
+
+import static org.junit.Assert.assertEquals;
+
+import org.junit.Test;
+
+/**
+ * Unit tests for IcebergPredicateConverter helper methods.
+ */
+public class IcebergPredicateConverterTest {
+
+  @Test
+  public void testIsWildcardEscaped() {
+    // Test case 1: No backslashes - not escaped
+    assertEquals(false, IcebergPredicateConverter.isWildcardEscaped("test%", 
4));
+    assertEquals(false, 
IcebergPredicateConverter.isWildcardEscaped("test_value", 4));
+    assertEquals(false, IcebergPredicateConverter.isWildcardEscaped("%test", 
0));
+
+    // Test case 2: Single backslash - escaped (odd count)
+    assertEquals(true, IcebergPredicateConverter.isWildcardEscaped("test\\%", 
5));
+    assertEquals(true, 
IcebergPredicateConverter.isWildcardEscaped("test\\_value", 5));
+
+    // Test case 3: Double backslash - not escaped (even count)
+    assertEquals(false, 
IcebergPredicateConverter.isWildcardEscaped("test\\\\%", 6));
+    assertEquals(false, 
IcebergPredicateConverter.isWildcardEscaped("test\\\\_value", 6));
+
+    // Test case 4: Triple backslash - escaped (odd count)
+    assertEquals(true, 
IcebergPredicateConverter.isWildcardEscaped("test\\\\\\%", 7));
+
+    // Test case 5: Multiple wildcards with mixed escaping
+    // Pattern: "test\%_" - first wildcard escaped, second not
+    assertEquals(true, IcebergPredicateConverter.isWildcardEscaped("test\\%_", 
5));
+    assertEquals(false, 
IcebergPredicateConverter.isWildcardEscaped("test\\%_", 6));
+
+    // Test case 6: Edge case - wildcard at position 0
+    assertEquals(false, IcebergPredicateConverter.isWildcardEscaped("%test", 
0));
+    assertEquals(false, IcebergPredicateConverter.isWildcardEscaped("_test", 
0));
+  }
+
+  @Test
+  public void testHasLiteralContentAfterWildcard() {
+    // Test case 1: No content after first wildcard (only unescaped wildcards 
or nothing)
+    assertEquals(false,
+        IcebergPredicateConverter.hasLiteralContentAfterWildcard("test%", 4));
+    assertEquals(false,
+        IcebergPredicateConverter.hasLiteralContentAfterWildcard("test%%", 4));
+    assertEquals(false,
+        IcebergPredicateConverter.hasLiteralContentAfterWildcard("test%_%", 
4));
+    assertEquals(false,
+        IcebergPredicateConverter.hasLiteralContentAfterWildcard("test_", 4));
+    assertEquals(false,
+        IcebergPredicateConverter.hasLiteralContentAfterWildcard("prefix%%%", 
6));
+
+    // Test case 2: Literal content after first wildcard
+    assertEquals(true,
+        IcebergPredicateConverter.hasLiteralContentAfterWildcard("d%d", 1));
+    assertEquals(true,
+        
IcebergPredicateConverter.hasLiteralContentAfterWildcard("test%suffix", 4));
+    assertEquals(true,
+        IcebergPredicateConverter.hasLiteralContentAfterWildcard("a%b%c", 1));
+    assertEquals(true,
+        IcebergPredicateConverter.hasLiteralContentAfterWildcard("pre_%fix", 
3));
+
+    // Test case 3: Escaped wildcards after first wildcard are literal content
+    // "test%\\%" has escaped % after the unescaped %, which is literal content
+    assertEquals(true,
+        IcebergPredicateConverter.hasLiteralContentAfterWildcard("test%\\%", 
4));
+    // "test%\\_" has escaped _ after the first %, which is literal content
+    assertEquals(true,
+        IcebergPredicateConverter.hasLiteralContentAfterWildcard("test%\\_", 
4));
+
+    // Test case 4: Mix of wildcards and escaped content
+    // "pre%_%\\%" has unescaped _, then escaped %, so has literal content
+    assertEquals(true,
+        IcebergPredicateConverter.hasLiteralContentAfterWildcard("pre%_%\\%", 
3));
+    // Obviously has literal 'x' at the end
+    assertEquals(true,
+        IcebergPredicateConverter.hasLiteralContentAfterWildcard("pre%_%\\%x", 
3));
+  }
+
+  @Test
+  public void testFindFirstUnescapedWildcard() {
+    // Test case 1: No wildcards
+    assertEquals(-1, 
IcebergPredicateConverter.findFirstUnescapedWildcard("exact"));
+    assertEquals(-1, 
IcebergPredicateConverter.findFirstUnescapedWildcard("test"));
+
+    // Test case 2: Unescaped wildcards
+    assertEquals(0, 
IcebergPredicateConverter.findFirstUnescapedWildcard("%test"));
+    assertEquals(0, 
IcebergPredicateConverter.findFirstUnescapedWildcard("_test"));
+    assertEquals(4, 
IcebergPredicateConverter.findFirstUnescapedWildcard("test%"));
+    assertEquals(4, 
IcebergPredicateConverter.findFirstUnescapedWildcard("test_suffix"));
+    assertEquals(3, 
IcebergPredicateConverter.findFirstUnescapedWildcard("abc%def"));
+
+    // Test case 3: Escaped wildcards (what getUnescapedValue() returns for 
SQL \%)
+    // SQL 'test\%' → getUnescapedValue() returns "test\\%"
+    assertEquals(-1, 
IcebergPredicateConverter.findFirstUnescapedWildcard("test\\%"));
+    assertEquals(-1,
+        IcebergPredicateConverter.findFirstUnescapedWildcard("test\\_value"));
+    assertEquals(-1, 
IcebergPredicateConverter.findFirstUnescapedWildcard("abc\\%def"));
+
+    // Test case 4: Mixed escaped and unescaped wildcards
+    // "wild\\%card%" in memory: w,i,l,d,\,%,c,a,r,d,%
+    // First % at 5 is escaped, second % at 10 is unescaped
+    assertEquals(10,
+        IcebergPredicateConverter.findFirstUnescapedWildcard("wild\\%card%"));
+    assertEquals(10,
+        
IcebergPredicateConverter.findFirstUnescapedWildcard("wild\\_card_mix"));
+    // "test\\%%more" in memory: t,e,s,t,\,%,%,m,o,r,e
+    // First % at 5 is escaped, second % at 6 is unescaped
+    assertEquals(6, 
IcebergPredicateConverter.findFirstUnescapedWildcard("test\\%%more"));
+
+    // Test case 5: Double backslash (escaped backslash + unescaped wildcard)
+    // "test\\\\%" in Java source = "test\\%" in memory
+    // % at index 6 has even (2) backslashes, so it's unescaped
+    assertEquals(6, 
IcebergPredicateConverter.findFirstUnescapedWildcard("test\\\\%"));
+    assertEquals(6, 
IcebergPredicateConverter.findFirstUnescapedWildcard("test\\\\_"));
+
+    // Test case 6: Triple backslash (escaped backslash + escaped wildcard)
+    // "test\\\\\\%" in Java source = "test\\\\%" in memory
+    // % at index 7 has odd (3) backslashes, so it's escaped
+    assertEquals(-1, 
IcebergPredicateConverter.findFirstUnescapedWildcard("test\\\\\\%"));
+  }
+
+  @Test
+  public void testUnescapeLikePattern() {
+    // Test case 1: No escapes
+    assertEquals("exact", 
IcebergPredicateConverter.unescapeLikePattern("exact", -1));
+    assertEquals("prefix", 
IcebergPredicateConverter.unescapeLikePattern("prefix", 6));
+
+    // Test case 2: Escaped % (what getUnescapedValue() returns for SQL 
'test\%value')
+    // SQL: 'test\%value' → getUnescapedValue() returns "test\\%value" 
(backslash + %)
+    // After unescaping → "test%value" (literal %)
+    assertEquals("test%value",
+        IcebergPredicateConverter.unescapeLikePattern("test\\%value", -1));
+    assertEquals("abc%def",
+        IcebergPredicateConverter.unescapeLikePattern("abc\\%def", -1));
+
+    // Test case 3: Escaped _
+    assertEquals("test_value",
+        IcebergPredicateConverter.unescapeLikePattern("test\\_value", -1));
+    assertEquals("abc_def",
+        IcebergPredicateConverter.unescapeLikePattern("abc\\_def", -1));
+
+    // Test case 4: Mixed escaped wildcards
+    assertEquals("wild%card_mix",
+        IcebergPredicateConverter.unescapeLikePattern("wild\\%card\\_mix", 
-1));
+
+    // Test case 5: Prefix extraction (endPos specified)
+    // "wild\\%card%suffix" in memory: w,i,l,d,\,%,c,a,r,d,%,s,u,f,f,i,x
+    // Unescaped % is at index 10, extract up to (not including) that position
+    assertEquals("wild%card",
+        IcebergPredicateConverter.unescapeLikePattern("wild\\%card%suffix", 
10));
+    // "test\\%%more" in memory: t,e,s,t,\,%,%,m,o,r,e
+    // Unescaped % is at index 6, extract up to that position
+    assertEquals("test%",
+        IcebergPredicateConverter.unescapeLikePattern("test\\%%more", 6));
+
+    // Test case 6: Escaped backslash (\\)
+    // SQL: 'test\\value' → getUnescapedValue() returns "test\\\\value" (2 
backslashes)
+    // After unescaping \\ → \ (single backslash)
+    assertEquals("test\\value",
+        IcebergPredicateConverter.unescapeLikePattern("test\\\\value", -1));
+    assertEquals("path\\to\\file",
+        IcebergPredicateConverter.unescapeLikePattern("path\\\\to\\\\file", 
-1));
+
+    // Test case 7: Multiple consecutive escapes
+    // "test\\\\\\%" in Java = "test\\\\%" in memory
+    // First pair of backslashes (4-5) unescapes to single \
+    // Third backslash (6) escapes the %, so result is test\ followed by 
literal %
+    assertEquals("test\\%",
+        IcebergPredicateConverter.unescapeLikePattern("test\\\\\\%", -1));
+    assertEquals("a%b_c", 
IcebergPredicateConverter.unescapeLikePattern("a\\%b\\_c", -1));
+
+    // Test case 8: Empty and edge cases
+    assertEquals("", IcebergPredicateConverter.unescapeLikePattern("", -1));
+    assertEquals("\\", IcebergPredicateConverter.unescapeLikePattern("\\\\", 
-1));
+    assertEquals("%", IcebergPredicateConverter.unescapeLikePattern("\\%", 
-1));
+    assertEquals("_", IcebergPredicateConverter.unescapeLikePattern("\\_", 
-1));
+  }
+}
diff --git 
a/testdata/workloads/functional-planner/queries/PlannerTest/iceberg-predicates.test
 
b/testdata/workloads/functional-planner/queries/PlannerTest/iceberg-predicates.test
index 3946ccada..1180e6690 100644
--- 
a/testdata/workloads/functional-planner/queries/PlannerTest/iceberg-predicates.test
+++ 
b/testdata/workloads/functional-planner/queries/PlannerTest/iceberg-predicates.test
@@ -118,30 +118,41 @@ PLAN-ROOT SINK
    skipped Iceberg predicates: event_time < TIMESTAMP '2022-01-01 00:00:00'
    row-size=44B cardinality=2
 ====
-# List of predicates contains an untranslated expression (action LIKE "d%") 
and a redisual
-# expression after Iceberg's filtering
+# LIKE predicate with prefix pattern is pushed down to Iceberg
 select * from iceberg_partitioned where action like "d%" and event_time < 
"2022-01-01" and id < 10
 ---- PLAN
 PLAN-ROOT SINK
 |
 00:SCAN HDFS [functional_parquet.iceberg_partitioned]
-   HDFS partitions=3/3 files=9 size=10.33KB
-   predicates: id < 10, action LIKE 'd%'
+   HDFS partitions=1/3 files=4 size=4.65KB
+   predicates: id < 10
    Iceberg snapshot id: 8270633197658268308
-   skipped Iceberg predicates: event_time < TIMESTAMP '2022-01-01 00:00:00'
+   skipped Iceberg predicates: action LIKE 'd%', event_time < TIMESTAMP 
'2022-01-01 00:00:00'
    row-size=44B cardinality=1
 ====
-# Compound expression partially evaluated by Iceberg, and cannot be mapped 
back to Impala expression
+# LIKE predicate with prefix pattern is pushed down to Iceberg
 select * from iceberg_partitioned where action like "d%" and (event_time < 
"2020-01-01" or id > 10)
 ---- PLAN
 PLAN-ROOT SINK
 |
 00:SCAN HDFS [functional_parquet.iceberg_partitioned]
-   HDFS partitions=3/3 files=10 size=11.42KB
+   HDFS partitions=1/3 files=2 size=2.32KB
    predicates: (event_time < TIMESTAMP '2020-01-01 00:00:00' OR id > 10), 
action LIKE 'd%'
    Iceberg snapshot id: 8270633197658268308
    row-size=44B cardinality=1
 ====
+# LIKE with wildcard in middle (d%d) cannot be pushed down - stays in 
predicates
+select * from iceberg_partitioned where action like "d%d" and event_time < 
"2022-01-01" and id < 10
+---- PLAN
+PLAN-ROOT SINK
+|
+00:SCAN HDFS [functional_parquet.iceberg_partitioned]
+   HDFS partitions=3/3 files=9 size=10.33KB
+   predicates: id < 10, action LIKE 'd%d'
+   Iceberg snapshot id: 8270633197658268308
+   skipped Iceberg predicates: event_time < TIMESTAMP '2022-01-01 00:00:00'
+   row-size=44B cardinality=1
+====
 # Predicate on a partition introduced by partition evolution pushed down to 
the scan node and
 # predicate on a partition that existed before partition evolution skipped.
 select * from iceberg_partition_evolution where month = 12 and year = 2010
diff --git 
a/testdata/workloads/functional-planner/queries/PlannerTest/iceberg-v2-update.test
 
b/testdata/workloads/functional-planner/queries/PlannerTest/iceberg-v2-update.test
index f7355217f..a1781b937 100644
--- 
a/testdata/workloads/functional-planner/queries/PlannerTest/iceberg-v2-update.test
+++ 
b/testdata/workloads/functional-planner/queries/PlannerTest/iceberg-v2-update.test
@@ -196,49 +196,31 @@ MULTI DATA SINK
 |->WRITE TO HDFS [functional_parquet.iceberg_v2_partitioned_position_deletes, 
OVERWRITE=false, PARTITION-KEYS=(action)]
 |->BUFFERED DELETE FROM ICEBERG 
[functional_parquet.iceberg_v2_partitioned_position_deletes-POSITION-DELETE]
 |
-03:SORT
+01:SORT
 |  order by: action ASC NULLS LAST
-|  row-size=76B cardinality=1
-|
-02:DELETE EVENTS ICEBERG DELETE [ICEBERG DELETE JOIN]
-|  row-size=80B cardinality=1
-|
-|--01:SCAN HDFS 
[functional_parquet.iceberg_v2_partitioned_position_deletes-POSITION-DELETE-01 
functional_parquet.iceberg_v2_partitioned_position_deletes-position-delete]
-|     HDFS partitions=3/3 files=3 size=9.47KB
-|     Iceberg snapshot id: 8885697082976537578
-|     row-size=204B cardinality=10
+|  row-size=76B cardinality=0
 |
 00:SCAN HDFS [functional_parquet.iceberg_v2_partitioned_position_deletes]
-   HDFS partitions=3/3 files=3 size=3.48KB
-   predicates: id IS DISTINCT FROM length(action), `user` LIKE 'impala'
+   HDFS partitions=0/3 files=0 size=0B
+   predicates: id IS DISTINCT FROM length(action)
    Iceberg snapshot id: 8885697082976537578
-   row-size=80B cardinality=2
+   skipped Iceberg predicates: `user` LIKE 'impala'
+   row-size=80B cardinality=0
 ---- DISTRIBUTEDPLAN
 MULTI DATA SINK
 |->WRITE TO HDFS [functional_parquet.iceberg_v2_partitioned_position_deletes, 
OVERWRITE=false, PARTITION-KEYS=(action)]
 |->BUFFERED DELETE FROM ICEBERG 
[functional_parquet.iceberg_v2_partitioned_position_deletes-POSITION-DELETE]
 |
-05:SORT
+01:SORT
 |  order by: action ASC NULLS LAST
-|  row-size=76B cardinality=1
-|
-04:EXCHANGE 
[HASH(functional_parquet.iceberg_v2_partitioned_position_deletes.action)]
-|
-02:DELETE EVENTS ICEBERG DELETE [ICEBERG DELETE JOIN, DIRECTED]
-|  row-size=80B cardinality=1
-|
-|--03:EXCHANGE [DIRECTED]
-|  |
-|  01:SCAN HDFS 
[functional_parquet.iceberg_v2_partitioned_position_deletes-POSITION-DELETE-01 
functional_parquet.iceberg_v2_partitioned_position_deletes-position-delete]
-|     HDFS partitions=3/3 files=3 size=9.47KB
-|     Iceberg snapshot id: 8885697082976537578
-|     row-size=204B cardinality=10
+|  row-size=76B cardinality=0
 |
 00:SCAN HDFS [functional_parquet.iceberg_v2_partitioned_position_deletes]
-   HDFS partitions=3/3 files=3 size=3.48KB
-   predicates: id IS DISTINCT FROM length(action), `user` LIKE 'impala'
+   HDFS partitions=0/3 files=0 size=0B
+   predicates: id IS DISTINCT FROM length(action)
    Iceberg snapshot id: 8885697082976537578
-   row-size=80B cardinality=2
+   skipped Iceberg predicates: `user` LIKE 'impala'
+   row-size=80B cardinality=0
 ====
 UPDATE iceberg_v2_partitioned_position_deletes set id = length(action) where 
action = 'click'
 ---- PLAN
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/iceberg-like-pushdown.test
 
b/testdata/workloads/functional-query/queries/QueryTest/iceberg-like-pushdown.test
new file mode 100644
index 000000000..01454ec48
--- /dev/null
+++ 
b/testdata/workloads/functional-query/queries/QueryTest/iceberg-like-pushdown.test
@@ -0,0 +1,356 @@
+====
+---- QUERY
+# IMPALA-14737: Test LIKE predicate pushdown to Iceberg
+# This test demonstrates that LIKE predicates with prefix patterns enable 
partition
+# pruning, while patterns starting with wildcards do not.
+#
+# KEY COMPARISON:
+# - LIKE 'prefix%' -> extracts prefix, enables pruning -> partitions=1/3
+# - LIKE '%suffix' -> no prefix, cannot prune -> partitions=3/3
+#
+# The 'action' column is a partition column in iceberg_partitioned table.
+# Positive Cases: LIKE with prefix enables partition pruning
+
+# Test 1: LIKE with prefix 'c%' - ENABLES partition pruning
+EXPLAIN SELECT count(*) FROM functional_parquet.iceberg_partitioned WHERE 
action LIKE 'c%';
+---- RESULTS: VERIFY_IS_SUBSET
+row_regex:.*partitions=1/3.*
+row_regex:.*Iceberg snapshot id.*
+====
+---- QUERY
+SELECT count(*) FROM functional_parquet.iceberg_partitioned WHERE action LIKE 
'c%';
+---- TYPES
+BIGINT
+---- RESULTS
+6
+---- RUNTIME_PROFILE
+row_regex: partitions=1/.* files=.*
+====
+---- QUERY
+# Test 2: LIKE with longer prefix 'cli%' - still enables pruning
+SELECT count(*) FROM functional_parquet.iceberg_partitioned WHERE action LIKE 
'cli%';
+---- TYPES
+BIGINT
+---- RESULTS
+6
+---- RUNTIME_PROFILE
+row_regex: partitions=1/.* files=.*
+====
+---- QUERY
+# Test 3: LIKE with underscore wildcard 'c____' - extracts 'c' prefix
+SELECT count(*) FROM functional_parquet.iceberg_partitioned WHERE action LIKE 
'c____';
+---- TYPES
+BIGINT
+---- RESULTS
+6
+---- RUNTIME_PROFILE
+row_regex: partitions=1/.* files=.*
+====
+---- QUERY
+# Test 4: LIKE with OR - prunes to 2 out of 3 partitions
+SELECT count(*) FROM functional_parquet.iceberg_partitioned WHERE action LIKE 
'c%' OR action LIKE 'd%';
+---- TYPES
+BIGINT
+---- RESULTS
+12
+---- RUNTIME_PROFILE
+row_regex: partitions=2/.* files=.*
+====
+---- QUERY
+# Test 5: LIKE on different partition value 'v%'
+SELECT count(*) FROM functional_parquet.iceberg_partitioned WHERE action LIKE 
'v%';
+---- TYPES
+BIGINT
+---- RESULTS
+8
+---- RUNTIME_PROFILE
+row_regex: partitions=1/.* files=.*
+====
+---- QUERY
+# Negative Cases: Patterns that CANNOT be pushed down
+#
+# The following LIKE patterns cannot be optimized with Iceberg pushdown:
+# 1. Patterns starting with wildcards: '%suffix', '_prefix'
+# 2. Patterns with literal content after wildcards: 'prefix%suffix', 'd%d'
+#    (Note: startsWith('prefix') would incorrectly match 'prefix123' for 
'prefix%suffix')
+
+# Test 1: LIKE starting with '%' wildcard - NO partition pruning
+EXPLAIN SELECT count(*) FROM functional_parquet.iceberg_partitioned WHERE 
action LIKE '%lick';
+---- RESULTS: VERIFY_IS_SUBSET
+row_regex:.*partitions=3/3.*
+====
+---- QUERY
+SELECT count(*) FROM functional_parquet.iceberg_partitioned WHERE action LIKE 
'%lick';
+---- TYPES
+BIGINT
+---- RESULTS
+6
+---- RUNTIME_PROFILE
+row_regex: partitions=3/.* files=.*
+====
+---- QUERY
+# Test 2: LIKE starting with '_' wildcard - NO partition pruning
+EXPLAIN SELECT count(*) FROM functional_parquet.iceberg_partitioned WHERE 
action LIKE '_lick';
+---- RESULTS: VERIFY_IS_SUBSET
+row_regex:.*partitions=3/3.*
+====
+---- QUERY
+SELECT count(*) FROM functional_parquet.iceberg_partitioned WHERE action LIKE 
'_lick';
+---- TYPES
+BIGINT
+---- RESULTS
+6
+---- RUNTIME_PROFILE
+row_regex: partitions=3/.* files=.*
+====
+---- QUERY
+# Test 3: Pattern with only '%' - scans all partitions
+SELECT count(*) FROM functional_parquet.iceberg_partitioned WHERE action LIKE 
'%';
+---- TYPES
+BIGINT
+---- RESULTS
+20
+---- RUNTIME_PROFILE
+row_regex: partitions=3/.* files=.*
+====
+---- QUERY
+# Comparison Tests: Demonstrating the benefit
+# Comparison: LIKE 'c%' vs LIKE '%lick' on same data
+# Both return same result (6 rows), but LIKE 'c%' scans fewer partitions
+
+# With prefix (efficient)
+EXPLAIN SELECT count(*) FROM functional_parquet.iceberg_partitioned WHERE 
action LIKE 'c%';
+---- RESULTS: VERIFY_IS_SUBSET
+row_regex:.*partitions=1/3.*
+====
+---- QUERY
+# Without prefix (inefficient)
+EXPLAIN SELECT count(*) FROM functional_parquet.iceberg_partitioned WHERE 
action LIKE '%ick';
+---- RESULTS: VERIFY_IS_SUBSET
+row_regex:.*partitions=3/3.*
+====
+---- QUERY
+# Tests on non-partition columns
+SELECT id, user FROM functional_parquet.iceberg_partitioned WHERE user LIKE 
'L%' ORDER BY id;
+---- TYPES
+INT, STRING
+---- RESULTS
+2,'Lisa'
+5,'Lisa'
+7,'Lisa'
+8,'Lisa'
+14,'Lisa'
+16,'Lisa'
+====
+---- QUERY
+# LIKE with underscore on non-partition column - extracts 'Al' prefix
+SELECT count(*) FROM functional_parquet.iceberg_partitioned WHERE user LIKE 
'Al_%';
+---- TYPES
+BIGINT
+---- RESULTS
+14
+====
+---- QUERY
+# Compound predicate: partition column LIKE + non-partition column LIKE
+# Partition column enables pruning
+SELECT count(*) FROM functional_parquet.iceberg_partitioned WHERE action LIKE 
'c%' AND user LIKE 'Al%';
+---- TYPES
+BIGINT
+---- RESULTS
+6
+---- RUNTIME_PROFILE
+row_regex: partitions=1/.* files=.*
+====
+---- QUERY
+# Edge Cases
+
+# Case sensitivity test
+SELECT count(*) FROM functional_parquet.iceberg_partitioned WHERE user LIKE 
'lisa%';
+---- TYPES
+BIGINT
+---- RESULTS
+0
+====
+---- QUERY
+# Empty result with valid prefix
+SELECT count(*) FROM functional_parquet.iceberg_partitioned WHERE action LIKE 
'z%';
+---- TYPES
+BIGINT
+---- RESULTS
+0
+====
+---- QUERY
+# Tests on non-partitioned table
+# LIKE works correctly on non-partitioned table
+SELECT count(*) FROM functional_parquet.iceberg_non_partitioned WHERE user 
LIKE 'L%';
+---- TYPES
+BIGINT
+---- RESULTS
+6
+====
+---- QUERY
+# LIKE with underscore on non-partitioned table
+SELECT count(*) FROM functional_parquet.iceberg_non_partitioned WHERE user 
LIKE 'Al_%';
+---- TYPES
+BIGINT
+---- RESULTS
+14
+====
+---- QUERY
+# UTF-8 Character Tests
+
+# Test table with UTF-8 strings
+CREATE TABLE ice_utf8_test (s STRING)
+PARTITIONED BY SPEC (truncate(5, s))
+STORED AS ICEBERG;
+====
+---- QUERY
+INSERT INTO ice_utf8_test VALUES ('impala'), ('árvíztűrőtükörfúrógép'), 
('árvíztűrő'), ('űűű'), ('你好hello'), ('你好world'), ('test%value'), 
('test_value'), ('wild%card_mix');
+====
+---- QUERY
+# Test: LIKE with UTF-8 prefix pattern - should use startsWith pushdown
+SELECT s FROM ice_utf8_test WHERE s LIKE 'árvíz%' ORDER BY s;
+---- TYPES
+STRING
+---- RESULTS: RAW_STRING
+'árvíztűrő'
+'árvíztűrőtükörfúrógép'
+---- RUNTIME_PROFILE
+row_regex: partitions=1/.* files=.*
+====
+---- QUERY
+# Test: LIKE with exact UTF-8 match - should use equality pushdown
+SELECT s FROM ice_utf8_test WHERE s LIKE 'űűű';
+---- TYPES
+STRING
+---- RESULTS: RAW_STRING
+'űűű'
+---- RUNTIME_PROFILE
+row_regex: partitions=1/.* files=.*
+====
+---- QUERY
+# Test: LIKE with Chinese characters prefix
+SELECT s FROM ice_utf8_test WHERE s LIKE '你好%' ORDER BY s;
+---- TYPES
+STRING
+---- RESULTS: RAW_STRING
+'你好hello'
+'你好world'
+---- RUNTIME_PROFILE
+row_regex: partitions=2/.* files=.*
+====
+---- QUERY
+# Escaped Wildcard Tests
+# When wildcards are escaped with '\', they are treated as literal characters
+
+# Test 1: Escaped % at end - exact match using Expressions.equal()
+SELECT s FROM ice_utf8_test WHERE s LIKE 'test\%value';
+---- TYPES
+STRING
+---- RESULTS: RAW_STRING
+'test%value'
+---- RUNTIME_PROFILE
+row_regex: partitions=1/.* files=.*
+====
+---- QUERY
+# Test 2: Escaped _ in middle - exact match
+SELECT s FROM ice_utf8_test WHERE s LIKE 'test\_value';
+---- TYPES
+STRING
+---- RESULTS: RAW_STRING
+'test_value'
+---- RUNTIME_PROFILE
+row_regex: partitions=1/.* files=.*
+====
+---- QUERY
+# Test 3: Escaped wildcard followed by unescaped wildcard - extracts prefix 
including literal wildcard
+# Pattern 'wild\%card%' means: starts with 'wild%card', then any suffix
+# Should match 'wild%card_mix'
+SELECT s FROM ice_utf8_test WHERE s LIKE 'wild\%card%';
+---- TYPES
+STRING
+---- RESULTS: RAW_STRING
+'wild%card_mix'
+---- RUNTIME_PROFILE
+row_regex: partitions=1/.* files=.*
+====
+---- QUERY
+# Test 4: Multiple escaped wildcards - exact match
+SELECT s FROM ice_utf8_test WHERE s LIKE 'wild\%card\_mix';
+---- TYPES
+STRING
+---- RESULTS: RAW_STRING
+'wild%card_mix'
+---- RUNTIME_PROFILE
+row_regex: partitions=1/.* files=.*
+====
+---- QUERY
+# Test 5: Unescaped wildcard at the beginning of partitioned table - no pruning
+SELECT count(*) FROM ice_utf8_test WHERE s LIKE '%value';
+---- TYPES
+BIGINT
+---- RESULTS
+2
+---- RUNTIME_PROFILE
+row_regex: partitions=8/8.* files=.*
+====
+---- QUERY
+# Exact Match Cases: LIKE without wildcards
+# Test: LIKE with exact match 'click' (no wildcards) - uses equality pushdown
+EXPLAIN SELECT count(*) FROM functional_parquet.iceberg_partitioned WHERE 
action LIKE 'click';
+---- RESULTS: VERIFY_IS_SUBSET
+row_regex:.*partitions=1/3.*
+row_regex:.*Iceberg snapshot id.*
+====
+---- QUERY
+SELECT count(*) FROM functional_parquet.iceberg_partitioned WHERE action LIKE 
'click';
+---- TYPES
+BIGINT
+---- RESULTS
+6
+---- RUNTIME_PROFILE
+row_regex: partitions=1/.* files=.*
+====
+---- QUERY
+# Test: LIKE with exact match on non-partition column
+SELECT id, user FROM functional_parquet.iceberg_partitioned WHERE user LIKE 
'Alex' ORDER BY id;
+---- TYPES
+INT, STRING
+---- RESULTS
+1,'Alex'
+4,'Alex'
+6,'Alex'
+11,'Alex'
+15,'Alex'
+17,'Alex'
+19,'Alex'
+20,'Alex'
+====
+---- QUERY
+# Verify árvíz%p returns correct results (should only match 
'árvíztűrőtükörfúrógép' and not 'árvíztűrő')
+# Checks pattern not being pushed down, else startsWith('árvíz') would 
incorrectly match 'árvíztűrő'
+SELECT s FROM ice_utf8_test WHERE s LIKE 'árvíz%p';
+---- TYPES
+STRING
+---- RESULTS: RAW_STRING
+'árvíztűrőtükörfúrógép'
+====
+---- QUERY
+# Negative Test: LIKE with suffix after wildcard - cannot be pushed down
+# Using escaped wildcard test table to verify patterns with literal content 
after wildcard
+SELECT s FROM ice_utf8_test WHERE s LIKE 'test%value' ORDER BY s;
+---- TYPES
+STRING
+---- RESULTS
+'test%value'
+'test_value'
+====
+---- QUERY
+# Negative Test: Complex pattern with wildcards and literal content
+# Pattern 'i%a' requires 'i' at start and 'a' at end - cannot use 
startsWith('i')
+SELECT s FROM ice_utf8_test WHERE s LIKE 'i%a' ORDER BY s;
+---- TYPES
+STRING
+---- RESULTS
+'impala'
+====
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/iceberg-partition-transform-insert.test
 
b/testdata/workloads/functional-query/queries/QueryTest/iceberg-partition-transform-insert.test
index a5a349fe7..7e5e72393 100644
--- 
a/testdata/workloads/functional-query/queries/QueryTest/iceberg-partition-transform-insert.test
+++ 
b/testdata/workloads/functional-query/queries/QueryTest/iceberg-partition-transform-insert.test
@@ -1131,7 +1131,7 @@ where s like "árvíz%";
 STRING
 ---- RUNTIME_PROFILE
 aggregation(SUM, RowsRead): 2
-aggregation(SUM, NumRowGroups): 4
+aggregation(SUM, NumRowGroups): 1
 ====
 ---- QUERY
 select * from unicode_truncate
diff --git a/tests/query_test/test_iceberg.py b/tests/query_test/test_iceberg.py
index c3ab38f58..be3eae8bb 100644
--- a/tests/query_test/test_iceberg.py
+++ b/tests/query_test/test_iceberg.py
@@ -2250,6 +2250,9 @@ class TestIcebergV2Table(IcebergTestSuite):
     self.run_test_case('QueryTest/iceberg-predicate-push-down-hint', vector,
                        use_db=unique_database)
 
+  def test_like_pushdown(self, vector, unique_database):
+    self.run_test_case('QueryTest/iceberg-like-pushdown', vector, 
use_db=unique_database)
+
   def test_partitions(self, vector, unique_database):
     self.run_test_case('QueryTest/iceberg-partitions', vector, unique_database)
     tbl_name = unique_database + ".ice_num_partitions"

(impala) branch master updated: IMPALA-14737 Part1: Push down LIKE predicates to Iceberg

Reply via email to