This is an automated email from the ASF dual-hosted git repository.
michaelsmith pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
The following commit(s) were added to refs/heads/master by this push:
new 540a3784e IMPALA-14737 Part1: Push down LIKE predicates to Iceberg
540a3784e is described below
commit 540a3784e0e3cad1a962c368ddbcb14b05de6832
Author: Arnab Karmakar <[email protected]>
AuthorDate: Wed Feb 18 02:46:00 2026 -0800
IMPALA-14737 Part1: Push down LIKE predicates to Iceberg
This patch adds support for pushing down LIKE predicates to Iceberg using
startsWith() and equal() expressions. When a LIKE predicate has a pattern
starting with non-wildcard characters, Impala analyzes the pattern and
pushes it down to Iceberg for efficient file-level filtering.
Supported patterns:
- 'abc%' -> pushes down startsWith('abc')
- 'pre_fix%' -> pushes down startsWith('pre') (underscore is wildcard)
- 'a_b%' -> pushes down startsWith('a')
- 'exact' -> pushes down equal('exact') (no wildcards)
- 'asd\%' -> pushes down equal('asd%') (escaped wildcard treated as literal)
Unsupported patterns (not pushed down):
- '%suffix' - starts with wildcard
- '_prefix%' - starts with wildcard
- 'prefix%suffix' - has literal content after wildcard
Benefits:
- File-level filtering using Iceberg metadata
- Partition pruning when LIKE is on partition columns
- Works with UTF-8 strings
Testing:
- Added iceberg-like-pushdown.test with comprehensive test coverage
- Tests include prefix patterns, underscore wildcards, exact matches,
partition pruning comparison, UTF-8 strings and cases where the pattern
cannot be pushed down
Change-Id: I548834126540bcc8d22efc872c2571293b8b7ec4
Reviewed-on: http://gerrit.cloudera.org:8080/24001
Reviewed-by: Impala Public Jenkins <[email protected]>
Tested-by: Impala Public Jenkins <[email protected]>
---
.../org/apache/impala/analysis/LikePredicate.java | 2 +-
.../impala/common/IcebergPredicateConverter.java | 168 ++++++++++
.../common/IcebergPredicateConverterTest.java | 197 ++++++++++++
.../queries/PlannerTest/iceberg-predicates.test | 25 +-
.../queries/PlannerTest/iceberg-v2-update.test | 42 +--
.../queries/QueryTest/iceberg-like-pushdown.test | 356 +++++++++++++++++++++
.../iceberg-partition-transform-insert.test | 2 +-
tests/query_test/test_iceberg.py | 3 +
8 files changed, 756 insertions(+), 39 deletions(-)
diff --git a/fe/src/main/java/org/apache/impala/analysis/LikePredicate.java
b/fe/src/main/java/org/apache/impala/analysis/LikePredicate.java
index e17917bff..84e560fe1 100644
--- a/fe/src/main/java/org/apache/impala/analysis/LikePredicate.java
+++ b/fe/src/main/java/org/apache/impala/analysis/LikePredicate.java
@@ -32,7 +32,7 @@ import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
public class LikePredicate extends Predicate {
- enum Operator {
+ public enum Operator {
LIKE("LIKE"),
ILIKE("ILIKE"),
RLIKE("RLIKE"),
diff --git
a/fe/src/main/java/org/apache/impala/common/IcebergPredicateConverter.java
b/fe/src/main/java/org/apache/impala/common/IcebergPredicateConverter.java
index c42f96f4d..946fcf41e 100644
--- a/fe/src/main/java/org/apache/impala/common/IcebergPredicateConverter.java
+++ b/fe/src/main/java/org/apache/impala/common/IcebergPredicateConverter.java
@@ -38,6 +38,7 @@ import org.apache.impala.analysis.DateLiteral;
import org.apache.impala.analysis.Expr;
import org.apache.impala.analysis.InPredicate;
import org.apache.impala.analysis.IsNullPredicate;
+import org.apache.impala.analysis.LikePredicate;
import org.apache.impala.analysis.LiteralExpr;
import org.apache.impala.analysis.NumericLiteral;
import org.apache.impala.analysis.SlotDescriptor;
@@ -75,6 +76,8 @@ public class IcebergPredicateConverter {
return convert((IsNullPredicate) expr);
} else if (expr instanceof CompoundPredicate) {
return convert((CompoundPredicate) expr);
+ } else if (expr instanceof LikePredicate) {
+ return convert((LikePredicate) expr);
} else {
throw new ImpalaRuntimeException(String.format(
"Unsupported expression: %s", expr.toSql()));
@@ -150,6 +153,171 @@ public class IcebergPredicateConverter {
Expressions.or(left, right);
}
+ /**
+ * Checks if a wildcard character at the given position is escaped by
counting
+ * preceding backslashes. An odd number of backslashes means the wildcard is
escaped.
+ */
+ static boolean isWildcardEscaped(String pattern, int wildcardPos) {
+ int backslashCount = 0;
+ int j = wildcardPos - 1;
+ while (j >= 0 && pattern.charAt(j) == '\\') {
+ backslashCount++;
+ j--;
+ }
+ return backslashCount % 2 == 1;
+ }
+
+ /**
+ * Checks if there's any literal (non-wildcard) content after the given
position.
+ * This determines if a pattern like 'd%d' has content after the first
wildcard.
+ * Only unescaped % and _ are considered non-literal.
+ *
+ * @param pattern The pattern string from StringLiteral.getUnescapedValue()
+ * @param afterPos Position to check after (exclusive)
+ * @return True if there's any literal content after afterPos
+ */
+ static boolean hasLiteralContentAfterWildcard(String pattern, int afterPos) {
+ for (int i = afterPos + 1; i < pattern.length(); i++) {
+ char c = pattern.charAt(i);
+
+ // Check if this is a wildcard
+ if (c == '%' || c == '_') {
+ // If escaped, it's literal content
+ if (isWildcardEscaped(pattern, i)) {
+ return true;
+ }
+ // Unescaped wildcard, continue checking
+ continue;
+ }
+
+ // Any non-wildcard character is literal content
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * Finds the position of the first unescaped wildcard (% or _) in the
pattern.
+ * Wildcards can be escaped with '\', e.g., 'asd\%' has no unescaped
wildcard.
+ *
+ * @return Position of first unescaped wildcard, or -1 if none found
+ */
+ static int findFirstUnescapedWildcard(String pattern) {
+ for (int i = 0; i < pattern.length(); i++) {
+ char c = pattern.charAt(i);
+
+ // Check if this character is a wildcard
+ if (c == '%' || c == '_') {
+ // Return position if not escaped
+ if (!isWildcardEscaped(pattern, i)) {
+ return i;
+ }
+ }
+ }
+ return -1;
+ }
+
+ /**
+ * Removes backslash escapes from LIKE wildcards in the pattern.
+ *
+ * Background: When Impala parses SQL LIKE 'test\%value',
+ * StringLiteral.getUnescapedValue() preserves LIKE-specific escape sequences
+ * and returns the Java string "test\\%value" (backslash followed by %). This
+ * method converts it to "test%value" (literal %) for Iceberg, which doesn't
+ * use backslash escaping.
+ *
+ * Example:
+ * Input: "test\\%value" (from SQL: LIKE 'test\%value')
+ * Output: "test%value" (literal % for Iceberg equal())
+ *
+ * @param pattern The pattern string from StringLiteral.getUnescapedValue()
+ * @param endPos Position to stop processing, or -1 to process entire string
+ * @return Pattern with LIKE escape sequences removed
+ */
+ static String unescapeLikePattern(String pattern, int endPos) {
+ int len = (endPos == -1) ? pattern.length() : endPos;
+ StringBuilder sb = new StringBuilder(len);
+
+ for (int i = 0; i < len; i++) {
+ char c = pattern.charAt(i);
+
+ if (c == '\\' && i + 1 < pattern.length()) {
+ char next = pattern.charAt(i + 1);
+ // Unescape LIKE wildcards: \% -> %, \_ -> _
+ // Also handle escaped backslash: \\ -> \
+ if (next == '%' || next == '_' || next == '\\') {
+ sb.append(next);
+ i++; // Skip the next character as we've already processed it
+ continue;
+ }
+ }
+
+ sb.append(c);
+ }
+
+ return sb.toString();
+ }
+
+ protected Expression convert(LikePredicate predicate)
+ throws ImpalaRuntimeException {
+ // Only LIKE operator is supported, not RLIKE, REGEXP, etc.
+ if (predicate.getOp() != LikePredicate.Operator.LIKE) {
+ throw new ImpalaRuntimeException(String.format(
+ "Only LIKE operator is supported for Iceberg pushdown, got: %s",
+ predicate.getOp()));
+ }
+
+ Term term = getTerm(predicate.getChild(0));
+ IcebergColumn column = term.referencedColumn_;
+
+ // Check if the column is a string type
+ if (!column.getType().isStringType()) {
+ throw new ImpalaRuntimeException(String.format(
+ "LIKE predicate pushdown only supports string columns, got: %s",
+ column.getType()));
+ }
+
+ LiteralExpr literal = getSecondChildAsLiteralExpr(predicate);
+ checkNullLiteral(literal);
+
+ if (!(literal instanceof StringLiteral)) {
+ throw new ImpalaRuntimeException(String.format(
+ "LIKE pattern must be a string literal, got: %s", literal.toSql()));
+ }
+
+ String pattern = ((StringLiteral) literal).getUnescapedValue();
+ if (pattern == null || pattern.isEmpty()) {
+ throw new ImpalaRuntimeException("LIKE pattern cannot be null or empty");
+ }
+
+ // Find first unescaped wildcard position
+ int firstWildcard = findFirstUnescapedWildcard(pattern);
+
+ // Case 1: Pattern starts with wildcard - cannot push down
+ if (firstWildcard == 0) {
+ throw new ImpalaRuntimeException(String.format(
+ "LIKE pattern '%s' cannot be pushed down to Iceberg. Patterns must
start "
+ + "with at least one literal character.", pattern));
+ }
+
+ // Case 2: No wildcards - exact match
+ if (firstWildcard == -1) {
+ String unescapedPattern = unescapeLikePattern(pattern, -1);
+ return Expressions.equal(column.getName(), unescapedPattern);
+ }
+
+ // Case 3: Wildcard in middle with literal content after - cannot push down
+ if (hasLiteralContentAfterWildcard(pattern, firstWildcard)) {
+ throw new ImpalaRuntimeException(String.format(
+ "LIKE pattern '%s' cannot be pushed down to Iceberg. Only prefix
patterns "
+ + "(e.g., 'prefix%%') are supported.", pattern));
+ }
+
+ // Case 4: Pure prefix pattern (wildcard only at end) - use startsWith
+ String unescapedPattern = unescapeLikePattern(pattern, firstWildcard);
+ return Expressions.startsWith(column.getName(), unescapedPattern);
+ }
+
protected void checkNullLiteral(LiteralExpr literal) throws
ImpalaRuntimeException {
if (Expr.IS_NULL_LITERAL.apply(literal)) {
throw new ImpalaRuntimeException("Expression can't be NULL literal: " +
literal);
diff --git
a/fe/src/test/java/org/apache/impala/common/IcebergPredicateConverterTest.java
b/fe/src/test/java/org/apache/impala/common/IcebergPredicateConverterTest.java
new file mode 100644
index 000000000..41f8e942f
--- /dev/null
+++
b/fe/src/test/java/org/apache/impala/common/IcebergPredicateConverterTest.java
@@ -0,0 +1,197 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.impala.common;
+
+import static org.junit.Assert.assertEquals;
+
+import org.junit.Test;
+
+/**
+ * Unit tests for IcebergPredicateConverter helper methods.
+ */
+public class IcebergPredicateConverterTest {
+
+ @Test
+ public void testIsWildcardEscaped() {
+ // Test case 1: No backslashes - not escaped
+ assertEquals(false, IcebergPredicateConverter.isWildcardEscaped("test%",
4));
+ assertEquals(false,
IcebergPredicateConverter.isWildcardEscaped("test_value", 4));
+ assertEquals(false, IcebergPredicateConverter.isWildcardEscaped("%test",
0));
+
+ // Test case 2: Single backslash - escaped (odd count)
+ assertEquals(true, IcebergPredicateConverter.isWildcardEscaped("test\\%",
5));
+ assertEquals(true,
IcebergPredicateConverter.isWildcardEscaped("test\\_value", 5));
+
+ // Test case 3: Double backslash - not escaped (even count)
+ assertEquals(false,
IcebergPredicateConverter.isWildcardEscaped("test\\\\%", 6));
+ assertEquals(false,
IcebergPredicateConverter.isWildcardEscaped("test\\\\_value", 6));
+
+ // Test case 4: Triple backslash - escaped (odd count)
+ assertEquals(true,
IcebergPredicateConverter.isWildcardEscaped("test\\\\\\%", 7));
+
+ // Test case 5: Multiple wildcards with mixed escaping
+ // Pattern: "test\%_" - first wildcard escaped, second not
+ assertEquals(true, IcebergPredicateConverter.isWildcardEscaped("test\\%_",
5));
+ assertEquals(false,
IcebergPredicateConverter.isWildcardEscaped("test\\%_", 6));
+
+ // Test case 6: Edge case - wildcard at position 0
+ assertEquals(false, IcebergPredicateConverter.isWildcardEscaped("%test",
0));
+ assertEquals(false, IcebergPredicateConverter.isWildcardEscaped("_test",
0));
+ }
+
+ @Test
+ public void testHasLiteralContentAfterWildcard() {
+ // Test case 1: No content after first wildcard (only unescaped wildcards
or nothing)
+ assertEquals(false,
+ IcebergPredicateConverter.hasLiteralContentAfterWildcard("test%", 4));
+ assertEquals(false,
+ IcebergPredicateConverter.hasLiteralContentAfterWildcard("test%%", 4));
+ assertEquals(false,
+ IcebergPredicateConverter.hasLiteralContentAfterWildcard("test%_%",
4));
+ assertEquals(false,
+ IcebergPredicateConverter.hasLiteralContentAfterWildcard("test_", 4));
+ assertEquals(false,
+ IcebergPredicateConverter.hasLiteralContentAfterWildcard("prefix%%%",
6));
+
+ // Test case 2: Literal content after first wildcard
+ assertEquals(true,
+ IcebergPredicateConverter.hasLiteralContentAfterWildcard("d%d", 1));
+ assertEquals(true,
+
IcebergPredicateConverter.hasLiteralContentAfterWildcard("test%suffix", 4));
+ assertEquals(true,
+ IcebergPredicateConverter.hasLiteralContentAfterWildcard("a%b%c", 1));
+ assertEquals(true,
+ IcebergPredicateConverter.hasLiteralContentAfterWildcard("pre_%fix",
3));
+
+ // Test case 3: Escaped wildcards after first wildcard are literal content
+ // "test%\\%" has escaped % after the unescaped %, which is literal content
+ assertEquals(true,
+ IcebergPredicateConverter.hasLiteralContentAfterWildcard("test%\\%",
4));
+ // "test%\\_" has escaped _ after the first %, which is literal content
+ assertEquals(true,
+ IcebergPredicateConverter.hasLiteralContentAfterWildcard("test%\\_",
4));
+
+ // Test case 4: Mix of wildcards and escaped content
+ // "pre%_%\\%" has unescaped _, then escaped %, so has literal content
+ assertEquals(true,
+ IcebergPredicateConverter.hasLiteralContentAfterWildcard("pre%_%\\%",
3));
+ // Obviously has literal 'x' at the end
+ assertEquals(true,
+ IcebergPredicateConverter.hasLiteralContentAfterWildcard("pre%_%\\%x",
3));
+ }
+
+ @Test
+ public void testFindFirstUnescapedWildcard() {
+ // Test case 1: No wildcards
+ assertEquals(-1,
IcebergPredicateConverter.findFirstUnescapedWildcard("exact"));
+ assertEquals(-1,
IcebergPredicateConverter.findFirstUnescapedWildcard("test"));
+
+ // Test case 2: Unescaped wildcards
+ assertEquals(0,
IcebergPredicateConverter.findFirstUnescapedWildcard("%test"));
+ assertEquals(0,
IcebergPredicateConverter.findFirstUnescapedWildcard("_test"));
+ assertEquals(4,
IcebergPredicateConverter.findFirstUnescapedWildcard("test%"));
+ assertEquals(4,
IcebergPredicateConverter.findFirstUnescapedWildcard("test_suffix"));
+ assertEquals(3,
IcebergPredicateConverter.findFirstUnescapedWildcard("abc%def"));
+
+ // Test case 3: Escaped wildcards (what getUnescapedValue() returns for
SQL \%)
+ // SQL 'test\%' → getUnescapedValue() returns "test\\%"
+ assertEquals(-1,
IcebergPredicateConverter.findFirstUnescapedWildcard("test\\%"));
+ assertEquals(-1,
+ IcebergPredicateConverter.findFirstUnescapedWildcard("test\\_value"));
+ assertEquals(-1,
IcebergPredicateConverter.findFirstUnescapedWildcard("abc\\%def"));
+
+ // Test case 4: Mixed escaped and unescaped wildcards
+ // "wild\\%card%" in memory: w,i,l,d,\,%,c,a,r,d,%
+ // First % at 5 is escaped, second % at 10 is unescaped
+ assertEquals(10,
+ IcebergPredicateConverter.findFirstUnescapedWildcard("wild\\%card%"));
+ assertEquals(10,
+
IcebergPredicateConverter.findFirstUnescapedWildcard("wild\\_card_mix"));
+ // "test\\%%more" in memory: t,e,s,t,\,%,%,m,o,r,e
+ // First % at 5 is escaped, second % at 6 is unescaped
+ assertEquals(6,
IcebergPredicateConverter.findFirstUnescapedWildcard("test\\%%more"));
+
+ // Test case 5: Double backslash (escaped backslash + unescaped wildcard)
+ // "test\\\\%" in Java source = "test\\%" in memory
+ // % at index 6 has even (2) backslashes, so it's unescaped
+ assertEquals(6,
IcebergPredicateConverter.findFirstUnescapedWildcard("test\\\\%"));
+ assertEquals(6,
IcebergPredicateConverter.findFirstUnescapedWildcard("test\\\\_"));
+
+ // Test case 6: Triple backslash (escaped backslash + escaped wildcard)
+ // "test\\\\\\%" in Java source = "test\\\\%" in memory
+ // % at index 7 has odd (3) backslashes, so it's escaped
+ assertEquals(-1,
IcebergPredicateConverter.findFirstUnescapedWildcard("test\\\\\\%"));
+ }
+
+ @Test
+ public void testUnescapeLikePattern() {
+ // Test case 1: No escapes
+ assertEquals("exact",
IcebergPredicateConverter.unescapeLikePattern("exact", -1));
+ assertEquals("prefix",
IcebergPredicateConverter.unescapeLikePattern("prefix", 6));
+
+ // Test case 2: Escaped % (what getUnescapedValue() returns for SQL
'test\%value')
+ // SQL: 'test\%value' → getUnescapedValue() returns "test\\%value"
(backslash + %)
+ // After unescaping → "test%value" (literal %)
+ assertEquals("test%value",
+ IcebergPredicateConverter.unescapeLikePattern("test\\%value", -1));
+ assertEquals("abc%def",
+ IcebergPredicateConverter.unescapeLikePattern("abc\\%def", -1));
+
+ // Test case 3: Escaped _
+ assertEquals("test_value",
+ IcebergPredicateConverter.unescapeLikePattern("test\\_value", -1));
+ assertEquals("abc_def",
+ IcebergPredicateConverter.unescapeLikePattern("abc\\_def", -1));
+
+ // Test case 4: Mixed escaped wildcards
+ assertEquals("wild%card_mix",
+ IcebergPredicateConverter.unescapeLikePattern("wild\\%card\\_mix",
-1));
+
+ // Test case 5: Prefix extraction (endPos specified)
+ // "wild\\%card%suffix" in memory: w,i,l,d,\,%,c,a,r,d,%,s,u,f,f,i,x
+ // Unescaped % is at index 10, extract up to (not including) that position
+ assertEquals("wild%card",
+ IcebergPredicateConverter.unescapeLikePattern("wild\\%card%suffix",
10));
+ // "test\\%%more" in memory: t,e,s,t,\,%,%,m,o,r,e
+ // Unescaped % is at index 6, extract up to that position
+ assertEquals("test%",
+ IcebergPredicateConverter.unescapeLikePattern("test\\%%more", 6));
+
+ // Test case 6: Escaped backslash (\\)
+ // SQL: 'test\\value' → getUnescapedValue() returns "test\\\\value" (2
backslashes)
+ // After unescaping \\ → \ (single backslash)
+ assertEquals("test\\value",
+ IcebergPredicateConverter.unescapeLikePattern("test\\\\value", -1));
+ assertEquals("path\\to\\file",
+ IcebergPredicateConverter.unescapeLikePattern("path\\\\to\\\\file",
-1));
+
+ // Test case 7: Multiple consecutive escapes
+ // "test\\\\\\%" in Java = "test\\\\%" in memory
+ // First pair of backslashes (4-5) unescapes to single \
+ // Third backslash (6) escapes the %, so result is test\ followed by
literal %
+ assertEquals("test\\%",
+ IcebergPredicateConverter.unescapeLikePattern("test\\\\\\%", -1));
+ assertEquals("a%b_c",
IcebergPredicateConverter.unescapeLikePattern("a\\%b\\_c", -1));
+
+ // Test case 8: Empty and edge cases
+ assertEquals("", IcebergPredicateConverter.unescapeLikePattern("", -1));
+ assertEquals("\\", IcebergPredicateConverter.unescapeLikePattern("\\\\",
-1));
+ assertEquals("%", IcebergPredicateConverter.unescapeLikePattern("\\%",
-1));
+ assertEquals("_", IcebergPredicateConverter.unescapeLikePattern("\\_",
-1));
+ }
+}
diff --git
a/testdata/workloads/functional-planner/queries/PlannerTest/iceberg-predicates.test
b/testdata/workloads/functional-planner/queries/PlannerTest/iceberg-predicates.test
index 3946ccada..1180e6690 100644
---
a/testdata/workloads/functional-planner/queries/PlannerTest/iceberg-predicates.test
+++
b/testdata/workloads/functional-planner/queries/PlannerTest/iceberg-predicates.test
@@ -118,30 +118,41 @@ PLAN-ROOT SINK
skipped Iceberg predicates: event_time < TIMESTAMP '2022-01-01 00:00:00'
row-size=44B cardinality=2
====
-# List of predicates contains an untranslated expression (action LIKE "d%")
and a redisual
-# expression after Iceberg's filtering
+# LIKE predicate with prefix pattern is pushed down to Iceberg
select * from iceberg_partitioned where action like "d%" and event_time <
"2022-01-01" and id < 10
---- PLAN
PLAN-ROOT SINK
|
00:SCAN HDFS [functional_parquet.iceberg_partitioned]
- HDFS partitions=3/3 files=9 size=10.33KB
- predicates: id < 10, action LIKE 'd%'
+ HDFS partitions=1/3 files=4 size=4.65KB
+ predicates: id < 10
Iceberg snapshot id: 8270633197658268308
- skipped Iceberg predicates: event_time < TIMESTAMP '2022-01-01 00:00:00'
+ skipped Iceberg predicates: action LIKE 'd%', event_time < TIMESTAMP
'2022-01-01 00:00:00'
row-size=44B cardinality=1
====
-# Compound expression partially evaluated by Iceberg, and cannot be mapped
back to Impala expression
+# LIKE predicate with prefix pattern is pushed down to Iceberg
select * from iceberg_partitioned where action like "d%" and (event_time <
"2020-01-01" or id > 10)
---- PLAN
PLAN-ROOT SINK
|
00:SCAN HDFS [functional_parquet.iceberg_partitioned]
- HDFS partitions=3/3 files=10 size=11.42KB
+ HDFS partitions=1/3 files=2 size=2.32KB
predicates: (event_time < TIMESTAMP '2020-01-01 00:00:00' OR id > 10),
action LIKE 'd%'
Iceberg snapshot id: 8270633197658268308
row-size=44B cardinality=1
====
+# LIKE with wildcard in middle (d%d) cannot be pushed down - stays in
predicates
+select * from iceberg_partitioned where action like "d%d" and event_time <
"2022-01-01" and id < 10
+---- PLAN
+PLAN-ROOT SINK
+|
+00:SCAN HDFS [functional_parquet.iceberg_partitioned]
+ HDFS partitions=3/3 files=9 size=10.33KB
+ predicates: id < 10, action LIKE 'd%d'
+ Iceberg snapshot id: 8270633197658268308
+ skipped Iceberg predicates: event_time < TIMESTAMP '2022-01-01 00:00:00'
+ row-size=44B cardinality=1
+====
# Predicate on a partition introduced by partition evolution pushed down to
the scan node and
# predicate on a partition that existed before partition evolution skipped.
select * from iceberg_partition_evolution where month = 12 and year = 2010
diff --git
a/testdata/workloads/functional-planner/queries/PlannerTest/iceberg-v2-update.test
b/testdata/workloads/functional-planner/queries/PlannerTest/iceberg-v2-update.test
index f7355217f..a1781b937 100644
---
a/testdata/workloads/functional-planner/queries/PlannerTest/iceberg-v2-update.test
+++
b/testdata/workloads/functional-planner/queries/PlannerTest/iceberg-v2-update.test
@@ -196,49 +196,31 @@ MULTI DATA SINK
|->WRITE TO HDFS [functional_parquet.iceberg_v2_partitioned_position_deletes,
OVERWRITE=false, PARTITION-KEYS=(action)]
|->BUFFERED DELETE FROM ICEBERG
[functional_parquet.iceberg_v2_partitioned_position_deletes-POSITION-DELETE]
|
-03:SORT
+01:SORT
| order by: action ASC NULLS LAST
-| row-size=76B cardinality=1
-|
-02:DELETE EVENTS ICEBERG DELETE [ICEBERG DELETE JOIN]
-| row-size=80B cardinality=1
-|
-|--01:SCAN HDFS
[functional_parquet.iceberg_v2_partitioned_position_deletes-POSITION-DELETE-01
functional_parquet.iceberg_v2_partitioned_position_deletes-position-delete]
-| HDFS partitions=3/3 files=3 size=9.47KB
-| Iceberg snapshot id: 8885697082976537578
-| row-size=204B cardinality=10
+| row-size=76B cardinality=0
|
00:SCAN HDFS [functional_parquet.iceberg_v2_partitioned_position_deletes]
- HDFS partitions=3/3 files=3 size=3.48KB
- predicates: id IS DISTINCT FROM length(action), `user` LIKE 'impala'
+ HDFS partitions=0/3 files=0 size=0B
+ predicates: id IS DISTINCT FROM length(action)
Iceberg snapshot id: 8885697082976537578
- row-size=80B cardinality=2
+ skipped Iceberg predicates: `user` LIKE 'impala'
+ row-size=80B cardinality=0
---- DISTRIBUTEDPLAN
MULTI DATA SINK
|->WRITE TO HDFS [functional_parquet.iceberg_v2_partitioned_position_deletes,
OVERWRITE=false, PARTITION-KEYS=(action)]
|->BUFFERED DELETE FROM ICEBERG
[functional_parquet.iceberg_v2_partitioned_position_deletes-POSITION-DELETE]
|
-05:SORT
+01:SORT
| order by: action ASC NULLS LAST
-| row-size=76B cardinality=1
-|
-04:EXCHANGE
[HASH(functional_parquet.iceberg_v2_partitioned_position_deletes.action)]
-|
-02:DELETE EVENTS ICEBERG DELETE [ICEBERG DELETE JOIN, DIRECTED]
-| row-size=80B cardinality=1
-|
-|--03:EXCHANGE [DIRECTED]
-| |
-| 01:SCAN HDFS
[functional_parquet.iceberg_v2_partitioned_position_deletes-POSITION-DELETE-01
functional_parquet.iceberg_v2_partitioned_position_deletes-position-delete]
-| HDFS partitions=3/3 files=3 size=9.47KB
-| Iceberg snapshot id: 8885697082976537578
-| row-size=204B cardinality=10
+| row-size=76B cardinality=0
|
00:SCAN HDFS [functional_parquet.iceberg_v2_partitioned_position_deletes]
- HDFS partitions=3/3 files=3 size=3.48KB
- predicates: id IS DISTINCT FROM length(action), `user` LIKE 'impala'
+ HDFS partitions=0/3 files=0 size=0B
+ predicates: id IS DISTINCT FROM length(action)
Iceberg snapshot id: 8885697082976537578
- row-size=80B cardinality=2
+ skipped Iceberg predicates: `user` LIKE 'impala'
+ row-size=80B cardinality=0
====
UPDATE iceberg_v2_partitioned_position_deletes set id = length(action) where
action = 'click'
---- PLAN
diff --git
a/testdata/workloads/functional-query/queries/QueryTest/iceberg-like-pushdown.test
b/testdata/workloads/functional-query/queries/QueryTest/iceberg-like-pushdown.test
new file mode 100644
index 000000000..01454ec48
--- /dev/null
+++
b/testdata/workloads/functional-query/queries/QueryTest/iceberg-like-pushdown.test
@@ -0,0 +1,356 @@
+====
+---- QUERY
+# IMPALA-14737: Test LIKE predicate pushdown to Iceberg
+# This test demonstrates that LIKE predicates with prefix patterns enable
partition
+# pruning, while patterns starting with wildcards do not.
+#
+# KEY COMPARISON:
+# - LIKE 'prefix%' -> extracts prefix, enables pruning -> partitions=1/3
+# - LIKE '%suffix' -> no prefix, cannot prune -> partitions=3/3
+#
+# The 'action' column is a partition column in iceberg_partitioned table.
+# Positive Cases: LIKE with prefix enables partition pruning
+
+# Test 1: LIKE with prefix 'c%' - ENABLES partition pruning
+EXPLAIN SELECT count(*) FROM functional_parquet.iceberg_partitioned WHERE
action LIKE 'c%';
+---- RESULTS: VERIFY_IS_SUBSET
+row_regex:.*partitions=1/3.*
+row_regex:.*Iceberg snapshot id.*
+====
+---- QUERY
+SELECT count(*) FROM functional_parquet.iceberg_partitioned WHERE action LIKE
'c%';
+---- TYPES
+BIGINT
+---- RESULTS
+6
+---- RUNTIME_PROFILE
+row_regex: partitions=1/.* files=.*
+====
+---- QUERY
+# Test 2: LIKE with longer prefix 'cli%' - still enables pruning
+SELECT count(*) FROM functional_parquet.iceberg_partitioned WHERE action LIKE
'cli%';
+---- TYPES
+BIGINT
+---- RESULTS
+6
+---- RUNTIME_PROFILE
+row_regex: partitions=1/.* files=.*
+====
+---- QUERY
+# Test 3: LIKE with underscore wildcard 'c____' - extracts 'c' prefix
+SELECT count(*) FROM functional_parquet.iceberg_partitioned WHERE action LIKE
'c____';
+---- TYPES
+BIGINT
+---- RESULTS
+6
+---- RUNTIME_PROFILE
+row_regex: partitions=1/.* files=.*
+====
+---- QUERY
+# Test 4: LIKE with OR - prunes to 2 out of 3 partitions
+SELECT count(*) FROM functional_parquet.iceberg_partitioned WHERE action LIKE
'c%' OR action LIKE 'd%';
+---- TYPES
+BIGINT
+---- RESULTS
+12
+---- RUNTIME_PROFILE
+row_regex: partitions=2/.* files=.*
+====
+---- QUERY
+# Test 5: LIKE on different partition value 'v%'
+SELECT count(*) FROM functional_parquet.iceberg_partitioned WHERE action LIKE
'v%';
+---- TYPES
+BIGINT
+---- RESULTS
+8
+---- RUNTIME_PROFILE
+row_regex: partitions=1/.* files=.*
+====
+---- QUERY
+# Negative Cases: Patterns that CANNOT be pushed down
+#
+# The following LIKE patterns cannot be optimized with Iceberg pushdown:
+# 1. Patterns starting with wildcards: '%suffix', '_prefix'
+# 2. Patterns with literal content after wildcards: 'prefix%suffix', 'd%d'
+# (Note: startsWith('prefix') would incorrectly match 'prefix123' for
'prefix%suffix')
+
+# Test 1: LIKE starting with '%' wildcard - NO partition pruning
+EXPLAIN SELECT count(*) FROM functional_parquet.iceberg_partitioned WHERE
action LIKE '%lick';
+---- RESULTS: VERIFY_IS_SUBSET
+row_regex:.*partitions=3/3.*
+====
+---- QUERY
+SELECT count(*) FROM functional_parquet.iceberg_partitioned WHERE action LIKE
'%lick';
+---- TYPES
+BIGINT
+---- RESULTS
+6
+---- RUNTIME_PROFILE
+row_regex: partitions=3/.* files=.*
+====
+---- QUERY
+# Test 2: LIKE starting with '_' wildcard - NO partition pruning
+EXPLAIN SELECT count(*) FROM functional_parquet.iceberg_partitioned WHERE
action LIKE '_lick';
+---- RESULTS: VERIFY_IS_SUBSET
+row_regex:.*partitions=3/3.*
+====
+---- QUERY
+SELECT count(*) FROM functional_parquet.iceberg_partitioned WHERE action LIKE
'_lick';
+---- TYPES
+BIGINT
+---- RESULTS
+6
+---- RUNTIME_PROFILE
+row_regex: partitions=3/.* files=.*
+====
+---- QUERY
+# Test 3: Pattern with only '%' - scans all partitions
+SELECT count(*) FROM functional_parquet.iceberg_partitioned WHERE action LIKE
'%';
+---- TYPES
+BIGINT
+---- RESULTS
+20
+---- RUNTIME_PROFILE
+row_regex: partitions=3/.* files=.*
+====
+---- QUERY
+# Comparison Tests: Demonstrating the benefit
+# Comparison: LIKE 'c%' vs LIKE '%lick' on same data
+# Both return same result (6 rows), but LIKE 'c%' scans fewer partitions
+
+# With prefix (efficient)
+EXPLAIN SELECT count(*) FROM functional_parquet.iceberg_partitioned WHERE
action LIKE 'c%';
+---- RESULTS: VERIFY_IS_SUBSET
+row_regex:.*partitions=1/3.*
+====
+---- QUERY
+# Without prefix (inefficient)
+EXPLAIN SELECT count(*) FROM functional_parquet.iceberg_partitioned WHERE
action LIKE '%ick';
+---- RESULTS: VERIFY_IS_SUBSET
+row_regex:.*partitions=3/3.*
+====
+---- QUERY
+# Tests on non-partition columns
+SELECT id, user FROM functional_parquet.iceberg_partitioned WHERE user LIKE
'L%' ORDER BY id;
+---- TYPES
+INT, STRING
+---- RESULTS
+2,'Lisa'
+5,'Lisa'
+7,'Lisa'
+8,'Lisa'
+14,'Lisa'
+16,'Lisa'
+====
+---- QUERY
+# LIKE with underscore on non-partition column - extracts 'Al' prefix
+SELECT count(*) FROM functional_parquet.iceberg_partitioned WHERE user LIKE
'Al_%';
+---- TYPES
+BIGINT
+---- RESULTS
+14
+====
+---- QUERY
+# Compound predicate: partition column LIKE + non-partition column LIKE
+# Partition column enables pruning
+SELECT count(*) FROM functional_parquet.iceberg_partitioned WHERE action LIKE
'c%' AND user LIKE 'Al%';
+---- TYPES
+BIGINT
+---- RESULTS
+6
+---- RUNTIME_PROFILE
+row_regex: partitions=1/.* files=.*
+====
+---- QUERY
+# Edge Cases
+
+# Case sensitivity test
+SELECT count(*) FROM functional_parquet.iceberg_partitioned WHERE user LIKE
'lisa%';
+---- TYPES
+BIGINT
+---- RESULTS
+0
+====
+---- QUERY
+# Empty result with valid prefix
+SELECT count(*) FROM functional_parquet.iceberg_partitioned WHERE action LIKE
'z%';
+---- TYPES
+BIGINT
+---- RESULTS
+0
+====
+---- QUERY
+# Tests on non-partitioned table
+# LIKE works correctly on non-partitioned table
+SELECT count(*) FROM functional_parquet.iceberg_non_partitioned WHERE user
LIKE 'L%';
+---- TYPES
+BIGINT
+---- RESULTS
+6
+====
+---- QUERY
+# LIKE with underscore on non-partitioned table
+SELECT count(*) FROM functional_parquet.iceberg_non_partitioned WHERE user
LIKE 'Al_%';
+---- TYPES
+BIGINT
+---- RESULTS
+14
+====
+---- QUERY
+# UTF-8 Character Tests
+
+# Test table with UTF-8 strings
+CREATE TABLE ice_utf8_test (s STRING)
+PARTITIONED BY SPEC (truncate(5, s))
+STORED AS ICEBERG;
+====
+---- QUERY
+INSERT INTO ice_utf8_test VALUES ('impala'), ('árvíztűrőtükörfúrógép'),
('árvíztűrő'), ('űűű'), ('你好hello'), ('你好world'), ('test%value'),
('test_value'), ('wild%card_mix');
+====
+---- QUERY
+# Test: LIKE with UTF-8 prefix pattern - should use startsWith pushdown
+SELECT s FROM ice_utf8_test WHERE s LIKE 'árvíz%' ORDER BY s;
+---- TYPES
+STRING
+---- RESULTS: RAW_STRING
+'árvíztűrő'
+'árvíztűrőtükörfúrógép'
+---- RUNTIME_PROFILE
+row_regex: partitions=1/.* files=.*
+====
+---- QUERY
+# Test: LIKE with exact UTF-8 match - should use equality pushdown
+SELECT s FROM ice_utf8_test WHERE s LIKE 'űűű';
+---- TYPES
+STRING
+---- RESULTS: RAW_STRING
+'űűű'
+---- RUNTIME_PROFILE
+row_regex: partitions=1/.* files=.*
+====
+---- QUERY
+# Test: LIKE with Chinese characters prefix
+SELECT s FROM ice_utf8_test WHERE s LIKE '你好%' ORDER BY s;
+---- TYPES
+STRING
+---- RESULTS: RAW_STRING
+'你好hello'
+'你好world'
+---- RUNTIME_PROFILE
+row_regex: partitions=2/.* files=.*
+====
+---- QUERY
+# Escaped Wildcard Tests
+# When wildcards are escaped with '\', they are treated as literal characters
+
+# Test 1: Escaped % at end - exact match using Expressions.equal()
+SELECT s FROM ice_utf8_test WHERE s LIKE 'test\%value';
+---- TYPES
+STRING
+---- RESULTS: RAW_STRING
+'test%value'
+---- RUNTIME_PROFILE
+row_regex: partitions=1/.* files=.*
+====
+---- QUERY
+# Test 2: Escaped _ in middle - exact match
+SELECT s FROM ice_utf8_test WHERE s LIKE 'test\_value';
+---- TYPES
+STRING
+---- RESULTS: RAW_STRING
+'test_value'
+---- RUNTIME_PROFILE
+row_regex: partitions=1/.* files=.*
+====
+---- QUERY
+# Test 3: Escaped wildcard followed by unescaped wildcard - extracts prefix
including literal wildcard
+# Pattern 'wild\%card%' means: starts with 'wild%card', then any suffix
+# Should match 'wild%card_mix'
+SELECT s FROM ice_utf8_test WHERE s LIKE 'wild\%card%';
+---- TYPES
+STRING
+---- RESULTS: RAW_STRING
+'wild%card_mix'
+---- RUNTIME_PROFILE
+row_regex: partitions=1/.* files=.*
+====
+---- QUERY
+# Test 4: Multiple escaped wildcards - exact match
+SELECT s FROM ice_utf8_test WHERE s LIKE 'wild\%card\_mix';
+---- TYPES
+STRING
+---- RESULTS: RAW_STRING
+'wild%card_mix'
+---- RUNTIME_PROFILE
+row_regex: partitions=1/.* files=.*
+====
+---- QUERY
+# Test 5: Unescaped wildcard at the beginning of partitioned table - no pruning
+SELECT count(*) FROM ice_utf8_test WHERE s LIKE '%value';
+---- TYPES
+BIGINT
+---- RESULTS
+2
+---- RUNTIME_PROFILE
+row_regex: partitions=8/8.* files=.*
+====
+---- QUERY
+# Exact Match Cases: LIKE without wildcards
+# Test: LIKE with exact match 'click' (no wildcards) - uses equality pushdown
+EXPLAIN SELECT count(*) FROM functional_parquet.iceberg_partitioned WHERE
action LIKE 'click';
+---- RESULTS: VERIFY_IS_SUBSET
+row_regex:.*partitions=1/3.*
+row_regex:.*Iceberg snapshot id.*
+====
+---- QUERY
+SELECT count(*) FROM functional_parquet.iceberg_partitioned WHERE action LIKE
'click';
+---- TYPES
+BIGINT
+---- RESULTS
+6
+---- RUNTIME_PROFILE
+row_regex: partitions=1/.* files=.*
+====
+---- QUERY
+# Test: LIKE with exact match on non-partition column
+SELECT id, user FROM functional_parquet.iceberg_partitioned WHERE user LIKE
'Alex' ORDER BY id;
+---- TYPES
+INT, STRING
+---- RESULTS
+1,'Alex'
+4,'Alex'
+6,'Alex'
+11,'Alex'
+15,'Alex'
+17,'Alex'
+19,'Alex'
+20,'Alex'
+====
+---- QUERY
+# Verify árvíz%p returns correct results (should only match
'árvíztűrőtükörfúrógép' and not 'árvíztűrő')
+# Checks pattern not being pushed down, else startsWith('árvíz') would
incorrectly match 'árvíztűrő'
+SELECT s FROM ice_utf8_test WHERE s LIKE 'árvíz%p';
+---- TYPES
+STRING
+---- RESULTS: RAW_STRING
+'árvíztűrőtükörfúrógép'
+====
+---- QUERY
+# Negative Test: LIKE with suffix after wildcard - cannot be pushed down
+# Using escaped wildcard test table to verify patterns with literal content
after wildcard
+SELECT s FROM ice_utf8_test WHERE s LIKE 'test%value' ORDER BY s;
+---- TYPES
+STRING
+---- RESULTS
+'test%value'
+'test_value'
+====
+---- QUERY
+# Negative Test: Complex pattern with wildcards and literal content
+# Pattern 'i%a' requires 'i' at start and 'a' at end - cannot use
startsWith('i')
+SELECT s FROM ice_utf8_test WHERE s LIKE 'i%a' ORDER BY s;
+---- TYPES
+STRING
+---- RESULTS
+'impala'
+====
diff --git
a/testdata/workloads/functional-query/queries/QueryTest/iceberg-partition-transform-insert.test
b/testdata/workloads/functional-query/queries/QueryTest/iceberg-partition-transform-insert.test
index a5a349fe7..7e5e72393 100644
---
a/testdata/workloads/functional-query/queries/QueryTest/iceberg-partition-transform-insert.test
+++
b/testdata/workloads/functional-query/queries/QueryTest/iceberg-partition-transform-insert.test
@@ -1131,7 +1131,7 @@ where s like "árvíz%";
STRING
---- RUNTIME_PROFILE
aggregation(SUM, RowsRead): 2
-aggregation(SUM, NumRowGroups): 4
+aggregation(SUM, NumRowGroups): 1
====
---- QUERY
select * from unicode_truncate
diff --git a/tests/query_test/test_iceberg.py b/tests/query_test/test_iceberg.py
index c3ab38f58..be3eae8bb 100644
--- a/tests/query_test/test_iceberg.py
+++ b/tests/query_test/test_iceberg.py
@@ -2250,6 +2250,9 @@ class TestIcebergV2Table(IcebergTestSuite):
self.run_test_case('QueryTest/iceberg-predicate-push-down-hint', vector,
use_db=unique_database)
+ def test_like_pushdown(self, vector, unique_database):
+ self.run_test_case('QueryTest/iceberg-like-pushdown', vector,
use_db=unique_database)
+
def test_partitions(self, vector, unique_database):
self.run_test_case('QueryTest/iceberg-partitions', vector, unique_database)
tbl_name = unique_database + ".ice_num_partitions"