[ https://issues.apache.org/jira/browse/HIVE-24962?focusedWorklogId=586557&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-586557 ]
ASF GitHub Bot logged work on HIVE-24962: ----------------------------------------- Author: ASF GitHub Bot Created on: 21/Apr/21 13:23 Start Date: 21/Apr/21 13:23 Worklog Time Spent: 10m Work Description: lcspinter commented on a change in pull request #2137: URL: https://github.com/apache/hive/pull/2137#discussion_r617511852 ########## File path: iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerWithEngine.java ########## @@ -126,7 +127,7 @@ // skip HiveCatalog tests as they are added before for (TestTables.TestTableType testTableType : TestTables.ALL_TABLE_TYPES) { if (!TestTables.TestTableType.HIVE_CATALOG.equals(testTableType)) { - testParams.add(new Object[]{FileFormat.PARQUET, "mr", testTableType}); + testParams.add(new Object[]{FileFormat.PARQUET, "tez", testTableType}); Review comment: The comment above the loop is not in sync with the logic. ########## File path: iceberg/iceberg-handler/src/test/java/org/apache/iceberg/mr/hive/TestHiveIcebergStorageHandlerWithEngine.java ########## @@ -126,7 +127,7 @@ // skip HiveCatalog tests as they are added before for (TestTables.TestTableType testTableType : TestTables.ALL_TABLE_TYPES) { if (!TestTables.TestTableType.HIVE_CATALOG.equals(testTableType)) { - testParams.add(new Object[]{FileFormat.PARQUET, "mr", testTableType}); + testParams.add(new Object[]{FileFormat.PARQUET, "tez", testTableType}); Review comment: Anyway, this change is already present on master, so you might revert it. ########## File path: iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java ########## @@ -194,6 +210,54 @@ public boolean canProvideBasicStatistics() { return stats; } + public boolean addDynamicSplitPruningEdge(org.apache.hadoop.hive.ql.metadata.Table table, + ExprNodeDesc syntheticFilterPredicate) { + try { + Collection<String> partitionColumns = ((HiveIcebergSerDe) table.getDeserializer()).partitionColumns(); + if (partitionColumns.size() > 0) { + // Collect the column names from the predicate + Set<String> filterColumns = Sets.newHashSet(); + columns(syntheticFilterPredicate, filterColumns); + + // While Iceberg could handle multiple columns the current pruning only able to handle filters for a + // single column. We keep the logic below to handle multiple columns so if pruning is available on executor + // side the we can easily adapt to it as well. + if (filterColumns.size() > 1) { Review comment: If we know that we don't support multiple column filtering, wouldn't be possible to get a rough estimate of the filter size ( is >=1) before collecting every column name? ########## File path: iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java ########## @@ -50,9 +61,14 @@ import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.base.Splitter; import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; import org.apache.iceberg.util.SerializationUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class HiveIcebergStorageHandler implements HiveStoragePredicateHandler, HiveStorageHandler { + private static final Logger LOG = LoggerFactory.getLogger(HiveIcebergStorageHandler.class); + private static final Splitter TABLE_NAME_SPLITTER = Splitter.on(".."); Review comment: nit: Splitter.on(TABLE_NAME_SEPARATOR) ########## File path: iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java ########## @@ -275,4 +339,74 @@ static void overlayTableProperties(Configuration configuration, TableDesc tableD // this is an exception to the interface documentation, but it's a safe operation to add this property props.put(InputFormatConfig.TABLE_SCHEMA, schemaJson); } + + /** + * Recursively collects the column names from the predicate. + * @param node The node we are traversing + * @param columns The already collected column names + */ + private void columns(ExprNodeDesc node, Set<String> columns) { + if (node instanceof ExprNodeColumnDesc) { + columns.add(((ExprNodeColumnDesc) node).getColumn()); + } else { + List<ExprNodeDesc> children = node.getChildren(); + if (children != null && !children.isEmpty()) { + children.forEach(child -> columns(child, columns)); + } + } + } + + /** + * Recursively replaces the ExprNodeDynamicListDesc nodes by a dummy ExprNodeConstantDesc so we can test if we can + * convert the predicate to an Iceberg predicate when pruning the partitions later. Please make sure that it is ok + * to change the input node (clone if needed) + * @param node The node we are traversing + */ + private void replaceWithDummyValues(ExprNodeDesc node) { + List<ExprNodeDesc> children = node.getChildren(); + if (children != null && !children.isEmpty()) { + ListIterator<ExprNodeDesc> iterator = node.getChildren().listIterator(); Review comment: nit: No need for the iterator, since we already have a reference on the children list. A simple for loop could do the trick ########## File path: iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java ########## @@ -275,4 +339,74 @@ static void overlayTableProperties(Configuration configuration, TableDesc tableD // this is an exception to the interface documentation, but it's a safe operation to add this property props.put(InputFormatConfig.TABLE_SCHEMA, schemaJson); } + + /** + * Recursively collects the column names from the predicate. + * @param node The node we are traversing + * @param columns The already collected column names + */ + private void columns(ExprNodeDesc node, Set<String> columns) { + if (node instanceof ExprNodeColumnDesc) { + columns.add(((ExprNodeColumnDesc) node).getColumn()); + } else { + List<ExprNodeDesc> children = node.getChildren(); + if (children != null && !children.isEmpty()) { Review comment: nit: unnecessary isEmpty check -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking ------------------- Worklog Id: (was: 586557) Time Spent: 4h 10m (was: 4h) > Enable partition pruning for Iceberg tables > ------------------------------------------- > > Key: HIVE-24962 > URL: https://issues.apache.org/jira/browse/HIVE-24962 > Project: Hive > Issue Type: Improvement > Reporter: Peter Vary > Assignee: Peter Vary > Priority: Major > Labels: pull-request-available > Time Spent: 4h 10m > Remaining Estimate: 0h > > We should enable partition pruning above iceberg tables -- This message was sent by Atlassian Jira (v8.3.4#803005)