[GitHub] [flink] pltbkd commented on a diff in pull request #20415: [FLINK-28711] Hive connector implements SupportsDynamicFiltering interface

GitBox Thu, 04 Aug 2022 01:16:01 -0700


pltbkd commented on code in PR #20415:
URL: https://github.com/apache/flink/pull/20415#discussion_r937485324



##########
flink-connectors/flink-connector-hive/src/main/java/org/apache/flink/connectors/hive/HiveSourceDynamicFileEnumerator.java:
##########
@@ -0,0 +1,179 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.connectors.hive;
+
+import org.apache.flink.connector.file.src.FileSourceSplit;
+import org.apache.flink.connector.file.src.enumerate.DynamicFileEnumerator;
+import org.apache.flink.connectors.hive.util.JobConfUtils;
+import org.apache.flink.core.fs.Path;
+import org.apache.flink.table.connector.source.DynamicFilteringData;
+import org.apache.flink.table.data.GenericRowData;
+import org.apache.flink.table.data.RowData;
+import org.apache.flink.table.data.StringData;
+import org.apache.flink.table.types.logical.LogicalTypeFamily;
+import org.apache.flink.table.types.logical.RowType;
+import org.apache.flink.util.Preconditions;
+
+import org.apache.hadoop.mapred.JobConf;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+
+import static org.apache.flink.util.Preconditions.checkNotNull;
+
+/**
+ * A {@link DynamicFileEnumerator} implementation for hive source. It uses 
{@link
+ * HiveSourceFileEnumerator#createInputSplits} to generate splits like 
HiveSourceFileEnumerator, but
+ * only enumerates {@link HiveTablePartition}s that exist in the {@link 
DynamicFilteringData} if a
+ * DynamicFilteringData is provided.
+ */
+public class HiveSourceDynamicFileEnumerator implements DynamicFileEnumerator {
+
+    private static final Logger LOG =
+            LoggerFactory.getLogger(HiveSourceDynamicFileEnumerator.class);
+
+    private final String table;
+    private final List<String> dynamicPartitionKeys;
+    // For non-partition hive table, partitions only contains one partition 
which partitionValues is
+    // empty.
+    private final List<HiveTablePartition> allPartitions;
+    private final int threadNum;
+    private final JobConf jobConf;
+
+    private transient List<HiveTablePartition> finalPartitions;
+
+    public HiveSourceDynamicFileEnumerator(
+            String table,
+            List<String> dynamicPartitionKeys,
+            List<HiveTablePartition> allPartitions,
+            int threadNum,
+            JobConf jobConf) {
+        this.table = checkNotNull(table);
+        this.dynamicPartitionKeys = checkNotNull(dynamicPartitionKeys);
+        this.allPartitions = checkNotNull(allPartitions);
+        this.threadNum = threadNum;
+        this.jobConf = checkNotNull(jobConf);
+
+        this.finalPartitions = this.allPartitions;
+    }
+
+    public void setDynamicFilteringData(DynamicFilteringData data) {
+        LOG.debug("Filtering partitions of table {} based on the data: {}", 
table, data);
+        try {
+            finalPartitions = new ArrayList<>();
+            if (!data.isFiltering()) {
+                finalPartitions = allPartitions;
+                return;
+            }
+            RowType rowType = data.getRowType();
+            Preconditions.checkArgument(rowType.getFieldCount() == 
dynamicPartitionKeys.size());
+            for (HiveTablePartition partition : allPartitions) {
+                RowData partitionRow = createRowData(rowType, partition);
+                if (data.contains(partitionRow)) {
+                    finalPartitions.add(partition);
+                }
+            }
+            LOG.info(
+                    "Dynamic filtering table {}, original partition number is 
{}, remaining partition number {}",
+                    table,
+                    allPartitions.size(),
+                    finalPartitions.size());
+        } catch (Exception e) {
+            LOG.error("Failed to set partition data, will use all partitions", 
e);
+            finalPartitions = allPartitions;
+            throw new RuntimeException("Failed to apply dynamic filtering", e);

Review Comment:
   It's removed now. see above.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@flink.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [flink] pltbkd commented on a diff in pull request #20415: [FLINK-28711] Hive connector implements SupportsDynamicFiltering interface

Reply via email to