manojpec commented on a change in pull request #4352: URL: https://github.com/apache/hudi/pull/4352#discussion_r793419549
########## File path: hudi-client/hudi-client-common/src/main/java/org/apache/hudi/index/bloom/HoodieBloomIndex.java ########## @@ -133,30 +144,89 @@ public HoodieBloomIndex(HoodieWriteConfig config, BaseHoodieBloomIndexHelper blo /** * Load all involved files as <Partition, filename> pair List. */ - List<Pair<String, BloomIndexFileInfo>> loadInvolvedFiles( + List<Pair<String, BloomIndexFileInfo>> loadColumnRangesFromFiles( List<String> partitions, final HoodieEngineContext context, final HoodieTable hoodieTable) { // Obtain the latest data files from all the partitions. List<Pair<String, String>> partitionPathFileIDList = getLatestBaseFilesForAllPartitions(partitions, context, hoodieTable).stream() .map(pair -> Pair.of(pair.getKey(), pair.getValue().getFileId())) .collect(toList()); - if (config.getBloomIndexPruneByRanges()) { - // also obtain file ranges, if range pruning is enabled - context.setJobStatus(this.getClass().getName(), "Obtain key ranges for file slices (range pruning=on)"); - return context.map(partitionPathFileIDList, pf -> { - try { - HoodieRangeInfoHandle rangeInfoHandle = new HoodieRangeInfoHandle(config, hoodieTable, pf); - String[] minMaxKeys = rangeInfoHandle.getMinMaxKeys(); - return Pair.of(pf.getKey(), new BloomIndexFileInfo(pf.getValue(), minMaxKeys[0], minMaxKeys[1])); - } catch (MetadataNotFoundException me) { - LOG.warn("Unable to find range metadata in file :" + pf); - return Pair.of(pf.getKey(), new BloomIndexFileInfo(pf.getValue())); + context.setJobStatus(this.getClass().getName(), "Obtain key ranges for file slices (range pruning=on)"); + return context.map(partitionPathFileIDList, pf -> { + try { + HoodieRangeInfoHandle rangeInfoHandle = new HoodieRangeInfoHandle(config, hoodieTable, pf); + String[] minMaxKeys = rangeInfoHandle.getMinMaxKeys(); + return Pair.of(pf.getKey(), new BloomIndexFileInfo(pf.getValue(), minMaxKeys[0], minMaxKeys[1])); + } catch (MetadataNotFoundException me) { + LOG.warn("Unable to find range metadata in file :" + pf); + return Pair.of(pf.getKey(), new BloomIndexFileInfo(pf.getValue())); + } + }, Math.max(partitionPathFileIDList.size(), 1)); + } + + /** + * Get BloomIndexFileInfo for all the latest base files for the requested partitions. + * + * @param partitions - List of partitions to get the base files for + * @param context - Engine context + * @param hoodieTable - Hoodie Table + * @return List of partition and file column range info pairs + */ + private List<Pair<String, BloomIndexFileInfo>> getFileInfoForLatestBaseFiles( + List<String> partitions, final HoodieEngineContext context, final HoodieTable hoodieTable) { + List<Pair<String, String>> partitionPathFileIDList = getLatestBaseFilesForAllPartitions(partitions, context, + hoodieTable).stream() + .map(pair -> Pair.of(pair.getKey(), pair.getValue().getFileId())) + .collect(toList()); + return partitionPathFileIDList.stream() + .map(pf -> Pair.of(pf.getKey(), new BloomIndexFileInfo(pf.getValue()))).collect(toList()); + } + + /** + * Load the column stats index as BloomIndexFileInfo for all the involved files in the partition. + * + * @param partitions - List of partitions for which column stats need to be loaded + * @param context - Engine context + * @param hoodieTable - Hoodie table + * @return List of partition and file column range info pairs + */ + protected List<Pair<String, BloomIndexFileInfo>> loadColumnRangesFromMetaIndex( + List<String> partitions, final HoodieEngineContext context, final HoodieTable hoodieTable) { + // also obtain file ranges, if range pruning is enabled + context.setJobStatus(this.getClass().getName(), "Load meta index key ranges for file slices"); + + final String keyField = hoodieTable.getMetaClient().getTableConfig().getRecordKeyFieldProp(); + return context.flatMap(partitions, partitionName -> { + List<String> partitionFileNameList = HoodieIndexUtils.getLatestBaseFilesForPartition(partitionName, Review comment: yes, this is from the metadata table -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org