[ https://issues.apache.org/jira/browse/HIVE-24350?focusedWorklogId=509607&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-509607 ]
ASF GitHub Bot logged work on HIVE-24350: ----------------------------------------- Author: ASF GitHub Bot Created on: 10/Nov/20 09:32 Start Date: 10/Nov/20 09:32 Worklog Time Spent: 10m Work Description: rbalamohan commented on a change in pull request #1645: URL: https://github.com/apache/hive/pull/1645#discussion_r520415139 ########## File path: ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/NullScanTaskDispatcher.java ########## @@ -93,94 +91,109 @@ private String getAliasForTableScanOperator(MapWork work, return null; } - private PartitionDesc changePartitionToMetadataOnly(PartitionDesc desc, - Path path) { - if (desc == null) { - return null; - } - FileStatus[] filesFoundInPartitionDir = null; + private void lookupAndProcessPath(MapWork work, Path path, + Collection<String> aliasesToOptimize) { try { - filesFoundInPartitionDir = Utilities.listNonHiddenFileStatus(physicalContext.getConf(), path); + boolean isEmpty = Utilities.listNonHiddenFileStatus(physicalContext.getConf(), path).length == 0; + processPath(work, path, aliasesToOptimize, isEmpty); } catch (IOException e) { - LOG.error("Cannot determine if the table is empty", e); - } - if (!isMetadataOnlyAllowed(filesFoundInPartitionDir)) { - return desc; + LOG.warn("Could not determine if path {} was empty." + + "Cannot use null scan optimization for this path.", path, e); } - - boolean isEmpty = filesFoundInPartitionDir == null || filesFoundInPartitionDir.length == 0; - desc.setInputFileFormatClass(isEmpty ? ZeroRowsInputFormat.class : OneNullRowInputFormat.class); - desc.setOutputFileFormatClass(HiveIgnoreKeyTextOutputFormat.class); - desc.getProperties().setProperty(serdeConstants.SERIALIZATION_LIB, - NullStructSerDe.class.getName()); - return desc; } - private boolean isMetadataOnlyAllowed(FileStatus[] filesFoundInPartitionDir) { - if (filesFoundInPartitionDir == null || filesFoundInPartitionDir.length == 0) { - return true; // empty folders are safe to convert to metadata-only - } - for (FileStatus f : filesFoundInPartitionDir) { - if (AcidUtils.isDeleteDelta(f.getPath())) { - /* - * as described in HIVE-23712, an acid partition is not a safe subject of metadata-only - * optimization, because there is a chance that it contains no data but contains folders - * (e.g: delta_0000002_0000002_0000, delete_delta_0000003_0000003_0000), without scanning - * the underlying file contents, we cannot tell whether this partition contains data or not - */ - return false; - } + private void processPath(MapWork work, Path path, Collection<String> aliasesToOptimize, + boolean isEmpty) { + PartitionDesc partDesc = work.getPathToPartitionInfo().get(path).clone(); + partDesc.setInputFileFormatClass(isEmpty ? ZeroRowsInputFormat.class : OneNullRowInputFormat.class); + partDesc.setOutputFileFormatClass(HiveIgnoreKeyTextOutputFormat.class); + partDesc.getProperties().setProperty(serdeConstants.SERIALIZATION_LIB, + NullStructSerDe.class.getName()); + Path fakePath = + new Path(NullScanFileSystem.getBase() + partDesc.getTableName() + + "/part" + encode(partDesc.getPartSpec())); + StringInternUtils.internUriStringsInPath(fakePath); + work.addPathToPartitionInfo(fakePath, partDesc); + work.addPathToAlias(fakePath, new ArrayList<>(aliasesToOptimize)); + Collection<String> aliasesContainingPath = work.getPathToAliases().get(path); + aliasesContainingPath.removeAll(aliasesToOptimize); + if (aliasesContainingPath.isEmpty()) { + work.removePathToAlias(path); + work.removePathToPartitionInfo(path); } - return true; } - private void processAlias(MapWork work, Path path, - Collection<String> aliasesAffected, Set<String> aliases) { - // the aliases that are allowed to map to a null scan. - Collection<String> allowed = aliasesAffected.stream() - .filter(a -> aliases.contains(a)).collect(Collectors.toList()); - if (!allowed.isEmpty()) { - PartitionDesc partDesc = work.getPathToPartitionInfo().get(path).clone(); - PartitionDesc newPartition = - changePartitionToMetadataOnly(partDesc, path); - // Prefix partition with something to avoid it being a hidden file. - Path fakePath = - new Path(NullScanFileSystem.getBase() + newPartition.getTableName() - + "/part" + encode(newPartition.getPartSpec())); - StringInternUtils.internUriStringsInPath(fakePath); - work.addPathToPartitionInfo(fakePath, newPartition); - work.addPathToAlias(fakePath, new ArrayList<>(allowed)); - aliasesAffected.removeAll(allowed); - if (aliasesAffected.isEmpty()) { - work.removePathToAlias(path); - work.removePathToPartitionInfo(path); - } - } + private void addCandidatePath(Map<Path, Collection<String>> aliasesForPath, Path path, String alias) { + Collection<String> aliases = aliasesForPath.computeIfAbsent(path, k -> new ArrayList<>()); + aliases.add(alias); } - private void processAlias(MapWork work, Set<TableScanOperator> tableScans) { - Set<String> aliases = new HashSet<>(); + private void processTableScans(MapWork work, Set<TableScanOperator> tableScans) { + Map<Path, Boolean> managedEmptyPathMap = new HashMap<>(); + Map<Path, Collection<String>> candidatePathsToAliases = new HashMap<>(); + Map<String, Boolean> aliasTypeMap = new HashMap<>(); + Map<String, Map<Path, Partition>> allowedAliasesToPartitions = new HashMap<>(); for (TableScanOperator tso : tableScans) { // use LinkedHashMap<String, Operator<? extends OperatorDesc>> // getAliasToWork() should not apply this for non-native table if (tso.getConf().getTableMetadata().getStorageHandler() != null) { continue; } String alias = getAliasForTableScanOperator(work, tso); - aliases.add(alias); + boolean isManagedTable = !MetaStoreUtils.isExternalTable(tso.getConf().getTableMetadata().getTTable()); + aliasTypeMap.put(alias, isManagedTable); + allowedAliasesToPartitions.put(alias, getPathToPartitionMap(alias, tso)); tso.getConf().setIsMetadataOnly(true); } - // group path alias according to work - Map<Path, List<String>> candidates = new HashMap<>(); + for (Path path : work.getPaths()) { - List<String> aliasesAffected = work.getPathToAliases().get(path); - if (CollectionUtils.isNotEmpty(aliasesAffected)) { - candidates.put(path, aliasesAffected); + List<String> aliases = work.getPathToAliases().get(path); + for (String alias: aliases) { + Map<Path, Partition> pathToPartitionMap = allowedAliasesToPartitions.get(alias); + if (pathToPartitionMap == null) { + continue; + } + addCandidatePath(candidatePathsToAliases, path, alias); + Partition partitionObject = pathToPartitionMap.get(path); + Map<String, String> partitionParameters = partitionObject != null ? partitionObject.getParameters() : null; + boolean isManagedTable = aliasTypeMap.get(alias); + if (isManagedTable && partitionParameters != null && StatsSetupConst.areBasicStatsUptoDate(partitionParameters)) { + long rwCount = Long.parseLong(partitionParameters.get(StatsSetupConst.ROW_COUNT)); + if (rwCount == 0) { + managedEmptyPathMap.put(path, true); + } else { + managedEmptyPathMap.put(path, false); + } + } + } + } + + for (Entry<Path, Collection<String>> entry : candidatePathsToAliases.entrySet()) { Review comment: Wouldn't this have the same issue for external tables (e.g text?). For ########## File path: ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/NullScanTaskDispatcher.java ########## @@ -93,94 +91,109 @@ private String getAliasForTableScanOperator(MapWork work, return null; } - private PartitionDesc changePartitionToMetadataOnly(PartitionDesc desc, - Path path) { - if (desc == null) { - return null; - } - FileStatus[] filesFoundInPartitionDir = null; + private void lookupAndProcessPath(MapWork work, Path path, + Collection<String> aliasesToOptimize) { try { - filesFoundInPartitionDir = Utilities.listNonHiddenFileStatus(physicalContext.getConf(), path); + boolean isEmpty = Utilities.listNonHiddenFileStatus(physicalContext.getConf(), path).length == 0; + processPath(work, path, aliasesToOptimize, isEmpty); } catch (IOException e) { - LOG.error("Cannot determine if the table is empty", e); - } - if (!isMetadataOnlyAllowed(filesFoundInPartitionDir)) { - return desc; + LOG.warn("Could not determine if path {} was empty." + + "Cannot use null scan optimization for this path.", path, e); } - - boolean isEmpty = filesFoundInPartitionDir == null || filesFoundInPartitionDir.length == 0; - desc.setInputFileFormatClass(isEmpty ? ZeroRowsInputFormat.class : OneNullRowInputFormat.class); - desc.setOutputFileFormatClass(HiveIgnoreKeyTextOutputFormat.class); - desc.getProperties().setProperty(serdeConstants.SERIALIZATION_LIB, - NullStructSerDe.class.getName()); - return desc; } - private boolean isMetadataOnlyAllowed(FileStatus[] filesFoundInPartitionDir) { - if (filesFoundInPartitionDir == null || filesFoundInPartitionDir.length == 0) { - return true; // empty folders are safe to convert to metadata-only - } - for (FileStatus f : filesFoundInPartitionDir) { - if (AcidUtils.isDeleteDelta(f.getPath())) { - /* - * as described in HIVE-23712, an acid partition is not a safe subject of metadata-only - * optimization, because there is a chance that it contains no data but contains folders - * (e.g: delta_0000002_0000002_0000, delete_delta_0000003_0000003_0000), without scanning - * the underlying file contents, we cannot tell whether this partition contains data or not - */ - return false; - } + private void processPath(MapWork work, Path path, Collection<String> aliasesToOptimize, + boolean isEmpty) { + PartitionDesc partDesc = work.getPathToPartitionInfo().get(path).clone(); + partDesc.setInputFileFormatClass(isEmpty ? ZeroRowsInputFormat.class : OneNullRowInputFormat.class); + partDesc.setOutputFileFormatClass(HiveIgnoreKeyTextOutputFormat.class); + partDesc.getProperties().setProperty(serdeConstants.SERIALIZATION_LIB, + NullStructSerDe.class.getName()); + Path fakePath = + new Path(NullScanFileSystem.getBase() + partDesc.getTableName() + + "/part" + encode(partDesc.getPartSpec())); + StringInternUtils.internUriStringsInPath(fakePath); + work.addPathToPartitionInfo(fakePath, partDesc); + work.addPathToAlias(fakePath, new ArrayList<>(aliasesToOptimize)); + Collection<String> aliasesContainingPath = work.getPathToAliases().get(path); + aliasesContainingPath.removeAll(aliasesToOptimize); + if (aliasesContainingPath.isEmpty()) { + work.removePathToAlias(path); + work.removePathToPartitionInfo(path); } - return true; } - private void processAlias(MapWork work, Path path, - Collection<String> aliasesAffected, Set<String> aliases) { - // the aliases that are allowed to map to a null scan. - Collection<String> allowed = aliasesAffected.stream() - .filter(a -> aliases.contains(a)).collect(Collectors.toList()); - if (!allowed.isEmpty()) { - PartitionDesc partDesc = work.getPathToPartitionInfo().get(path).clone(); - PartitionDesc newPartition = - changePartitionToMetadataOnly(partDesc, path); - // Prefix partition with something to avoid it being a hidden file. - Path fakePath = - new Path(NullScanFileSystem.getBase() + newPartition.getTableName() - + "/part" + encode(newPartition.getPartSpec())); - StringInternUtils.internUriStringsInPath(fakePath); - work.addPathToPartitionInfo(fakePath, newPartition); - work.addPathToAlias(fakePath, new ArrayList<>(allowed)); - aliasesAffected.removeAll(allowed); - if (aliasesAffected.isEmpty()) { - work.removePathToAlias(path); - work.removePathToPartitionInfo(path); - } - } + private void addCandidatePath(Map<Path, Collection<String>> aliasesForPath, Path path, String alias) { + Collection<String> aliases = aliasesForPath.computeIfAbsent(path, k -> new ArrayList<>()); + aliases.add(alias); } - private void processAlias(MapWork work, Set<TableScanOperator> tableScans) { - Set<String> aliases = new HashSet<>(); + private void processTableScans(MapWork work, Set<TableScanOperator> tableScans) { + Map<Path, Boolean> managedEmptyPathMap = new HashMap<>(); + Map<Path, Collection<String>> candidatePathsToAliases = new HashMap<>(); + Map<String, Boolean> aliasTypeMap = new HashMap<>(); + Map<String, Map<Path, Partition>> allowedAliasesToPartitions = new HashMap<>(); for (TableScanOperator tso : tableScans) { // use LinkedHashMap<String, Operator<? extends OperatorDesc>> // getAliasToWork() should not apply this for non-native table if (tso.getConf().getTableMetadata().getStorageHandler() != null) { continue; } String alias = getAliasForTableScanOperator(work, tso); - aliases.add(alias); + boolean isManagedTable = !MetaStoreUtils.isExternalTable(tso.getConf().getTableMetadata().getTTable()); + aliasTypeMap.put(alias, isManagedTable); + allowedAliasesToPartitions.put(alias, getPathToPartitionMap(alias, tso)); tso.getConf().setIsMetadataOnly(true); } - // group path alias according to work - Map<Path, List<String>> candidates = new HashMap<>(); + for (Path path : work.getPaths()) { - List<String> aliasesAffected = work.getPathToAliases().get(path); - if (CollectionUtils.isNotEmpty(aliasesAffected)) { - candidates.put(path, aliasesAffected); + List<String> aliases = work.getPathToAliases().get(path); + for (String alias: aliases) { + Map<Path, Partition> pathToPartitionMap = allowedAliasesToPartitions.get(alias); + if (pathToPartitionMap == null) { + continue; + } + addCandidatePath(candidatePathsToAliases, path, alias); Review comment: Can be candidatePathsToAliases.computeIfAbsent(path, k -> new ArrayList<>()).add(alias) ? ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking ------------------- Worklog Id: (was: 509607) Time Spent: 20m (was: 10m) > NullScanTaskDispatcher should use stats > --------------------------------------- > > Key: HIVE-24350 > URL: https://issues.apache.org/jira/browse/HIVE-24350 > Project: Hive > Issue Type: Sub-task > Reporter: Mustafa Iman > Assignee: Mustafa Iman > Priority: Major > Labels: pull-request-available > Time Spent: 20m > Remaining Estimate: 0h > > NullScanTaskDispatcher manually checks each partition directory to see if > they are empty. While this is necessary in external tables, we can just use > stats for managed tables. -- This message was sent by Atlassian Jira (v8.3.4#803005)