[ https://issues.apache.org/jira/browse/HIVE-25453?focusedWorklogId=641011&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-641011 ]
ASF GitHub Bot logged work on HIVE-25453: ----------------------------------------- Author: ASF GitHub Bot Created on: 24/Aug/21 09:20 Start Date: 24/Aug/21 09:20 Worklog Time Spent: 10m Work Description: pvary commented on a change in pull request #2586: URL: https://github.com/apache/hive/pull/2586#discussion_r694674515 ########## File path: ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java ########## @@ -2693,6 +2696,77 @@ public static TypeDescription getDesiredRowTypeDescr(Configuration conf, return result; } + /** + * Based on the file schema and the low level file includes provided in the SchemaEvolution instance, this method + * calculates which top level columns should be included i.e. if any of the nested columns inside complex types is + * required, then its relevant top level parent column will be considered as required (and thus the full subtree). + * Hive and LLAP currently only supports column pruning on the first level, thus we need to calculate this ourselves. + * @param evolution + * @return bool array of include values, where 0th element is root struct, and any Nth element is a first level + * column within that + */ + public static boolean[] firstLevelFileIncludes(SchemaEvolution evolution) { + // This is the leaf level type description include bool array + boolean[] lowLevelIncludes = evolution.getFileIncluded(); + Map<Integer, TypeDescription> idMap = new HashMap<>(); + Map<Integer, Integer> parentIdMap = new HashMap<>(); + idToFieldSchemaMap(evolution.getFileSchema(), idMap, parentIdMap); + + // Root + N top level columns... + boolean[] result = new boolean[evolution.getFileSchema().getChildren().size() + 1]; + + Set<Integer> requiredTopLevelSchemaIds = new HashSet<>(); + for (int i = 1; i < lowLevelIncludes.length; ++i) { + if (lowLevelIncludes[i]) { + int topLevelParentId = getTopLevelParentId(i, parentIdMap); + if (!requiredTopLevelSchemaIds.contains(topLevelParentId)) { + requiredTopLevelSchemaIds.add(topLevelParentId); + } + } + } + + List<TypeDescription> topLevelFields = evolution.getFileSchema().getChildren(); + + for (int typeDescriptionId : requiredTopLevelSchemaIds) { + result[IntStream.range(0, topLevelFields.size()).filter( + i -> typeDescriptionId == topLevelFields.get(i).getId()).findFirst().getAsInt() + 1] = true; + } + + return result; + } + + /** + * Recursively builds 2 maps: + * ID to type description + * child to parent type description Review comment: child to partent id? ########## File path: ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java ########## @@ -2693,6 +2696,77 @@ public static TypeDescription getDesiredRowTypeDescr(Configuration conf, return result; } + /** + * Based on the file schema and the low level file includes provided in the SchemaEvolution instance, this method + * calculates which top level columns should be included i.e. if any of the nested columns inside complex types is + * required, then its relevant top level parent column will be considered as required (and thus the full subtree). + * Hive and LLAP currently only supports column pruning on the first level, thus we need to calculate this ourselves. + * @param evolution + * @return bool array of include values, where 0th element is root struct, and any Nth element is a first level + * column within that + */ + public static boolean[] firstLevelFileIncludes(SchemaEvolution evolution) { + // This is the leaf level type description include bool array + boolean[] lowLevelIncludes = evolution.getFileIncluded(); + Map<Integer, TypeDescription> idMap = new HashMap<>(); + Map<Integer, Integer> parentIdMap = new HashMap<>(); + idToFieldSchemaMap(evolution.getFileSchema(), idMap, parentIdMap); + + // Root + N top level columns... + boolean[] result = new boolean[evolution.getFileSchema().getChildren().size() + 1]; + + Set<Integer> requiredTopLevelSchemaIds = new HashSet<>(); + for (int i = 1; i < lowLevelIncludes.length; ++i) { + if (lowLevelIncludes[i]) { + int topLevelParentId = getTopLevelParentId(i, parentIdMap); + if (!requiredTopLevelSchemaIds.contains(topLevelParentId)) { + requiredTopLevelSchemaIds.add(topLevelParentId); + } + } + } + + List<TypeDescription> topLevelFields = evolution.getFileSchema().getChildren(); + + for (int typeDescriptionId : requiredTopLevelSchemaIds) { + result[IntStream.range(0, topLevelFields.size()).filter( + i -> typeDescriptionId == topLevelFields.get(i).getId()).findFirst().getAsInt() + 1] = true; + } + + return result; + } + + /** + * Recursively builds 2 maps: + * ID to type description + * child to parent type description Review comment: child to parent id? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: gitbox-unsubscr...@hive.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking ------------------- Worklog Id: (was: 641011) Time Spent: 1h 40m (was: 1.5h) > Add LLAP IO support for Iceberg ORC tables > ------------------------------------------ > > Key: HIVE-25453 > URL: https://issues.apache.org/jira/browse/HIVE-25453 > Project: Hive > Issue Type: New Feature > Reporter: Ádám Szita > Assignee: Ádám Szita > Priority: Major > Labels: pull-request-available > Time Spent: 1h 40m > Remaining Estimate: 0h > -- This message was sent by Atlassian Jira (v8.3.4#803005)