This is an automated email from the ASF dual-hosted git repository.
zivanfi pushed a commit to branch column-indexes
in repository https://gitbox.apache.org/repos/asf/parquet-mr.git
The following commit(s) were added to refs/heads/column-indexes by this push:
new 6165a0c PARQUET-1212: Column indexes: Show indexes in tools (#479)
6165a0c is described below
commit 6165a0c4ee695708562b7787d484d48fdd6eb074
Author: Gabor Szadovszky <[email protected]>
AuthorDate: Tue May 22 15:19:12 2018 +0200
PARQUET-1212: Column indexes: Show indexes in tools (#479)
---
.../src/main/java/org/apache/parquet/cli/Main.java | 2 +
.../cli/commands/ShowColumnIndexCommand.java | 166 ++++++++++++++++++
.../parquet/tools/command/ColumnIndexCommand.java | 190 +++++++++++++++++++++
.../org/apache/parquet/tools/command/Registry.java | 1 +
4 files changed, 359 insertions(+)
diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java
b/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java
index 990193c..fa69ce7 100644
--- a/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java
+++ b/parquet-cli/src/main/java/org/apache/parquet/cli/Main.java
@@ -32,6 +32,7 @@ import org.apache.parquet.cli.commands.ConvertCSVCommand;
import org.apache.parquet.cli.commands.ConvertCommand;
import org.apache.parquet.cli.commands.ParquetMetadataCommand;
import org.apache.parquet.cli.commands.SchemaCommand;
+import org.apache.parquet.cli.commands.ShowColumnIndexCommand;
import org.apache.parquet.cli.commands.ShowDictionaryCommand;
import org.apache.parquet.cli.commands.ShowPagesCommand;
import org.apache.parquet.cli.commands.ToAvroCommand;
@@ -87,6 +88,7 @@ public class Main extends Configured implements Tool {
jc.addCommand("to-avro", new ToAvroCommand(console));
jc.addCommand("cat", new CatCommand(console, 0));
jc.addCommand("head", new CatCommand(console, 10));
+ jc.addCommand("column-index", new ShowColumnIndexCommand(console));
}
@Override
diff --git
a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowColumnIndexCommand.java
b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowColumnIndexCommand.java
new file mode 100644
index 0000000..0407a8d
--- /dev/null
+++
b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ShowColumnIndexCommand.java
@@ -0,0 +1,166 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.cli.commands;
+
+import java.io.IOException;
+import java.util.AbstractMap;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.parquet.cli.BaseCommand;
+import org.apache.parquet.hadoop.ParquetFileReader;
+import org.apache.parquet.hadoop.metadata.BlockMetaData;
+import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
+import org.apache.parquet.hadoop.metadata.ParquetMetadata;
+import org.apache.parquet.hadoop.util.HadoopInputFile;
+import org.apache.parquet.internal.column.columnindex.ColumnIndex;
+import org.apache.parquet.internal.column.columnindex.OffsetIndex;
+import org.apache.parquet.io.InputFile;
+import org.slf4j.Logger;
+
+import com.beust.jcommander.Parameter;
+import com.beust.jcommander.Parameters;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Lists;
+
+/**
+ * parquet-cli command to print column and offset indexes.
+ */
+@Parameters(commandDescription = "Prints the column and offset indexes of a
Parquet file")
+public class ShowColumnIndexCommand extends BaseCommand {
+ public ShowColumnIndexCommand(Logger console) {
+ super(console);
+ }
+
+ @Parameter(description = "<parquet path>")
+ List<String> files;
+
+ @Parameter(names = { "-c", "--column" }, description = "Shows the
column/offset indexes for the given column only")
+ List<String> ColumnPaths;
+
+ @Parameter(names = { "-b",
+ "--block" }, description = "Shows the column/offset indexes for the
given block (row-group) only; "
+ + "blocks are referenced by their indexes from 0")
+ List<String> blockIndexes;
+
+ @Parameter(names = { "-i", "--column-index" }, description = "Shows the
column indexes; "
+ + "active by default unless -o is used")
+ boolean showColumnIndex;
+
+ @Parameter(names = { "-o", "--offset-index" }, description = "Shows the
offset indexes; "
+ + "active by default unless -i is used")
+ boolean showOffsetIndex;
+
+ @Override
+ public List<String> getExamples() {
+ return Lists.newArrayList(
+ "# Show only column indexes for column 'col' from a Parquet file",
+ "-c col -i sample.parquet");
+ }
+
+ @Override
+ public int run() throws IOException {
+ Preconditions.checkArgument(files != null && files.size() >= 1,
+ "A Parquet file is required.");
+ Preconditions.checkArgument(files.size() == 1,
+ "Cannot process multiple Parquet files.");
+
+ InputFile in = HadoopInputFile.fromPath(new Path(files.get(0)), new
Configuration());
+ if (!showColumnIndex && !showOffsetIndex) {
+ showColumnIndex = showOffsetIndex = true;
+ }
+
+ try (ParquetFileReader reader = ParquetFileReader.open(in)) {
+ boolean firstBlock = true;
+ for (Entry<Integer, BlockMetaData> entry :
getBlocks(reader.getFooter())) {
+ if (!firstBlock) {
+ console.info("");
+ }
+ firstBlock = false;
+ console.info("row group {}:", entry.getKey());
+ for (ColumnChunkMetaData column : getColumns(entry.getValue())) {
+ String path = column.getPath().toDotString();
+ if (showColumnIndex) {
+ console.info("column index for column {}:", path);
+ ColumnIndex columnIndex = reader.readColumnIndex(column);
+ if (columnIndex == null) {
+ console.info("NONE");
+ } else {
+ console.info(columnIndex.toString());
+ }
+ }
+ if (showOffsetIndex) {
+ console.info("offset index for column {}:", path);
+ OffsetIndex offsetIndex = reader.readOffsetIndex(column);
+ if (offsetIndex == null) {
+ console.info("NONE");
+ } else {
+ console.info(offsetIndex.toString());
+ }
+ }
+ }
+ }
+ }
+ return 0;
+ }
+
+ // Returns the index-block pairs based on the arguments of --block
+ private List<Entry<Integer, BlockMetaData>> getBlocks(ParquetMetadata meta) {
+ List<BlockMetaData> blocks = meta.getBlocks();
+ List<Entry<Integer, BlockMetaData>> pairs = new ArrayList<>();
+ if (blockIndexes == null || blockIndexes.isEmpty()) {
+ int index = 0;
+ for (BlockMetaData block : blocks) {
+ pairs.add(new AbstractMap.SimpleImmutableEntry<>(index++, block));
+ }
+ } else {
+ for (String indexStr : blockIndexes) {
+ int index = Integer.parseInt(indexStr);
+ pairs.add(new AbstractMap.SimpleImmutableEntry<>(index,
blocks.get(index)));
+ }
+ }
+ return pairs;
+ }
+
+ private List<ColumnChunkMetaData> getColumns(BlockMetaData block) {
+ List<ColumnChunkMetaData> columns = block.getColumns();
+ if (ColumnPaths == null || ColumnPaths.isEmpty()) {
+ return columns;
+ }
+ Map<String, ColumnChunkMetaData> pathMap = new HashMap<>();
+ for (ColumnChunkMetaData column : columns) {
+ pathMap.put(column.getPath().toDotString(), column);
+ }
+
+ List<ColumnChunkMetaData> filtered = new ArrayList<>();
+ for (String path : ColumnPaths) {
+ ColumnChunkMetaData column = pathMap.get(path);
+ if (column != null) {
+ filtered.add(column);
+ }
+ }
+ return filtered;
+ }
+
+}
diff --git
a/parquet-tools/src/main/java/org/apache/parquet/tools/command/ColumnIndexCommand.java
b/parquet-tools/src/main/java/org/apache/parquet/tools/command/ColumnIndexCommand.java
new file mode 100644
index 0000000..f31599a
--- /dev/null
+++
b/parquet-tools/src/main/java/org/apache/parquet/tools/command/ColumnIndexCommand.java
@@ -0,0 +1,190 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.tools.command;
+
+import java.io.PrintWriter;
+import java.util.AbstractMap;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.parquet.internal.column.columnindex.ColumnIndex;
+import org.apache.parquet.internal.column.columnindex.OffsetIndex;
+import org.apache.parquet.hadoop.ParquetFileReader;
+import org.apache.parquet.hadoop.metadata.BlockMetaData;
+import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
+import org.apache.parquet.hadoop.metadata.ParquetMetadata;
+import org.apache.parquet.hadoop.util.HadoopInputFile;
+import org.apache.parquet.io.InputFile;
+import org.apache.parquet.tools.Main;
+
+/**
+ * parquet-tools command to print column and offset indexes.
+ */
+public class ColumnIndexCommand extends ArgsOnlyCommand {
+ public static final String[] USAGE = new String[] {
+ "<input>",
+ "where <input> is the parquet file to print the column and offset
indexes for"
+ };
+
+ public static final Options OPTIONS;
+ static {
+ OPTIONS = new Options();
+ OPTIONS.addOption(Option.builder("c")
+ .longOpt("column")
+ .desc("Shows the column/offset indexes for the given column only; "
+ + "multiple columns shall be separated by commas")
+ .hasArg()
+ .build());
+ OPTIONS.addOption(Option.builder("b")
+ .longOpt("block")
+ .desc("Shows the column/offset indexes for the given block (row-group)
only; "
+ + "multiple blocks shall be speparated by commas; "
+ + "blocks are referenced by their indexes from 0")
+ .hasArg()
+ .build());
+ OPTIONS.addOption(Option.builder("i")
+ .longOpt("column-index")
+ .desc("Shows the column indexes; "
+ + "active by default unless -o is used")
+ .hasArg(false)
+ .build());
+ OPTIONS.addOption(Option.builder("o")
+ .longOpt("offset-index")
+ .desc("Shows the offset indexes; "
+ + "active by default unless -i is used")
+ .hasArg(false)
+ .build());
+ }
+
+ public ColumnIndexCommand() {
+ super(1, 1);
+ }
+
+ @Override
+ public String[] getUsageDescription() {
+ return USAGE;
+ }
+
+ @Override
+ public String getCommandDescription() {
+ return "Prints the column and offset indexes of a Parquet file.";
+ }
+
+ @Override
+ public Options getOptions() {
+ return OPTIONS;
+ }
+
+ @Override
+ public void execute(CommandLine options) throws Exception {
+ super.execute(options);
+
+ String[] args = options.getArgs();
+ InputFile in = HadoopInputFile.fromPath(new Path(args[0]), new
Configuration());
+ PrintWriter out = new PrintWriter(Main.out, true);
+ String blockValue = options.getOptionValue("b");
+ String[] indexes = blockValue == null ? null :
blockValue.split("\\s*,\\s*");
+ boolean showColumnIndex = options.hasOption("i");
+ boolean showOffsetIndex = options.hasOption("o");
+ if (!showColumnIndex && !showOffsetIndex) {
+ showColumnIndex = showOffsetIndex = true;
+ }
+
+ try (ParquetFileReader reader = ParquetFileReader.open(in)) {
+ boolean firstBlock = true;
+ for (Entry<Integer, BlockMetaData> entry : getBlocks(reader.getFooter(),
indexes)) {
+ if (!firstBlock) {
+ out.println();
+ }
+ firstBlock = false;
+ out.format("row group %d:%n", entry.getKey());
+ for (ColumnChunkMetaData column : getColumns(entry.getValue(),
options)) {
+ String path = column.getPath().toDotString();
+ if (showColumnIndex) {
+ out.format("column index for column %s:%n", path);
+ ColumnIndex columnIndex = reader.readColumnIndex(column);
+ if (columnIndex == null) {
+ out.println("NONE");
+ } else {
+ out.println(columnIndex);
+ }
+ }
+ if (showOffsetIndex) {
+ out.format("offset index for column %s:%n", path);
+ OffsetIndex offsetIndex = reader.readOffsetIndex(column);
+ if (offsetIndex == null) {
+ out.println("NONE");
+ } else {
+ out.println(offsetIndex);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ // Returns the index-block pairs based on the arguments of --block
+ private static List<Entry<Integer, BlockMetaData>> getBlocks(ParquetMetadata
meta, String[] indexes) {
+ List<BlockMetaData> blocks = meta.getBlocks();
+ List<Entry<Integer, BlockMetaData>> pairs = new ArrayList<>();
+ if (indexes == null) {
+ int index = 0;
+ for (BlockMetaData block : blocks) {
+ pairs.add(new AbstractMap.SimpleImmutableEntry<>(index++, block));
+ }
+ } else {
+ for (String indexStr : indexes) {
+ int index = Integer.parseInt(indexStr);
+ pairs.add(new AbstractMap.SimpleImmutableEntry<>(index,
blocks.get(index)));
+ }
+ }
+ return pairs;
+ }
+
+ private static List<ColumnChunkMetaData> getColumns(BlockMetaData block,
CommandLine options) {
+ List<ColumnChunkMetaData> columns = block.getColumns();
+ String pathValue = options.getOptionValue("c");
+ if (pathValue == null) {
+ return columns;
+ }
+ String[] paths = pathValue.split("\\s*,\\s*");
+ Map<String, ColumnChunkMetaData> pathMap = new HashMap<>();
+ for (ColumnChunkMetaData column : columns) {
+ pathMap.put(column.getPath().toDotString(), column);
+ }
+
+ List<ColumnChunkMetaData> filtered = new ArrayList<>();
+ for (String path : paths) {
+ ColumnChunkMetaData column = pathMap.get(path);
+ if (column != null) {
+ filtered.add(column);
+ }
+ }
+ return filtered;
+ }
+
+}
diff --git
a/parquet-tools/src/main/java/org/apache/parquet/tools/command/Registry.java
b/parquet-tools/src/main/java/org/apache/parquet/tools/command/Registry.java
index 6df84be..399efb7 100644
--- a/parquet-tools/src/main/java/org/apache/parquet/tools/command/Registry.java
+++ b/parquet-tools/src/main/java/org/apache/parquet/tools/command/Registry.java
@@ -34,6 +34,7 @@ public final class Registry {
registry.put("merge", MergeCommand.class);
registry.put("rowcount", RowCountCommand.class);
registry.put("size", SizeCommand.class);
+ registry.put("column-index", ColumnIndexCommand.class);
}
public static Map<String,Command> allCommands() {
--
To stop receiving notification emails like this one, please contact
[email protected].