This is an automated email from the ASF dual-hosted git repository. wzhou pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit 679d58fa6d970065f6c656ffcfd504794c2be516 Author: Daniel Becker <[email protected]> AuthorDate: Thu Jun 22 18:05:19 2023 +0200 IMPALA-12238: RandomNestedDataGenerator should take a seed argument RandomNestedDataGenerator can be used to produce parquet files with random data from Avro schemas. This change makes it possible to provide a seed value for the random generator so the generated files are reproducible. The seed can be given as the last (optional) command line argument. It is parsed as a Java 'long'. Testing: - manually verified that when run with the same arguments (including the seed), the data generator produces the same results Change-Id: Iee33604bbfe12895100afbd0f98ac302dee9a238 Reviewed-on: http://gerrit.cloudera.org:8080/20136 Reviewed-by: Csaba Ringhofer <[email protected]> Tested-by: Daniel Becker <[email protected]> --- .../datagenerator/RandomNestedDataGenerator.java | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/java/datagenerator/src/main/java/org/apache/impala/datagenerator/RandomNestedDataGenerator.java b/java/datagenerator/src/main/java/org/apache/impala/datagenerator/RandomNestedDataGenerator.java index 9a3cb7894..fc68847c4 100644 --- a/java/datagenerator/src/main/java/org/apache/impala/datagenerator/RandomNestedDataGenerator.java +++ b/java/datagenerator/src/main/java/org/apache/impala/datagenerator/RandomNestedDataGenerator.java @@ -24,6 +24,7 @@ import java.lang.StringBuilder; import java.util.ArrayList; import java.util.List; import java.util.HashMap; +import java.util.Optional; import java.util.Random; import java.util.Date; @@ -54,9 +55,10 @@ public class RandomNestedDataGenerator { public static final Double CHANCE_UNIQUE = 0.02; private static void generateDataToFile( - String schemaFile, int targetNumElements, String outputFile) throws IOException { + String schemaFile, int targetNumElements, String outputFile, Optional<Long> seed) + throws IOException { buildCache(); - rand = new Random(); + rand = seed.isPresent() ? new Random(seed.get()) : new Random(); Schema schema = new Schema.Parser().parse(new File(schemaFile)); Configuration conf = new Configuration(); conf.set("parquet.avro.write-old-list-structure", "false"); @@ -251,8 +253,10 @@ public class RandomNestedDataGenerator { } public static void main(String[] args) throws Exception { - if (args.length != 4) { - System.err.println("Arguments: schema_file num_elements list_len output_file"); + final int num_args = args.length; + if (num_args < 4 || num_args > 5) { + System.err.println( + "Arguments: schema_file num_elements list_len output_file [random_seed]"); System.exit(1); } String schemaFile = args[0]; @@ -260,6 +264,13 @@ public class RandomNestedDataGenerator { numListItems = Integer.valueOf(args[2]); String outputFile = args[3]; - generateDataToFile(schemaFile, numElements, outputFile); + Optional<Long> seed; + if (num_args > 4) { + seed = Optional.of(Long.valueOf(args[4])); + } else { + seed = Optional.empty(); + } + + generateDataToFile(schemaFile, numElements, outputFile, seed); } }
