allenerb commented on a change in pull request #1597:
URL: https://github.com/apache/incubator-hudi/pull/1597#discussion_r422266199



##########
File path: 
hudi-utilities/src/main/java/org/apache/hudi/utilities/keygen/MultiFormatTimestampBasedKeyGenerator.java
##########
@@ -0,0 +1,183 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.utilities.keygen;
+
+import org.apache.avro.generic.GenericRecord;
+import org.apache.hudi.DataSourceUtils;
+import org.apache.hudi.common.model.HoodieKey;
+import org.apache.hudi.common.util.TypedProperties;
+import org.apache.hudi.exception.HoodieKeyException;
+import org.apache.hudi.exception.HoodieNotSupportedException;
+import org.apache.hudi.keygen.SimpleKeyGenerator;
+import org.apache.hudi.utilities.exception.HoodieDeltaStreamerException;
+
+import java.io.Serializable;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.TimeZone;
+
+import org.joda.time.DateTime;
+import org.joda.time.DateTimeZone;
+import org.joda.time.format.DateTimeFormat;
+import org.joda.time.format.DateTimeFormatter;
+import org.joda.time.format.DateTimeFormatterBuilder;
+import org.joda.time.format.DateTimeParser;
+
+/**
+ * Key generator, that relies on timestamps for partitioning field. Still 
picks record key by name.
+ */
+public class MultiFormatTimestampBasedKeyGenerator extends SimpleKeyGenerator {
+
+  enum TimestampType implements Serializable {
+    UNIX_TIMESTAMP, DATE_STRING, MIXED, EPOCHMILLISECONDS
+  }
+
+  private final TimestampType timestampType;
+  private final String outputDateFormat;
+  private final String configInputDateFormatList;
+  private final String configInputDateFormatDelimiter;
+
+
+  // TimeZone detailed settings reference
+  // https://docs.oracle.com/javase/8/docs/api/java/util/TimeZone.html
+  private final DateTimeZone inputDateTimeZone;
+  private final DateTimeZone outputDateTimeZone;
+
+  /**
+   * Supported configs.
+   */
+  static class Config {
+    // One value from TimestampType above
+    private static final String TIMESTAMP_TYPE_FIELD_PROP                      
       = "hoodie.deltastreamer.keygen.timebased.timestamp.type";
+
+    private static final String TIMESTAMP_INPUT_DATE_FORMAT_LIST_PROP          
       = "hoodie.deltastreamer.keygen.timebased.input.dateformatlist";
+    private static final String 
TIMESTAMP_INPUT_DATE_FORMAT_LIST_DELIMETER_REGEX_PROP = 
"hoodie.deltastreamer.keygen.timebased.input.dateformatlistdelimiterregex";
+    private static final String TIMESTAMP_INPUT_TIMEZONE_FORMAT_PROP           
       = "hoodie.deltastreamer.keygen.timebased.input.timezone";
+
+    private static final String TIMESTAMP_OUTPUT_DATE_FORMAT_PROP              
       = "hoodie.deltastreamer.keygen.timebased.output.dateformat";
+    private static final String TIMESTAMP_OUTPUT_TIMEZONE_FORMAT_PROP          
       = "hoodie.deltastreamer.keygen.timebased.output.timezone";
+  }
+
+  public MultiFormatTimestampBasedKeyGenerator(TypedProperties config) {
+    super(config);
+    DataSourceUtils.checkRequiredProperties(config,
+        Arrays.asList(Config.TIMESTAMP_TYPE_FIELD_PROP, 
Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP));
+
+    String inputTimeZone = 
config.getString(Config.TIMESTAMP_INPUT_TIMEZONE_FORMAT_PROP, "");
+    String outputTimeZone = 
config.getString(Config.TIMESTAMP_OUTPUT_TIMEZONE_FORMAT_PROP, "");
+
+    this.timestampType = 
TimestampType.valueOf(config.getString(Config.TIMESTAMP_TYPE_FIELD_PROP));
+    this.outputDateFormat = 
config.getString(Config.TIMESTAMP_OUTPUT_DATE_FORMAT_PROP);
+    this.configInputDateFormatList = 
config.getString(Config.TIMESTAMP_INPUT_DATE_FORMAT_LIST_PROP, "");
+
+    String inputDateFormatDelimiter = 
this.config.getString(Config.TIMESTAMP_INPUT_DATE_FORMAT_LIST_DELIMETER_REGEX_PROP,
 ",").trim();
+    inputDateFormatDelimiter = inputDateFormatDelimiter.isEmpty() ? "," : 
inputDateFormatDelimiter;
+    this.configInputDateFormatDelimiter = inputDateFormatDelimiter;
+
+    if (inputTimeZone != null && !inputTimeZone.trim().isEmpty()) {

Review comment:
       Oh, btw @vinothchandar - on the existing timestampgenerator - I too 
thought about extending that thing to make it happen, but while I was 
researching how I wanted to approach this, using the SimpleDateFormat class 
that the TimestampBasedKeyGenerator means the options would have been to create 
a list of formats the user wanted to parse, and then basically do try catch 
iterations around each provided formatter (which is an expensive process), and 
then decide to throw at the end if none matched.  So, the primary reason I went 
the Joda time route is it has a built in ability to handle and attempt multiple 
formats, supposedly in a more performant way.  I have not taken the time to 
verify whether it's faster or not, but with the existing time classes being 
used by the existing timestamp key generator, the options were limited.  
   
   Just wanted to throw that out there to give some background information on 
why I chose to split to a new class rather than extend the old - basically 
extending the old would likely have been rewriting the old (which may have been 
the right path).




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to