zenfenan commented on a change in pull request #11307: [FLINK-16371] [BulkWriter] Fix Hadoop Compression BulkWriter URL: https://github.com/apache/flink/pull/11307#discussion_r389804308
########## File path: flink-formats/flink-compress/src/main/java/org/apache/flink/formats/compress/CompressWriterFactory.java ########## @@ -42,39 +47,57 @@ private Extractor<IN> extractor; private CompressionCodec hadoopCodec; + private String hadoopCodecName; + private Map<String, String> hadoopConfigurationMap; + private String codecExtension; public CompressWriterFactory(Extractor<IN> extractor) { - this.extractor = Preconditions.checkNotNull(extractor, "extractor cannot be null"); + this.extractor = checkNotNull(extractor, "Extractor cannot be null"); + this.hadoopConfigurationMap = new HashMap<>(); } public CompressWriterFactory<IN> withHadoopCompression(String hadoopCodecName) { return withHadoopCompression(hadoopCodecName, new Configuration()); } public CompressWriterFactory<IN> withHadoopCompression(String hadoopCodecName, Configuration hadoopConfiguration) { - return withHadoopCompression(new CompressionCodecFactory(hadoopConfiguration).getCodecByName(hadoopCodecName)); - } + CompressionCodec codec = new CompressionCodecFactory(hadoopConfiguration).getCodecByName(hadoopCodecName); + this.codecExtension = checkNotNull(codec, "Unable to load the provided Hadoop codec [" + hadoopCodecName + "]") + .getDefaultExtension(); + + this.hadoopCodecName = hadoopCodecName; + + for (Map.Entry<String, String> entry : hadoopConfiguration) { + hadoopConfigurationMap.put(entry.getKey(), entry.getValue()); + } - public CompressWriterFactory<IN> withHadoopCompression(CompressionCodec hadoopCodec) { - this.hadoopCodec = Preconditions.checkNotNull(hadoopCodec, "hadoopCodec cannot be null"); return this; } @Override public BulkWriter<IN> create(FSDataOutputStream out) throws IOException { - try { - return (hadoopCodec != null) - ? new HadoopCompressionBulkWriter<>(out, extractor, hadoopCodec) - : new NoCompressionBulkWriter<>(out, extractor); - } catch (Exception e) { - throw new IOException(e.getLocalizedMessage(), e); + if (hadoopCodecName == null || hadoopCodecName.length() == 0) { + return new NoCompressionBulkWriter<>(out, extractor); } + + initializeCompressionCodec(); + + return new HadoopCompressionBulkWriter<>(hadoopCodec.createOutputStream(out), extractor); } - public String codecExtension() { - return (hadoopCodec != null) - ? hadoopCodec.getDefaultExtension() - : ""; + public String getExtension() { + return (hadoopCodecName != null) ? this.codecExtension : ""; } + private void initializeCompressionCodec() { + if (hadoopCodec == null) { + Configuration conf = new Configuration(); + + for (Map.Entry<String, String> entry : hadoopConfigurationMap.entrySet()) { + conf.set(entry.getKey(), entry.getValue()); + } + + hadoopCodec = new CompressionCodecFactory(conf).getCodecByName(this.hadoopCodecName); + } + } Review comment: I'm all in for making the code more concise and clear. However, I thought of not spending much time on the Map to Configuration, and vice versa since I thought of getting rid of this Map based approach altogether in a further patch (based on a decision from the ML thread on SerializableHadoopConfiguration). Do you still want me to go ahead with these separate static methods ? ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services