(impala) 02/02: IMPALA-12927: Support specifying format for reading JSON BINARY columns

joemcdonnell Tue, 29 Apr 2025 10:00:34 -0700

This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


commit faf322dd414c7d93bf5d6bd8e1a83113c19e9c7d
Author: Eyizoha <[email protected]>
AuthorDate: Thu Jan 2 16:39:46 2025 +0800

    IMPALA-12927: Support specifying format for reading JSON BINARY columns
    
    Currently, Impala always assumes that the data in the binary columns of
    JSON tables is base64 encoded. However, before HIVE-21240, Hive wrote
    binary data to JSON tables without base64 encoding it, instead writing
    it as escaped strings. After HIVE-21240, Hive defaults to base64
    encoding binary data when writing to JSON tables and introduces the
    serde property 'json.binary.format' to indicate the encoding method of
    binary data in JSON tables.
    
    To maintain consistency with Hive and avoid correctness issues caused by
    reading data in an incorrect manner, this patch also introduces the
    serde property 'json.binary.format' to specify the reading method for
    binary data in JSON tables. Currently, this property supports reading in
    either base64 or rawstring formats, same as Hive.
    
    Additionally, this patch introduces a query option 'json_binary_format'
    to achieve the same effect. This query option will only take effect for
    JSON tables where the serde property 'json.binary.format' is not set.
    The reading format of binary columns in JSON tables can be configured
    globally by setting the 'default_query_options'. It should be noted that
    the default value of 'json_binary_format' is 'NONE', and impala will
    prohibit reading binary columns of JSON tables that either have
    "no 'json.binary.format' set and 'json_binary_format' is 'NONE'" or
    "an invalid 'json.binary.format' value set", and will provide an error
    message to avoid using an incorrect format without the user noticing.
    
    Testing:
      - Enabled existing binary type E2E tests for JSON tables
      - Added new E2E test for 'json.binary.format'
    
    Change-Id: Idf61fa3afc0f33caa63fbc05393e975733165e82
    Reviewed-on: http://gerrit.cloudera.org:8080/22289
    Reviewed-by: Impala Public Jenkins <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 be/src/exec/json/hdfs-json-scanner.cc              |   9 +-
 be/src/exec/text-converter.cc                      |   5 +-
 be/src/exec/text-converter.h                       |   6 +-
 be/src/exec/text-converter.inline.h                |   2 +-
 be/src/runtime/descriptors.cc                      |   1 +
 be/src/runtime/descriptors.h                       |   4 +
 be/src/service/query-options.cc                    |   7 ++
 be/src/service/query-options.h                     |   3 +-
 common/thrift/CatalogObjects.thrift                |   8 ++
 common/thrift/ImpalaService.thrift                 |   7 ++
 common/thrift/Query.thrift                         |   4 +
 .../impala/catalog/HdfsStorageDescriptor.java      |  37 +++++--
 .../org/apache/impala/planner/HdfsScanNode.java    |  59 ++++++++++-
 testdata/bin/generate-schema-statements.py         |   9 +-
 testdata/datasets/README                           |   1 +
 .../functional/functional_schema_template.sql      |  15 +++
 .../queries/QueryTest/json-binary-format.test      | 110 +++++++++++++++++++++
 tests/query_test/test_scanners.py                  |  12 ++-
 18 files changed, 278 insertions(+), 21 deletions(-)

diff --git a/be/src/exec/json/hdfs-json-scanner.cc 
b/be/src/exec/json/hdfs-json-scanner.cc
index 46d26cc7e..ac8cdc1fb 100644
--- a/be/src/exec/json/hdfs-json-scanner.cc
+++ b/be/src/exec/json/hdfs-json-scanner.cc
@@ -125,7 +125,14 @@ Status HdfsJsonScanner::InitNewRange() {
     schema.push_back(scan_node_->hdfs_table()->GetColumnDesc(slot).name());
   }
 
-  text_converter_.reset(new TextConverter('\\', "", false, 
state_->strict_mode()));
+  auto json_binary_format = 
context_->partition_descriptor()->json_binary_format();
+  if (json_binary_format == TJsonBinaryFormat::NONE) {
+    json_binary_format = state_->query_options().json_binary_format;
+  }
+  bool decode_binary = json_binary_format == TJsonBinaryFormat::BASE64;
+
+  text_converter_.reset(new TextConverter('\\', "", false, 
state_->strict_mode(),
+      decode_binary));
   json_parser_.reset(new JsonParser<HdfsJsonScanner>(schema, this));
   json_parser_->ResetParser();
   return Status::OK();
diff --git a/be/src/exec/text-converter.cc b/be/src/exec/text-converter.cc
index ca4fb0fc8..30bd0511d 100644
--- a/be/src/exec/text-converter.cc
+++ b/be/src/exec/text-converter.cc
@@ -35,11 +35,12 @@
 using namespace impala;
 
 TextConverter::TextConverter(char escape_char, const string& null_col_val,
-    bool check_null, bool strict_mode)
+    bool check_null, bool strict_mode, bool decode_binary)
   : escape_char_(escape_char),
     null_col_val_(null_col_val),
     check_null_(check_null),
-    strict_mode_(strict_mode) {
+    strict_mode_(strict_mode),
+    decode_binary_(decode_binary) {
 }
 
 void TextConverter::UnescapeString(const char* src, char* dest, int* len,
diff --git a/be/src/exec/text-converter.h b/be/src/exec/text-converter.h
index f8b43f89b..da83895c9 100644
--- a/be/src/exec/text-converter.h
+++ b/be/src/exec/text-converter.h
@@ -49,7 +49,7 @@ class TextConverter {
   /// strict_mode: If set, numerical overflow/underflow are considered to be 
parse
   /// errors.
   TextConverter(char escape_char, const std::string& null_col_val,
-      bool check_null = true, bool strict_mode = false);
+      bool check_null = true, bool strict_mode = false, bool decode_binary = 
true);
 
   /// Converts slot data, of length 'len',  into type of slot_desc,
   /// and writes the result into the tuples's slot.
@@ -101,6 +101,10 @@ class TextConverter {
   /// Indicates whether numerical overflow/underflow are considered to be parse
   /// errors.
   bool strict_mode_;
+  /// Indicates whether we should use base64 decoding for binary data.
+  /// Currently, this is only set to false when reading JSON tables with 
rawstring format
+  /// binary columns.
+  bool decode_binary_;
 };
 
 }
diff --git a/be/src/exec/text-converter.inline.h 
b/be/src/exec/text-converter.inline.h
index 6455126be..6e1474607 100644
--- a/be/src/exec/text-converter.inline.h
+++ b/be/src/exec/text-converter.inline.h
@@ -67,7 +67,7 @@ inline bool TextConverter::WriteSlot(const SlotDescriptor* 
slot_desc, Tuple* tup
           !(len != 0 && (copy_string || need_escape));
 
       bool base64_decode = false;
-      if (type.IsBinaryType()) {
+      if (type.IsBinaryType() && decode_binary_) {
         base64_decode = true;
         reuse_data = false;
         int64_t out_len;
diff --git a/be/src/runtime/descriptors.cc b/be/src/runtime/descriptors.cc
index a1002b342..eda1bfacc 100644
--- a/be/src/runtime/descriptors.cc
+++ b/be/src/runtime/descriptors.cc
@@ -219,6 +219,7 @@ HdfsPartitionDescriptor::HdfsPartitionDescriptor(
   escape_char_ = sd.escapeChar;
   block_size_ = sd.blockSize;
   file_format_ = sd.fileFormat;
+  json_binary_format_ = sd.jsonBinaryFormat;
   DecompressLocation(thrift_table, thrift_partition, &location_);
 }
 
diff --git a/be/src/runtime/descriptors.h b/be/src/runtime/descriptors.h
index 10c42c664..2e432d8bc 100644
--- a/be/src/runtime/descriptors.h
+++ b/be/src/runtime/descriptors.h
@@ -401,6 +401,7 @@ class HdfsPartitionDescriptor {
   int block_size() const { return block_size_; }
   const std::string& location() const { return location_; }
   int64_t id() const { return id_; }
+  TJsonBinaryFormat::type json_binary_format() const { return 
json_binary_format_; }
   std::string DebugString() const;
 
   /// It is safe to call the returned expr evaluators concurrently from 
multiple
@@ -437,6 +438,9 @@ class HdfsPartitionDescriptor {
 
   /// The format (e.g. text, sequence file etc.) of data in the files in this 
partition
   THdfsFileFormat::type file_format_;
+
+  /// The format for reading JSON binary columns, used only for JSON tables.
+  TJsonBinaryFormat::type json_binary_format_;
 };
 
 class HdfsTableDescriptor : public TableDescriptor {
diff --git a/be/src/service/query-options.cc b/be/src/service/query-options.cc
index d71dbc6b9..a409cd0e2 100644
--- a/be/src/service/query-options.cc
+++ b/be/src/service/query-options.cc
@@ -1375,6 +1375,13 @@ Status impala::SetQueryOption(TImpalaQueryOptions::type 
option, const string& va
         query_options->__set_skip_unneeded_updates_col_limit(int32_t_val);
         break;
       }
+      case TImpalaQueryOptions::JSON_BINARY_FORMAT: {
+        TJsonBinaryFormat::type enum_type;
+        RETURN_IF_ERROR(GetThriftEnum(value, "Json binary format",
+            _TJsonBinaryFormat_VALUES_TO_NAMES, &enum_type));
+        query_options->__set_json_binary_format(enum_type);
+        break;
+      }
       case TImpalaQueryOptions::MEM_ESTIMATE_SCALE_FOR_SPILLING_OPERATOR: {
         double double_val = 0.0f;
         RETURN_IF_ERROR(QueryOptionParser::ParseAndCheckInclusiveRange<double>(
diff --git a/be/src/service/query-options.h b/be/src/service/query-options.h
index 5e21229e1..68c7737a8 100644
--- a/be/src/service/query-options.h
+++ b/be/src/service/query-options.h
@@ -51,7 +51,7 @@ typedef std::unordered_map<string, 
beeswax::TQueryOptionLevel::type>
 // plus one. Thus, the second argument to the DCHECK has to be updated every
 // time we add or remove a query option to/from the enum TImpalaQueryOptions.
 constexpr unsigned NUM_QUERY_OPTIONS =
-    TImpalaQueryOptions::USE_CALCITE_PLANNER + 1;
+    TImpalaQueryOptions::JSON_BINARY_FORMAT + 1;
 #define QUERY_OPTS_TABLE                                                       
          \
   DCHECK_EQ(_TImpalaQueryOptions_VALUES_TO_NAMES.size(), NUM_QUERY_OPTIONS);   
          \
   REMOVED_QUERY_OPT_FN(abort_on_default_limit_exceeded, 
ABORT_ON_DEFAULT_LIMIT_EXCEEDED) \
@@ -376,6 +376,7 @@ constexpr unsigned NUM_QUERY_OPTIONS =
       MEM_ESTIMATE_SCALE_FOR_SPILLING_OPERATOR, 
TQueryOptionLevel::DEVELOPMENT)          \
   QUERY_OPT_FN(use_calcite_planner, USE_CALCITE_PLANNER,                       
          \
       TQueryOptionLevel::ADVANCED)                                             
          \
+  QUERY_OPT_FN(json_binary_format, JSON_BINARY_FORMAT, 
TQueryOptionLevel::REGULAR)       \
   ;
 
 /// Enforce practical limits on some query options to avoid undesired query 
state.
diff --git a/common/thrift/CatalogObjects.thrift 
b/common/thrift/CatalogObjects.thrift
index 57b1ddf18..03ebdfdfa 100644
--- a/common/thrift/CatalogObjects.thrift
+++ b/common/thrift/CatalogObjects.thrift
@@ -178,6 +178,13 @@ enum TBucketType {
   HASH = 1
 }
 
+// Options for JSON binary format to determine how binary data is encoded in 
JSON.
+enum TJsonBinaryFormat {
+  NONE = 0
+  BASE64 = 1
+  RAWSTRING = 2
+}
+
 struct TCompressionCodec {
   // Compression codec
   1: required THdfsCompression codec
@@ -357,6 +364,7 @@ struct THdfsStorageDescriptor {
   6: required byte quoteChar
   7: required THdfsFileFormat fileFormat
   8: required i32 blockSize
+  9: optional TJsonBinaryFormat jsonBinaryFormat
 }
 
 // Represents an HDFS partition
diff --git a/common/thrift/ImpalaService.thrift 
b/common/thrift/ImpalaService.thrift
index 43886c001..ef6a7efbc 100644
--- a/common/thrift/ImpalaService.thrift
+++ b/common/thrift/ImpalaService.thrift
@@ -1026,6 +1026,13 @@ enum TImpalaQueryOptions {
 
   // If True, use the Calcite planner for compilation
   USE_CALCITE_PLANNER = 191
+
+  // The default format for reading JSON binary columns, can be overridden by 
table
+  // property 'json.binary.format' (if set). The valid values are:
+  //   NONE - default value, means unspecified format, depends on the table 
property.
+  //   BASE64 - the json binary data is read as base64 encoded string.
+  //   RAWSTRING - the json binary data is read as raw string.
+  JSON_BINARY_FORMAT = 192
 }
 
 // The summary of a DML statement.
diff --git a/common/thrift/Query.thrift b/common/thrift/Query.thrift
index 8e3fc5969..bd2181348 100644
--- a/common/thrift/Query.thrift
+++ b/common/thrift/Query.thrift
@@ -778,6 +778,10 @@ struct TQueryOptions {
 
   // See comment in ImpalaService.thrift
   192: optional bool use_calcite_planner = false;
+
+  // See comment in ImpalaService.thrift
+  193: optional CatalogObjects.TJsonBinaryFormat json_binary_format =
+      TJsonBinaryFormat.NONE;
 }
 
 // Impala currently has three types of sessions: Beeswax, HiveServer2 and 
external
diff --git 
a/fe/src/main/java/org/apache/impala/catalog/HdfsStorageDescriptor.java 
b/fe/src/main/java/org/apache/impala/catalog/HdfsStorageDescriptor.java
index 5402fae11..6ce5cc871 100644
--- a/fe/src/main/java/org/apache/impala/catalog/HdfsStorageDescriptor.java
+++ b/fe/src/main/java/org/apache/impala/catalog/HdfsStorageDescriptor.java
@@ -26,6 +26,7 @@ import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
 import org.apache.hadoop.hive.serde.serdeConstants;
 import org.apache.impala.thrift.THdfsPartition;
 import org.apache.impala.thrift.THdfsStorageDescriptor;
+import org.apache.impala.thrift.TJsonBinaryFormat;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -54,6 +55,10 @@ public class HdfsStorageDescriptor {
   // Serde parameters that are recognized by table writers.
   private static final String BLOCK_SIZE = "blocksize";
 
+  // Hive JSON SerDe constants 'JsonSerDe.BINARY_FORMAT'.
+  // 
https://github.com/apache/hive/blob/63e6aa519273342eb75740d960ed3d42167326ea/serde/src/java/org/apache/hadoop/hive/serde2/JsonSerDe.java#L56
+  public static final String JSON_BINARY_FORMAT = "json.binary.format";
+
   // Important: don't change the ordering of these keys - if e.g. FIELD_DELIM 
is not
   // found, the value of LINE_DELIM is used, so LINE_DELIM must be found first.
   // Package visible for testing.
@@ -89,6 +94,7 @@ public class HdfsStorageDescriptor {
   private final byte escapeChar_;
   private final byte quoteChar_;
   private final int blockSize_;
+  private final TJsonBinaryFormat jsonBinaryFormat_;
 
   /**
    * Returns a map from delimiter key to a single delimiter character,
@@ -165,7 +171,7 @@ public class HdfsStorageDescriptor {
 
   private HdfsStorageDescriptor(String tblName, HdfsFileFormat fileFormat, 
byte lineDelim,
       byte fieldDelim, byte collectionDelim, byte mapKeyDelim, byte escapeChar,
-      byte quoteChar, int blockSize) {
+      byte quoteChar, int blockSize, TJsonBinaryFormat jsonBinaryFormat) {
     this.fileFormat_ = fileFormat;
     this.lineDelim_ = lineDelim;
     this.fieldDelim_ = fieldDelim;
@@ -173,6 +179,7 @@ public class HdfsStorageDescriptor {
     this.mapKeyDelim_ = mapKeyDelim;
     this.quoteChar_ = quoteChar;
     this.blockSize_ = blockSize;
+    this.jsonBinaryFormat_ = jsonBinaryFormat;
 
     // You can set the escape character as a tuple or row delim.  Empirically,
     // this is ignored by hive.
@@ -226,6 +233,20 @@ public class HdfsStorageDescriptor {
       blockSize = Integer.parseInt(blockValue);
     }
 
+    TJsonBinaryFormat jsonBinaryFormat;
+    // TODO: IMPALA-13748, also consider table properties and table level 
serde properties
+    String specificFormat = parameters.get(JSON_BINARY_FORMAT);
+    if (specificFormat == null) {
+      jsonBinaryFormat = TJsonBinaryFormat.NONE;
+    } else if ("base64".equalsIgnoreCase(specificFormat)) {
+      jsonBinaryFormat = TJsonBinaryFormat.BASE64;
+    } else if ("rawstring".equalsIgnoreCase(specificFormat)) {
+      jsonBinaryFormat = TJsonBinaryFormat.RAWSTRING;
+    } else {
+      // Use null to indicate an invalid format.
+      jsonBinaryFormat = null;
+    }
+
     try {
       return INTERNER.intern(new HdfsStorageDescriptor(tblName,
           HdfsFileFormat.fromJavaClassName(
@@ -236,7 +257,7 @@ public class HdfsStorageDescriptor {
           delimMap.get(serdeConstants.MAPKEY_DELIM),
           delimMap.get(serdeConstants.ESCAPE_CHAR),
           delimMap.get(serdeConstants.QUOTE_CHAR),
-          blockSize));
+          blockSize, jsonBinaryFormat));
     } catch (IllegalArgumentException ex) {
       // Thrown by fromJavaClassName
       throw new InvalidStorageDescriptorException(ex);
@@ -248,18 +269,20 @@ public class HdfsStorageDescriptor {
     return INTERNER.intern(new HdfsStorageDescriptor(tableName,
         HdfsFileFormat.fromThrift(tDesc.getFileFormat()), tDesc.lineDelim,
         tDesc.fieldDelim, tDesc.collectionDelim, tDesc.mapKeyDelim, 
tDesc.escapeChar,
-        tDesc.quoteChar, tDesc.blockSize));
+        tDesc.quoteChar, tDesc.blockSize, tDesc.isSetJsonBinaryFormat() ?
+            tDesc.getJsonBinaryFormat() : null));
   }
 
   public THdfsStorageDescriptor toThrift() {
     return new THdfsStorageDescriptor(lineDelim_, fieldDelim_, 
collectionDelim_,
-        mapKeyDelim_, escapeChar_, quoteChar_, fileFormat_.toThrift(), 
blockSize_);
+        mapKeyDelim_, escapeChar_, quoteChar_, fileFormat_.toThrift(), 
blockSize_)
+        .setJsonBinaryFormat(jsonBinaryFormat_);
   }
 
   public HdfsStorageDescriptor cloneWithChangedFileFormat(HdfsFileFormat 
newFormat) {
     return INTERNER.intern(new HdfsStorageDescriptor(
         "<unknown>", newFormat, lineDelim_, fieldDelim_, collectionDelim_, 
mapKeyDelim_,
-        escapeChar_, quoteChar_, blockSize_));
+        escapeChar_, quoteChar_, blockSize_, jsonBinaryFormat_));
   }
 
   public byte getLineDelim() { return lineDelim_; }
@@ -269,11 +292,12 @@ public class HdfsStorageDescriptor {
   public byte getEscapeChar() { return escapeChar_; }
   public HdfsFileFormat getFileFormat() { return fileFormat_; }
   public int getBlockSize() { return blockSize_; }
+  public TJsonBinaryFormat getJsonBinaryFormat() { return jsonBinaryFormat_; }
 
   @Override
   public int hashCode() {
     return Objects.hash(blockSize_, collectionDelim_, escapeChar_, fieldDelim_,
-        fileFormat_, lineDelim_, mapKeyDelim_, quoteChar_);
+        fileFormat_, lineDelim_, mapKeyDelim_, quoteChar_, jsonBinaryFormat_);
   }
 
   @Override
@@ -290,6 +314,7 @@ public class HdfsStorageDescriptor {
     if (lineDelim_ != other.lineDelim_) return false;
     if (mapKeyDelim_ != other.mapKeyDelim_) return false;
     if (quoteChar_ != other.quoteChar_) return false;
+    if (jsonBinaryFormat_ != other.jsonBinaryFormat_) return false;
     return true;
   }
 }
diff --git a/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java 
b/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
index b16f48a14..eb026b717 100644
--- a/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
@@ -61,6 +61,7 @@ import org.apache.impala.catalog.FileDescriptor;
 import org.apache.impala.catalog.HdfsCompression;
 import org.apache.impala.catalog.HdfsFileFormat;
 import org.apache.impala.catalog.IcebergFileDescriptor;
+import org.apache.impala.catalog.HdfsStorageDescriptor;
 import org.apache.impala.catalog.PrimitiveType;
 import org.apache.impala.catalog.ScalarType;
 import org.apache.impala.catalog.Table;
@@ -80,6 +81,7 @@ import org.apache.impala.service.BackendConfig;
 import org.apache.impala.thrift.TExplainLevel;
 import org.apache.impala.thrift.TExpr;
 import org.apache.impala.thrift.TFileSplitGeneratorSpec;
+import org.apache.impala.thrift.TJsonBinaryFormat;
 import org.apache.impala.thrift.THdfsFileSplit;
 import org.apache.impala.thrift.THdfsScanNode;
 import org.apache.impala.thrift.TNetworkAddress;
@@ -218,7 +220,7 @@ public class HdfsScanNode extends ScanNode {
   private static final double COST_COEFFICIENT_COLUMNAR_PREDICATE_EVAL = 
0.0281;
   private static final double COST_COEFFICIENT_NONCOLUMNAR_PREDICATE_EVAL = 
0.0549;
 
-  //An estimate of the width of a row when the information is not available.
+  // An estimate of the width of a row when the information is not available.
   private double DEFAULT_ROW_WIDTH_ESTIMATE = 1.0;
 
   private final FeFsTable tbl_;
@@ -432,6 +434,13 @@ public class HdfsScanNode extends ScanNode {
     return fileFormats.contains(HdfsFileFormat.ORC);
   }
 
+  /**
+   * Returns true if this scan node contains JSON.
+   */
+  private boolean hasJson(Set<HdfsFileFormat> fileFormats) {
+    return fileFormats.contains(HdfsFileFormat.JSON);
+  }
+
   /**
    * Returns true if the count(*) optimization can be applied to the query 
block
    * of this scan node.
@@ -476,6 +485,10 @@ public class HdfsScanNode extends ScanNode {
       computeDictionaryFilterConjuncts(analyzer);
     }
 
+    if (hasJson(fileFormats_)) {
+      checkJsonBinaryFormat(analyzer);
+    }
+
     computeMemLayout(analyzer);
 
     // This is towards the end, so that it can take all conjuncts, scan ranges 
and mem
@@ -535,6 +548,48 @@ public class HdfsScanNode extends ScanNode {
     }
   }
 
+  /**
+   * Check if there are any binary columns in the table, and if so,
+   * check json binary format could be determined from properties or query 
options.
+   */
+  private void checkJsonBinaryFormat(Analyzer analyzer) throws 
ImpalaRuntimeException {
+    boolean hasBinaryColumns = false;
+    for (SlotDescriptor slotDesc : desc_.getSlots()) {
+      if (slotDesc.getType().isBinary()) {
+        hasBinaryColumns = true;
+        break;
+      }
+    }
+    if (!hasBinaryColumns) return;
+
+    TJsonBinaryFormat defaultFormat = 
analyzer.getQueryOptions().getJson_binary_format();
+    for (FeFsPartition partition : partitions_) {
+      TJsonBinaryFormat specificFormat = partition.getInputFormatDescriptor()
+          .getJsonBinaryFormat();
+      if (specificFormat == null) {
+        // Null indicates that the property is invalid.
+        throw new ImpalaRuntimeException(String.format("Invalid serde property 
" +
+            "'%s' for scanning binary column of json table '%s'%s. Valid 
values are " +
+            "'base64' or 'rawstring'.",
+            HdfsStorageDescriptor.JSON_BINARY_FORMAT, tbl_.getFullName(),
+            partition.getPartitionName().isEmpty() ? "" :
+            " partition '" + partition.getPartitionName() + "'"));
+      }
+
+      // If the property is not set, use the query option.
+      if (specificFormat == TJsonBinaryFormat.NONE) {
+        if (defaultFormat != TJsonBinaryFormat.NONE) continue;
+        // If the query option is not set either, throw an error.
+        throw new ImpalaRuntimeException(String.format("No valid serde 
properties " +
+            "'%s' or query option 'json_binary_format' ('base64' or 
'rawstring') " +
+            "provided for scanning binary column of json table '%s'%s.",
+            HdfsStorageDescriptor.JSON_BINARY_FORMAT, tbl_.getFullName(),
+            partition.getPartitionName().isEmpty() ? "" :
+            " partition '" + partition.getPartitionName() + "'"));
+      }
+    }
+  }
+
   /**
    * Throws NotImplementedException if we do not support scanning the 
partition.
    * Specifically:
@@ -542,7 +597,7 @@ public class HdfsScanNode extends ScanNode {
    * a partition that has a format for which we do not support complex types,
    * regardless of whether a complex-typed column is actually referenced
    * in the query.
-   * 2) if we are scanning compressed json file or the json scanner is 
disabled.
+   * 2) if the json scanner is disabled.
    */
   @Override
   protected void checkForSupportedFileFormats() throws NotImplementedException 
{
diff --git a/testdata/bin/generate-schema-statements.py 
b/testdata/bin/generate-schema-statements.py
index 9a5bcc53f..8ceccbb2c 100755
--- a/testdata/bin/generate-schema-statements.py
+++ b/testdata/bin/generate-schema-statements.py
@@ -823,6 +823,10 @@ def generate_statements(output_name, test_vectors, 
sections,
         insert = None
         insert_hive = eval_section(section["DEPENDENT_LOAD_ACID"])
 
+      if file_format == 'json' and section["DEPENDENT_LOAD_JSON"]:
+        insert = None
+        insert_hive = eval_section(section["DEPENDENT_LOAD_JSON"])
+
       columns = eval_section(section['COLUMNS']).strip()
       partition_columns = section['PARTITION_COLUMNS'].strip()
       row_format = section['ROW_FORMAT'].strip()
@@ -1012,8 +1016,9 @@ def parse_schema_template_file(file_name):
   VALID_SECTION_NAMES = ['DATASET', 'BASE_TABLE_NAME', 'COLUMNS', 
'PARTITION_COLUMNS',
                          'ROW_FORMAT', 'CREATE', 'CREATE_HIVE', 'CREATE_KUDU', 
'COMMENT',
                          'DEPENDENT_LOAD', 'DEPENDENT_LOAD_KUDU', 
'DEPENDENT_LOAD_HIVE',
-                         'DEPENDENT_LOAD_ACID', 'LOAD', 'ALTER', 
'HBASE_COLUMN_FAMILIES',
-                         'TABLE_PROPERTIES', 'HBASE_REGION_SPLITS', 
'HIVE_MAJOR_VERSION']
+                         'DEPENDENT_LOAD_ACID', 'DEPENDENT_LOAD_JSON', 'LOAD', 
'ALTER',
+                         'HBASE_COLUMN_FAMILIES', 'TABLE_PROPERTIES',
+                         'HBASE_REGION_SPLITS', 'HIVE_MAJOR_VERSION']
   return parse_test_file(file_name, VALID_SECTION_NAMES, 
skip_unknown_sections=False)
 
 
diff --git a/testdata/datasets/README b/testdata/datasets/README
index fe9ea7364..3eaea7d6a 100644
--- a/testdata/datasets/README
+++ b/testdata/datasets/README
@@ -73,6 +73,7 @@ The schema template SQL files have the following format:
   DEPENDENT_LOAD_KUDU
   DEPENDENT_LOAD_HIVE
   DEPENDENT_LOAD_ACID
+  DEPENDENT_LOAD_JSON
       Statements to be executed during the "dependent load" phase. These 
statements
       are run after the initial (base table) load is complete.
 
diff --git a/testdata/datasets/functional/functional_schema_template.sql 
b/testdata/datasets/functional/functional_schema_template.sql
index 641045ea7..2db189abc 100644
--- a/testdata/datasets/functional/functional_schema_template.sql
+++ b/testdata/datasets/functional/functional_schema_template.sql
@@ -4489,6 +4489,14 @@ LOAD DATA LOCAL INPATH 
'{impala_home}/testdata/data/binary_tbl/000000_0.txt' OVE
 ---- DEPENDENT_LOAD
 insert overwrite table {db_name}{db_suffix}.{table_name}
 select id, string_col, binary_col from functional.{table_name};
+---- DEPENDENT_LOAD_JSON
+-- The hive version we currently depend on (without HIVE-21240) does not 
support writing
+-- binary fields to json files in base64 format by default, so we need to 
convert it
+-- manually. For the same reason, we also need to manually set the
+-- 'json.binary.format' property.
+alter table {db_name}{db_suffix}.{table_name} set serdeproperties 
('json.binary.format'='base64');
+insert overwrite table {db_name}{db_suffix}.{table_name}
+select id, string_col, base64(binary_col) from functional.{table_name};
 ---- CREATE_KUDU
 DROP TABLE IF EXISTS {db_name}{db_suffix}.{table_name};
 CREATE TABLE {db_name}{db_suffix}.{table_name} (
@@ -4527,6 +4535,13 @@ select id, int_col, cast(string_col as binary),
        cast(case when id % 2 = 0 then date_string_col else NULL end as binary),
        year, month
     from functional.alltypes;
+---- DEPENDENT_LOAD_JSON
+insert overwrite table {db_name}{db_suffix}.{table_name} partition(year, month)
+select id, int_col, base64(cast(string_col as binary)),
+       base64(cast(case when id % 2 = 0 then date_string_col else NULL end as 
binary)),
+       year, month
+    from functional.alltypes;
+alter table {db_name}{db_suffix}.{table_name} partition(year) set 
serdeproperties ('json.binary.format'='base64');
 ---- CREATE_KUDU
 DROP TABLE IF EXISTS {db_name}{db_suffix}.{table_name};
 CREATE TABLE {db_name}{db_suffix}.{table_name} (
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/json-binary-format.test 
b/testdata/workloads/functional-query/queries/QueryTest/json-binary-format.test
new file mode 100644
index 000000000..34d78d4c9
--- /dev/null
+++ 
b/testdata/workloads/functional-query/queries/QueryTest/json-binary-format.test
@@ -0,0 +1,110 @@
+====
+---- QUERY
+# No property or query option set, scanning binary columns will throw an 
exception.
+# Refresh is needed for serdeproperties changes to take effect, see 
IMPALA-13748.
+alter table binary_tbl unset serdeproperties ('json.binary.format');
+refresh binary_tbl;
+set json_binary_format=none;
+select id, string_col, cast(binary_col as string) from binary_tbl
+---- CATCH
+No valid serde properties 'json.binary.format' or query option 
'json_binary_format' ('base64' or 'rawstring') provided for scanning binary 
column of json table '$DATABASE.binary_tbl'.
+====
+---- QUERY
+# No binary column scanned, no exception thrown.
+set json_binary_format=none;
+select id, string_col from binary_tbl
+---- TYPES
+INT, STRING
+---- RESULTS:
+1,'ascii'
+2,'ascii'
+3,'null'
+4,'empty'
+5,'valid utf8'
+6,'valid utf8'
+7,'invalid utf8'
+8,'invalid utf8'
+====
+---- QUERY
+# No property set but query option set, scanning binary columns will use the 
query option.
+set json_binary_format=rawstring;
+select id, string_col, cast(binary_col as string) from binary_tbl
+---- TYPES
+INT, STRING, STRING
+---- RESULTS:
+1,'ascii','YmluYXJ5MQ=='
+2,'ascii','YmluYXJ5Mg=='
+3,'null','NULL'
+4,'empty',''
+5,'valid utf8','w6FydsOtenTFsXLFkXTDvGvDtnJmw7pyw7M='
+6,'valid utf8','5L2g5aW9aGVsbG8='
+7,'invalid utf8','AP8A/w=='
+8,'invalid utf8','/0QzIhEA'
+====
+---- QUERY
+# If the property is set, it takes precedence over the query option, even if 
the value is
+# invalid.
+alter table binary_tbl set serdeproperties ('json.binary.format'='foobar');
+refresh binary_tbl;
+set json_binary_format=rawstring;
+select id, string_col, cast(binary_col as string) from binary_tbl
+---- CATCH
+Invalid serde property 'json.binary.format' for scanning binary column of json 
table '$DATABASE.binary_tbl'. Valid values are 'base64' or 'rawstring'.
+====
+---- QUERY
+# Setting the property to 'base64', scanning binary columns will use base64 
encoding,
+# rather than the query option 'rawstring'.
+alter table binary_tbl set serdeproperties ('json.binary.format'='base64');
+refresh binary_tbl;
+set json_binary_format=rawstring;
+select id, string_col, base64encode(cast(binary_col as string)) from binary_tbl
+---- TYPES
+INT, STRING, STRING
+---- RESULTS:
+1,'ascii','YmluYXJ5MQ=='
+2,'ascii','YmluYXJ5Mg=='
+3,'null','NULL'
+4,'empty',''
+5,'valid utf8','w6FydsOtenTFsXLFkXTDvGvDtnJmw7pyw7M='
+6,'valid utf8','5L2g5aW9aGVsbG8='
+7,'invalid utf8','AP8A/w=='
+8,'invalid utf8','/0QzIhEA'
+====
+---- QUERY
+# Unsetting the property and setting the query option to 'base64' will have 
the same
+# effect.
+alter table binary_tbl unset serdeproperties ('json.binary.format');
+refresh binary_tbl;
+set json_binary_format=base64;
+select id, string_col, base64encode(cast(binary_col as string)) from binary_tbl
+---- TYPES
+INT, STRING, STRING
+---- RESULTS:
+1,'ascii','YmluYXJ5MQ=='
+2,'ascii','YmluYXJ5Mg=='
+3,'null','NULL'
+4,'empty',''
+5,'valid utf8','w6FydsOtenTFsXLFkXTDvGvDtnJmw7pyw7M='
+6,'valid utf8','5L2g5aW9aGVsbG8='
+7,'invalid utf8','AP8A/w=='
+8,'invalid utf8','/0QzIhEA'
+====
+---- QUERY
+# Test scanning multiple json tables with different binary column formats
+# ('functional_json.binary_tbl' has 'base64').
+alter table binary_tbl set serdeproperties ('json.binary.format'='rawstring');
+refresh binary_tbl;
+select r.id, cast(r.binary_col as string), base64encode(cast(b.binary_col as 
string))
+from binary_tbl r join functional_json.binary_tbl b using (id)
+---- TYPES
+INT, STRING, STRING
+---- RESULTS:
+1,'YmluYXJ5MQ==','YmluYXJ5MQ=='
+2,'YmluYXJ5Mg==','YmluYXJ5Mg=='
+3,'NULL','NULL'
+4,'',''
+5,'w6FydsOtenTFsXLFkXTDvGvDtnJmw7pyw7M=','w6FydsOtenTFsXLFkXTDvGvDtnJmw7pyw7M='
+6,'5L2g5aW9aGVsbG8=','5L2g5aW9aGVsbG8='
+7,'AP8A/w==','AP8A/w=='
+8,'/0QzIhEA','/0QzIhEA'
+====
\ No newline at end of file
diff --git a/tests/query_test/test_scanners.py 
b/tests/query_test/test_scanners.py
index 410980d8f..c044cf0a5 100644
--- a/tests/query_test/test_scanners.py
+++ b/tests/query_test/test_scanners.py
@@ -1940,15 +1940,17 @@ class TestErasureCoding(ImpalaTestSuite):
 
 
 class TestBinaryType(ImpalaTestSuite):
-  @classmethod
-  def add_test_dimensions(cls):
-    super(TestBinaryType, cls).add_test_dimensions()
-    cls.ImpalaTestMatrix.add_constraint(
-      lambda v: v.get_value('table_format').file_format != 'json')
 
   def test_binary_type(self, vector):
     self.run_test_case('QueryTest/binary-type', vector)
 
+  def test_json_binary_format(self, vector, unique_database):
+    if vector.get_value('table_format').file_format != 'json':
+      pytest.skip()
+    test_tbl = unique_database + '.binary_tbl'
+    self.clone_table('functional_json.binary_tbl', test_tbl, False, vector)
+    self.run_test_case('QueryTest/json-binary-format', vector, unique_database)
+
 
 class TestBinaryInComplexType(ImpalaTestSuite):
   @classmethod

(impala) 02/02: IMPALA-12927: Support specifying format for reading JSON BINARY columns

Reply via email to