This is an automated email from the ASF dual-hosted git repository. joemcdonnell pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit faf322dd414c7d93bf5d6bd8e1a83113c19e9c7d Author: Eyizoha <[email protected]> AuthorDate: Thu Jan 2 16:39:46 2025 +0800 IMPALA-12927: Support specifying format for reading JSON BINARY columns Currently, Impala always assumes that the data in the binary columns of JSON tables is base64 encoded. However, before HIVE-21240, Hive wrote binary data to JSON tables without base64 encoding it, instead writing it as escaped strings. After HIVE-21240, Hive defaults to base64 encoding binary data when writing to JSON tables and introduces the serde property 'json.binary.format' to indicate the encoding method of binary data in JSON tables. To maintain consistency with Hive and avoid correctness issues caused by reading data in an incorrect manner, this patch also introduces the serde property 'json.binary.format' to specify the reading method for binary data in JSON tables. Currently, this property supports reading in either base64 or rawstring formats, same as Hive. Additionally, this patch introduces a query option 'json_binary_format' to achieve the same effect. This query option will only take effect for JSON tables where the serde property 'json.binary.format' is not set. The reading format of binary columns in JSON tables can be configured globally by setting the 'default_query_options'. It should be noted that the default value of 'json_binary_format' is 'NONE', and impala will prohibit reading binary columns of JSON tables that either have "no 'json.binary.format' set and 'json_binary_format' is 'NONE'" or "an invalid 'json.binary.format' value set", and will provide an error message to avoid using an incorrect format without the user noticing. Testing: - Enabled existing binary type E2E tests for JSON tables - Added new E2E test for 'json.binary.format' Change-Id: Idf61fa3afc0f33caa63fbc05393e975733165e82 Reviewed-on: http://gerrit.cloudera.org:8080/22289 Reviewed-by: Impala Public Jenkins <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> --- be/src/exec/json/hdfs-json-scanner.cc | 9 +- be/src/exec/text-converter.cc | 5 +- be/src/exec/text-converter.h | 6 +- be/src/exec/text-converter.inline.h | 2 +- be/src/runtime/descriptors.cc | 1 + be/src/runtime/descriptors.h | 4 + be/src/service/query-options.cc | 7 ++ be/src/service/query-options.h | 3 +- common/thrift/CatalogObjects.thrift | 8 ++ common/thrift/ImpalaService.thrift | 7 ++ common/thrift/Query.thrift | 4 + .../impala/catalog/HdfsStorageDescriptor.java | 37 +++++-- .../org/apache/impala/planner/HdfsScanNode.java | 59 ++++++++++- testdata/bin/generate-schema-statements.py | 9 +- testdata/datasets/README | 1 + .../functional/functional_schema_template.sql | 15 +++ .../queries/QueryTest/json-binary-format.test | 110 +++++++++++++++++++++ tests/query_test/test_scanners.py | 12 ++- 18 files changed, 278 insertions(+), 21 deletions(-) diff --git a/be/src/exec/json/hdfs-json-scanner.cc b/be/src/exec/json/hdfs-json-scanner.cc index 46d26cc7e..ac8cdc1fb 100644 --- a/be/src/exec/json/hdfs-json-scanner.cc +++ b/be/src/exec/json/hdfs-json-scanner.cc @@ -125,7 +125,14 @@ Status HdfsJsonScanner::InitNewRange() { schema.push_back(scan_node_->hdfs_table()->GetColumnDesc(slot).name()); } - text_converter_.reset(new TextConverter('\\', "", false, state_->strict_mode())); + auto json_binary_format = context_->partition_descriptor()->json_binary_format(); + if (json_binary_format == TJsonBinaryFormat::NONE) { + json_binary_format = state_->query_options().json_binary_format; + } + bool decode_binary = json_binary_format == TJsonBinaryFormat::BASE64; + + text_converter_.reset(new TextConverter('\\', "", false, state_->strict_mode(), + decode_binary)); json_parser_.reset(new JsonParser<HdfsJsonScanner>(schema, this)); json_parser_->ResetParser(); return Status::OK(); diff --git a/be/src/exec/text-converter.cc b/be/src/exec/text-converter.cc index ca4fb0fc8..30bd0511d 100644 --- a/be/src/exec/text-converter.cc +++ b/be/src/exec/text-converter.cc @@ -35,11 +35,12 @@ using namespace impala; TextConverter::TextConverter(char escape_char, const string& null_col_val, - bool check_null, bool strict_mode) + bool check_null, bool strict_mode, bool decode_binary) : escape_char_(escape_char), null_col_val_(null_col_val), check_null_(check_null), - strict_mode_(strict_mode) { + strict_mode_(strict_mode), + decode_binary_(decode_binary) { } void TextConverter::UnescapeString(const char* src, char* dest, int* len, diff --git a/be/src/exec/text-converter.h b/be/src/exec/text-converter.h index f8b43f89b..da83895c9 100644 --- a/be/src/exec/text-converter.h +++ b/be/src/exec/text-converter.h @@ -49,7 +49,7 @@ class TextConverter { /// strict_mode: If set, numerical overflow/underflow are considered to be parse /// errors. TextConverter(char escape_char, const std::string& null_col_val, - bool check_null = true, bool strict_mode = false); + bool check_null = true, bool strict_mode = false, bool decode_binary = true); /// Converts slot data, of length 'len', into type of slot_desc, /// and writes the result into the tuples's slot. @@ -101,6 +101,10 @@ class TextConverter { /// Indicates whether numerical overflow/underflow are considered to be parse /// errors. bool strict_mode_; + /// Indicates whether we should use base64 decoding for binary data. + /// Currently, this is only set to false when reading JSON tables with rawstring format + /// binary columns. + bool decode_binary_; }; } diff --git a/be/src/exec/text-converter.inline.h b/be/src/exec/text-converter.inline.h index 6455126be..6e1474607 100644 --- a/be/src/exec/text-converter.inline.h +++ b/be/src/exec/text-converter.inline.h @@ -67,7 +67,7 @@ inline bool TextConverter::WriteSlot(const SlotDescriptor* slot_desc, Tuple* tup !(len != 0 && (copy_string || need_escape)); bool base64_decode = false; - if (type.IsBinaryType()) { + if (type.IsBinaryType() && decode_binary_) { base64_decode = true; reuse_data = false; int64_t out_len; diff --git a/be/src/runtime/descriptors.cc b/be/src/runtime/descriptors.cc index a1002b342..eda1bfacc 100644 --- a/be/src/runtime/descriptors.cc +++ b/be/src/runtime/descriptors.cc @@ -219,6 +219,7 @@ HdfsPartitionDescriptor::HdfsPartitionDescriptor( escape_char_ = sd.escapeChar; block_size_ = sd.blockSize; file_format_ = sd.fileFormat; + json_binary_format_ = sd.jsonBinaryFormat; DecompressLocation(thrift_table, thrift_partition, &location_); } diff --git a/be/src/runtime/descriptors.h b/be/src/runtime/descriptors.h index 10c42c664..2e432d8bc 100644 --- a/be/src/runtime/descriptors.h +++ b/be/src/runtime/descriptors.h @@ -401,6 +401,7 @@ class HdfsPartitionDescriptor { int block_size() const { return block_size_; } const std::string& location() const { return location_; } int64_t id() const { return id_; } + TJsonBinaryFormat::type json_binary_format() const { return json_binary_format_; } std::string DebugString() const; /// It is safe to call the returned expr evaluators concurrently from multiple @@ -437,6 +438,9 @@ class HdfsPartitionDescriptor { /// The format (e.g. text, sequence file etc.) of data in the files in this partition THdfsFileFormat::type file_format_; + + /// The format for reading JSON binary columns, used only for JSON tables. + TJsonBinaryFormat::type json_binary_format_; }; class HdfsTableDescriptor : public TableDescriptor { diff --git a/be/src/service/query-options.cc b/be/src/service/query-options.cc index d71dbc6b9..a409cd0e2 100644 --- a/be/src/service/query-options.cc +++ b/be/src/service/query-options.cc @@ -1375,6 +1375,13 @@ Status impala::SetQueryOption(TImpalaQueryOptions::type option, const string& va query_options->__set_skip_unneeded_updates_col_limit(int32_t_val); break; } + case TImpalaQueryOptions::JSON_BINARY_FORMAT: { + TJsonBinaryFormat::type enum_type; + RETURN_IF_ERROR(GetThriftEnum(value, "Json binary format", + _TJsonBinaryFormat_VALUES_TO_NAMES, &enum_type)); + query_options->__set_json_binary_format(enum_type); + break; + } case TImpalaQueryOptions::MEM_ESTIMATE_SCALE_FOR_SPILLING_OPERATOR: { double double_val = 0.0f; RETURN_IF_ERROR(QueryOptionParser::ParseAndCheckInclusiveRange<double>( diff --git a/be/src/service/query-options.h b/be/src/service/query-options.h index 5e21229e1..68c7737a8 100644 --- a/be/src/service/query-options.h +++ b/be/src/service/query-options.h @@ -51,7 +51,7 @@ typedef std::unordered_map<string, beeswax::TQueryOptionLevel::type> // plus one. Thus, the second argument to the DCHECK has to be updated every // time we add or remove a query option to/from the enum TImpalaQueryOptions. constexpr unsigned NUM_QUERY_OPTIONS = - TImpalaQueryOptions::USE_CALCITE_PLANNER + 1; + TImpalaQueryOptions::JSON_BINARY_FORMAT + 1; #define QUERY_OPTS_TABLE \ DCHECK_EQ(_TImpalaQueryOptions_VALUES_TO_NAMES.size(), NUM_QUERY_OPTIONS); \ REMOVED_QUERY_OPT_FN(abort_on_default_limit_exceeded, ABORT_ON_DEFAULT_LIMIT_EXCEEDED) \ @@ -376,6 +376,7 @@ constexpr unsigned NUM_QUERY_OPTIONS = MEM_ESTIMATE_SCALE_FOR_SPILLING_OPERATOR, TQueryOptionLevel::DEVELOPMENT) \ QUERY_OPT_FN(use_calcite_planner, USE_CALCITE_PLANNER, \ TQueryOptionLevel::ADVANCED) \ + QUERY_OPT_FN(json_binary_format, JSON_BINARY_FORMAT, TQueryOptionLevel::REGULAR) \ ; /// Enforce practical limits on some query options to avoid undesired query state. diff --git a/common/thrift/CatalogObjects.thrift b/common/thrift/CatalogObjects.thrift index 57b1ddf18..03ebdfdfa 100644 --- a/common/thrift/CatalogObjects.thrift +++ b/common/thrift/CatalogObjects.thrift @@ -178,6 +178,13 @@ enum TBucketType { HASH = 1 } +// Options for JSON binary format to determine how binary data is encoded in JSON. +enum TJsonBinaryFormat { + NONE = 0 + BASE64 = 1 + RAWSTRING = 2 +} + struct TCompressionCodec { // Compression codec 1: required THdfsCompression codec @@ -357,6 +364,7 @@ struct THdfsStorageDescriptor { 6: required byte quoteChar 7: required THdfsFileFormat fileFormat 8: required i32 blockSize + 9: optional TJsonBinaryFormat jsonBinaryFormat } // Represents an HDFS partition diff --git a/common/thrift/ImpalaService.thrift b/common/thrift/ImpalaService.thrift index 43886c001..ef6a7efbc 100644 --- a/common/thrift/ImpalaService.thrift +++ b/common/thrift/ImpalaService.thrift @@ -1026,6 +1026,13 @@ enum TImpalaQueryOptions { // If True, use the Calcite planner for compilation USE_CALCITE_PLANNER = 191 + + // The default format for reading JSON binary columns, can be overridden by table + // property 'json.binary.format' (if set). The valid values are: + // NONE - default value, means unspecified format, depends on the table property. + // BASE64 - the json binary data is read as base64 encoded string. + // RAWSTRING - the json binary data is read as raw string. + JSON_BINARY_FORMAT = 192 } // The summary of a DML statement. diff --git a/common/thrift/Query.thrift b/common/thrift/Query.thrift index 8e3fc5969..bd2181348 100644 --- a/common/thrift/Query.thrift +++ b/common/thrift/Query.thrift @@ -778,6 +778,10 @@ struct TQueryOptions { // See comment in ImpalaService.thrift 192: optional bool use_calcite_planner = false; + + // See comment in ImpalaService.thrift + 193: optional CatalogObjects.TJsonBinaryFormat json_binary_format = + TJsonBinaryFormat.NONE; } // Impala currently has three types of sessions: Beeswax, HiveServer2 and external diff --git a/fe/src/main/java/org/apache/impala/catalog/HdfsStorageDescriptor.java b/fe/src/main/java/org/apache/impala/catalog/HdfsStorageDescriptor.java index 5402fae11..6ce5cc871 100644 --- a/fe/src/main/java/org/apache/impala/catalog/HdfsStorageDescriptor.java +++ b/fe/src/main/java/org/apache/impala/catalog/HdfsStorageDescriptor.java @@ -26,6 +26,7 @@ import org.apache.hadoop.hive.metastore.api.StorageDescriptor; import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.impala.thrift.THdfsPartition; import org.apache.impala.thrift.THdfsStorageDescriptor; +import org.apache.impala.thrift.TJsonBinaryFormat; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -54,6 +55,10 @@ public class HdfsStorageDescriptor { // Serde parameters that are recognized by table writers. private static final String BLOCK_SIZE = "blocksize"; + // Hive JSON SerDe constants 'JsonSerDe.BINARY_FORMAT'. + // https://github.com/apache/hive/blob/63e6aa519273342eb75740d960ed3d42167326ea/serde/src/java/org/apache/hadoop/hive/serde2/JsonSerDe.java#L56 + public static final String JSON_BINARY_FORMAT = "json.binary.format"; + // Important: don't change the ordering of these keys - if e.g. FIELD_DELIM is not // found, the value of LINE_DELIM is used, so LINE_DELIM must be found first. // Package visible for testing. @@ -89,6 +94,7 @@ public class HdfsStorageDescriptor { private final byte escapeChar_; private final byte quoteChar_; private final int blockSize_; + private final TJsonBinaryFormat jsonBinaryFormat_; /** * Returns a map from delimiter key to a single delimiter character, @@ -165,7 +171,7 @@ public class HdfsStorageDescriptor { private HdfsStorageDescriptor(String tblName, HdfsFileFormat fileFormat, byte lineDelim, byte fieldDelim, byte collectionDelim, byte mapKeyDelim, byte escapeChar, - byte quoteChar, int blockSize) { + byte quoteChar, int blockSize, TJsonBinaryFormat jsonBinaryFormat) { this.fileFormat_ = fileFormat; this.lineDelim_ = lineDelim; this.fieldDelim_ = fieldDelim; @@ -173,6 +179,7 @@ public class HdfsStorageDescriptor { this.mapKeyDelim_ = mapKeyDelim; this.quoteChar_ = quoteChar; this.blockSize_ = blockSize; + this.jsonBinaryFormat_ = jsonBinaryFormat; // You can set the escape character as a tuple or row delim. Empirically, // this is ignored by hive. @@ -226,6 +233,20 @@ public class HdfsStorageDescriptor { blockSize = Integer.parseInt(blockValue); } + TJsonBinaryFormat jsonBinaryFormat; + // TODO: IMPALA-13748, also consider table properties and table level serde properties + String specificFormat = parameters.get(JSON_BINARY_FORMAT); + if (specificFormat == null) { + jsonBinaryFormat = TJsonBinaryFormat.NONE; + } else if ("base64".equalsIgnoreCase(specificFormat)) { + jsonBinaryFormat = TJsonBinaryFormat.BASE64; + } else if ("rawstring".equalsIgnoreCase(specificFormat)) { + jsonBinaryFormat = TJsonBinaryFormat.RAWSTRING; + } else { + // Use null to indicate an invalid format. + jsonBinaryFormat = null; + } + try { return INTERNER.intern(new HdfsStorageDescriptor(tblName, HdfsFileFormat.fromJavaClassName( @@ -236,7 +257,7 @@ public class HdfsStorageDescriptor { delimMap.get(serdeConstants.MAPKEY_DELIM), delimMap.get(serdeConstants.ESCAPE_CHAR), delimMap.get(serdeConstants.QUOTE_CHAR), - blockSize)); + blockSize, jsonBinaryFormat)); } catch (IllegalArgumentException ex) { // Thrown by fromJavaClassName throw new InvalidStorageDescriptorException(ex); @@ -248,18 +269,20 @@ public class HdfsStorageDescriptor { return INTERNER.intern(new HdfsStorageDescriptor(tableName, HdfsFileFormat.fromThrift(tDesc.getFileFormat()), tDesc.lineDelim, tDesc.fieldDelim, tDesc.collectionDelim, tDesc.mapKeyDelim, tDesc.escapeChar, - tDesc.quoteChar, tDesc.blockSize)); + tDesc.quoteChar, tDesc.blockSize, tDesc.isSetJsonBinaryFormat() ? + tDesc.getJsonBinaryFormat() : null)); } public THdfsStorageDescriptor toThrift() { return new THdfsStorageDescriptor(lineDelim_, fieldDelim_, collectionDelim_, - mapKeyDelim_, escapeChar_, quoteChar_, fileFormat_.toThrift(), blockSize_); + mapKeyDelim_, escapeChar_, quoteChar_, fileFormat_.toThrift(), blockSize_) + .setJsonBinaryFormat(jsonBinaryFormat_); } public HdfsStorageDescriptor cloneWithChangedFileFormat(HdfsFileFormat newFormat) { return INTERNER.intern(new HdfsStorageDescriptor( "<unknown>", newFormat, lineDelim_, fieldDelim_, collectionDelim_, mapKeyDelim_, - escapeChar_, quoteChar_, blockSize_)); + escapeChar_, quoteChar_, blockSize_, jsonBinaryFormat_)); } public byte getLineDelim() { return lineDelim_; } @@ -269,11 +292,12 @@ public class HdfsStorageDescriptor { public byte getEscapeChar() { return escapeChar_; } public HdfsFileFormat getFileFormat() { return fileFormat_; } public int getBlockSize() { return blockSize_; } + public TJsonBinaryFormat getJsonBinaryFormat() { return jsonBinaryFormat_; } @Override public int hashCode() { return Objects.hash(blockSize_, collectionDelim_, escapeChar_, fieldDelim_, - fileFormat_, lineDelim_, mapKeyDelim_, quoteChar_); + fileFormat_, lineDelim_, mapKeyDelim_, quoteChar_, jsonBinaryFormat_); } @Override @@ -290,6 +314,7 @@ public class HdfsStorageDescriptor { if (lineDelim_ != other.lineDelim_) return false; if (mapKeyDelim_ != other.mapKeyDelim_) return false; if (quoteChar_ != other.quoteChar_) return false; + if (jsonBinaryFormat_ != other.jsonBinaryFormat_) return false; return true; } } diff --git a/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java b/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java index b16f48a14..eb026b717 100644 --- a/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java +++ b/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java @@ -61,6 +61,7 @@ import org.apache.impala.catalog.FileDescriptor; import org.apache.impala.catalog.HdfsCompression; import org.apache.impala.catalog.HdfsFileFormat; import org.apache.impala.catalog.IcebergFileDescriptor; +import org.apache.impala.catalog.HdfsStorageDescriptor; import org.apache.impala.catalog.PrimitiveType; import org.apache.impala.catalog.ScalarType; import org.apache.impala.catalog.Table; @@ -80,6 +81,7 @@ import org.apache.impala.service.BackendConfig; import org.apache.impala.thrift.TExplainLevel; import org.apache.impala.thrift.TExpr; import org.apache.impala.thrift.TFileSplitGeneratorSpec; +import org.apache.impala.thrift.TJsonBinaryFormat; import org.apache.impala.thrift.THdfsFileSplit; import org.apache.impala.thrift.THdfsScanNode; import org.apache.impala.thrift.TNetworkAddress; @@ -218,7 +220,7 @@ public class HdfsScanNode extends ScanNode { private static final double COST_COEFFICIENT_COLUMNAR_PREDICATE_EVAL = 0.0281; private static final double COST_COEFFICIENT_NONCOLUMNAR_PREDICATE_EVAL = 0.0549; - //An estimate of the width of a row when the information is not available. + // An estimate of the width of a row when the information is not available. private double DEFAULT_ROW_WIDTH_ESTIMATE = 1.0; private final FeFsTable tbl_; @@ -432,6 +434,13 @@ public class HdfsScanNode extends ScanNode { return fileFormats.contains(HdfsFileFormat.ORC); } + /** + * Returns true if this scan node contains JSON. + */ + private boolean hasJson(Set<HdfsFileFormat> fileFormats) { + return fileFormats.contains(HdfsFileFormat.JSON); + } + /** * Returns true if the count(*) optimization can be applied to the query block * of this scan node. @@ -476,6 +485,10 @@ public class HdfsScanNode extends ScanNode { computeDictionaryFilterConjuncts(analyzer); } + if (hasJson(fileFormats_)) { + checkJsonBinaryFormat(analyzer); + } + computeMemLayout(analyzer); // This is towards the end, so that it can take all conjuncts, scan ranges and mem @@ -535,6 +548,48 @@ public class HdfsScanNode extends ScanNode { } } + /** + * Check if there are any binary columns in the table, and if so, + * check json binary format could be determined from properties or query options. + */ + private void checkJsonBinaryFormat(Analyzer analyzer) throws ImpalaRuntimeException { + boolean hasBinaryColumns = false; + for (SlotDescriptor slotDesc : desc_.getSlots()) { + if (slotDesc.getType().isBinary()) { + hasBinaryColumns = true; + break; + } + } + if (!hasBinaryColumns) return; + + TJsonBinaryFormat defaultFormat = analyzer.getQueryOptions().getJson_binary_format(); + for (FeFsPartition partition : partitions_) { + TJsonBinaryFormat specificFormat = partition.getInputFormatDescriptor() + .getJsonBinaryFormat(); + if (specificFormat == null) { + // Null indicates that the property is invalid. + throw new ImpalaRuntimeException(String.format("Invalid serde property " + + "'%s' for scanning binary column of json table '%s'%s. Valid values are " + + "'base64' or 'rawstring'.", + HdfsStorageDescriptor.JSON_BINARY_FORMAT, tbl_.getFullName(), + partition.getPartitionName().isEmpty() ? "" : + " partition '" + partition.getPartitionName() + "'")); + } + + // If the property is not set, use the query option. + if (specificFormat == TJsonBinaryFormat.NONE) { + if (defaultFormat != TJsonBinaryFormat.NONE) continue; + // If the query option is not set either, throw an error. + throw new ImpalaRuntimeException(String.format("No valid serde properties " + + "'%s' or query option 'json_binary_format' ('base64' or 'rawstring') " + + "provided for scanning binary column of json table '%s'%s.", + HdfsStorageDescriptor.JSON_BINARY_FORMAT, tbl_.getFullName(), + partition.getPartitionName().isEmpty() ? "" : + " partition '" + partition.getPartitionName() + "'")); + } + } + } + /** * Throws NotImplementedException if we do not support scanning the partition. * Specifically: @@ -542,7 +597,7 @@ public class HdfsScanNode extends ScanNode { * a partition that has a format for which we do not support complex types, * regardless of whether a complex-typed column is actually referenced * in the query. - * 2) if we are scanning compressed json file or the json scanner is disabled. + * 2) if the json scanner is disabled. */ @Override protected void checkForSupportedFileFormats() throws NotImplementedException { diff --git a/testdata/bin/generate-schema-statements.py b/testdata/bin/generate-schema-statements.py index 9a5bcc53f..8ceccbb2c 100755 --- a/testdata/bin/generate-schema-statements.py +++ b/testdata/bin/generate-schema-statements.py @@ -823,6 +823,10 @@ def generate_statements(output_name, test_vectors, sections, insert = None insert_hive = eval_section(section["DEPENDENT_LOAD_ACID"]) + if file_format == 'json' and section["DEPENDENT_LOAD_JSON"]: + insert = None + insert_hive = eval_section(section["DEPENDENT_LOAD_JSON"]) + columns = eval_section(section['COLUMNS']).strip() partition_columns = section['PARTITION_COLUMNS'].strip() row_format = section['ROW_FORMAT'].strip() @@ -1012,8 +1016,9 @@ def parse_schema_template_file(file_name): VALID_SECTION_NAMES = ['DATASET', 'BASE_TABLE_NAME', 'COLUMNS', 'PARTITION_COLUMNS', 'ROW_FORMAT', 'CREATE', 'CREATE_HIVE', 'CREATE_KUDU', 'COMMENT', 'DEPENDENT_LOAD', 'DEPENDENT_LOAD_KUDU', 'DEPENDENT_LOAD_HIVE', - 'DEPENDENT_LOAD_ACID', 'LOAD', 'ALTER', 'HBASE_COLUMN_FAMILIES', - 'TABLE_PROPERTIES', 'HBASE_REGION_SPLITS', 'HIVE_MAJOR_VERSION'] + 'DEPENDENT_LOAD_ACID', 'DEPENDENT_LOAD_JSON', 'LOAD', 'ALTER', + 'HBASE_COLUMN_FAMILIES', 'TABLE_PROPERTIES', + 'HBASE_REGION_SPLITS', 'HIVE_MAJOR_VERSION'] return parse_test_file(file_name, VALID_SECTION_NAMES, skip_unknown_sections=False) diff --git a/testdata/datasets/README b/testdata/datasets/README index fe9ea7364..3eaea7d6a 100644 --- a/testdata/datasets/README +++ b/testdata/datasets/README @@ -73,6 +73,7 @@ The schema template SQL files have the following format: DEPENDENT_LOAD_KUDU DEPENDENT_LOAD_HIVE DEPENDENT_LOAD_ACID + DEPENDENT_LOAD_JSON Statements to be executed during the "dependent load" phase. These statements are run after the initial (base table) load is complete. diff --git a/testdata/datasets/functional/functional_schema_template.sql b/testdata/datasets/functional/functional_schema_template.sql index 641045ea7..2db189abc 100644 --- a/testdata/datasets/functional/functional_schema_template.sql +++ b/testdata/datasets/functional/functional_schema_template.sql @@ -4489,6 +4489,14 @@ LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/binary_tbl/000000_0.txt' OVE ---- DEPENDENT_LOAD insert overwrite table {db_name}{db_suffix}.{table_name} select id, string_col, binary_col from functional.{table_name}; +---- DEPENDENT_LOAD_JSON +-- The hive version we currently depend on (without HIVE-21240) does not support writing +-- binary fields to json files in base64 format by default, so we need to convert it +-- manually. For the same reason, we also need to manually set the +-- 'json.binary.format' property. +alter table {db_name}{db_suffix}.{table_name} set serdeproperties ('json.binary.format'='base64'); +insert overwrite table {db_name}{db_suffix}.{table_name} +select id, string_col, base64(binary_col) from functional.{table_name}; ---- CREATE_KUDU DROP TABLE IF EXISTS {db_name}{db_suffix}.{table_name}; CREATE TABLE {db_name}{db_suffix}.{table_name} ( @@ -4527,6 +4535,13 @@ select id, int_col, cast(string_col as binary), cast(case when id % 2 = 0 then date_string_col else NULL end as binary), year, month from functional.alltypes; +---- DEPENDENT_LOAD_JSON +insert overwrite table {db_name}{db_suffix}.{table_name} partition(year, month) +select id, int_col, base64(cast(string_col as binary)), + base64(cast(case when id % 2 = 0 then date_string_col else NULL end as binary)), + year, month + from functional.alltypes; +alter table {db_name}{db_suffix}.{table_name} partition(year) set serdeproperties ('json.binary.format'='base64'); ---- CREATE_KUDU DROP TABLE IF EXISTS {db_name}{db_suffix}.{table_name}; CREATE TABLE {db_name}{db_suffix}.{table_name} ( diff --git a/testdata/workloads/functional-query/queries/QueryTest/json-binary-format.test b/testdata/workloads/functional-query/queries/QueryTest/json-binary-format.test new file mode 100644 index 000000000..34d78d4c9 --- /dev/null +++ b/testdata/workloads/functional-query/queries/QueryTest/json-binary-format.test @@ -0,0 +1,110 @@ +==== +---- QUERY +# No property or query option set, scanning binary columns will throw an exception. +# Refresh is needed for serdeproperties changes to take effect, see IMPALA-13748. +alter table binary_tbl unset serdeproperties ('json.binary.format'); +refresh binary_tbl; +set json_binary_format=none; +select id, string_col, cast(binary_col as string) from binary_tbl +---- CATCH +No valid serde properties 'json.binary.format' or query option 'json_binary_format' ('base64' or 'rawstring') provided for scanning binary column of json table '$DATABASE.binary_tbl'. +==== +---- QUERY +# No binary column scanned, no exception thrown. +set json_binary_format=none; +select id, string_col from binary_tbl +---- TYPES +INT, STRING +---- RESULTS: +1,'ascii' +2,'ascii' +3,'null' +4,'empty' +5,'valid utf8' +6,'valid utf8' +7,'invalid utf8' +8,'invalid utf8' +==== +---- QUERY +# No property set but query option set, scanning binary columns will use the query option. +set json_binary_format=rawstring; +select id, string_col, cast(binary_col as string) from binary_tbl +---- TYPES +INT, STRING, STRING +---- RESULTS: +1,'ascii','YmluYXJ5MQ==' +2,'ascii','YmluYXJ5Mg==' +3,'null','NULL' +4,'empty','' +5,'valid utf8','w6FydsOtenTFsXLFkXTDvGvDtnJmw7pyw7M=' +6,'valid utf8','5L2g5aW9aGVsbG8=' +7,'invalid utf8','AP8A/w==' +8,'invalid utf8','/0QzIhEA' +==== +---- QUERY +# If the property is set, it takes precedence over the query option, even if the value is +# invalid. +alter table binary_tbl set serdeproperties ('json.binary.format'='foobar'); +refresh binary_tbl; +set json_binary_format=rawstring; +select id, string_col, cast(binary_col as string) from binary_tbl +---- CATCH +Invalid serde property 'json.binary.format' for scanning binary column of json table '$DATABASE.binary_tbl'. Valid values are 'base64' or 'rawstring'. +==== +---- QUERY +# Setting the property to 'base64', scanning binary columns will use base64 encoding, +# rather than the query option 'rawstring'. +alter table binary_tbl set serdeproperties ('json.binary.format'='base64'); +refresh binary_tbl; +set json_binary_format=rawstring; +select id, string_col, base64encode(cast(binary_col as string)) from binary_tbl +---- TYPES +INT, STRING, STRING +---- RESULTS: +1,'ascii','YmluYXJ5MQ==' +2,'ascii','YmluYXJ5Mg==' +3,'null','NULL' +4,'empty','' +5,'valid utf8','w6FydsOtenTFsXLFkXTDvGvDtnJmw7pyw7M=' +6,'valid utf8','5L2g5aW9aGVsbG8=' +7,'invalid utf8','AP8A/w==' +8,'invalid utf8','/0QzIhEA' +==== +---- QUERY +# Unsetting the property and setting the query option to 'base64' will have the same +# effect. +alter table binary_tbl unset serdeproperties ('json.binary.format'); +refresh binary_tbl; +set json_binary_format=base64; +select id, string_col, base64encode(cast(binary_col as string)) from binary_tbl +---- TYPES +INT, STRING, STRING +---- RESULTS: +1,'ascii','YmluYXJ5MQ==' +2,'ascii','YmluYXJ5Mg==' +3,'null','NULL' +4,'empty','' +5,'valid utf8','w6FydsOtenTFsXLFkXTDvGvDtnJmw7pyw7M=' +6,'valid utf8','5L2g5aW9aGVsbG8=' +7,'invalid utf8','AP8A/w==' +8,'invalid utf8','/0QzIhEA' +==== +---- QUERY +# Test scanning multiple json tables with different binary column formats +# ('functional_json.binary_tbl' has 'base64'). +alter table binary_tbl set serdeproperties ('json.binary.format'='rawstring'); +refresh binary_tbl; +select r.id, cast(r.binary_col as string), base64encode(cast(b.binary_col as string)) +from binary_tbl r join functional_json.binary_tbl b using (id) +---- TYPES +INT, STRING, STRING +---- RESULTS: +1,'YmluYXJ5MQ==','YmluYXJ5MQ==' +2,'YmluYXJ5Mg==','YmluYXJ5Mg==' +3,'NULL','NULL' +4,'','' +5,'w6FydsOtenTFsXLFkXTDvGvDtnJmw7pyw7M=','w6FydsOtenTFsXLFkXTDvGvDtnJmw7pyw7M=' +6,'5L2g5aW9aGVsbG8=','5L2g5aW9aGVsbG8=' +7,'AP8A/w==','AP8A/w==' +8,'/0QzIhEA','/0QzIhEA' +==== \ No newline at end of file diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py index 410980d8f..c044cf0a5 100644 --- a/tests/query_test/test_scanners.py +++ b/tests/query_test/test_scanners.py @@ -1940,15 +1940,17 @@ class TestErasureCoding(ImpalaTestSuite): class TestBinaryType(ImpalaTestSuite): - @classmethod - def add_test_dimensions(cls): - super(TestBinaryType, cls).add_test_dimensions() - cls.ImpalaTestMatrix.add_constraint( - lambda v: v.get_value('table_format').file_format != 'json') def test_binary_type(self, vector): self.run_test_case('QueryTest/binary-type', vector) + def test_json_binary_format(self, vector, unique_database): + if vector.get_value('table_format').file_format != 'json': + pytest.skip() + test_tbl = unique_database + '.binary_tbl' + self.clone_table('functional_json.binary_tbl', test_tbl, False, vector) + self.run_test_case('QueryTest/json-binary-format', vector, unique_database) + class TestBinaryInComplexType(ImpalaTestSuite): @classmethod
