(impala) 03/04: IMPALA-12926: Refactor BINARY type handling in the backend

stigahuang Thu, 21 Mar 2024 14:59:55 -0700

This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


commit 4f5205a3d87821220a8dea9164a122788322884a
Author: Daniel Becker <[email protected]>
AuthorDate: Fri Mar 15 17:52:26 2024 +0100

    IMPALA-12926: Refactor BINARY type handling in the backend
    
    Currently the STRING and BINARY types are not distinguished in most of
    the backend. In contrast to the frontend, PrimitiveType::TYPE_BINARY is
    not used there at all, TYPE_STRING being used instead. This is to ensure
    that everything that works for STRING also works for BINARY. So far only
    file readers and writers have had to handle them differently, and they
    have access to ColumnDescriptors which contain AuxColumnType fields that
    differentiate these two types.
    
    However, only top-level columns have ColumnDescriptors. Adding support
    for BINARYs within complex types (see IMPALA-11491 and IMPALA-12651)
    necessitates adding type information about STRING vs BINARY to embedded
    fields as well.
    
    Using PrimitiveType::TYPE_BINARY would probably be the cleanest solution
    but it would affect huge parts of the code as TYPE_BINARY would have to
    be added to hundreds of switch statements and this would be error prone.
    
    Instead, this change introduces a new field in ColumnType: 'is_binary',
    which is true if the type is a BINARY and false otherwise. We keep using
    TYPE_STRING as the PrimitiveType of the ColumnType for BINARYs. This way
    full type information is present in ColumnType but code that does not
    differentiate between STRING and BINARY will continue to work for
    BINARY.
    
    With this change, AuxColumnType is no longer needed and is removed.
    
    See also IMPALA-5323 - https://gerrit.cloudera.org/#/c/18868/. This
    change is a very similar implementation of the same refactoring, except
    for the Kudu related parts.
    
    Testing:
     - added an extra test in binary-type.test
    
    Change-Id: Icedbad4e24a46e7731de11cc14218761d11fb86f
    Reviewed-on: http://gerrit.cloudera.org:8080/21157
    Reviewed-by: Impala Public Jenkins <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 be/src/exec/file-metadata-utils.cc                 |  4 +--
 be/src/exec/hbase/hbase-scan-node.cc               |  5 ++-
 be/src/exec/hbase/hbase-table-writer.cc            |  2 +-
 be/src/exec/hdfs-scanner-ir.cc                     |  4 +--
 be/src/exec/hdfs-scanner.cc                        | 13 +++-----
 be/src/exec/hdfs-text-table-writer.cc              |  6 ++--
 be/src/exec/json/hdfs-json-scanner.cc              |  6 ++--
 be/src/exec/parquet/hdfs-parquet-table-writer.cc   |  2 +-
 be/src/exec/parquet/parquet-metadata-utils.cc      |  3 +-
 be/src/exec/parquet/parquet-metadata-utils.h       |  2 +-
 be/src/exec/rcfile/hdfs-rcfile-scanner.cc          |  6 ++--
 be/src/exec/text-converter.cc                      | 19 ++++-------
 be/src/exec/text-converter.h                       | 18 ++++------
 be/src/exec/text-converter.inline.h                |  5 ++-
 be/src/exec/text/hdfs-text-scanner.cc              |  6 ++--
 be/src/runtime/descriptors.cc                      |  3 +-
 be/src/runtime/descriptors.h                       |  3 --
 be/src/runtime/types.cc                            | 34 ++++++++++---------
 be/src/runtime/types.h                             | 39 +++++++++-------------
 be/src/service/hs2-util.cc                         |  3 +-
 .../queries/QueryTest/binary-type.test             |  8 +++++
 21 files changed, 80 insertions(+), 111 deletions(-)

diff --git a/be/src/exec/file-metadata-utils.cc 
b/be/src/exec/file-metadata-utils.cc
index a03024992..b63379abe 100644
--- a/be/src/exec/file-metadata-utils.cc
+++ b/be/src/exec/file-metadata-utils.cc
@@ -118,9 +118,7 @@ void FileMetadataUtils::AddIcebergColumns(MemPool* 
mem_pool, Tuple** template_tu
         continue;
       }
       if (field_id != transform->source_id()) continue;
-      const AuxColumnType& aux_type =
-          scan_node_->hdfs_table()->GetColumnDesc(slot_desc).auxType();
-      if (!text_converter.WriteSlot(slot_desc, &aux_type, *template_tuple,
+      if (!text_converter.WriteSlot(slot_desc, *template_tuple,
                                     (const 
char*)transform->transform_value()->data(),
                                     transform->transform_value()->size(),
                                     true, false,
diff --git a/be/src/exec/hbase/hbase-scan-node.cc 
b/be/src/exec/hbase/hbase-scan-node.cc
index 55d0ab584..41a96edfa 100644
--- a/be/src/exec/hbase/hbase-scan-node.cc
+++ b/be/src/exec/hbase/hbase-scan-node.cc
@@ -147,9 +147,8 @@ void HBaseScanNode::WriteTextSlot(
     const string& family, const string& qualifier,
     void* value, int value_length, SlotDescriptor* slot_desc,
     RuntimeState* state, MemPool* pool, Tuple* tuple, bool* error_in_row) {
-  const AuxColumnType& aux_type = 
hbase_table_->GetColumnDesc(slot_desc).auxType();
-  if (!text_converter_->WriteSlot(slot_desc, &aux_type, tuple,
-      reinterpret_cast<char*>(value), value_length, true, false, pool)) {
+  if (!text_converter_->WriteSlot(slot_desc, tuple, 
reinterpret_cast<char*>(value),
+        value_length, true, false, pool)) {
     *error_in_row = true;
     if (state->LogHasSpace()) {
       stringstream ss;
diff --git a/be/src/exec/hbase/hbase-table-writer.cc 
b/be/src/exec/hbase/hbase-table-writer.cc
index 0f1641c94..6c3d1163a 100644
--- a/be/src/exec/hbase/hbase-table-writer.cc
+++ b/be/src/exec/hbase/hbase-table-writer.cc
@@ -155,7 +155,7 @@ Status HBaseTableWriter::AppendRows(RowBatch* batch) {
             string_value.clear();
             output_expr_evals_[j]->PrintValue(value, &string_value);
             const ColumnDescriptor& col_desc = table_desc_->col_descs()[j];
-            if (col_desc.auxType().IsBinaryStringSubtype()) {
+            if (col_desc.type().IsBinaryType()) {
               Base64Encode(string_value , &base64_encoded_value);
               data = base64_encoded_value.data();
               data_len = base64_encoded_value.length();
diff --git a/be/src/exec/hdfs-scanner-ir.cc b/be/src/exec/hdfs-scanner-ir.cc
index 0fe3e05c2..a05d07dc4 100644
--- a/be/src/exec/hdfs-scanner-ir.cc
+++ b/be/src/exec/hdfs-scanner-ir.cc
@@ -125,9 +125,7 @@ bool 
HdfsScanner::TextConverterWriteSlotInterpretedIR(HdfsScanner* hdfs_scanner,
   }
 
   SlotDescriptor* slot_desc = 
hdfs_scanner->scan_node_->materialized_slots()[slot_idx];
-  const AuxColumnType& auxType =
-      
hdfs_scanner->scan_node_->hdfs_table_->GetColumnDesc(slot_desc).auxType();
-  return hdfs_scanner->text_converter_->WriteSlot(slot_desc, &auxType, tuple, 
data, len,
+  return hdfs_scanner->text_converter_->WriteSlot(slot_desc, tuple, data, len,
        copy_string, need_escape, pool);
 }
 
diff --git a/be/src/exec/hdfs-scanner.cc b/be/src/exec/hdfs-scanner.cc
index ff83ab616..76314c9b5 100644
--- a/be/src/exec/hdfs-scanner.cc
+++ b/be/src/exec/hdfs-scanner.cc
@@ -289,10 +289,8 @@ bool HdfsScanner::WriteCompleteTuple(MemPool* pool, 
FieldLocation* fields,
     }
 
     const SlotDescriptor* slot_desc = scan_node_->materialized_slots()[i];
-    const AuxColumnType& aux_type =
-        scan_node_->hdfs_table()->GetColumnDesc(slot_desc).auxType();
-    bool error = !text_converter_->WriteSlot(slot_desc, &aux_type, tuple,
-        fields[i].start, len, false, need_escape, pool);
+    bool error = !text_converter_->WriteSlot(slot_desc, tuple, 
fields[i].start, len,
+        false, need_escape, pool);
     error_fields[i] = error;
     *error_in_row |= error;
   }
@@ -374,12 +372,9 @@ Status HdfsScanner::CodegenWriteCompleteTuple(const 
HdfsScanPlanNode* node,
     // If the type is CHAR, WriteSlot for this slot cannot be codegen'd. To 
keep codegen
     // for other things, we call the interpreted code for this slot from the 
codegen'd
     // code instead of failing codegen. See IMPALA-9747.
-    const AuxColumnType& aux_type =
-        node->hdfs_table_->GetColumnDesc(slot_desc).auxType();
-    if (TextConverter::SupportsCodegenWriteSlot(
-        slot_desc->type(), aux_type)) {
+    if (TextConverter::SupportsCodegenWriteSlot(slot_desc->type())) {
       RETURN_IF_ERROR(TextConverter::CodegenWriteSlot(codegen, tuple_desc, 
slot_desc,
-          &aux_type, &fn, node->hdfs_table_->null_column_value().data(),
+          &fn, node->hdfs_table_->null_column_value().data(),
           node->hdfs_table_->null_column_value().size(), true,
           state->query_options().strict_mode));
       if (i >= LlvmCodeGen::CODEGEN_INLINE_EXPRS_THRESHOLD) 
codegen->SetNoInline(fn);
diff --git a/be/src/exec/hdfs-text-table-writer.cc 
b/be/src/exec/hdfs-text-table-writer.cc
index ddcbbf108..9ac4a7781 100644
--- a/be/src/exec/hdfs-text-table-writer.cc
+++ b/be/src/exec/hdfs-text-table-writer.cc
@@ -100,11 +100,9 @@ Status HdfsTextTableWriter::AppendRows(
             StringValue sv(val_ptr, StringValue::UnpaddedCharLength(val_ptr, 
type.len));
             PrintEscaped(&sv);
           } else if (type.IsVarLenStringType()) {
-            const ColumnDescriptor& col_desc =
-                table_desc_->col_descs()[num_partition_cols + j];
             const StringValue* string_value = reinterpret_cast<const 
StringValue*>(value);
-            if (col_desc.auxType().IsBinaryStringSubtype()) {
-              // TODO: try to find a more efficient imlementation
+            if (type.IsBinaryType()) {
+              // TODO: try to find a more efficient implementation
               Base64Encode(
                   string_value->Ptr() , string_value->Len(), 
&rowbatch_stringstream_);
             } else {
diff --git a/be/src/exec/json/hdfs-json-scanner.cc 
b/be/src/exec/json/hdfs-json-scanner.cc
index b0a5c5b16..90023130e 100644
--- a/be/src/exec/json/hdfs-json-scanner.cc
+++ b/be/src/exec/json/hdfs-json-scanner.cc
@@ -344,10 +344,8 @@ void HdfsJsonScanner::SubmitRow() {
 bool HdfsJsonScanner::InvokeWriteSlot(const SlotDescriptor* slot_desc, const 
char* data,
     int len) {
   // TODO: Support Invoke CodeGend WriteSlot
-  const AuxColumnType& aux_type =
-      scan_node_->hdfs_table()->GetColumnDesc(slot_desc).auxType();
-  if (LIKELY(text_converter_->WriteSlot(slot_desc, &aux_type, tuple_, data, 
len, true,
-      false, current_pool_))) {
+  if (LIKELY(text_converter_->WriteSlot(slot_desc, tuple_, data, len, true, 
false,
+      current_pool_))) {
     return true;
   }
   return HandleConvertError(slot_desc, data, len);
diff --git a/be/src/exec/parquet/hdfs-parquet-table-writer.cc 
b/be/src/exec/parquet/hdfs-parquet-table-writer.cc
index 412002e07..60d43a77e 100644
--- a/be/src/exec/parquet/hdfs-parquet-table-writer.cc
+++ b/be/src/exec/parquet/hdfs-parquet-table-writer.cc
@@ -1504,7 +1504,7 @@ Status HdfsParquetTableWriter::CreateSchema() {
     const int field_id = col_desc.field_id();
     if (field_id != -1) col_schema.__set_field_id(field_id);
     ParquetMetadataUtils::FillSchemaElement(col_type, string_utf8_, 
timestamp_type_,
-                                            col_desc.auxType(), &col_schema);
+        &col_schema);
   }
 
   return Status::OK();
diff --git a/be/src/exec/parquet/parquet-metadata-utils.cc 
b/be/src/exec/parquet/parquet-metadata-utils.cc
index 99c2d2319..2d4ade20d 100644
--- a/be/src/exec/parquet/parquet-metadata-utils.cc
+++ b/be/src/exec/parquet/parquet-metadata-utils.cc
@@ -415,7 +415,6 @@ parquet::Type::type 
ParquetMetadataUtils::ConvertInternalToParquetType(
 
 void ParquetMetadataUtils::FillSchemaElement(const ColumnType& col_type,
     bool string_utf8, TParquetTimestampType::type timestamp_type,
-    const AuxColumnType& aux_type,
     parquet::SchemaElement* col_schema) {
   col_schema->__set_type(ConvertInternalToParquetType(col_type.type, 
timestamp_type));
   col_schema->__set_repetition_type(parquet::FieldRepetitionType::OPTIONAL);
@@ -431,7 +430,7 @@ void ParquetMetadataUtils::FillSchemaElement(const 
ColumnType& col_type,
     case TYPE_STRING:
       // By default STRING has no logical type, see IMPALA-5982.
       // VARCHAR and CHAR are always set to UTF8.
-      if (string_utf8 && !aux_type.IsBinaryStringSubtype()) {
+      if (string_utf8 && !col_type.IsBinaryType()) {
         SetUtf8ConvertedAndLogicalType(col_schema);
       }
       break;
diff --git a/be/src/exec/parquet/parquet-metadata-utils.h 
b/be/src/exec/parquet/parquet-metadata-utils.h
index 2981f7cef..74ab9e730 100644
--- a/be/src/exec/parquet/parquet-metadata-utils.h
+++ b/be/src/exec/parquet/parquet-metadata-utils.h
@@ -66,7 +66,7 @@ class ParquetMetadataUtils {
   /// and this function's arguments.
   static void FillSchemaElement(const ColumnType& col_type,
       bool string_utf8, TParquetTimestampType::type timestamp_type,
-      const AuxColumnType& aux_type, parquet::SchemaElement* col_schema);
+      parquet::SchemaElement* col_schema);
 };
 
 struct ParquetFileVersion {
diff --git a/be/src/exec/rcfile/hdfs-rcfile-scanner.cc 
b/be/src/exec/rcfile/hdfs-rcfile-scanner.cc
index 8264faf2b..e169e744a 100644
--- a/be/src/exec/rcfile/hdfs-rcfile-scanner.cc
+++ b/be/src/exec/rcfile/hdfs-rcfile-scanner.cc
@@ -590,10 +590,8 @@ Status HdfsRCFileScanner::ProcessRange(RowBatch* 
row_batch) {
           return Status(ss.str());
         }
 
-        const AuxColumnType& aux_type =
-            scan_node_->hdfs_table()->GetColumnDesc(slot_desc).auxType();
-        if (!text_converter_->WriteSlot(slot_desc, &aux_type, tuple, col_start,
-             field_len, false, false, row_batch->tuple_data_pool())) {
+        if (!text_converter_->WriteSlot(slot_desc, tuple, col_start, 
field_len, false,
+              false, row_batch->tuple_data_pool())) {
           ReportColumnParseError(slot_desc, col_start, field_len);
           error_in_row = true;
         }
diff --git a/be/src/exec/text-converter.cc b/be/src/exec/text-converter.cc
index a709f0d80..ca4fb0fc8 100644
--- a/be/src/exec/text-converter.cc
+++ b/be/src/exec/text-converter.cc
@@ -106,12 +106,11 @@ void TextConverter::UnescapeString(const char* src, char* 
dest, int* len,
 //}
 // TODO: convert this function to use cross-compilation + constant 
substitution in whole
 // or part. It is currently too complex and doesn't implement the full 
functionality.
-Status TextConverter::CodegenWriteSlot(LlvmCodeGen* codegen,
-    TupleDescriptor* tuple_desc, SlotDescriptor* slot_desc,
-    const AuxColumnType* aux_type, llvm::Function** fn, const char* 
null_col_val,
-    int len, bool check_null, bool strict_mode) {
+Status TextConverter::CodegenWriteSlot(LlvmCodeGen* codegen, TupleDescriptor* 
tuple_desc,
+    SlotDescriptor* slot_desc, llvm::Function** fn, const char* null_col_val, 
int len,
+    bool check_null, bool strict_mode) {
   DCHECK(fn != nullptr);
-  DCHECK(SupportsCodegenWriteSlot(slot_desc->type(), *aux_type));
+  DCHECK(SupportsCodegenWriteSlot(slot_desc->type()));
 
   // Codegen is_null_string
   bool is_default_null = (len == 2 && null_col_val[0] == '\\' && 
null_col_val[1] == 'N');
@@ -351,11 +350,7 @@ Status TextConverter::CodegenWriteSlot(LlvmCodeGen* 
codegen,
   return Status::OK();
 }
 
-bool TextConverter::SupportsCodegenWriteSlot(const ColumnType& col_type,
-    const AuxColumnType& auxType) {
-  if (col_type.type == TYPE_STRING) {
-    // TODO: implement codegen for binary (IMPALA-11475)
-    return auxType.string_subtype != AuxColumnType::StringSubtype::BINARY;
-  }
-  return col_type.type != TYPE_CHAR;
+bool TextConverter::SupportsCodegenWriteSlot(const ColumnType& col_type) {
+  // TODO: implement codegen for binary (IMPALA-11475)
+  return col_type.type != TYPE_CHAR && !col_type.IsBinaryType();
 }
diff --git a/be/src/exec/text-converter.h b/be/src/exec/text-converter.h
index 132e97741..f8b43f89b 100644
--- a/be/src/exec/text-converter.h
+++ b/be/src/exec/text-converter.h
@@ -53,7 +53,6 @@ class TextConverter {
 
   /// Converts slot data, of length 'len',  into type of slot_desc,
   /// and writes the result into the tuples's slot.
-  /// auxType is used to differentiate between BINARY and STRING types.
   /// copy_string indicates whether we need to make a separate copy of the 
string data:
   /// For regular unescaped strings, we point to the original data in the 
file_buf_.
   /// For regular escaped strings, we copy its unescaped string into a 
separate buffer
@@ -62,9 +61,8 @@ class TextConverter {
   /// 'pool' is unused.
   /// Unsuccessful conversions are turned into NULLs.
   /// Returns true if the value was written successfully.
-  bool WriteSlot(const SlotDescriptor* slot_desc, const AuxColumnType* auxType,
-      Tuple* tuple, const char* data, int len, bool copy_string, bool 
need_escape,
-      MemPool* pool);
+  bool WriteSlot(const SlotDescriptor* slot_desc, Tuple* tuple, const char* 
data, int len,
+      bool copy_string, bool need_escape, MemPool* pool);
 
   /// Removes escape characters from len characters of the null-terminated 
string src,
   /// and copies the unescaped string into dest, changing *len to the 
unescaped length.
@@ -81,21 +79,19 @@ class TextConverter {
   /// bool WriteSlot(Tuple* tuple, const char* data, int len);
   /// The codegen function returns true if the slot could be written and false
   /// otherwise.
-  /// auxType is used to differentiate between BINARY and STRING types.
   /// If check_null is set, then the codegen'd function sets the target slot 
to NULL
   /// if its input string matches null_vol_val.
   /// The codegenerated function does not support escape characters and should 
not
   /// be used for partitions that contain escapes.
   /// strict_mode: If set, numerical overflow/underflow are considered to be 
parse
   /// errors.
-  static Status CodegenWriteSlot(LlvmCodeGen* codegen,
-      TupleDescriptor* tuple_desc, SlotDescriptor* slot_desc,
-      const AuxColumnType* auxType, llvm::Function** fn, const char* 
null_col_val,
-      int len, bool check_null, bool strict_mode = false);
+  static Status CodegenWriteSlot(LlvmCodeGen* codegen, TupleDescriptor* 
tuple_desc,
+      SlotDescriptor* slot_desc, llvm::Function** fn, const char* 
null_col_val, int len,
+      bool check_null, bool strict_mode = false);
 
   /// Returns whether codegen is supported for the given type.
-  static bool SupportsCodegenWriteSlot(const ColumnType& col_type,
-      const AuxColumnType& auxType);
+  static bool SupportsCodegenWriteSlot(const ColumnType& col_type);
+
  private:
   char escape_char_;
   /// Special string to indicate NULL column values.
diff --git a/be/src/exec/text-converter.inline.h 
b/be/src/exec/text-converter.inline.h
index 992029b25..6f4c74ff9 100644
--- a/be/src/exec/text-converter.inline.h
+++ b/be/src/exec/text-converter.inline.h
@@ -39,8 +39,7 @@ namespace impala {
 
 /// Note: this function has a codegen'd version.  Changing this function 
requires
 /// corresponding changes to CodegenWriteSlot().
-inline bool TextConverter::WriteSlot(const SlotDescriptor* slot_desc,
-    const AuxColumnType* auxType, Tuple* tuple,
+inline bool TextConverter::WriteSlot(const SlotDescriptor* slot_desc, Tuple* 
tuple,
     const char* data, int len, bool copy_string, bool need_escape, MemPool* 
pool) {
   if ((len == 0 && !slot_desc->type().IsStringType()) || data == NULL) {
     tuple->SetNull(slot_desc->null_indicator_offset());
@@ -68,7 +67,7 @@ inline bool TextConverter::WriteSlot(const SlotDescriptor* 
slot_desc,
           !(len != 0 && (copy_string || need_escape));
 
       bool base64_decode = false;
-      if (auxType->IsBinaryStringSubtype()) {
+      if (type.IsBinaryType()) {
         base64_decode = true;
         reuse_data = false;
         int64_t out_len;
diff --git a/be/src/exec/text/hdfs-text-scanner.cc 
b/be/src/exec/text/hdfs-text-scanner.cc
index 3429d3a88..db2fcf5f8 100644
--- a/be/src/exec/text/hdfs-text-scanner.cc
+++ b/be/src/exec/text/hdfs-text-scanner.cc
@@ -799,10 +799,8 @@ void HdfsTextScanner::WritePartialTuple(FieldLocation* 
fields, int num_fields) {
     }
 
     const SlotDescriptor* slot_desc = 
scan_node_->materialized_slots()[slot_idx_];
-    const AuxColumnType& aux_type =
-        scan_node_->hdfs_table()->GetColumnDesc(slot_desc).auxType();
-    if (!text_converter_->WriteSlot(slot_desc, &aux_type, partial_tuple_,
-        fields[i].start, len, true, need_escape, boundary_pool_.get())) {
+    if (!text_converter_->WriteSlot(slot_desc, partial_tuple_, 
fields[i].start, len,
+        true, need_escape, boundary_pool_.get())) {
       ReportColumnParseError(slot_desc, fields[i].start, len);
       error_in_row_ = true;
     }
diff --git a/be/src/runtime/descriptors.cc b/be/src/runtime/descriptors.cc
index b880f6cac..aa716cca8 100644
--- a/be/src/runtime/descriptors.cc
+++ b/be/src/runtime/descriptors.cc
@@ -166,8 +166,7 @@ inline bool SlotDescriptor::IsChildOfStruct() const {
 
 ColumnDescriptor::ColumnDescriptor(const TColumnDescriptor& tdesc)
   : name_(tdesc.name),
-    type_(ColumnType::FromThrift(tdesc.type)),
-    aux_type_(tdesc.type) {
+    type_(ColumnType::FromThrift(tdesc.type)) {
   if (tdesc.__isset.icebergFieldId) {
     field_id_ = tdesc.icebergFieldId;
     // Get key and value field_id for Iceberg table column with Map type.
diff --git a/be/src/runtime/descriptors.h b/be/src/runtime/descriptors.h
index b8df42346..a73420205 100644
--- a/be/src/runtime/descriptors.h
+++ b/be/src/runtime/descriptors.h
@@ -334,7 +334,6 @@ class ColumnDescriptor {
   int field_map_key_id() const { return field_map_key_id_; }
   int field_map_value_id() const { return field_map_value_id_; }
 
-  const AuxColumnType& auxType() const { return aux_type_; }
   std::string DebugString() const;
 
  private:
@@ -344,8 +343,6 @@ class ColumnDescriptor {
   int field_id_ = -1;
   int field_map_key_id_ = -1;
   int field_map_value_id_ = -1;
-
-  AuxColumnType aux_type_;
 };
 
 /// Base class for table descriptors.
diff --git a/be/src/runtime/types.cc b/be/src/runtime/types.cc
index dd1beeeb1..dfe209fbf 100644
--- a/be/src/runtime/types.cc
+++ b/be/src/runtime/types.cc
@@ -37,14 +37,8 @@ const int ColumnType::MAX_DECIMAL8_PRECISION;
 
 const char* ColumnType::LLVM_CLASS_NAME = "struct.impala::ColumnType";
 
-AuxColumnType::AuxColumnType(const TColumnType& col_type) {
-  if (col_type.types.size() != 1 || !col_type.types[0].__isset.scalar_type) 
return;
-  TPrimitiveType::type primitive_type = col_type.types[0].scalar_type.type;
-  if (primitive_type == TPrimitiveType::BINARY) string_subtype = 
StringSubtype::BINARY;
-}
-
 ColumnType::ColumnType(const std::vector<TTypeNode>& types, int* idx)
-  : len(-1), precision(-1), scale(-1) {
+  : len(-1), precision(-1), scale(-1), is_binary_(false) {
   DCHECK_GE(*idx, 0);
   DCHECK_LT(*idx, types.size());
   const TTypeNode& node = types[*idx];
@@ -57,6 +51,8 @@ ColumnType::ColumnType(const std::vector<TTypeNode>& types, 
int* idx)
           || type == TYPE_FIXED_UDA_INTERMEDIATE) {
         DCHECK(scalar_type.__isset.len);
         len = scalar_type.len;
+      } else if (type == TYPE_STRING) {
+        is_binary_ = scalar_type.type == TPrimitiveType::BINARY;
       } else if (type == TYPE_DECIMAL) {
         DCHECK(scalar_type.__isset.precision);
         DCHECK(scalar_type.__isset.scale);
@@ -120,7 +116,7 @@ PrimitiveType ThriftToType(TPrimitiveType::type ttype) {
   }
 }
 
-TPrimitiveType::type ToThrift(PrimitiveType ptype) {
+TPrimitiveType::type ToThrift(PrimitiveType ptype, bool is_binary) {
   switch (ptype) {
     case INVALID_TYPE: return TPrimitiveType::INVALID_TYPE;
     case TYPE_NULL: return TPrimitiveType::NULL_TYPE;
@@ -134,7 +130,8 @@ TPrimitiveType::type ToThrift(PrimitiveType ptype) {
     case TYPE_DATE: return TPrimitiveType::DATE;
     case TYPE_DATETIME: return TPrimitiveType::DATETIME;
     case TYPE_TIMESTAMP: return TPrimitiveType::TIMESTAMP;
-    case TYPE_STRING: return TPrimitiveType::STRING;
+    case TYPE_STRING:
+      return is_binary ? TPrimitiveType::BINARY : TPrimitiveType::STRING;
     case TYPE_VARCHAR: return TPrimitiveType::VARCHAR;
     case TYPE_BINARY:
       DCHECK(false) << "STRING should be used instead of BINARY in the 
backend.";
@@ -184,7 +181,6 @@ string TypeToOdbcString(const TColumnType& type) {
   DCHECK(type.types[0].__isset.scalar_type);
   TPrimitiveType::type col_type = type.types[0].scalar_type.type;
   PrimitiveType primitive_type = ThriftToType(col_type);
-  AuxColumnType aux_type(type);
   // ODBC driver requires types in lower case
   switch (primitive_type) {
     case INVALID_TYPE: return "invalid";
@@ -200,7 +196,7 @@ string TypeToOdbcString(const TColumnType& type) {
     case TYPE_DATETIME: return "datetime";
     case TYPE_TIMESTAMP: return "timestamp";
     case TYPE_STRING:
-      if(aux_type.IsBinaryStringSubtype()) {
+      if(col_type == TPrimitiveType::BINARY) {
         return "binary";
       } else {
         return "string";
@@ -247,7 +243,7 @@ void ColumnType::ToThrift(TColumnType* thrift_type) const {
     node.type = TTypeNodeType::SCALAR;
     node.__set_scalar_type(TScalarType());
     TScalarType& scalar_type = node.scalar_type;
-    scalar_type.__set_type(impala::ToThrift(type));
+    scalar_type.__set_type(impala::ToThrift(type, is_binary_));
     if (type == TYPE_CHAR || type == TYPE_VARCHAR
         || type == TYPE_FIXED_UDA_INTERMEDIATE) {
       DCHECK_NE(len, -1);
@@ -294,11 +290,12 @@ ostream& operator<<(ostream& os, const ColumnType& type) {
 }
 
 llvm::ConstantStruct* ColumnType::ToIR(LlvmCodeGen* codegen) const {
-  // ColumnType = { i32, i32, i32, i32, <vector>, <vector> }
+  // ColumnType = { i32, i8, i32, i32, i32, <vector>, <vector>, <vector> }
   llvm::StructType* column_type_type = codegen->GetStructType<ColumnType>();
 
   DCHECK_EQ(sizeof(type), sizeof(int32_t));
   llvm::Constant* type_field = codegen->GetI32Constant(type);
+
   DCHECK_EQ(sizeof(len), sizeof(int32_t));
   llvm::Constant* len_field = codegen->GetI32Constant(len);
   DCHECK_EQ(sizeof(precision), sizeof(int32_t));
@@ -306,7 +303,7 @@ llvm::ConstantStruct* ColumnType::ToIR(LlvmCodeGen* 
codegen) const {
   DCHECK_EQ(sizeof(scale), sizeof(int32_t));
   llvm::Constant* scale_field = codegen->GetI32Constant(scale);
 
-  // Create empty 'children' and 'field_names' vectors
+  // Create empty 'children', 'field_names' and 'field_ids' vectors
   DCHECK(children.empty()) << "Nested types NYI";
   DCHECK(field_names.empty()) << "Nested types NYI";
   DCHECK(field_ids.empty()) << "Nested types NYI";
@@ -317,9 +314,16 @@ llvm::ConstantStruct* ColumnType::ToIR(LlvmCodeGen* 
codegen) const {
   llvm::Constant* field_ids_field =
       llvm::Constant::getNullValue(column_type_type->getElementType(6));
 
+  DCHECK_EQ(sizeof(is_binary_), sizeof(uint8_t));
+  llvm::Constant* is_binary_field = codegen->GetI8Constant(is_binary_);
+
+  llvm::Constant* padding =
+      llvm::Constant::getNullValue(column_type_type->getElementType(8));
+
   return llvm::cast<llvm::ConstantStruct>(
       llvm::ConstantStruct::get(column_type_type, type_field, len_field, 
precision_field,
-          scale_field, children_field, field_names_field, field_ids_field));
+        scale_field, children_field, field_names_field, field_ids_field,
+        is_binary_field, padding));
 }
 
 }
diff --git a/be/src/runtime/types.h b/be/src/runtime/types.h
index d899e20f2..94d8e0258 100644
--- a/be/src/runtime/types.h
+++ b/be/src/runtime/types.h
@@ -47,7 +47,7 @@ enum PrimitiveType {
   TYPE_STRING,
   TYPE_DATE,
   TYPE_DATETIME,    // Not implemented
-  TYPE_BINARY,      // Not used, see AuxColumnType::StringSubtype
+  TYPE_BINARY,      // Not used, see ColumnType::is_binary
   TYPE_DECIMAL,
   TYPE_CHAR,
   TYPE_VARCHAR,
@@ -62,28 +62,6 @@ PrimitiveType ThriftToType(TPrimitiveType::type ttype);
 TPrimitiveType::type ToThrift(PrimitiveType ptype);
 std::string TypeToString(PrimitiveType t);
 
-// Contains information about a type that generally doesn't affect how it 
should be
-// handled by backend, but can affect encoding / decoding.
-struct AuxColumnType {
-  // Differentiates between STRING and BINARY.
-  // As STRING is just a byte array in Impala (no UTF-8 encoding), the two 
types are
-  // practically the same in the backend - only some file format 
readers/writers
-  // differentiate between the two. Instead of using 
PrimitiveType::TYPE_BINARY BINARY
-  // uses TYPE_STRING to ensure that everything that works for STRING also 
works for
-  // BINARY.
-  enum class StringSubtype {
-    STRING,
-    BINARY
-  };
-  StringSubtype string_subtype = StringSubtype::STRING;
-
-  AuxColumnType(const TColumnType& thrift_type);
-
-  inline bool IsBinaryStringSubtype() const {
-    return string_subtype == StringSubtype::BINARY;
-  }
-};
-
 std::string TypeToOdbcString(const TColumnType& type);
 
 // Describes a type. Includes the enum, children types, and any type-specific 
metadata
@@ -91,6 +69,7 @@ std::string TypeToOdbcString(const TColumnType& type);
 // TODO for 2.3: rename to TypeDescriptor
 struct ColumnType {
   PrimitiveType type;
+
   /// Only set if type one of TYPE_CHAR, TYPE_VARCHAR, 
TYPE_FIXED_UDA_INTERMEDIATE.
   int len;
   static const int MAX_VARCHAR_LENGTH = (1 << 16) - 1; // 65535
@@ -120,7 +99,7 @@ struct ColumnType {
   static const char* LLVM_CLASS_NAME;
 
   explicit ColumnType(PrimitiveType type = INVALID_TYPE)
-    : type(type), len(-1), precision(-1), scale(-1) {
+    : type(type), len(-1), precision(-1), scale(-1), is_binary_(false) {
     DCHECK_NE(type, TYPE_CHAR);
     DCHECK_NE(type, TYPE_VARCHAR);
     DCHECK_NE(type, TYPE_BINARY);
@@ -234,6 +213,8 @@ struct ColumnType {
     return type == TYPE_STRING || type == TYPE_VARCHAR;
   }
 
+  inline bool IsBinaryType() const { return is_binary_; }
+
   inline bool IsComplexType() const {
     return type == TYPE_STRUCT || type == TYPE_ARRAY || type == TYPE_MAP;
   }
@@ -276,6 +257,16 @@ struct ColumnType {
   ColumnType(const std::vector<TTypeNode>& types, int* idx);
 
  private:
+  // Differentiates between STRING and BINARY. As STRING is just a byte array 
in Impala
+  // (no UTF-8 encoding), the two types are practically the same in the 
backend - only
+  // some code parts, e.g. file format readers/writers differentiate between 
the two.
+  // Instead of PrimitiveType::TYPE_BINARY, TYPE_STRING is used for the BINARY 
type to
+  // ensure that everything that works for STRING also works for BINARY.
+  //
+  // This variable is true if 'type' is TYPE_STRING and this object represents 
the BINARY
+  // type, and false in all other cases.
+  bool  is_binary_ = false;
+
   /// Recursive implementation of ToThrift() that populates 'thrift_type' with 
the
   /// TTypeNodes for this type and its children.
   void ToThrift(TColumnType* thrift_type) const;
diff --git a/be/src/service/hs2-util.cc b/be/src/service/hs2-util.cc
index ea0b51f1e..ce6654db3 100644
--- a/be/src/service/hs2-util.cc
+++ b/be/src/service/hs2-util.cc
@@ -866,7 +866,6 @@ bool impala::isOneFieldSet(const impala::TColumnValue& 
value) {
 thrift::TTypeEntry impala::ColumnToHs2Type(
     const TColumnType& columnType) {
   const ColumnType& type = ColumnType::FromThrift(columnType);
-  AuxColumnType aux_type(columnType);
   thrift::TPrimitiveTypeEntry type_entry;
   switch (type.type) {
     // Map NULL_TYPE to BOOLEAN, otherwise Hive's JDBC driver won't
@@ -902,7 +901,7 @@ thrift::TTypeEntry impala::ColumnToHs2Type(
       type_entry.__set_type(thrift::TTypeId::TIMESTAMP_TYPE);
       break;
     case TYPE_STRING:
-      if (aux_type.string_subtype == AuxColumnType::StringSubtype::BINARY) {
+      if (type.IsBinaryType()) {
         type_entry.__set_type(thrift::TTypeId::BINARY_TYPE);
       } else {
         type_entry.__set_type(thrift::TTypeId::STRING_TYPE);
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/binary-type.test 
b/testdata/workloads/functional-query/queries/QueryTest/binary-type.test
index 1462b4820..8d8941d9d 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/binary-type.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/binary-type.test
@@ -22,6 +22,14 @@ INT, STRING
 8,'FF4433221100'
 ====
 ---- QUERY
+select id, hex(cast(binary_col as string)) from binary_tbl
+where binary_col = cast(unhex("FF4433221100") as binary)
+---- TYPES
+INT, STRING
+---- RESULTS
+8,'FF4433221100'
+====
+---- QUERY
 set utf8_mode=0;
 select string_col, length(binary_col) from binary_tbl
 ---- TYPES

(impala) 03/04: IMPALA-12926: Refactor BINARY type handling in the backend

Reply via email to