This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch dev-1.0.1 in repository https://gitbox.apache.org/repos/asf/incubator-doris.git
commit 11374d67b2195d24778657b2b29cbbe1b2b2c395 Author: Zhengguo Yang <yangz...@gmail.com> AuthorDate: Fri Mar 25 09:28:41 2022 +0800 [api-change] add soft limit of String type length (#8567) 1. add a config string_type_soft_limit to soft limit max length of string type 2. disable using String type in Key column, partition column and distribution column 3. remove String type alias BLOB for futrue use --- be/src/common/config.h | 7 ++++ be/src/exec/tablet_sink.cpp | 43 ++++++++++++++-------- be/src/olap/delete_handler.cpp | 4 +- be/src/olap/olap_define.h | 3 -- be/src/olap/row_block2.cpp | 8 ++-- be/src/olap/types.h | 4 +- be/src/olap/wrapper_field.cpp | 5 ++- be/src/runtime/types.h | 3 +- be/src/vec/sink/vtablet_sink.cpp | 29 +++++++-------- docs/en/administrator-guide/config/be_config.md | 5 +++ .../sql-statements/Data Types/STRING.md | 2 +- docs/zh-CN/administrator-guide/config/be_config.md | 5 +++ .../sql-statements/Data Types/STRING.md | 2 +- fe/fe-core/src/main/cup/sql_parser.cup | 2 - .../java/org/apache/doris/analysis/ColumnDef.java | 5 ++- .../org/apache/doris/analysis/CreateTableStmt.java | 12 ++++-- .../apache/doris/analysis/DistributionDesc.java | 2 +- .../doris/analysis/HashDistributionDesc.java | 18 ++++++++- .../org/apache/doris/analysis/PartitionDesc.java | 5 +++ .../doris/analysis/RandomDistributionDesc.java | 2 +- .../java/org/apache/doris/catalog/ScalarType.java | 1 - .../java/org/apache/doris/planner/PlannerTest.java | 15 ++++++++ 22 files changed, 123 insertions(+), 59 deletions(-) diff --git a/be/src/common/config.h b/be/src/common/config.h index 5cb5a17..9c255c2 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -704,6 +704,13 @@ CONF_String(function_service_protocol, "h2:grpc"); // use which load balancer to select server to connect CONF_String(rpc_load_balancer, "rr"); +// a soft limit of string type length, the hard limit is 2GB - 4, but if too long will cause very low performance, +// so we set a soft limit, default is 1MB +CONF_mInt32(string_type_length_soft_limit_bytes, "1048576"); + +CONF_Validator(string_type_length_soft_limit_bytes, + [](const int config) -> bool { return config > 0 && config <= 2147483643; }); + } // namespace config } // namespace doris diff --git a/be/src/exec/tablet_sink.cpp b/be/src/exec/tablet_sink.cpp index 0a56369..2dcd9d4 100644 --- a/be/src/exec/tablet_sink.cpp +++ b/be/src/exec/tablet_sink.cpp @@ -191,7 +191,8 @@ Status NodeChannel::open_wait() { return; } // If rpc failed, mark all tablets on this node channel as failed - _index_channel->mark_as_failed(this->node_id(), this->host(), _add_batch_closure->cntl.ErrorText(), -1); + _index_channel->mark_as_failed(this->node_id(), this->host(), + _add_batch_closure->cntl.ErrorText(), -1); Status st = _index_channel->check_intolerable_failure(); if (!st.ok()) { _cancel_with_msg(fmt::format("{}, err: {}", channel_info(), st.get_error_msg())); @@ -214,7 +215,8 @@ Status NodeChannel::open_wait() { if (status.ok()) { // if has error tablet, handle them first for (auto& error : result.tablet_errors()) { - _index_channel->mark_as_failed(this->node_id(), this->host(), error.msg(), error.tablet_id()); + _index_channel->mark_as_failed(this->node_id(), this->host(), error.msg(), + error.tablet_id()); } Status st = _index_channel->check_intolerable_failure(); @@ -387,7 +389,7 @@ Status NodeChannel::close_wait(RuntimeState* state) { while (!_add_batches_finished && !_cancelled) { SleepFor(MonoDelta::FromMilliseconds(1)); } - _close_time_ms = UnixMillis() - _close_time_ms; + _close_time_ms = UnixMillis() - _close_time_ms; if (_add_batches_finished) { { @@ -676,7 +678,8 @@ OlapTableSink::~OlapTableSink() { // OlapTableSink::_mem_tracker and its parents. // But their destructions are after OlapTableSink's. for (auto index_channel : _channels) { - index_channel->for_each_node_channel([](const std::shared_ptr<NodeChannel>& ch) { ch->clear_all_batches(); }); + index_channel->for_each_node_channel( + [](const std::shared_ptr<NodeChannel>& ch) { ch->clear_all_batches(); }); } } @@ -838,11 +841,13 @@ Status OlapTableSink::open(RuntimeState* state) { RETURN_IF_ERROR(Expr::open(_output_expr_ctxs, state)); for (auto index_channel : _channels) { - index_channel->for_each_node_channel([](const std::shared_ptr<NodeChannel>& ch) { ch->open(); }); + index_channel->for_each_node_channel( + [](const std::shared_ptr<NodeChannel>& ch) { ch->open(); }); } for (auto index_channel : _channels) { - index_channel->for_each_node_channel([&index_channel](const std::shared_ptr<NodeChannel>& ch) { + index_channel->for_each_node_channel([&index_channel]( + const std::shared_ptr<NodeChannel>& ch) { auto st = ch->open_wait(); if (!st.ok()) { // The open() phase is mainly to generate DeltaWriter instances on the nodes corresponding to each node channel. @@ -932,13 +937,13 @@ Status OlapTableSink::send(RuntimeState* state, RowBatch* input_batch) { uint32_t tablet_index = 0; if (findTabletMode != FindTabletMode::FIND_TABLET_EVERY_ROW) { if (_partition_to_tablet_map.find(partition->id) == _partition_to_tablet_map.end()) { - tablet_index = _partition->find_tablet(tuple,*partition); + tablet_index = _partition->find_tablet(tuple, *partition); _partition_to_tablet_map.emplace(partition->id, tablet_index); } else { tablet_index = _partition_to_tablet_map[partition->id]; } } else { - tablet_index = _partition->find_tablet(tuple,*partition); + tablet_index = _partition->find_tablet(tuple, *partition); } _partition_ids.emplace(partition->id); for (int j = 0; j < partition->indexes.size(); ++j) { @@ -976,7 +981,8 @@ Status OlapTableSink::close(RuntimeState* state, Status close_status) { { SCOPED_TIMER(_close_timer); for (auto index_channel : _channels) { - index_channel->for_each_node_channel([](const std::shared_ptr<NodeChannel>& ch) { ch->mark_close(); }); + index_channel->for_each_node_channel( + [](const std::shared_ptr<NodeChannel>& ch) { ch->mark_close(); }); num_node_channels += index_channel->num_node_channels(); } @@ -989,7 +995,8 @@ Status OlapTableSink::close(RuntimeState* state, Status close_status) { &total_add_batch_num](const std::shared_ptr<NodeChannel>& ch) { auto s = ch->close_wait(state); if (!s.ok()) { - index_channel->mark_as_failed(ch->node_id(), ch->host(), s.get_error_msg(), -1); + index_channel->mark_as_failed(ch->node_id(), ch->host(), + s.get_error_msg(), -1); LOG(WARNING) << ch->channel_info() << ", close channel failed, err: " << s.get_error_msg(); @@ -1039,7 +1046,8 @@ Status OlapTableSink::close(RuntimeState* state, Status close_status) { // print log of add batch time of all node, for tracing load performance easily std::stringstream ss; ss << "finished to close olap table sink. load_id=" << print_id(_load_id) - << ", txn_id=" << _txn_id << ", node add batch time(ms)/wait execution time(ms)/close time(ms)/num: "; + << ", txn_id=" << _txn_id + << ", node add batch time(ms)/wait execution time(ms)/close time(ms)/num: "; for (auto const& pair : node_add_batch_counter_map) { ss << "{" << pair.first << ":(" << (pair.second.add_batch_execution_time_us / 1000) << ")(" << (pair.second.add_batch_wait_execution_time_us / 1000) << ")(" @@ -1048,8 +1056,9 @@ Status OlapTableSink::close(RuntimeState* state, Status close_status) { LOG(INFO) << ss.str(); } else { for (auto channel : _channels) { - channel->for_each_node_channel( - [&status](const std::shared_ptr<NodeChannel>& ch) { ch->cancel(status.get_error_msg()); }); + channel->for_each_node_channel([&status](const std::shared_ptr<NodeChannel>& ch) { + ch->cancel(status.get_error_msg()); + }); } } @@ -1181,13 +1190,14 @@ Status OlapTableSink::_validate_data(RuntimeState* state, RowBatch* batch, Bitma } case TYPE_STRING: { StringValue* str_val = (StringValue*)slot; - if (str_val->len > OLAP_STRING_MAX_LENGTH) { + if (str_val->len > config::string_type_length_soft_limit_bytes) { fmt::format_to(error_msg, "{}", "the length of input is too long than schema. "); fmt::format_to(error_msg, "column_name: {}; ", desc->col_name()); fmt::format_to(error_msg, "first 128 bytes of input str: [{}] ", std::string(str_val->ptr, 128)); - fmt::format_to(error_msg, "schema length: {}; ", OLAP_STRING_MAX_LENGTH); + fmt::format_to(error_msg, "schema length: {}; ", + config::string_type_length_soft_limit_bytes); fmt::format_to(error_msg, "actual length: {}; ", str_val->len); row_valid = false; continue; @@ -1258,7 +1268,8 @@ void OlapTableSink::_send_batch_process() { if (running_channels_num == 0) { LOG(INFO) << "all node channels are stopped(maybe finished/offending/cancelled), " - "sender thread exit. " << print_id(_load_id); + "sender thread exit. " + << print_id(_load_id); return; } } while (!_stop_background_threads_latch.wait_for( diff --git a/be/src/olap/delete_handler.cpp b/be/src/olap/delete_handler.cpp index 6dd655f..2234044 100644 --- a/be/src/olap/delete_handler.cpp +++ b/be/src/olap/delete_handler.cpp @@ -143,7 +143,7 @@ bool DeleteConditionHandler::is_condition_value_valid(const TabletColumn& column case OLAP_FIELD_TYPE_VARCHAR: return value_str.size() <= column.length(); case OLAP_FIELD_TYPE_STRING: - return value_str.size() <= OLAP_STRING_MAX_LENGTH; + return value_str.size() <= config::string_type_length_soft_limit_bytes; case OLAP_FIELD_TYPE_DATE: case OLAP_FIELD_TYPE_DATETIME: return valid_datetime(value_str); @@ -303,7 +303,7 @@ OLAPStatus DeleteHandler::init(const TabletSchema& schema, } bool DeleteHandler::is_filter_data(const int64_t data_version, const RowCursor& row) const { - // According to semantics, the delete condition stored in _del_conds should be an OR relationship, + // According to semantics, the delete condition stored in _del_conds should be an OR relationship, // so as long as the data matches one of the _del_conds, it will return true. for (const auto& del_cond : _del_conds) { if (data_version <= del_cond.filter_version && diff --git a/be/src/olap/olap_define.h b/be/src/olap/olap_define.h index b16c614..6af5024 100644 --- a/be/src/olap/olap_define.h +++ b/be/src/olap/olap_define.h @@ -56,9 +56,6 @@ static const uint16_t OLAP_VARCHAR_MAX_LENGTH = 65535; // the max length supported for string type 2GB static const uint32_t OLAP_STRING_MAX_LENGTH = 2147483647; -// the max length supported for vec string type 1MB -static constexpr size_t MAX_SIZE_OF_VEC_STRING = 1024 * 1024; - // the max length supported for array static const uint16_t OLAP_ARRAY_MAX_LENGTH = 65535; diff --git a/be/src/olap/row_block2.cpp b/be/src/olap/row_block2.cpp index 5d43c94..8a99ca4 100644 --- a/be/src/olap/row_block2.cpp +++ b/be/src/olap/row_block2.cpp @@ -17,6 +17,7 @@ #include "olap/row_block2.h" +#include <algorithm> #include <sstream> #include <utility> @@ -192,15 +193,16 @@ Status RowBlockV2::_copy_data_to_column(int cid, doris::vectorized::MutableColum } case OLAP_FIELD_TYPE_STRING: { auto column_string = assert_cast<vectorized::ColumnString*>(column); - + size_t limit = config::string_type_length_soft_limit_bytes; for (uint16_t j = 0; j < _selected_size; ++j) { if (!nullable_mark_array[j]) { uint16_t row_idx = _selection_vector[j]; auto slice = reinterpret_cast<const Slice*>(column_block(cid).cell_ptr(row_idx)); - if (LIKELY(slice->size <= MAX_SIZE_OF_VEC_STRING)) { + if (LIKELY(slice->size <= limit)) { column_string->insert_data(slice->data, slice->size); } else { - return Status::NotSupported("Not support string len over than 1MB in vec engine."); + return Status::NotSupported(fmt::format( + "Not support string len over than {} in vec engine.", limit)); } } else { column_string->insert_default(); diff --git a/be/src/olap/types.h b/be/src/olap/types.h index e734c7d..00c45d7 100644 --- a/be/src/olap/types.h +++ b/be/src/olap/types.h @@ -1117,9 +1117,9 @@ template <> struct FieldTypeTraits<OLAP_FIELD_TYPE_STRING> : public FieldTypeTraits<OLAP_FIELD_TYPE_CHAR> { static OLAPStatus from_string(void* buf, const std::string& scan_key) { size_t value_len = scan_key.length(); - if (value_len > OLAP_STRING_MAX_LENGTH) { + if (value_len > config::string_type_length_soft_limit_bytes) { LOG(WARNING) << "the len of value string is too long, len=" << value_len - << ", max_len=" << OLAP_STRING_MAX_LENGTH; + << ", max_len=" << config::string_type_length_soft_limit_bytes; return OLAP_ERR_INPUT_PARAMETER_ERROR; } diff --git a/be/src/olap/wrapper_field.cpp b/be/src/olap/wrapper_field.cpp index be908a2..887b93b 100644 --- a/be/src/olap/wrapper_field.cpp +++ b/be/src/olap/wrapper_field.cpp @@ -28,8 +28,9 @@ WrapperField* WrapperField::create(const TabletColumn& column, uint32_t len) { (column.type() == OLAP_FIELD_TYPE_CHAR || column.type() == OLAP_FIELD_TYPE_VARCHAR || column.type() == OLAP_FIELD_TYPE_HLL || column.type() == OLAP_FIELD_TYPE_OBJECT || column.type() == OLAP_FIELD_TYPE_STRING); - size_t max_length = column.type() == OLAP_FIELD_TYPE_STRING ? OLAP_STRING_MAX_LENGTH - : OLAP_VARCHAR_MAX_LENGTH; + size_t max_length = column.type() == OLAP_FIELD_TYPE_STRING + ? config::string_type_length_soft_limit_bytes + : OLAP_VARCHAR_MAX_LENGTH; if (is_string_type && len > max_length) { OLAP_LOG_WARNING("length of string parameter is too long[len=%lu, max_len=%lu].", len, max_length); diff --git a/be/src/runtime/types.h b/be/src/runtime/types.h index 9235e6c..4742565 100644 --- a/be/src/runtime/types.h +++ b/be/src/runtime/types.h @@ -45,7 +45,6 @@ struct TypeDescriptor { /// Only set if type == TYPE_CHAR or type == TYPE_VARCHAR int len; static const int MAX_VARCHAR_LENGTH = OLAP_VARCHAR_MAX_LENGTH; - static const int MAX_STRING_LENGTH = OLAP_STRING_MAX_LENGTH; static const int MAX_CHAR_LENGTH = 255; static const int MAX_CHAR_INLINE_LENGTH = 128; @@ -99,7 +98,7 @@ struct TypeDescriptor { static TypeDescriptor create_string_type() { TypeDescriptor ret; ret.type = TYPE_STRING; - ret.len = MAX_STRING_LENGTH; + ret.len = config::string_type_length_soft_limit_bytes; return ret; } diff --git a/be/src/vec/sink/vtablet_sink.cpp b/be/src/vec/sink/vtablet_sink.cpp index ea93c09..e9768c9 100644 --- a/be/src/vec/sink/vtablet_sink.cpp +++ b/be/src/vec/sink/vtablet_sink.cpp @@ -115,10 +115,11 @@ Status VOlapTableSink::send(RuntimeState* state, vectorized::Block* input_block) if (!_vpartition->find_partition(&block_row, &partition)) { RETURN_IF_ERROR(state->append_error_msg_to_file([]() -> std::string { return ""; }, [&]() -> std::string { - fmt::memory_buffer buf; - fmt::format_to(buf, "no partition for this tuple. tuple=[]"); - return fmt::to_string(buf); - }, &stop_processing)); + fmt::memory_buffer buf; + fmt::format_to(buf, "no partition for this tuple. tuple=[]"); + return fmt::to_string(buf); + }, + &stop_processing)); _number_filtered_rows++; if (stop_processing) { return Status::EndOfFile("Encountered unqualified data, stop processing"); @@ -162,9 +163,11 @@ Status VOlapTableSink::_validate_data(RuntimeState* state, vectorized::Block* bl const auto num_rows = block->rows(); fmt::memory_buffer error_msg; auto set_invalid_and_append_error_msg = [&](int row) { - filter_bitmap->Set(row, true); - return state->append_error_msg_to_file([]() -> std::string { return ""; }, - [&error_msg]() -> std::string { return fmt::to_string(error_msg); }, stop_processing); + filter_bitmap->Set(row, true); + return state->append_error_msg_to_file( + []() -> std::string { return ""; }, + [&error_msg]() -> std::string { return fmt::to_string(error_msg); }, + stop_processing); }; for (int i = 0; i < _output_tuple_desc->slots().size(); ++i) { @@ -199,12 +202,8 @@ Status VOlapTableSink::_validate_data(RuntimeState* state, vectorized::Block* bl const auto column_string = assert_cast<const vectorized::ColumnString*>(real_column_ptr.get()); - size_t limit = MAX_SIZE_OF_VEC_STRING; - if (desc->type().type != TYPE_STRING) { - DCHECK(desc->type().len >= 0); - limit = std::min(limit, (size_t)desc->type().len); - } - + size_t limit = + std::min(config::string_type_length_soft_limit_bytes, desc->type().len); for (int j = 0; j < num_rows; ++j) { if (!filter_bitmap->Get(j)) { auto str_val = column_string->get_data_at(j); @@ -218,14 +217,14 @@ Status VOlapTableSink::_validate_data(RuntimeState* state, vectorized::Block* bl fmt::format_to(error_msg, "input str: [{}] ", str_val.to_prefix(10)); fmt::format_to(error_msg, "schema length: {}; ", desc->type().len); fmt::format_to(error_msg, "actual length: {}; ", str_val.size); - } else if (str_val.size > MAX_SIZE_OF_VEC_STRING) { + } else if (str_val.size > limit) { fmt::format_to( error_msg, "{}", "the length of input string is too long than vec schema. "); fmt::format_to(error_msg, "column_name: {}; ", desc->col_name()); fmt::format_to(error_msg, "input str: [{}] ", str_val.to_prefix(10)); fmt::format_to(error_msg, "schema length: {}; ", desc->type().len); - fmt::format_to(error_msg, "limit length: {}; ", MAX_SIZE_OF_VEC_STRING); + fmt::format_to(error_msg, "limit length: {}; ", limit); fmt::format_to(error_msg, "actual length: {}; ", str_val.size); } diff --git a/docs/en/administrator-guide/config/be_config.md b/docs/en/administrator-guide/config/be_config.md index c917492..0f8bfaa 100644 --- a/docs/en/administrator-guide/config/be_config.md +++ b/docs/en/administrator-guide/config/be_config.md @@ -1499,3 +1499,8 @@ Translated with www.DeepL.com/Translator (free version) * Type: int32 * Description: The maximum amount of data read by each OlapScanner. * Default: 1024 + +### `string_type_length_soft_limit_bytes` +* Type: int32 +* Description: A soft limit of string type length. +* Description: 1048576 \ No newline at end of file diff --git a/docs/en/sql-reference/sql-statements/Data Types/STRING.md b/docs/en/sql-reference/sql-statements/Data Types/STRING.md index 654a7b9..9a01939 100644 --- a/docs/en/sql-reference/sql-statements/Data Types/STRING.md +++ b/docs/en/sql-reference/sql-statements/Data Types/STRING.md @@ -27,7 +27,7 @@ under the License. # STRING ## Description STRING (M) -A variable length string, max legnth is 2147483643(2GB - 4). +A variable length string, max legnth is 2147483643(2GB - 4). The length of the String type is also limited by the configuration `string_type_length_soft_limit_bytes` of be, the actual maximum length that can be stored take the minimum value of both, the String type can only be used in the value column, not in the key column and the partition and bucket columns Note: Variable length strings are stored in UTF-8 encoding, so usually English characters occupies 1 byte, and Chinese characters occupies 3 bytes. diff --git a/docs/zh-CN/administrator-guide/config/be_config.md b/docs/zh-CN/administrator-guide/config/be_config.md index 65277f2..2b86af5 100644 --- a/docs/zh-CN/administrator-guide/config/be_config.md +++ b/docs/zh-CN/administrator-guide/config/be_config.md @@ -1516,3 +1516,8 @@ webserver默认工作线程数 * 类型: int32 * 描述: 每个OlapScanner 读取的最大数据量 * 默认值: 1024 + +### `string_type_length_soft_limit_bytes` +* 类型: int32 +* 描述: String 类型最大长度的软限,单位是字节 +* 默认值: 1048576 diff --git a/docs/zh-CN/sql-reference/sql-statements/Data Types/STRING.md b/docs/zh-CN/sql-reference/sql-statements/Data Types/STRING.md index 7b80e79..88fb608 100644 --- a/docs/zh-CN/sql-reference/sql-statements/Data Types/STRING.md +++ b/docs/zh-CN/sql-reference/sql-statements/Data Types/STRING.md @@ -27,7 +27,7 @@ under the License. # STRING ## description STRING - 变长字符串,最大支持2147483643 字节(2GB-4)。用法类似VARCHAR。 + 变长字符串,最大支持2147483643 字节(2GB-4)。String类型的长度还受 be 配置 `string_type_soft_limit`, 实际能存储的最大长度 取两者最小值,String类型只能用在value 列,不能用在 key 列和分区 分桶列 注意:变长字符串是以UTF-8编码存储的,因此通常英文字符占1个字节,中文字符占3个字节。 diff --git a/fe/fe-core/src/main/cup/sql_parser.cup b/fe/fe-core/src/main/cup/sql_parser.cup index da8d815..f42e864 100644 --- a/fe/fe-core/src/main/cup/sql_parser.cup +++ b/fe/fe-core/src/main/cup/sql_parser.cup @@ -4569,8 +4569,6 @@ type ::= {: RESULT = ScalarType.createStringType(); :} | KW_TEXT {: RESULT = ScalarType.createStringType(); :} - | KW_BLOB - {: RESULT = ScalarType.createStringType(); :} | KW_VARCHAR LPAREN INTEGER_LITERAL:len RPAREN {: ScalarType type = ScalarType.createVarcharType(len.intValue()); type.setAssignedStrLenInColDefinition(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/ColumnDef.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/ColumnDef.java index bd32419..cf61e09 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/ColumnDef.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/ColumnDef.java @@ -215,7 +215,10 @@ public class ColumnDef { throw new AnalysisException("Array type column default value only support null"); } } - + if (isKey() && type.getPrimitiveType() == PrimitiveType.STRING) { + throw new AnalysisException("String Type should not be used in key column[" + getName() + + "]."); + } if (type.getPrimitiveType() == PrimitiveType.MAP) { if (defaultValue.isSet && defaultValue != DefaultValue.NULL_DEFAULT_VALUE) { throw new AnalysisException("Map type column default value just support null"); diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/CreateTableStmt.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/CreateTableStmt.java index ed2b689..47a34b9 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/CreateTableStmt.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/CreateTableStmt.java @@ -296,7 +296,8 @@ public class CreateTableStmt extends DdlStmt { } if (hasAggregate) { for (ColumnDef columnDef : columnDefs) { - if (columnDef.getAggregateType() == null) { + if (columnDef.getAggregateType() == null + && !columnDef.getType().isScalarType(PrimitiveType.STRING)) { keysColumnNames.add(columnDef.getName()); } } @@ -315,6 +316,9 @@ public class CreateTableStmt extends DdlStmt { if (columnDef.getType().isFloatingPointType()) { break; } + if (columnDef.getType().getPrimitiveType() == PrimitiveType.STRING) { + break; + } if (columnDef.getType().getPrimitiveType() == PrimitiveType.VARCHAR) { keysColumnNames.add(columnDef.getName()); break; @@ -324,8 +328,8 @@ public class CreateTableStmt extends DdlStmt { // The OLAP table must has at least one short key and the float and double should not be short key. // So the float and double could not be the first column in OLAP table. if (keysColumnNames.isEmpty()) { - throw new AnalysisException("The olap table first column could not be float or double," - + " use decimal instead."); + throw new AnalysisException("The olap table first column could not be float, double, string" + + " use decimal or varchar instead."); } keysDesc = new KeysDesc(KeysType.DUP_KEYS, keysColumnNames); } @@ -422,7 +426,7 @@ public class CreateTableStmt extends DdlStmt { if (distributionDesc == null) { throw new AnalysisException("Create olap table should contain distribution desc"); } - distributionDesc.analyze(columnSet); + distributionDesc.analyze(columnSet, columnDefs); } else if (engineName.equalsIgnoreCase("elasticsearch")) { EsUtil.analyzePartitionAndDistributionDesc(partitionDesc, distributionDesc); } else { diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/DistributionDesc.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/DistributionDesc.java index c349185..5d5b132 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/DistributionDesc.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/DistributionDesc.java @@ -40,7 +40,7 @@ public class DistributionDesc implements Writable { } - public void analyze(Set<String> colSet) throws AnalysisException { + public void analyze(Set<String> colSet, List<ColumnDef> columnDefs) throws AnalysisException { throw new NotImplementedException(); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/HashDistributionDesc.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/HashDistributionDesc.java index 2379d3d..d02faa4 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/HashDistributionDesc.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/HashDistributionDesc.java @@ -22,11 +22,13 @@ import org.apache.doris.catalog.Column; import org.apache.doris.catalog.DistributionInfo; import org.apache.doris.catalog.DistributionInfo.DistributionInfoType; import org.apache.doris.catalog.HashDistributionInfo; +import org.apache.doris.catalog.PrimitiveType; import org.apache.doris.common.AnalysisException; import org.apache.doris.common.DdlException; import org.apache.doris.common.io.Text; import com.google.common.collect.Lists; +import com.google.common.collect.Sets; import java.io.DataInput; import java.io.DataOutput; @@ -58,17 +60,29 @@ public class HashDistributionDesc extends DistributionDesc { } @Override - public void analyze(Set<String> cols) throws AnalysisException { + public void analyze(Set<String> colSet, List<ColumnDef> columnDefs) throws AnalysisException { if (numBucket <= 0) { throw new AnalysisException("Number of hash distribution should be larger than zero."); } if (distributionColumnNames == null || distributionColumnNames.size() == 0) { throw new AnalysisException("Number of hash column should be larger than zero."); } + Set<String> distColSet = Sets.newTreeSet(String.CASE_INSENSITIVE_ORDER); for (String columnName : distributionColumnNames) { - if (!cols.contains(columnName)) { + if (!colSet.contains(columnName)) { throw new AnalysisException("Distribution column(" + columnName + ") doesn't exist."); } + if (!distColSet.add(columnName)) { + throw new AnalysisException("Duplicated distribution column " + columnName); + } + for (ColumnDef columnDef : columnDefs) { + if (columnDef.getName().equals(columnName)) { + if (columnDef.getType().isScalarType(PrimitiveType.STRING)) { + throw new AnalysisException("String Type should not be used in distribution column[" + + columnDef.getName() + "]."); + } + } + } } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/PartitionDesc.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/PartitionDesc.java index 7566cdd..83c592c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/PartitionDesc.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/PartitionDesc.java @@ -21,6 +21,7 @@ import org.apache.doris.catalog.AggregateType; import org.apache.doris.catalog.Column; import org.apache.doris.catalog.PartitionInfo; import org.apache.doris.catalog.PartitionType; +import org.apache.doris.catalog.PrimitiveType; import org.apache.doris.common.AnalysisException; import org.apache.doris.common.DdlException; import org.apache.doris.qe.ConnectContext; @@ -78,6 +79,10 @@ public class PartitionDesc { if (columnDef.getType().isFloatingPointType()) { throw new AnalysisException("Floating point type column can not be partition column"); } + if (columnDef.getType().isScalarType(PrimitiveType.STRING)) { + throw new AnalysisException("String Type should not be used in partition column[" + + columnDef.getName() + "]."); + } if (!ConnectContext.get().getSessionVariable().isAllowPartitionColumnNullable() && columnDef.isAllowNull()) { throw new AnalysisException("The partition column must be NOT NULL"); diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/RandomDistributionDesc.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/RandomDistributionDesc.java index 947cebb..d32af89 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/RandomDistributionDesc.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/RandomDistributionDesc.java @@ -42,7 +42,7 @@ public class RandomDistributionDesc extends DistributionDesc { } @Override - public void analyze(Set<String> colSet) throws AnalysisException { + public void analyze(Set<String> colSet, List<ColumnDef> columnDefs) throws AnalysisException { if (numBucket <= 0) { throw new AnalysisException("Number of random distribution should be larger than zero."); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/ScalarType.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/ScalarType.java index 9484d7a..714e067 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/ScalarType.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/ScalarType.java @@ -196,7 +196,6 @@ public class ScalarType extends Type { return createVarcharType(); case "STRING": case "TEXT": - case "BLOB": return createStringType(); case "HLL": return createHllType(); diff --git a/fe/fe-core/src/test/java/org/apache/doris/planner/PlannerTest.java b/fe/fe-core/src/test/java/org/apache/doris/planner/PlannerTest.java index ff29dc2..8548ba6 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/planner/PlannerTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/planner/PlannerTest.java @@ -22,6 +22,7 @@ import org.apache.doris.analysis.CreateTableStmt; import org.apache.doris.analysis.ExplainOptions; import org.apache.doris.analysis.Expr; import org.apache.doris.catalog.Catalog; +import org.apache.doris.common.AnalysisException; import org.apache.doris.qe.ConnectContext; import org.apache.doris.qe.StmtExecutor; import org.apache.doris.utframe.UtFrameUtils; @@ -31,7 +32,9 @@ import org.apache.commons.lang3.StringUtils; import org.junit.After; import org.junit.Assert; import org.junit.BeforeClass; +import org.junit.Rule; import org.junit.Test; +import org.junit.rules.ExpectedException; import java.io.File; import java.util.List; @@ -41,6 +44,9 @@ public class PlannerTest { private static String runningDir = "fe/mocked/DemoTest/" + UUID.randomUUID().toString() + "/"; private static ConnectContext ctx; + @Rule + public ExpectedException expectedEx = ExpectedException.none(); + @After public void tearDown() throws Exception { FileUtils.deleteDirectory(new File(runningDir)); @@ -434,4 +440,13 @@ public class PlannerTest { compare.accept("select * from db1.tbl2 where k1 > 2.1", "select * from db1.tbl2 where k1 > 2"); } + @Test + public void testStringType() throws Exception { + String createTbl1 = "create table db1.tbl1(k1 string, k2 varchar(32), k3 varchar(32), k4 int) " + + "AGGREGATE KEY(k1, k2,k3,k4) distributed by hash(k1) buckets 3 properties('replication_num' = '1')"; + expectedEx.expect(AnalysisException.class); + expectedEx.expectMessage("String Type should not be used in key column[k1]."); + UtFrameUtils.parseAndAnalyzeStmt(createTbl1, ctx); + } + } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org