SaintBacchus commented on code in PR #16941: URL: https://github.com/apache/doris/pull/16941#discussion_r1111662671
########## be/src/exec/es/es_scroll_parser.cpp: ########## @@ -519,7 +521,178 @@ Status ScrollParser::fill_columns(const TupleDescriptor* tuple_desc, } break; } + case TYPE_ARRAY: { + vectorized::Array array; + const auto& sub_type = tuple_desc->slots()[i]->type().children[0].type; + for (auto& sub_col : col.GetArray()) { + switch (sub_type) { + case TYPE_CHAR: + case TYPE_VARCHAR: + case TYPE_STRING: { + std::string val; + if (pure_doc_value) { + if (!sub_col[0].IsString()) { + val = json_value_to_string(sub_col[0]); + } else { + val = sub_col[0].GetString(); + } + } else { + RETURN_ERROR_IF_COL_IS_ARRAY(sub_col, type); + if (!sub_col.IsString()) { + val = json_value_to_string(sub_col); + } else { + val = sub_col.GetString(); + } + } + array.push_back(val); + break; + } + case TYPE_TINYINT: { + int8_t val; + RETURN_IF_ERROR(get_int_value<int8_t>(sub_col, sub_type, &val, pure_doc_value)); + array.push_back(val); + break; + } + case TYPE_SMALLINT: { + int16_t val; + RETURN_IF_ERROR( + get_int_value<int16_t>(sub_col, sub_type, &val, pure_doc_value)); + array.push_back(val); + break; + } + case TYPE_INT: { + int32 val; + RETURN_IF_ERROR(get_int_value<int32>(sub_col, sub_type, &val, pure_doc_value)); + array.push_back(val); + break; + } + case TYPE_BIGINT: { + int64_t val; + RETURN_IF_ERROR( + get_int_value<int64_t>(sub_col, sub_type, &val, pure_doc_value)); + array.push_back(val); + break; + } + case TYPE_LARGEINT: { + __int128 val; + RETURN_IF_ERROR( + get_int_value<__int128>(sub_col, sub_type, &val, pure_doc_value)); + array.push_back(val); + break; + } + case TYPE_FLOAT: { + float val; + RETURN_IF_ERROR( + get_float_value<float>(sub_col, sub_type, &val, pure_doc_value)); + array.push_back(val); + break; + } + case TYPE_DOUBLE: { + double val; + RETURN_IF_ERROR( + get_float_value<double>(sub_col, sub_type, &val, pure_doc_value)); + array.push_back(val); + break; + } + case TYPE_BOOLEAN: { + if (sub_col.IsBool()) { + array.push_back(sub_col.GetBool()); + break; + } + + if (sub_col.IsNumber()) { + array.push_back(sub_col.GetInt()); + break; + } + + bool is_nested_str = false; + if (pure_doc_value && sub_col.IsArray() && sub_col[0].IsBool()) { + array.push_back(sub_col[0].GetBool()); + break; + } else if (pure_doc_value && sub_col.IsArray() && sub_col[0].IsString()) { + is_nested_str = true; + } else if (pure_doc_value && sub_col.IsArray()) { + return Status::InternalError(ERROR_INVALID_COL_DATA, "BOOLEAN"); + } + + const rapidjson::Value& str_col = is_nested_str ? sub_col[0] : sub_col; + + const std::string& val = str_col.GetString(); + size_t val_size = str_col.GetStringLength(); + StringParser::ParseResult result; + bool b = StringParser::string_to_bool(val.c_str(), val_size, &result); + RETURN_ERROR_IF_PARSING_FAILED(result, str_col, type); + array.push_back(b); + break; + } + // date/datetime v2 is the default type for catalog table, + // see https://github.com/apache/doris/pull/16304 + // No need to support date and datetime types. + case TYPE_DATEV2: { + // this would happend just only when `enable_docvalue_scan = false`, and field has timestamp format date from _source + if (sub_col.IsNumber()) { + // ES process date/datetime field would use millisecond timestamp for index or docvalue + // processing date type field, if a number is encountered, Doris On ES will force it to be processed according to ms + // Doris On ES needs to be consistent with ES, so just divided by 1000 because the unit for from_unixtime is seconds + array.push_back(get_datev2_int(sub_col, false)); + } else if (sub_col.IsArray() && pure_doc_value) { + // this would happened just only when `enable_docvalue_scan = true` + // ES add default format for all field after ES 6.4, if we not provided format for `date` field ES would impose + // a standard date-format for date field as `2020-06-16T00:00:00.000Z` + // At present, we just process this string format date. After some PR were merged into Doris, we would impose `epoch_mills` for + // date field's docvalue + if (sub_col[0].IsString()) { + array.push_back(get_datev2_int(sub_col, true)); + break; + } + // ES would return millisecond timestamp for date field, divided by 1000 because the unit for from_unixtime is seconds + array.push_back(get_datev2_int(sub_col, false)); + } else { + // this would happened just only when `enable_docvalue_scan = false`, and field has string format date from _source + RETURN_ERROR_IF_COL_IS_ARRAY(sub_col, type); + RETURN_ERROR_IF_COL_IS_NOT_STRING(sub_col, type); + array.push_back(get_datev2_int(sub_col, true)); + } + break; + } + case TYPE_DATETIMEV2: { + // this would happend just only when `enable_docvalue_scan = false`, and field has timestamp format date from _source + if (sub_col.IsNumber()) { + // ES process date/datetime field would use millisecond timestamp for index or docvalue + // processing date type field, if a number is encountered, Doris On ES will force it to be processed according to ms + // Doris On ES needs to be consistent with ES, so just divided by 1000 because the unit for from_unixtime is seconds + array.push_back(get_datetimev2_int(sub_col, false)); + } else if (sub_col.IsArray() && pure_doc_value) { + // this would happened just only when `enable_docvalue_scan = true` + // ES add default format for all field after ES 6.4, if we not provided format for `date` field ES would impose + // a standard date-format for date field as `2020-06-16T00:00:00.000Z` + // At present, we just process this string format date. After some PR were merged into Doris, we would impose `epoch_mills` for + // date field's docvalue + if (sub_col[0].IsString()) { + array.push_back(get_datetimev2_int(sub_col, true)); + break; + } + // ES would return millisecond timestamp for date field, divided by 1000 because the unit for from_unixtime is seconds + array.push_back(get_datetimev2_int(sub_col, false)); + } else { + // this would happened just only when `enable_docvalue_scan = false`, and field has string format date from _source + RETURN_ERROR_IF_COL_IS_ARRAY(sub_col, type); + RETURN_ERROR_IF_COL_IS_NOT_STRING(sub_col, type); + array.push_back(get_datetimev2_int(sub_col, true)); Review Comment: As discussion with @stalary, it's better to place `DateV2` and `DateTimeV2` in one method since these case branch code were too similar. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org