This is an automated email from the ASF dual-hosted git repository. zhaoc pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/incubator-doris.git
The following commit(s) were added to refs/heads/master by this push: new 14769b0 Improve to_bitmap parse int performance (#2223) 14769b0 is described below commit 14769b0beb7671626da011611d2c73ffe841b831 Author: kangkaisen <kangkai...@apache.org> AuthorDate: Tue Nov 19 18:00:19 2019 +0800 Improve to_bitmap parse int performance (#2223) --- be/src/exprs/bitmap_function.cpp | 24 ++++------- be/src/util/string_parser.hpp | 68 +++++++++++++++++++++++++++++++ be/test/exprs/bitmap_function_test.cpp | 4 +- be/test/util/CMakeLists.txt | 1 + be/test/util/string_parser_test.cpp | 73 +++++++++++++++++++++++++++++++--- run-ut.sh | 1 + 6 files changed, 146 insertions(+), 25 deletions(-) diff --git a/be/src/exprs/bitmap_function.cpp b/be/src/exprs/bitmap_function.cpp index 107362a..2f1942a 100644 --- a/be/src/exprs/bitmap_function.cpp +++ b/be/src/exprs/bitmap_function.cpp @@ -19,6 +19,7 @@ #include "exprs/anyval_util.h" #include "util/bitmap.h" +#include "util/string_parser.hpp" namespace doris { void BitmapFunctions::init() { @@ -79,27 +80,16 @@ BigIntVal BitmapFunctions::bitmap_count(FunctionContext* ctx, const StringVal& s StringVal BitmapFunctions::to_bitmap(doris_udf::FunctionContext* ctx, const doris_udf::StringVal& src) { std::unique_ptr<RoaringBitmap> bitmap {new RoaringBitmap()}; if (!src.is_null) { - std::string tmp_str = std::string(reinterpret_cast<char*>(src.ptr), src.len) ; - unsigned long uint32_value = 0; - try { - uint32_value = std::stoul(tmp_str); - // the std::stoul result type is unsigned long, not uint32_t. so we need check it - if(UNLIKELY(uint32_value > std::numeric_limits<unsigned int>::max())) { - throw std::out_of_range(""); - } - } catch (std::invalid_argument& e) { + StringParser::ParseResult parse_result = StringParser::PARSE_SUCCESS; + uint32_t int_value = StringParser::string_to_unsigned_int<uint32_t>(reinterpret_cast<char*>(src.ptr), src.len, &parse_result); + if (UNLIKELY(parse_result != StringParser::PARSE_SUCCESS)) { std::stringstream error_msg; - error_msg << "The to_bitmap function argument: " << tmp_str << " type isn't integer family"; - ctx->set_error(error_msg.str().c_str()); - return StringVal::null(); - } catch (std::out_of_range& e) { - std::stringstream error_msg; - error_msg << "The to_bitmap function argument: " << tmp_str << " exceed unsigned integer max value " - << std::numeric_limits<unsigned int>::max(); + error_msg << "The to_bitmap function argument: " << std::string(reinterpret_cast<char*>(src.ptr), src.len) + << " type isn't integer family or exceed unsigned integer max value 4294967295"; ctx->set_error(error_msg.str().c_str()); return StringVal::null(); } - bitmap->update(uint32_value); + bitmap->update(int_value); } std::string buf; buf.resize(bitmap->size()); diff --git a/be/src/util/string_parser.hpp b/be/src/util/string_parser.hpp index 3a0aa73..0354343 100644 --- a/be/src/util/string_parser.hpp +++ b/be/src/util/string_parser.hpp @@ -86,6 +86,20 @@ public: return string_to_int_internal<T>(s + i, len - i, result); } + // This is considerably faster than glibc's implementation. + // In the case of overflow, the max/min value for the data type will be returned. + // Assumes s represents a decimal number. + template <typename T> + static inline T string_to_unsigned_int(const char* s, int len, ParseResult* result) { + T ans = string_to_unsigned_int_internal<T>(s, len, result); + if (LIKELY(*result == PARSE_SUCCESS)){ + return ans; + } + + int i = skip_leading_whitespace(s, len); + return string_to_unsigned_int_internal<T>(s + i, len - i, result); + } + // Convert a string s representing a number in given base into a decimal number. template <typename T> static inline T string_to_int(const char* s, int len, int base, ParseResult* result) { @@ -159,6 +173,13 @@ private: template <typename T> static inline T string_to_int_internal(const char* s, int len, ParseResult* result); + // This is considerably faster than glibc's implementation. + // In the case of overflow, the max/min value for the data type will be returned. + // Assumes s represents a decimal number. + // Return PARSE_FAILURE on leading whitespace. Trailing whitespace is allowed. + template <typename T> + static inline T string_to_unsigned_int_internal(const char* s, int len, ParseResult* result); + // Convert a string s representing a number in given base into a decimal number. // Return PARSE_FAILURE on leading whitespace. Trailing whitespace is allowed. template <typename T> @@ -267,6 +288,53 @@ inline T StringParser::string_to_int_internal(const char* s, int len, ParseResul } template <typename T> +inline T StringParser::string_to_unsigned_int_internal(const char* s, int len, ParseResult* result) { + if (UNLIKELY(len <= 0)) { + *result = PARSE_FAILURE; + return 0; + } + + T val = 0; + T max_val = std::numeric_limits<T>::max(); + int i = 0; + + typedef typename std::make_signed<T>::type signedT; + // This is the fast path where the string cannot overflow. + if (LIKELY(len - i < StringParseTraits<signedT>::max_ascii_len())) { + val = string_to_int_no_overflow<T>(s + i, len - i, result); + return val; + } + + const T max_div_10 = max_val / 10; + const T max_mod_10 = max_val % 10; + + int first = i; + for (; i < len; ++i) { + if (LIKELY(s[i] >= '0' && s[i] <= '9')) { + T digit = s[i] - '0'; + // This is a tricky check to see if adding this digit will cause an overflow. + if (UNLIKELY(val > (max_div_10 - (digit > max_mod_10)))) { + *result = PARSE_OVERFLOW; + return max_val; + } + val = val * 10 + digit; + } else { + if ((UNLIKELY(i == first || !is_all_whitespace(s + i, len - i)))) { + // Reject the string because either the first char was not a digit, + // or the remaining chars are not all whitespace + *result = PARSE_FAILURE; + return 0; + } + // Returning here is slightly faster than breaking the loop. + *result = PARSE_SUCCESS; + return val; + } + } + *result = PARSE_SUCCESS; + return val; +} + +template <typename T> inline T StringParser::string_to_int_internal( const char* s, int len, int base, ParseResult* result) { typedef typename std::make_unsigned<T>::type UnsignedT; diff --git a/be/test/exprs/bitmap_function_test.cpp b/be/test/exprs/bitmap_function_test.cpp index 228a104..b8492f2 100644 --- a/be/test/exprs/bitmap_function_test.cpp +++ b/be/test/exprs/bitmap_function_test.cpp @@ -90,7 +90,7 @@ TEST_F(BitmapFunctionsTest, to_bitmap_invalid_argument) { ASSERT_EQ(expected, result); ASSERT_TRUE(ctx->has_error()); - std::string error_msg("The to_bitmap function argument: xxxxxx type isn't integer family"); + std::string error_msg("The to_bitmap function argument: xxxxxx type isn't integer family or exceed unsigned integer max value 4294967295"); ASSERT_EQ(error_msg, ctx->error_msg()); } @@ -103,7 +103,7 @@ TEST_F(BitmapFunctionsTest, to_bitmap_out_of_range) { ASSERT_TRUE(ctx->has_error()); - std::string error_msg("The to_bitmap function argument: 4294967296 exceed unsigned integer max value 4294967295"); + std::string error_msg("The to_bitmap function argument: 4294967296 type isn't integer family or exceed unsigned integer max value 4294967295"); ASSERT_EQ(error_msg, ctx->error_msg()); } diff --git a/be/test/util/CMakeLists.txt b/be/test/util/CMakeLists.txt index bc78c18..1b4739e 100644 --- a/be/test/util/CMakeLists.txt +++ b/be/test/util/CMakeLists.txt @@ -32,6 +32,7 @@ ADD_BE_TEST(new_metrics_test) ADD_BE_TEST(doris_metrics_test) ADD_BE_TEST(system_metrics_test) ADD_BE_TEST(string_util_test) +ADD_BE_TEST(string_parser_test) ADD_BE_TEST(core_local_test) ADD_BE_TEST(types_test) ADD_BE_TEST(json_util_test) diff --git a/be/test/util/string_parser_test.cpp b/be/test/util/string_parser_test.cpp index 0fd308b..dcfb3a6 100644 --- a/be/test/util/string_parser_test.cpp +++ b/be/test/util/string_parser_test.cpp @@ -47,6 +47,21 @@ void test_int_value(const char* s, T exp_val, StringParser::ParseResult exp_resu } } +// Tests conversion of s to integer with and without leading/trailing whitespace +template<typename T> +void test_unsigned_int_value(const char* s, T exp_val, StringParser::ParseResult exp_result) { + for (int i = 0; i < space_len; ++i) { + for (int j = 0; j < space_len; ++j) { + // All combinations of leading and/or trailing whitespace. + std::string str = space[i] + s + space[j]; + StringParser::ParseResult result; + T val = StringParser::string_to_unsigned_int<T>(str.data(), str.length(), &result); + EXPECT_EQ(exp_val, val) << str; + EXPECT_EQ(result, exp_result); + } + } +} + // Tests conversion of s, given a base, to an integer with and without leading/trailing whitespace template<typename T> void test_int_value(const char* s, int base, T exp_val, StringParser::ParseResult exp_result) { @@ -209,13 +224,59 @@ TEST(StringToInt, Limit) { test_int_value<int32_t>("2147483647", 2147483647, StringParser::PARSE_SUCCESS); test_int_value<int32_t>("-2147483648", -2147483648, StringParser::PARSE_SUCCESS); test_int_value<int64_t>( - "9223372036854775807", - std::numeric_limits<int64_t>::max(), - StringParser::PARSE_SUCCESS); + "9223372036854775807", + std::numeric_limits<int64_t>::max(), + StringParser::PARSE_SUCCESS); test_int_value<int64_t>( - "-9223372036854775808", - std::numeric_limits<int64_t>::min(), - StringParser::PARSE_SUCCESS); + "-9223372036854775808", + std::numeric_limits<int64_t>::min(), + StringParser::PARSE_SUCCESS); +} + +TEST(StringToUnsignedInt, Basic) { + test_unsigned_int_value<uint8_t>("123", 123, StringParser::PARSE_SUCCESS); + test_unsigned_int_value<uint16_t>("123", 123, StringParser::PARSE_SUCCESS); + test_unsigned_int_value<uint32_t>("123", 123, StringParser::PARSE_SUCCESS); + test_unsigned_int_value<uint64_t>("123", 123, StringParser::PARSE_SUCCESS); + + test_unsigned_int_value<uint8_t>("123", 123, StringParser::PARSE_SUCCESS); + test_unsigned_int_value<uint16_t>("12345", 12345, StringParser::PARSE_SUCCESS); + test_unsigned_int_value<uint32_t>("12345678", 12345678, StringParser::PARSE_SUCCESS); + test_unsigned_int_value<uint64_t>("12345678901234", 12345678901234, StringParser::PARSE_SUCCESS); + + test_unsigned_int_value<uint8_t>("-10", 0, StringParser::PARSE_FAILURE); + test_unsigned_int_value<uint16_t>("-10", 0, StringParser::PARSE_FAILURE); + test_unsigned_int_value<uint32_t>("-10", 0, StringParser::PARSE_FAILURE); + test_unsigned_int_value<uint64_t>("-10", 0, StringParser::PARSE_FAILURE); + + test_unsigned_int_value<uint8_t>("+1", 0, StringParser::PARSE_FAILURE); + test_unsigned_int_value<uint16_t>("+1", 0, StringParser::PARSE_FAILURE); + test_unsigned_int_value<uint32_t>("+1", 0, StringParser::PARSE_FAILURE); + test_unsigned_int_value<uint64_t>("+1", 0, StringParser::PARSE_FAILURE); + + test_unsigned_int_value<uint8_t>("+0", 0, StringParser::PARSE_FAILURE); + test_unsigned_int_value<uint16_t>("-0", 0, StringParser::PARSE_FAILURE); + test_unsigned_int_value<uint32_t>("+0", 0, StringParser::PARSE_FAILURE); + test_unsigned_int_value<uint64_t>("-0", 0, StringParser::PARSE_FAILURE); +} + +TEST(StringToUnsignedInt, Limit) { + test_unsigned_int_value<uint8_t>("255", 255, StringParser::PARSE_SUCCESS); + test_unsigned_int_value<uint16_t>("65535", 65535, StringParser::PARSE_SUCCESS); + test_unsigned_int_value<uint32_t>("4294967295", 4294967295, StringParser::PARSE_SUCCESS); + test_unsigned_int_value<uint64_t>("18446744073709551615", + std::numeric_limits<uint64_t>::max(), + StringParser::PARSE_SUCCESS); +} + +TEST(StringToUnsignedInt, Overflow) { + test_unsigned_int_value<uint8_t>("256", 255, StringParser::PARSE_OVERFLOW); + test_unsigned_int_value<uint16_t>("65536", 65535, StringParser::PARSE_OVERFLOW); + test_unsigned_int_value<uint32_t>("4294967296", 4294967295, StringParser::PARSE_OVERFLOW); + test_unsigned_int_value<uint64_t>( + "18446744073709551616", + std::numeric_limits<uint64_t>::max(), + StringParser::PARSE_OVERFLOW); } TEST(StringToInt, Overflow) { diff --git a/run-ut.sh b/run-ut.sh index 07c035d..d0720ed 100755 --- a/run-ut.sh +++ b/run-ut.sh @@ -151,6 +151,7 @@ ${DORIS_TEST_BINARY_DIR}/util/byte_buffer_test2 ${DORIS_TEST_BINARY_DIR}/util/uid_util_test ${DORIS_TEST_BINARY_DIR}/util/aes_util_test ${DORIS_TEST_BINARY_DIR}/util/string_util_test +${DORIS_TEST_BINARY_DIR}/util/string_parser_test ${DORIS_TEST_BINARY_DIR}/util/coding_test ${DORIS_TEST_BINARY_DIR}/util/faststring_test ${DORIS_TEST_BINARY_DIR}/util/tdigest_test --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org