kbobyrev created this revision. kbobyrev added reviewers: ilya-biryukov, ioeric. Herald added a reviewer: javed.absar. Herald added a subscriber: kristof.beyls.
This patch significantly improves performance of the YAML serializer by optimizing `YAML::isNumeric` function. This function is called on the most strings and is highly inefficient for two reasons: - It uses `Regex`, which is parsed and compiled each time this function is called - It uses multiple passes which are not necessary This patch introduces stateful ad hoc YAML number parser which does not rely on `Regex`. It also fixes YAML number format inconsistency: current implementation supports C-stile octal number format (`01234567`) which was present in YAML 1.0 specialization (http://yaml.org/spec/1.0/), [Section 2.4. Tags, Example 2.19] but was deprecated and is no longer present in latest YAML 1.2 specification (http://yaml.org/spec/1.2/spec.html), see [Section 10.3.2. Tag Resolution]. Since the rest of the rest of the implementation does not support other deprecated YAML 1.0 numeric features such as sexagecimal numbers, commas as delimiters it is treated as inconsistency and not longer supported. This performance bottleneck was identified while profiling Clangd's global-symbol-builder tool with my colleague @ilya-biryukov. The substantial part of the runtime was spent during a single-thread Reduce phase, which concludes with YAML serialization of collected symbol collection. Regex matching was accountable for approximately 45% of the whole runtime (which involves sharded Map phase), now it is reduced to 18% (which is spent in `clang::clangd::CanonicalIncludes` and can be also optimized because all used regexes are in fact either suffix matches or exact matches). Benchmarking `global-symbol-builder` (using `hyperfine --warmup 2 --min-runs 5 'command 1' 'command 2'` tool by processing a reasonable amount of code (26 source files matched by `clang-tools-extra/clangd/*.cpp` with all transitive includes) confirmed our understanding of the performance bottleneck nature as it speeds up the command by the factor of 1.6x: | Command | Mean [s] | Min…Max [s] | | :--- | ---: | ---: | | patch | 84.7 ± 0.6 | 83.3…84.7 | | master (https://reviews.llvm.org/rL339849) | 133.1 ± 0.8 | 132.4…134.6 | | Using smaller samples (e.g. by collecting symbols from `clang-tools-extra/clangd/AST.cpp` only) yields even better performance improvement, which is expected because Map phase takes less time compared to Reduce and is 2.05x faster: | Command | Mean [ms] | Min…Max [ms] | | :--- | ---: | ---: | | patch | 7607.6 ± 109.5 | 7533.3…7796.4 | | master (https://reviews.llvm.org/rL339849) | 3702.2 ± 48.7 | 3635.1…3752.3 | https://reviews.llvm.org/D50839 Files: llvm/include/llvm/Support/YAMLTraits.h llvm/unittests/Support/YAMLIOTest.cpp
Index: llvm/unittests/Support/YAMLIOTest.cpp =================================================================== --- llvm/unittests/Support/YAMLIOTest.cpp +++ llvm/unittests/Support/YAMLIOTest.cpp @@ -16,16 +16,17 @@ #include "gmock/gmock.h" #include "gtest/gtest.h" +using llvm::yaml::Hex16; +using llvm::yaml::Hex32; +using llvm::yaml::Hex64; +using llvm::yaml::Hex8; using llvm::yaml::Input; -using llvm::yaml::Output; using llvm::yaml::IO; -using llvm::yaml::MappingTraits; +using llvm::yaml::isNumeric; using llvm::yaml::MappingNormalization; +using llvm::yaml::MappingTraits; +using llvm::yaml::Output; using llvm::yaml::ScalarTraits; -using llvm::yaml::Hex8; -using llvm::yaml::Hex16; -using llvm::yaml::Hex32; -using llvm::yaml::Hex64; using ::testing::StartsWith; @@ -2569,3 +2570,63 @@ TestEscaped((char const *)foobar, "\"foo\\u200Bbar\""); } } + +TEST(YAMLIO, Numeric) { + EXPECT_TRUE(isNumeric(".inf")); + EXPECT_TRUE(isNumeric(".INF")); + EXPECT_TRUE(isNumeric(".Inf")); + EXPECT_TRUE(isNumeric("-.inf")); + EXPECT_TRUE(isNumeric("+.inf")); + + EXPECT_TRUE(isNumeric(".nan")); + EXPECT_TRUE(isNumeric(".NaN")); + EXPECT_TRUE(isNumeric(".NAN")); + + EXPECT_TRUE(isNumeric("0")); + EXPECT_TRUE(isNumeric("0.")); + EXPECT_TRUE(isNumeric("0.0")); + EXPECT_TRUE(isNumeric("-0.0")); + EXPECT_TRUE(isNumeric("+0.0")); + + EXPECT_TRUE(isNumeric("+12.0")); + EXPECT_TRUE(isNumeric(".5")); + EXPECT_TRUE(isNumeric("+.5")); + EXPECT_TRUE(isNumeric("-1.0")); + + EXPECT_TRUE(isNumeric("2.3e4")); + EXPECT_TRUE(isNumeric("-2E+05")); + EXPECT_TRUE(isNumeric("+12e03")); + EXPECT_TRUE(isNumeric("6.8523015e+5")); + + EXPECT_TRUE(isNumeric("1.e+1")); + EXPECT_TRUE(isNumeric(".0e+1")); + + EXPECT_TRUE(isNumeric("0x2aF3")); + EXPECT_TRUE(isNumeric("0o01234567")); + + EXPECT_FALSE(isNumeric("not a number")); + EXPECT_FALSE(isNumeric(".")); + EXPECT_FALSE(isNumeric(".e+1")); + EXPECT_FALSE(isNumeric(".1e")); + EXPECT_FALSE(isNumeric(".1e+")); + EXPECT_FALSE(isNumeric(".1e++1")); + + EXPECT_FALSE(isNumeric("+0x2AF3")); + EXPECT_FALSE(isNumeric("-0x2AF3")); + EXPECT_FALSE(isNumeric("0x2AF3Z")); + EXPECT_FALSE(isNumeric("0o012345678")); + EXPECT_FALSE(isNumeric("-0o012345678")); + + // Deprecated formats: as for YAML 1.2 specification, the following are not + // valid numbers anymore: + // + // * Octal numbers with "0" prefix instead of "0o" + // * Sexagecimal numbers + // * Decimal numbers with comma s the delimiter + // * "inf", "nan" without '.' prefix + EXPECT_FALSE(isNumeric("3:25:45")); + EXPECT_FALSE(isNumeric("014")); + EXPECT_FALSE(isNumeric("+12,345")); + EXPECT_FALSE(isNumeric("-inf")); + EXPECT_FALSE(isNumeric("1,230.15")); +} Index: llvm/include/llvm/Support/YAMLTraits.h =================================================================== --- llvm/include/llvm/Support/YAMLTraits.h +++ llvm/include/llvm/Support/YAMLTraits.h @@ -27,6 +27,7 @@ #include <cctype> #include <cstddef> #include <cstdint> +#include <iterator> #include <map> #include <memory> #include <new> @@ -449,46 +450,76 @@ static bool const value = (sizeof(test<DocumentListTraits<T>>(nullptr))==1); }; -inline bool isNumber(StringRef S) { - static const char OctalChars[] = "01234567"; - if (S.startswith("0") && - S.drop_front().find_first_not_of(OctalChars) == StringRef::npos) - return true; - - if (S.startswith("0o") && - S.drop_front(2).find_first_not_of(OctalChars) == StringRef::npos) - return true; +inline bool isNumeric(StringRef S) { + if (S.empty()) + return false; - static const char HexChars[] = "0123456789abcdefABCDEF"; - if (S.startswith("0x") && - S.drop_front(2).find_first_not_of(HexChars) == StringRef::npos) + if (S.equals(".nan") || S.equals(".NaN") || S.equals(".NAN")) return true; - static const char DecChars[] = "0123456789"; - if (S.find_first_not_of(DecChars) == StringRef::npos) - return true; + StringRef Tail = (S.front() == '-' || S.front() == '+') ? S.drop_front() : S; - if (S.equals(".inf") || S.equals(".Inf") || S.equals(".INF")) + if (Tail.equals(".inf") || Tail.equals(".Inf") || Tail.equals(".INF")) return true; - Regex FloatMatcher("^(\\.[0-9]+|[0-9]+(\\.[0-9]*)?)([eE][-+]?[0-9]+)?$"); - if (FloatMatcher.match(S)) + static const char HexChars[] = "0123456789abcdefABCDEF"; + static const char OctalChars[] = "01234567"; + bool ParseHex = S.startswith("0x"); + bool ParseOct = S.startswith("0o"); + if (ParseHex || ParseOct) { + if (S.size() < 3) + return false; + for (const auto &Char : S.drop_front(2)) { + if (ParseHex && std::find(std::begin(HexChars), std::end(HexChars), + Char) == std::end(HexChars)) + return false; + if (ParseOct && std::find(std::begin(OctalChars), std::end(OctalChars), + Char) == std::end(OctalChars)) + return false; + } return true; + } - return false; -} - -inline bool isNumeric(StringRef S) { - if ((S.front() == '-' || S.front() == '+') && isNumber(S.drop_front())) - return true; + static const char DecChars[] = "0123456789"; - if (isNumber(S)) - return true; + // Leading zeros are not allowed. + if (Tail.front() == '0' && Tail.size() > 1 && + std::find(std::begin(DecChars), std::end(DecChars), Tail[1]) != + std::end(DecChars)) + return false; + + // Parse float: [-+]? (\. [0-9]+ | [0-9]+ (\. [0-9]* )?) ([eE] [-+]? [0-9]+)? + bool FoundDot = false; + bool FoundExponent = false; + for (size_t I = 0; I < Tail.size(); ++I) { + char Symbol = Tail[I]; + if (Symbol == '.') { + // There can only be one dot in the number. + if (FoundDot) + return false; + FoundDot = true; + // If string starts with '.' it has to be followed by at least one digit. + if (I == 0 && (Tail.size() == 1 || Tail.find_first_of(DecChars) != 1)) + return false; + } else if (Symbol == 'e' || Symbol == 'E') { + // There can only be one exponent sign in the number. + if (FoundExponent) + return false; + FoundExponent = true; + } else if (Symbol == '+' || Symbol == '-') { + // Sign can only follow an exponent sign. + if (!FoundExponent || (Tail[I - 1] != 'e' && Tail[I - 1] != 'E')) + return false; + } else if ('0' > Symbol || Symbol > '9') { + return false; + } + } - if (S.equals(".nan") || S.equals(".NaN") || S.equals(".NAN")) - return true; + // Exponent sign has been found: it should be followed by at least one digit. + if (FoundExponent) + return ('0' <= S.back() && S.back() <= '9'); - return false; + return true; } inline bool isNull(StringRef S) {
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits