kbobyrev created this revision.
kbobyrev added reviewers: ilya-biryukov, ioeric.
Herald added a reviewer: javed.absar.
Herald added a subscriber: kristof.beyls.

This patch significantly improves performance of the YAML serializer by 
optimizing `YAML::isNumeric` function. This function is called on the most 
strings and is highly inefficient for two reasons:

- It uses `Regex`, which is parsed and compiled each time this function is 
called
- It uses multiple passes which are not necessary

This patch introduces stateful ad hoc YAML number parser which does not rely on 
`Regex`. It also fixes YAML number format inconsistency: current implementation 
supports C-stile octal number format (`01234567`) which was present in YAML 1.0 
specialization (http://yaml.org/spec/1.0/), [Section 2.4. Tags, Example 2.19] 
but was deprecated and is no longer present in latest YAML 1.2 specification 
(http://yaml.org/spec/1.2/spec.html), see [Section 10.3.2. Tag Resolution]. 
Since the rest of the rest of the implementation does not support other 
deprecated YAML 1.0 numeric features such as sexagecimal numbers, commas as 
delimiters it is treated as inconsistency and not longer supported.

This performance bottleneck was identified while profiling Clangd's 
global-symbol-builder tool with my colleague @ilya-biryukov. The substantial 
part of the runtime was spent during a single-thread Reduce phase, which 
concludes with YAML serialization of collected symbol collection. Regex 
matching was accountable for approximately 45% of the whole runtime (which 
involves sharded Map phase), now it is reduced to 18% (which is spent in 
`clang::clangd::CanonicalIncludes` and can be also optimized because all used 
regexes are in fact either suffix matches or exact matches).

Benchmarking `global-symbol-builder` (using `hyperfine --warmup 2 --min-runs 5 
'command 1' 'command 2'` tool by processing a reasonable amount of code (26 
source files matched by `clang-tools-extra/clangd/*.cpp` with all transitive 
includes) confirmed our understanding of the performance bottleneck nature as 
it speeds up the command by the factor of 1.6x:

| Command    | Mean [s]    | Min…Max [s] |
| :---       | ---:        | ---:        |
| patch      | 84.7 ± 0.6  | 83.3…84.7   |
| master (https://reviews.llvm.org/rL339849) | 133.1 ± 0.8 | 132.4…134.6 |
|

Using smaller samples (e.g. by collecting symbols from 
`clang-tools-extra/clangd/AST.cpp` only) yields even better performance 
improvement, which is expected because Map phase takes less time compared to 
Reduce and is 2.05x faster:

| Command    | Mean [ms]      | Min…Max [ms]  |
| :---       | ---:           | ---:          |
| patch      | 7607.6 ± 109.5 | 7533.3…7796.4 |
| master (https://reviews.llvm.org/rL339849) | 3702.2 ± 48.7  | 3635.1…3752.3 |


https://reviews.llvm.org/D50839

Files:
  llvm/include/llvm/Support/YAMLTraits.h
  llvm/unittests/Support/YAMLIOTest.cpp

Index: llvm/unittests/Support/YAMLIOTest.cpp
===================================================================
--- llvm/unittests/Support/YAMLIOTest.cpp
+++ llvm/unittests/Support/YAMLIOTest.cpp
@@ -16,16 +16,17 @@
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
+using llvm::yaml::Hex16;
+using llvm::yaml::Hex32;
+using llvm::yaml::Hex64;
+using llvm::yaml::Hex8;
 using llvm::yaml::Input;
-using llvm::yaml::Output;
 using llvm::yaml::IO;
-using llvm::yaml::MappingTraits;
+using llvm::yaml::isNumeric;
 using llvm::yaml::MappingNormalization;
+using llvm::yaml::MappingTraits;
+using llvm::yaml::Output;
 using llvm::yaml::ScalarTraits;
-using llvm::yaml::Hex8;
-using llvm::yaml::Hex16;
-using llvm::yaml::Hex32;
-using llvm::yaml::Hex64;
 using ::testing::StartsWith;
 
 
@@ -2569,3 +2570,63 @@
     TestEscaped((char const *)foobar, "\"foo\\u200Bbar\"");
   }
 }
+
+TEST(YAMLIO, Numeric) {
+  EXPECT_TRUE(isNumeric(".inf"));
+  EXPECT_TRUE(isNumeric(".INF"));
+  EXPECT_TRUE(isNumeric(".Inf"));
+  EXPECT_TRUE(isNumeric("-.inf"));
+  EXPECT_TRUE(isNumeric("+.inf"));
+
+  EXPECT_TRUE(isNumeric(".nan"));
+  EXPECT_TRUE(isNumeric(".NaN"));
+  EXPECT_TRUE(isNumeric(".NAN"));
+
+  EXPECT_TRUE(isNumeric("0"));
+  EXPECT_TRUE(isNumeric("0."));
+  EXPECT_TRUE(isNumeric("0.0"));
+  EXPECT_TRUE(isNumeric("-0.0"));
+  EXPECT_TRUE(isNumeric("+0.0"));
+
+  EXPECT_TRUE(isNumeric("+12.0"));
+  EXPECT_TRUE(isNumeric(".5"));
+  EXPECT_TRUE(isNumeric("+.5"));
+  EXPECT_TRUE(isNumeric("-1.0"));
+
+  EXPECT_TRUE(isNumeric("2.3e4"));
+  EXPECT_TRUE(isNumeric("-2E+05"));
+  EXPECT_TRUE(isNumeric("+12e03"));
+  EXPECT_TRUE(isNumeric("6.8523015e+5"));
+
+  EXPECT_TRUE(isNumeric("1.e+1"));
+  EXPECT_TRUE(isNumeric(".0e+1"));
+
+  EXPECT_TRUE(isNumeric("0x2aF3"));
+  EXPECT_TRUE(isNumeric("0o01234567"));
+
+  EXPECT_FALSE(isNumeric("not a number"));
+  EXPECT_FALSE(isNumeric("."));
+  EXPECT_FALSE(isNumeric(".e+1"));
+  EXPECT_FALSE(isNumeric(".1e"));
+  EXPECT_FALSE(isNumeric(".1e+"));
+  EXPECT_FALSE(isNumeric(".1e++1"));
+
+  EXPECT_FALSE(isNumeric("+0x2AF3"));
+  EXPECT_FALSE(isNumeric("-0x2AF3"));
+  EXPECT_FALSE(isNumeric("0x2AF3Z"));
+  EXPECT_FALSE(isNumeric("0o012345678"));
+  EXPECT_FALSE(isNumeric("-0o012345678"));
+
+  // Deprecated formats: as for YAML 1.2 specification, the following are not
+  // valid numbers anymore:
+  //
+  // * Octal numbers with "0" prefix instead of "0o"
+  // * Sexagecimal numbers
+  // * Decimal numbers with comma s the delimiter
+  // * "inf", "nan" without '.' prefix
+  EXPECT_FALSE(isNumeric("3:25:45"));
+  EXPECT_FALSE(isNumeric("014"));
+  EXPECT_FALSE(isNumeric("+12,345"));
+  EXPECT_FALSE(isNumeric("-inf"));
+  EXPECT_FALSE(isNumeric("1,230.15"));
+}
Index: llvm/include/llvm/Support/YAMLTraits.h
===================================================================
--- llvm/include/llvm/Support/YAMLTraits.h
+++ llvm/include/llvm/Support/YAMLTraits.h
@@ -27,6 +27,7 @@
 #include <cctype>
 #include <cstddef>
 #include <cstdint>
+#include <iterator>
 #include <map>
 #include <memory>
 #include <new>
@@ -449,46 +450,76 @@
   static bool const value = (sizeof(test<DocumentListTraits<T>>(nullptr))==1);
 };
 
-inline bool isNumber(StringRef S) {
-  static const char OctalChars[] = "01234567";
-  if (S.startswith("0") &&
-      S.drop_front().find_first_not_of(OctalChars) == StringRef::npos)
-    return true;
-
-  if (S.startswith("0o") &&
-      S.drop_front(2).find_first_not_of(OctalChars) == StringRef::npos)
-    return true;
+inline bool isNumeric(StringRef S) {
+  if (S.empty())
+    return false;
 
-  static const char HexChars[] = "0123456789abcdefABCDEF";
-  if (S.startswith("0x") &&
-      S.drop_front(2).find_first_not_of(HexChars) == StringRef::npos)
+  if (S.equals(".nan") || S.equals(".NaN") || S.equals(".NAN"))
     return true;
 
-  static const char DecChars[] = "0123456789";
-  if (S.find_first_not_of(DecChars) == StringRef::npos)
-    return true;
+  StringRef Tail = (S.front() == '-' || S.front() == '+') ? S.drop_front() : S;
 
-  if (S.equals(".inf") || S.equals(".Inf") || S.equals(".INF"))
+  if (Tail.equals(".inf") || Tail.equals(".Inf") || Tail.equals(".INF"))
     return true;
 
-  Regex FloatMatcher("^(\\.[0-9]+|[0-9]+(\\.[0-9]*)?)([eE][-+]?[0-9]+)?$");
-  if (FloatMatcher.match(S))
+  static const char HexChars[] = "0123456789abcdefABCDEF";
+  static const char OctalChars[] = "01234567";
+  bool ParseHex = S.startswith("0x");
+  bool ParseOct = S.startswith("0o");
+  if (ParseHex || ParseOct) {
+    if (S.size() < 3)
+      return false;
+    for (const auto &Char : S.drop_front(2)) {
+      if (ParseHex && std::find(std::begin(HexChars), std::end(HexChars),
+                                Char) == std::end(HexChars))
+        return false;
+      if (ParseOct && std::find(std::begin(OctalChars), std::end(OctalChars),
+                                Char) == std::end(OctalChars))
+        return false;
+    }
     return true;
+  }
 
-  return false;
-}
-
-inline bool isNumeric(StringRef S) {
-  if ((S.front() == '-' || S.front() == '+') && isNumber(S.drop_front()))
-    return true;
+  static const char DecChars[] = "0123456789";
 
-  if (isNumber(S))
-    return true;
+  // Leading zeros are not allowed.
+  if (Tail.front() == '0' && Tail.size() > 1 &&
+      std::find(std::begin(DecChars), std::end(DecChars), Tail[1]) !=
+          std::end(DecChars))
+    return false;
+
+  // Parse float: [-+]? (\. [0-9]+ | [0-9]+ (\. [0-9]* )?) ([eE] [-+]? [0-9]+)?
+  bool FoundDot = false;
+  bool FoundExponent = false;
+  for (size_t I = 0; I < Tail.size(); ++I) {
+    char Symbol = Tail[I];
+    if (Symbol == '.') {
+      // There can only be one dot in the number.
+      if (FoundDot)
+        return false;
+      FoundDot = true;
+      // If string starts with '.' it has to be followed by at least one digit.
+      if (I == 0 && (Tail.size() == 1 || Tail.find_first_of(DecChars) != 1))
+        return false;
+    } else if (Symbol == 'e' || Symbol == 'E') {
+      // There can only be one exponent sign in the number.
+      if (FoundExponent)
+        return false;
+      FoundExponent = true;
+    } else if (Symbol == '+' || Symbol == '-') {
+      // Sign can only follow an exponent sign.
+      if (!FoundExponent || (Tail[I - 1] != 'e' && Tail[I - 1] != 'E'))
+        return false;
+    } else if ('0' > Symbol || Symbol > '9') {
+      return false;
+    }
+  }
 
-  if (S.equals(".nan") || S.equals(".NaN") || S.equals(".NAN"))
-    return true;
+  // Exponent sign has been found: it should be followed by at least one digit.
+  if (FoundExponent)
+    return ('0' <= S.back() && S.back() <= '9');
 
-  return false;
+  return true;
 }
 
 inline bool isNull(StringRef S) {
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to