Author: Sam McCall Date: 2020-06-10T11:40:23+02:00 New Revision: f2c8f6e16d25ca356f58995109292735b222b1b7
URL: https://github.com/llvm/llvm-project/commit/f2c8f6e16d25ca356f58995109292735b222b1b7 DIFF: https://github.com/llvm/llvm-project/commit/f2c8f6e16d25ca356f58995109292735b222b1b7.diff LOG: [clangd] Log rather than assert on bad UTF-8. Summary: I don't love this behavior, but it prevents crashing when indexing boost headers, and I can't think of a better practical alternative. Fixes https://reviews.llvm.org/D81530 Based on a patch by AnakinZheng! Reviewers: kadircet Subscribers: ilya-biryukov, MaskRay, jkorous, arphaman, usaxena95, cfe-commits, AnakinZheng Tags: #clang Differential Revision: https://reviews.llvm.org/D81530 Added: Modified: clang-tools-extra/clangd/SourceCode.cpp clang-tools-extra/clangd/unittests/SourceCodeTests.cpp clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp Removed: ################################################################################ diff --git a/clang-tools-extra/clangd/SourceCode.cpp b/clang-tools-extra/clangd/SourceCode.cpp index f6f9371d436f..c8502c580996 100644 --- a/clang-tools-extra/clangd/SourceCode.cpp +++ b/clang-tools-extra/clangd/SourceCode.cpp @@ -55,8 +55,14 @@ namespace clangd { // Iterates over unicode codepoints in the (UTF-8) string. For each, // invokes CB(UTF-8 length, UTF-16 length), and breaks if it returns true. // Returns true if CB returned true, false if we hit the end of string. +// +// If the string is not valid UTF-8, we log this error and "decode" the +// text in some arbitrary way. This is pretty sad, but this tends to happen deep +// within indexing of headers where clang misdetected the encoding, and +// propagating the error all the way back up is (probably?) not be worth it. template <typename Callback> static bool iterateCodepoints(llvm::StringRef U8, const Callback &CB) { + bool LoggedInvalid = false; // A codepoint takes two UTF-16 code unit if it's astral (outside BMP). // Astral codepoints are encoded as 4 bytes in UTF-8, starting with 11110xxx. for (size_t I = 0; I < U8.size();) { @@ -70,9 +76,20 @@ static bool iterateCodepoints(llvm::StringRef U8, const Callback &CB) { // This convenient property of UTF-8 holds for all non-ASCII characters. size_t UTF8Length = llvm::countLeadingOnes(C); // 0xxx is ASCII, handled above. 10xxx is a trailing byte, invalid here. - // 11111xxx is not valid UTF-8 at all. Assert because it's probably our bug. - assert((UTF8Length >= 2 && UTF8Length <= 4) && - "Invalid UTF-8, or transcoding bug?"); + // 11111xxx is not valid UTF-8 at all, maybe some ISO-8859-*. + if (LLVM_UNLIKELY(UTF8Length < 2 || UTF8Length > 4)) { + if (!LoggedInvalid) { + elog("File has invalid UTF-8 near offset {0}: {1}", I, llvm::toHex(U8)); + LoggedInvalid = true; + } + // We can't give a correct result, but avoid returning something wild. + // Pretend this is a valid ASCII byte, for lack of better options. + // (Too late to get ISO-8859-* right, we've skipped some bytes already). + if (CB(1, 1)) + return true; + ++I; + continue; + } I += UTF8Length; // Skip over all trailing bytes. // A codepoint takes two UTF-16 code unit if it's astral (outside BMP). // Astral codepoints are encoded as 4 bytes in UTF-8 (11110xxx ...) diff --git a/clang-tools-extra/clangd/unittests/SourceCodeTests.cpp b/clang-tools-extra/clangd/unittests/SourceCodeTests.cpp index 0521327b623b..9c3ae4df51ff 100644 --- a/clang-tools-extra/clangd/unittests/SourceCodeTests.cpp +++ b/clang-tools-extra/clangd/unittests/SourceCodeTests.cpp @@ -71,6 +71,21 @@ TEST(SourceCodeTests, lspLength) { EXPECT_EQ(lspLength("😂"), 1UL); } +TEST(SourceCodeTests, lspLengthBadUTF8) { + // Results are not well-defined if source file isn't valid UTF-8. + // However we shouldn't crash or return something totally wild. + const char *BadUTF8[] = {"\xa0", "\xff\xff\xff\xff\xff"}; + + for (OffsetEncoding Encoding : + {OffsetEncoding::UTF8, OffsetEncoding::UTF16, OffsetEncoding::UTF32}) { + WithContextValue UTF32(kCurrentOffsetEncoding, Encoding); + for (const char *Bad : BadUTF8) { + EXPECT_GE(lspLength(Bad), 0u); + EXPECT_LE(lspLength(Bad), strlen(Bad)); + } + } +} + // The = → 🡆 below are ASCII (1 byte), BMP (3 bytes), and astral (4 bytes). const char File[] = R"(0:0 = 0 1:0 → 8 diff --git a/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp b/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp index d3cd9f542de3..d90f99ff9f8c 100644 --- a/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp +++ b/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp @@ -1490,6 +1490,20 @@ TEST_F(SymbolCollectorTest, InvalidSourceLoc) { EXPECT_THAT(Symbols, Contains(QName("operator delete"))); } +TEST_F(SymbolCollectorTest, BadUTF8) { + // Extracted from boost/spirit/home/support/char_encoding/iso8859_1.hpp + // This looks like UTF-8 and fools clang, but has high-ISO-8859-1 comments. + const char *Header = "int PUNCT = 0;\n" + "int types[] = { /* \xa1 */PUNCT };"; + CollectorOpts.RefFilter = RefKind::All; + CollectorOpts.RefsInHeaders = true; + runSymbolCollector(Header, ""); + EXPECT_THAT(Symbols, Contains(QName("types"))); + EXPECT_THAT(Symbols, Contains(QName("PUNCT"))); + // Reference is stored, although offset within line is not reliable. + EXPECT_THAT(Refs, Contains(Pair(findSymbol(Symbols, "PUNCT").ID, _))); +} + } // namespace } // namespace clangd } // namespace clang _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits