Author: cmtice Date: 2025-02-05T10:47:11-08:00 New Revision: d9a7498aa24a35bdd95fd20a5c63e9495b6669f6
URL: https://github.com/llvm/llvm-project/commit/d9a7498aa24a35bdd95fd20a5c63e9495b6669f6 DIFF: https://github.com/llvm/llvm-project/commit/d9a7498aa24a35bdd95fd20a5c63e9495b6669f6.diff LOG: [LLDB] Add Lexer (with tests) for DIL (Data Inspection Language). (#123521) This adds the basic lexer, with unittests, for the Data Inspection Language (DIL) -- see https://discourse.llvm.org/t/rfc-data-inspection-language/69893 This version of the lexer only handles local variables and namespaces, and is designed to work with https://github.com/llvm/llvm-project/pull/120971. Added: lldb/include/lldb/ValueObject/DILLexer.h lldb/source/ValueObject/DILLexer.cpp lldb/unittests/ValueObject/DILLexerTests.cpp Modified: lldb/source/ValueObject/CMakeLists.txt lldb/unittests/ValueObject/CMakeLists.txt Removed: ################################################################################ diff --git a/lldb/include/lldb/ValueObject/DILLexer.h b/lldb/include/lldb/ValueObject/DILLexer.h new file mode 100644 index 00000000000000..e1182da5b20ab2 --- /dev/null +++ b/lldb/include/lldb/ValueObject/DILLexer.h @@ -0,0 +1,123 @@ +//===-- DILLexer.h ----------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLDB_VALUEOBJECT_DILLEXER_H +#define LLDB_VALUEOBJECT_DILLEXER_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Error.h" +#include <cstdint> +#include <memory> +#include <string> +#include <vector> + +namespace lldb_private::dil { + +/// Class defining the tokens generated by the DIL lexer and used by the +/// DIL parser. +class Token { +public: + enum Kind { + coloncolon, + eof, + identifier, + l_paren, + r_paren, + }; + + Token(Kind kind, std::string spelling, uint32_t start) + : m_kind(kind), m_spelling(std::move(spelling)), m_start_pos(start) {} + + Kind GetKind() const { return m_kind; } + + std::string GetSpelling() const { return m_spelling; } + + bool Is(Kind kind) const { return m_kind == kind; } + + bool IsNot(Kind kind) const { return m_kind != kind; } + + bool IsOneOf(Kind kind1, Kind kind2) const { return Is(kind1) || Is(kind2); } + + template <typename... Ts> bool IsOneOf(Kind kind, Ts... Ks) const { + return Is(kind) || IsOneOf(Ks...); + } + + uint32_t GetLocation() const { return m_start_pos; } + + static llvm::StringRef GetTokenName(Kind kind); + +private: + Kind m_kind; + std::string m_spelling; + uint32_t m_start_pos; // within entire expression string +}; + +/// Class for doing the simple lexing required by DIL. +class DILLexer { +public: + /// Lexes all the tokens in expr and calls the private constructor + /// with the lexed tokens. + static llvm::Expected<DILLexer> Create(llvm::StringRef expr); + + /// Return the current token to be handled by the DIL parser. + const Token &GetCurrentToken() { return m_lexed_tokens[m_tokens_idx]; } + + /// Advance the current token position by N. + void Advance(uint32_t N = 1) { + if (m_tokens_idx + N >= m_lexed_tokens.size()) + // N is too large; advance to the end of the lexed tokens. + m_tokens_idx = m_lexed_tokens.size() - 1; + else + m_tokens_idx += N; + } + + /// Return the lexed token N positions ahead of the 'current' token + /// being handled by the DIL parser. + const Token &LookAhead(uint32_t N) { + if (m_tokens_idx + N < m_lexed_tokens.size()) + return m_lexed_tokens[m_tokens_idx + N]; + + // Last token should be an 'eof' token. + return m_lexed_tokens.back(); + } + + /// Return the index for the 'current' token being handled by the DIL parser. + uint32_t GetCurrentTokenIdx() { return m_tokens_idx; } + + /// Set the index for the 'current' token (to be handled by the parser) + /// to a particular position. Used for either committing 'look ahead' parsing + /// or rolling back tentative parsing. + void ResetTokenIdx(uint32_t new_value) { + assert(new_value < m_lexed_tokens.size()); + m_tokens_idx = new_value; + } + + uint32_t NumLexedTokens() { return m_lexed_tokens.size(); } + +private: + DILLexer(llvm::StringRef dil_expr, std::vector<Token> lexed_tokens) + : m_expr(dil_expr), m_lexed_tokens(std::move(lexed_tokens)), + m_tokens_idx(0) {} + + static llvm::Expected<Token> Lex(llvm::StringRef expr, + llvm::StringRef &remainder); + + // The input string we are lexing & parsing. + llvm::StringRef m_expr; + + // Holds all of the tokens lexed so far. + std::vector<Token> m_lexed_tokens; + + // Index into m_lexed_tokens; indicates which token the DIL parser is + // currently trying to parse/handle. + uint32_t m_tokens_idx; +}; + +} // namespace lldb_private::dil + +#endif // LLDB_VALUEOBJECT_DILLEXER_H diff --git a/lldb/source/ValueObject/CMakeLists.txt b/lldb/source/ValueObject/CMakeLists.txt index 70cb3d6d53f071..30c34472289e7b 100644 --- a/lldb/source/ValueObject/CMakeLists.txt +++ b/lldb/source/ValueObject/CMakeLists.txt @@ -1,4 +1,5 @@ add_lldb_library(lldbValueObject + DILLexer.cpp ValueObject.cpp ValueObjectCast.cpp ValueObjectChild.cpp diff --git a/lldb/source/ValueObject/DILLexer.cpp b/lldb/source/ValueObject/DILLexer.cpp new file mode 100644 index 00000000000000..c7acfec347af48 --- /dev/null +++ b/lldb/source/ValueObject/DILLexer.cpp @@ -0,0 +1,97 @@ +//===-- DILLexer.cpp ------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// This implements the recursive descent parser for the Data Inspection +// Language (DIL), and its helper functions, which will eventually underlie the +// 'frame variable' command. The language that this parser recognizes is +// described in lldb/docs/dil-expr-lang.ebnf +// +//===----------------------------------------------------------------------===// + +#include "lldb/ValueObject/DILLexer.h" +#include "lldb/Utility/Status.h" +#include "llvm/ADT/StringSwitch.h" + +namespace lldb_private::dil { + +llvm::StringRef Token::GetTokenName(Kind kind) { + switch (kind) { + case Kind::coloncolon: + return "coloncolon"; + case Kind::eof: + return "eof"; + case Kind::identifier: + return "identifier"; + case Kind::l_paren: + return "l_paren"; + case Kind::r_paren: + return "r_paren"; + } +} + +static bool IsLetter(char c) { + return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'); +} + +static bool IsDigit(char c) { return '0' <= c && c <= '9'; } + +// A word starts with a letter, underscore, or dollar sign, followed by +// letters ('a'..'z','A'..'Z'), digits ('0'..'9'), and/or underscores. +static std::optional<llvm::StringRef> IsWord(llvm::StringRef expr, + llvm::StringRef &remainder) { + // Find the longest prefix consisting of letters, digits, underscors and + // '$'. If it doesn't start with a digit, then it's a word. + llvm::StringRef candidate = remainder.take_while( + [](char c) { return IsDigit(c) || IsLetter(c) || c == '_' || c == '$'; }); + if (candidate.empty() || IsDigit(candidate[0])) + return std::nullopt; + remainder = remainder.drop_front(candidate.size()); + return candidate; +} + +llvm::Expected<DILLexer> DILLexer::Create(llvm::StringRef expr) { + std::vector<Token> tokens; + llvm::StringRef remainder = expr; + do { + if (llvm::Expected<Token> t = Lex(expr, remainder)) { + tokens.push_back(std::move(*t)); + } else { + return t.takeError(); + } + } while (tokens.back().GetKind() != Token::eof); + return DILLexer(expr, std::move(tokens)); +} + +llvm::Expected<Token> DILLexer::Lex(llvm::StringRef expr, + llvm::StringRef &remainder) { + // Skip over whitespace (spaces). + remainder = remainder.ltrim(); + llvm::StringRef::iterator cur_pos = remainder.begin(); + + // Check to see if we've reached the end of our input string. + if (remainder.empty()) + return Token(Token::eof, "", (uint32_t)expr.size()); + + uint32_t position = cur_pos - expr.begin(); + std::optional<llvm::StringRef> maybe_word = IsWord(expr, remainder); + if (maybe_word) + return Token(Token::identifier, maybe_word->str(), position); + + constexpr std::pair<Token::Kind, const char *> operators[] = { + {Token::l_paren, "("}, + {Token::r_paren, ")"}, + {Token::coloncolon, "::"}, + }; + for (auto [kind, str] : operators) { + if (remainder.consume_front(str)) + return Token(kind, str, position); + } + + // Unrecognized character(s) in string; unable to lex it. + return llvm::createStringError("Unable to lex input string"); +} + +} // namespace lldb_private::dil diff --git a/lldb/unittests/ValueObject/CMakeLists.txt b/lldb/unittests/ValueObject/CMakeLists.txt index 8fcc8d62a79979..14808aa2f213a5 100644 --- a/lldb/unittests/ValueObject/CMakeLists.txt +++ b/lldb/unittests/ValueObject/CMakeLists.txt @@ -1,10 +1,12 @@ add_lldb_unittest(LLDBValueObjectTests DumpValueObjectOptionsTests.cpp + DILLexerTests.cpp LINK_LIBS lldbValueObject lldbPluginPlatformLinux lldbPluginScriptInterpreterNone + LLVMTestingSupport LINK_COMPONENTS Support diff --git a/lldb/unittests/ValueObject/DILLexerTests.cpp b/lldb/unittests/ValueObject/DILLexerTests.cpp new file mode 100644 index 00000000000000..9e5b8efd7af80a --- /dev/null +++ b/lldb/unittests/ValueObject/DILLexerTests.cpp @@ -0,0 +1,156 @@ +//===-- DILLexerTests.cpp --------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "lldb/ValueObject/DILLexer.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Testing/Support/Error.h" +#include "gtest/gtest.h" +#include <string> + +using llvm::StringRef; + +using namespace lldb_private::dil; + +llvm::Expected<std::vector<std::pair<Token::Kind, std::string>>> +ExtractTokenData(llvm::StringRef input_expr) { + + llvm::Expected<DILLexer> maybe_lexer = DILLexer::Create(input_expr); + if (!maybe_lexer) + return maybe_lexer.takeError(); + DILLexer lexer(*maybe_lexer); + + std::vector<std::pair<Token::Kind, std::string>> data; + do { + Token tok = lexer.GetCurrentToken(); + data.push_back(std::make_pair(tok.GetKind(), tok.GetSpelling())); + lexer.Advance(); + } while (data.back().first != Token::eof); + // Don't return the eof token. + data.pop_back(); + return data; +} + +TEST(DILLexerTests, SimpleTest) { + StringRef input_expr("simple_var"); + llvm::Expected<DILLexer> maybe_lexer = DILLexer::Create(input_expr); + ASSERT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded()); + DILLexer lexer(*maybe_lexer); + Token token = lexer.GetCurrentToken(); + + EXPECT_EQ(token.GetKind(), Token::identifier); + EXPECT_EQ(token.GetSpelling(), "simple_var"); + lexer.Advance(); + token = lexer.GetCurrentToken(); + EXPECT_EQ(token.GetKind(), Token::eof); +} + +TEST(DILLexerTests, TokenKindTest) { + Token token = Token(Token::identifier, "ident", 0); + + EXPECT_TRUE(token.Is(Token::identifier)); + EXPECT_FALSE(token.Is(Token::l_paren)); + EXPECT_TRUE(token.IsOneOf(Token::eof, Token::identifier)); + EXPECT_FALSE(token.IsOneOf(Token::l_paren, Token::r_paren, Token::coloncolon, + Token::eof)); +} + +TEST(DILLexerTests, LookAheadTest) { + StringRef input_expr("(anonymous namespace)::some_var"); + llvm::Expected<DILLexer> maybe_lexer = DILLexer::Create(input_expr); + ASSERT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded()); + DILLexer lexer(*maybe_lexer); + Token token = lexer.GetCurrentToken(); + + // Current token is '('; check the next 4 tokens, to make + // sure they are the identifier 'anonymous', the identifier 'namespace' + // ')' and '::', in that order. + EXPECT_EQ(token.GetKind(), Token::l_paren); + EXPECT_EQ(lexer.LookAhead(1).GetKind(), Token::identifier); + EXPECT_EQ(lexer.LookAhead(1).GetSpelling(), "anonymous"); + EXPECT_EQ(lexer.LookAhead(2).GetKind(), Token::identifier); + EXPECT_EQ(lexer.LookAhead(2).GetSpelling(), "namespace"); + EXPECT_EQ(lexer.LookAhead(3).GetKind(), Token::r_paren); + EXPECT_EQ(lexer.LookAhead(4).GetKind(), Token::coloncolon); + + // Our current index should still be 0, as we only looked ahead; we are still + // officially on the '('. + EXPECT_EQ(lexer.GetCurrentTokenIdx(), 0u); + + // Accept the 'lookahead', so our current token is '::', which has the index + // 4 in our vector of tokens (which starts at zero). + lexer.Advance(4); + token = lexer.GetCurrentToken(); + EXPECT_EQ(token.GetKind(), Token::coloncolon); + EXPECT_EQ(lexer.GetCurrentTokenIdx(), 4u); + + lexer.Advance(); + token = lexer.GetCurrentToken(); + EXPECT_EQ(token.GetKind(), Token::identifier); + EXPECT_EQ(token.GetSpelling(), "some_var"); + EXPECT_EQ(lexer.GetCurrentTokenIdx(), 5u); + EXPECT_EQ(token.GetLocation(), strlen("(anonymous namespace)::")); + + lexer.Advance(); + token = lexer.GetCurrentToken(); + EXPECT_EQ(token.GetKind(), Token::eof); +} + +TEST(DILLexerTests, MultiTokenLexTest) { + EXPECT_THAT_EXPECTED( + ExtractTokenData("This string has (several ) ::identifiers"), + llvm::HasValue(testing::ElementsAre( + testing::Pair(Token::identifier, "This"), + testing::Pair(Token::identifier, "string"), + testing::Pair(Token::identifier, "has"), + testing::Pair(Token::l_paren, "("), + testing::Pair(Token::identifier, "several"), + testing::Pair(Token::r_paren, ")"), + testing::Pair(Token::coloncolon, "::"), + testing::Pair(Token::identifier, "identifiers")))); +} + +TEST(DILLexerTests, IdentifiersTest) { + // These strings should lex into identifier tokens. + std::vector<std::string> valid_identifiers = { + "$My_name1", "$pc", "abcd", "_", "_a", "_a_", "$", + "a_b", "this", "self", "a", "MyName", "namespace"}; + + // The lexer can lex these strings, but they should not be identifiers. + std::vector<std::string> invalid_identifiers = {"", "::", "(", ")"}; + + // The lexer is expected to fail attempting to lex these strings (it cannot + // create valid tokens out of them). + std::vector<std::string> invalid_tok_strings = {"234", "2a", "2", "1MyName"}; + + // Verify that all of the valid identifiers come out as identifier tokens. + for (auto &str : valid_identifiers) { + SCOPED_TRACE(str); + EXPECT_THAT_EXPECTED(ExtractTokenData(str), + llvm::HasValue(testing::ElementsAre( + testing::Pair(Token::identifier, str)))); + } + + // Verify that the lexer fails on invalid token strings. + for (auto &str : invalid_tok_strings) { + SCOPED_TRACE(str); + auto maybe_lexer = DILLexer::Create(str); + EXPECT_THAT_EXPECTED(maybe_lexer, llvm::Failed()); + } + + // Verify that none of the invalid identifiers come out as identifier tokens. + for (auto &str : invalid_identifiers) { + SCOPED_TRACE(str); + llvm::Expected<DILLexer> maybe_lexer = DILLexer::Create(str); + EXPECT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded()); + DILLexer lexer(*maybe_lexer); + Token token = lexer.GetCurrentToken(); + EXPECT_TRUE(token.IsNot(Token::identifier)); + EXPECT_TRUE(token.IsOneOf(Token::eof, Token::coloncolon, Token::l_paren, + Token::r_paren)); + } +} _______________________________________________ lldb-commits mailing list lldb-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/lldb-commits