https://github.com/cmtice created https://github.com/llvm/llvm-project/pull/123521
This adds the basic lexer, with unittests, for the Data Inspection Language (DIL) -- see https://discourse.llvm.org/t/rfc-data-inspection-language/69893 This version of the lexer only handles local variables and namespaces, and is designed to work with https://github.com/llvm/llvm-project/pull/120971. >From 468f73f8539dcb8addf8ed9618d9eb797dabbb01 Mon Sep 17 00:00:00 2001 From: Caroline Tice <cmt...@google.com> Date: Sun, 19 Jan 2025 09:15:34 -0800 Subject: [PATCH] [LLDB] Add Lexer (with tests) for DIL (Data Inspection Language). This adds the basic lexer, with unittests, for the Data Inspection Language (DIL) -- see https://discourse.llvm.org/t/rfc-data-inspection-language/69893 This version of the lexer only handles local variables and namespaces, and is designed to work with https://github.com/llvm/llvm-project/pull/120971. --- lldb/include/lldb/ValueObject/DILLexer.h | 156 ++++++++++++++ lldb/source/ValueObject/DILLexer.cpp | 205 +++++++++++++++++++ lldb/unittests/ValueObject/CMakeLists.txt | 1 + lldb/unittests/ValueObject/DILLexerTests.cpp | 193 +++++++++++++++++ 4 files changed, 555 insertions(+) create mode 100644 lldb/include/lldb/ValueObject/DILLexer.h create mode 100644 lldb/source/ValueObject/DILLexer.cpp create mode 100644 lldb/unittests/ValueObject/DILLexerTests.cpp diff --git a/lldb/include/lldb/ValueObject/DILLexer.h b/lldb/include/lldb/ValueObject/DILLexer.h new file mode 100644 index 00000000000000..45c506b2f4106d --- /dev/null +++ b/lldb/include/lldb/ValueObject/DILLexer.h @@ -0,0 +1,156 @@ +//===-- DILLexer.h ----------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLDB_VALUEOBJECT_DILLEXER_H_ +#define LLDB_VALUEOBJECT_DILLEXER_H_ + +#include "llvm/ADT/StringRef.h" +#include <cstdint> +#include <limits.h> +#include <memory> +#include <string> +#include <vector> + +namespace lldb_private { + +namespace dil { + +enum class TokenKind { + coloncolon, + eof, + identifier, + invalid, + kw_namespace, + l_paren, + none, + r_paren, + unknown, +}; + +/// Class defining the tokens generated by the DIL lexer and used by the +/// DIL parser. +class DILToken { +public: + DILToken(dil::TokenKind kind, std::string spelling, uint32_t start) + : m_kind(kind), m_spelling(spelling), m_start_pos(start) {} + + DILToken() : m_kind(dil::TokenKind::none), m_spelling(""), m_start_pos(0) {} + + void setKind(dil::TokenKind kind) { m_kind = kind; } + dil::TokenKind getKind() const { return m_kind; } + + std::string getSpelling() const { return m_spelling; } + + uint32_t getLength() const { return m_spelling.size(); } + + bool is(dil::TokenKind kind) const { return m_kind == kind; } + + bool isNot(dil::TokenKind kind) const { return m_kind != kind; } + + bool isOneOf(dil::TokenKind kind1, dil::TokenKind kind2) const { + return is(kind1) || is(kind2); + } + + template <typename... Ts> bool isOneOf(dil::TokenKind kind, Ts... Ks) const { + return is(kind) || isOneOf(Ks...); + } + + uint32_t getLocation() const { return m_start_pos; } + + void setValues(dil::TokenKind kind, std::string spelling, uint32_t start) { + m_kind = kind; + m_spelling = spelling; + m_start_pos = start; + } + + static const std::string getTokenName(dil::TokenKind kind); + +private: + dil::TokenKind m_kind; + std::string m_spelling; + uint32_t m_start_pos; // within entire expression string +}; + +/// Class for doing the simple lexing required by DIL. +class DILLexer { +public: + DILLexer(llvm::StringRef dil_expr) : m_expr(dil_expr.str()) { + m_cur_pos = m_expr.begin(); + // Use UINT_MAX to indicate invalid/uninitialized value. + m_tokens_idx = UINT_MAX; + } + + bool Lex(DILToken &result, bool look_ahead = false); + + bool Is_Word(std::string::iterator start, uint32_t &length); + + uint32_t GetLocation() { return m_cur_pos - m_expr.begin(); } + + /// Update 'result' with the other paremeter values, create a + /// duplicate token, and push the duplicate token onto the vector of + /// lexed tokens. + void UpdateLexedTokens(DILToken &result, dil::TokenKind tok_kind, + std::string tok_str, uint32_t tok_pos); + + /// Return the lexed token N+1 positions ahead of the 'current' token + /// being handled by the DIL parser. + const DILToken &LookAhead(uint32_t N); + + const DILToken &AcceptLookAhead(uint32_t N); + + /// Return the index for the 'current' token being handled by the DIL parser. + uint32_t GetCurrentTokenIdx() { return m_tokens_idx; } + + /// Return the current token to be handled by the DIL parser. + DILToken &GetCurrentToken() { return m_lexed_tokens[m_tokens_idx]; } + + /// Update the index for the 'current' token, to point to the next lexed + /// token. + bool IncrementTokenIdx() { + if (m_tokens_idx >= m_lexed_tokens.size() - 1) + return false; + + m_tokens_idx++; + return true; + } + + /// Set the index for the 'current' token (to be handled by the parser) + /// to a particular position. Used for either committing 'look ahead' parsing + /// or rolling back tentative parsing. + bool ResetTokenIdx(uint32_t new_value) { + if (new_value > m_lexed_tokens.size() - 1) + return false; + + m_tokens_idx = new_value; + return true; + } + +private: + // The input string we are lexing & parsing. + std::string m_expr; + + // The current position of the lexer within m_expr (the character position, + // within the string, of the next item to be lexed). + std::string::iterator m_cur_pos; + + // Holds all of the tokens lexed so far. + std::vector<DILToken> m_lexed_tokens; + + // Index into m_lexed_tokens; indicates which token the DIL parser is + // currently trying to parse/handle. + uint32_t m_tokens_idx; + + // "invalid" token; to be returned by lexer when 'look ahead' fails. + DILToken m_invalid_token; +}; + +} // namespace dil + +} // namespace lldb_private + +#endif // LLDB_VALUEOBJECT_DILLEXER_H_ diff --git a/lldb/source/ValueObject/DILLexer.cpp b/lldb/source/ValueObject/DILLexer.cpp new file mode 100644 index 00000000000000..4c2b0b1813bb96 --- /dev/null +++ b/lldb/source/ValueObject/DILLexer.cpp @@ -0,0 +1,205 @@ +//===-- DILLexer.cpp ------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// This implements the recursive descent parser for the Data Inspection +// Language (DIL), and its helper functions, which will eventually underlie the +// 'frame variable' command. The language that this parser recognizes is +// described in lldb/docs/dil-expr-lang.ebnf +// +//===----------------------------------------------------------------------===// + +#include "lldb/ValueObject/DILLexer.h" +#include "llvm/ADT/StringMap.h" + +namespace lldb_private { + +namespace dil { + +// For fast keyword lookup. More keywords will be added later. +const llvm::StringMap<dil::TokenKind> Keywords = { + {"namespace", dil::TokenKind::kw_namespace}, +}; + +const std::string DILToken::getTokenName(dil::TokenKind kind) { + switch (kind) { + case dil::TokenKind::coloncolon: + return "coloncolon"; + case dil::TokenKind::eof: + return "eof"; + case dil::TokenKind::identifier: + return "identifier"; + case dil::TokenKind::kw_namespace: + return "namespace"; + case dil::TokenKind::l_paren: + return "l_paren"; + case dil::TokenKind::r_paren: + return "r_paren"; + case dil::TokenKind::unknown: + return "unknown"; + default: + return "token_name"; + } +} + +static bool Is_Letter(char c) { + if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z')) + return true; + return false; +} + +static bool Is_Digit(char c) { return ('0' <= c && c <= '9'); } + +// A word starts with a letter, underscore, or dollar sign, followed by +// letters ('a'..'z','A'..'Z'), digits ('0'..'9'), and/or underscores. +bool DILLexer::Is_Word(std::string::iterator start, uint32_t &length) { + bool done = false; + bool dollar_start = false; + + // Must not start with a digit. + if (m_cur_pos == m_expr.end() || Is_Digit(*m_cur_pos)) + return false; + + // First character *may* be a '$', for a register name or convenience + // variable. + if (*m_cur_pos == '$') { + dollar_start = true; + ++m_cur_pos; + length++; + } + + // Contains only letters, digits or underscores + for (; m_cur_pos != m_expr.end() && !done; ++m_cur_pos) { + char c = *m_cur_pos; + if (!Is_Letter(c) && !Is_Digit(c) && c != '_') { + done = true; + break; + } else + length++; + } + + if (dollar_start && length > 1) // Must have something besides just '$' + return true; + + if (!dollar_start && length > 0) + return true; + + // Not a valid word, so re-set the lexing position. + m_cur_pos = start; + return false; +} + +void DILLexer::UpdateLexedTokens(DILToken &result, dil::TokenKind tok_kind, + std::string tok_str, uint32_t tok_pos) { + DILToken new_token; + result.setValues(tok_kind, tok_str, tok_pos); + new_token = result; + m_lexed_tokens.push_back(std::move(new_token)); +} + +bool DILLexer::Lex(DILToken &result, bool look_ahead) { + bool retval = true; + + if (!look_ahead) { + // We're being asked for the 'next' token, and not a part of a LookAhead. + // Check to see if we've already lexed it and pushed it onto our tokens + // vector; if so, return the next token from the vector, rather than doing + // more lexing. + if ((m_tokens_idx != UINT_MAX) && + (m_tokens_idx < m_lexed_tokens.size() - 1)) { + result = m_lexed_tokens[m_tokens_idx + 1]; + return retval; + } + } + + // Skip over whitespace (spaces). + while (m_cur_pos != m_expr.end() && *m_cur_pos == ' ') + m_cur_pos++; + + // Check to see if we've reached the end of our input string. + if (m_cur_pos == m_expr.end()) { + UpdateLexedTokens(result, dil::TokenKind::eof, "", m_expr.length()); + return retval; + } + + uint32_t position = m_cur_pos - m_expr.begin(); + ; + std::string::iterator start = m_cur_pos; + uint32_t length = 0; + if (Is_Word(start, length)) { + dil::TokenKind kind; + std::string word = m_expr.substr(position, length); + auto iter = Keywords.find(word); + if (iter != Keywords.end()) + kind = iter->second; + else + kind = dil::TokenKind::identifier; + + UpdateLexedTokens(result, kind, word, position); + return true; + } + + switch (*m_cur_pos) { + case '(': + m_cur_pos++; + UpdateLexedTokens(result, dil::TokenKind::l_paren, "(", position); + return true; + case ')': + m_cur_pos++; + UpdateLexedTokens(result, dil::TokenKind::r_paren, ")", position); + return true; + case ':': + if (position + 1 < m_expr.size() && m_expr[position + 1] == ':') { + m_cur_pos += 2; + UpdateLexedTokens(result, dil::TokenKind::coloncolon, "::", position); + return true; + } + break; + default: + break; + } + // Empty Token + result.setValues(dil::TokenKind::none, "", m_expr.length()); + return false; +} + +const DILToken &DILLexer::LookAhead(uint32_t N) { + uint32_t extra_lexed_tokens = m_lexed_tokens.size() - m_tokens_idx - 1; + + if (N + 1 < extra_lexed_tokens) + return m_lexed_tokens[m_tokens_idx + N + 1]; + + uint32_t remaining_tokens = + (m_tokens_idx + N + 1) - m_lexed_tokens.size() + 1; + + bool done = false; + bool look_ahead = true; + while (!done && remaining_tokens > 0) { + DILToken tok; + Lex(tok, look_ahead); + if (tok.getKind() == dil::TokenKind::eof) + done = true; + remaining_tokens--; + }; + + if (remaining_tokens > 0) { + m_invalid_token.setValues(dil::TokenKind::invalid, "", 0); + return m_invalid_token; + } + + return m_lexed_tokens[m_tokens_idx + N + 1]; +} + +const DILToken &DILLexer::AcceptLookAhead(uint32_t N) { + if (m_tokens_idx + N + 1 > m_lexed_tokens.size()) + return m_invalid_token; + + m_tokens_idx += N + 1; + return m_lexed_tokens[m_tokens_idx]; +} + +} // namespace dil + +} // namespace lldb_private diff --git a/lldb/unittests/ValueObject/CMakeLists.txt b/lldb/unittests/ValueObject/CMakeLists.txt index 8fcc8d62a79979..952f5411a98057 100644 --- a/lldb/unittests/ValueObject/CMakeLists.txt +++ b/lldb/unittests/ValueObject/CMakeLists.txt @@ -1,5 +1,6 @@ add_lldb_unittest(LLDBValueObjectTests DumpValueObjectOptionsTests.cpp + DILLexerTests.cpp LINK_LIBS lldbValueObject diff --git a/lldb/unittests/ValueObject/DILLexerTests.cpp b/lldb/unittests/ValueObject/DILLexerTests.cpp new file mode 100644 index 00000000000000..ec6ff86b64d36b --- /dev/null +++ b/lldb/unittests/ValueObject/DILLexerTests.cpp @@ -0,0 +1,193 @@ +//===-- DILLexerTests.cpp --------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "lldb/ValueObject/DILLexer.h" +#include "llvm/ADT/StringRef.h" +#include "gtest/gtest.h" +#include <string> + +using llvm::StringRef; + +TEST(DILLexerTests, SimpleTest) { + StringRef dil_input_expr("simple_var"); + uint32_t tok_len = 10; + lldb_private::dil::DILLexer dil_lexer(dil_input_expr); + lldb_private::dil::DILToken dil_token; + dil_token.setKind(lldb_private::dil::TokenKind::unknown); + EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::unknown); + dil_lexer.Lex(dil_token); + EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::identifier); + EXPECT_EQ(dil_token.getSpelling(), "simple_var"); + EXPECT_EQ(dil_token.getLength(), tok_len); + dil_lexer.Lex(dil_token); + EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::eof); +} + +TEST(DILLexerTests, TokenKindTest) { + StringRef dil_input_expr("namespace"); + lldb_private::dil::DILLexer dil_lexer(dil_input_expr); + lldb_private::dil::DILToken dil_token; + dil_token.setKind(lldb_private::dil::TokenKind::unknown); + + dil_lexer.Lex(dil_token); + EXPECT_EQ(dil_lexer.GetCurrentTokenIdx(), UINT_MAX); + dil_lexer.ResetTokenIdx(0); + + EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::kw_namespace); + EXPECT_TRUE(dil_token.isNot(lldb_private::dil::TokenKind::identifier)); + EXPECT_FALSE(dil_token.is(lldb_private::dil::TokenKind::l_paren)); + EXPECT_TRUE(dil_token.isOneOf(lldb_private::dil::TokenKind::eof, + lldb_private::dil::TokenKind::kw_namespace)); + EXPECT_FALSE(dil_token.isOneOf(lldb_private::dil::TokenKind::l_paren, + lldb_private::dil::TokenKind::r_paren, + lldb_private::dil::TokenKind::coloncolon, + lldb_private::dil::TokenKind::eof)); + + dil_token.setKind(lldb_private::dil::TokenKind::identifier); + EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::identifier); +} + +TEST(DILLexerTests, LookAheadTest) { + StringRef dil_input_expr("(anonymous namespace)::some_var"); + lldb_private::dil::DILLexer dil_lexer(dil_input_expr); + lldb_private::dil::DILToken dil_token; + dil_token.setKind(lldb_private::dil::TokenKind::unknown); + uint32_t expect_loc = 23; + + dil_lexer.Lex(dil_token); + EXPECT_EQ(dil_lexer.GetCurrentTokenIdx(), UINT_MAX); + dil_lexer.ResetTokenIdx(0); + + // Current token is '('; check the next 4 tokens, to make + // sure they are the identifier 'anonymous', the namespace keyword, + // ')' and '::', in that order. + EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::l_paren); + EXPECT_EQ(dil_lexer.LookAhead(0).getKind(), + lldb_private::dil::TokenKind::identifier); + EXPECT_EQ(dil_lexer.LookAhead(0).getSpelling(), "anonymous"); + EXPECT_EQ(dil_lexer.LookAhead(1).getKind(), + lldb_private::dil::TokenKind::kw_namespace); + EXPECT_EQ(dil_lexer.LookAhead(2).getKind(), + lldb_private::dil::TokenKind::r_paren); + EXPECT_EQ(dil_lexer.LookAhead(3).getKind(), + lldb_private::dil::TokenKind::coloncolon); + // Verify we've advanced our position counter (lexing location) in the + // input 23 characters (the length of '(anonymous namespace)::'. + EXPECT_EQ(dil_lexer.GetLocation(), expect_loc); + + // Our current index should still be 0, as we only looked ahead; we are still + // officially on the '('. + EXPECT_EQ(dil_lexer.GetCurrentTokenIdx(), 0); + + // Accept the 'lookahead', so our current token is '::', which has the index + // 4 in our vector of tokens (which starts at zero). + dil_token = dil_lexer.AcceptLookAhead(3); + EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::coloncolon); + EXPECT_EQ(dil_lexer.GetCurrentTokenIdx(), 4); + + // Lex the final variable name in the input string + dil_lexer.Lex(dil_token); + dil_lexer.IncrementTokenIdx(); + EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::identifier); + EXPECT_EQ(dil_token.getSpelling(), "some_var"); + EXPECT_EQ(dil_lexer.GetCurrentTokenIdx(), 5); + + dil_lexer.Lex(dil_token); + EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::eof); +} + +TEST(DILLexerTests, MultiTokenLexTest) { + StringRef dil_input_expr("This string has several identifiers"); + lldb_private::dil::DILLexer dil_lexer(dil_input_expr); + lldb_private::dil::DILToken dil_token; + dil_token.setKind(lldb_private::dil::TokenKind::unknown); + + dil_lexer.Lex(dil_token); + EXPECT_EQ(dil_lexer.GetCurrentTokenIdx(), UINT_MAX); + dil_lexer.ResetTokenIdx(0); + + EXPECT_EQ(dil_token.getSpelling(), "This"); + EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::identifier); + dil_lexer.Lex(dil_token); + dil_lexer.IncrementTokenIdx(); + + EXPECT_EQ(dil_token.getSpelling(), "string"); + EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::identifier); + dil_lexer.Lex(dil_token); + dil_lexer.IncrementTokenIdx(); + + EXPECT_EQ(dil_token.getSpelling(), "has"); + EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::identifier); + dil_lexer.Lex(dil_token); + dil_lexer.IncrementTokenIdx(); + + EXPECT_EQ(dil_token.getSpelling(), "several"); + EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::identifier); + dil_lexer.Lex(dil_token); + dil_lexer.IncrementTokenIdx(); + + EXPECT_EQ(dil_token.getSpelling(), "identifiers"); + EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::identifier); + dil_lexer.Lex(dil_token); + dil_lexer.IncrementTokenIdx(); + + EXPECT_EQ(dil_token.getSpelling(), ""); + EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::eof); +} + +TEST(DILLexerTests, IdentifiersTest) { + std::vector<std::string> valid_identifiers = { + "$My_name1", + "$pc", + "abcd", + "ab cd", + "_", + "_a", + "_a_", + "a_b", + "this", + "self", + "a", + "MyName" + }; + std::vector<std::string> invalid_identifiers = { + "234", + "2a", + "2", + "$", + "1MyName", + "", + "namespace" + }; + + // Verify that all of the valid identifiers come out as identifier tokens. + for (auto str : valid_identifiers) { + StringRef dil_input_expr(str); + lldb_private::dil::DILLexer dil_lexer(dil_input_expr); + lldb_private::dil::DILToken dil_token; + dil_token.setKind(lldb_private::dil::TokenKind::unknown); + + dil_lexer.Lex(dil_token); + EXPECT_EQ(dil_token.getKind(), lldb_private::dil::TokenKind::identifier); + } + + // Verify that none of the invalid identifiers come out as identifier tokens. + for (auto str : invalid_identifiers) { + StringRef dil_input_expr(str); + lldb_private::dil::DILLexer dil_lexer(dil_input_expr); + lldb_private::dil::DILToken dil_token; + dil_token.setKind(lldb_private::dil::TokenKind::unknown); + + dil_lexer.Lex(dil_token); + EXPECT_TRUE(dil_token.isNot(lldb_private::dil::TokenKind::identifier)); + EXPECT_TRUE(dil_token.isOneOf(lldb_private::dil::TokenKind::unknown, + lldb_private::dil::TokenKind::none, + lldb_private::dil::TokenKind::eof, + lldb_private::dil::TokenKind::kw_namespace)); + } +} _______________________________________________ lldb-commits mailing list lldb-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/lldb-commits