Author: cmtice
Date: 2025-02-05T10:47:11-08:00
New Revision: d9a7498aa24a35bdd95fd20a5c63e9495b6669f6


LOG: [LLDB] Add Lexer (with tests) for DIL (Data Inspection Language). (#123521)

This adds the basic lexer, with unittests, for the Data Inspection
Language (DIL) -- see

This version of the lexer only handles local variables and namespaces,
and is designed to work with




diff  --git a/lldb/include/lldb/ValueObject/DILLexer.h 
new file mode 100644
index 00000000000000..e1182da5b20ab2
--- /dev/null
+++ b/lldb/include/lldb/ValueObject/DILLexer.h
@@ -0,0 +1,123 @@
+//===-- DILLexer.h ----------------------------------------------*- C++ 
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
+// See for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+namespace lldb_private::dil {
+/// Class defining the tokens generated by the DIL lexer and used by the
+/// DIL parser.
+class Token {
+  enum Kind {
+    coloncolon,
+    eof,
+    identifier,
+    l_paren,
+    r_paren,
+  };
+  Token(Kind kind, std::string spelling, uint32_t start)
+      : m_kind(kind), m_spelling(std::move(spelling)), m_start_pos(start) {}
+  Kind GetKind() const { return m_kind; }
+  std::string GetSpelling() const { return m_spelling; }
+  bool Is(Kind kind) const { return m_kind == kind; }
+  bool IsNot(Kind kind) const { return m_kind != kind; }
+  bool IsOneOf(Kind kind1, Kind kind2) const { return Is(kind1) || Is(kind2); }
+  template <typename... Ts> bool IsOneOf(Kind kind, Ts... Ks) const {
+    return Is(kind) || IsOneOf(Ks...);
+  }
+  uint32_t GetLocation() const { return m_start_pos; }
+  static llvm::StringRef GetTokenName(Kind kind);
+  Kind m_kind;
+  std::string m_spelling;
+  uint32_t m_start_pos; // within entire expression string
+/// Class for doing the simple lexing required by DIL.
+class DILLexer {
+  /// Lexes all the tokens in expr and calls the private constructor
+  /// with the lexed tokens.
+  static llvm::Expected<DILLexer> Create(llvm::StringRef expr);
+  /// Return the current token to be handled by the DIL parser.
+  const Token &GetCurrentToken() { return m_lexed_tokens[m_tokens_idx]; }
+  /// Advance the current token position by N.
+  void Advance(uint32_t N = 1) {
+    if (m_tokens_idx + N >= m_lexed_tokens.size())
+      // N is too large; advance to the end of the lexed tokens.
+      m_tokens_idx = m_lexed_tokens.size() - 1;
+    else
+      m_tokens_idx += N;
+  }
+  /// Return the lexed token N positions ahead of the 'current' token
+  /// being handled by the DIL parser.
+  const Token &LookAhead(uint32_t N) {
+    if (m_tokens_idx + N < m_lexed_tokens.size())
+      return m_lexed_tokens[m_tokens_idx + N];
+    // Last token should be an 'eof' token.
+    return m_lexed_tokens.back();
+  }
+  /// Return the index for the 'current' token being handled by the DIL parser.
+  uint32_t GetCurrentTokenIdx() { return m_tokens_idx; }
+  /// Set the index for the 'current' token (to be handled by the parser)
+  /// to a particular position. Used for either committing 'look ahead' parsing
+  /// or rolling back tentative parsing.
+  void ResetTokenIdx(uint32_t new_value) {
+    assert(new_value < m_lexed_tokens.size());
+    m_tokens_idx = new_value;
+  }
+  uint32_t NumLexedTokens() { return m_lexed_tokens.size(); }
+  DILLexer(llvm::StringRef dil_expr, std::vector<Token> lexed_tokens)
+      : m_expr(dil_expr), m_lexed_tokens(std::move(lexed_tokens)),
+        m_tokens_idx(0) {}
+  static llvm::Expected<Token> Lex(llvm::StringRef expr,
+                                   llvm::StringRef &remainder);
+  // The input string we are lexing & parsing.
+  llvm::StringRef m_expr;
+  // Holds all of the tokens lexed so far.
+  std::vector<Token> m_lexed_tokens;
+  // Index into m_lexed_tokens; indicates which token the DIL parser is
+  // currently trying to parse/handle.
+  uint32_t m_tokens_idx;
+} // namespace lldb_private::dil

diff  --git a/lldb/source/ValueObject/CMakeLists.txt 
index 70cb3d6d53f071..30c34472289e7b 100644
--- a/lldb/source/ValueObject/CMakeLists.txt
+++ b/lldb/source/ValueObject/CMakeLists.txt
@@ -1,4 +1,5 @@
+  DILLexer.cpp

diff  --git a/lldb/source/ValueObject/DILLexer.cpp 
new file mode 100644
index 00000000000000..c7acfec347af48
--- /dev/null
+++ b/lldb/source/ValueObject/DILLexer.cpp
@@ -0,0 +1,97 @@
+//===-- DILLexer.cpp 
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
+// See for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// This implements the recursive descent parser for the Data Inspection
+// Language (DIL), and its helper functions, which will eventually underlie the
+// 'frame variable' command. The language that this parser recognizes is
+// described in lldb/docs/dil-expr-lang.ebnf
+#include "lldb/ValueObject/DILLexer.h"
+#include "lldb/Utility/Status.h"
+#include "llvm/ADT/StringSwitch.h"
+namespace lldb_private::dil {
+llvm::StringRef Token::GetTokenName(Kind kind) {
+  switch (kind) {
+  case Kind::coloncolon:
+    return "coloncolon";
+  case Kind::eof:
+    return "eof";
+  case Kind::identifier:
+    return "identifier";
+  case Kind::l_paren:
+    return "l_paren";
+  case Kind::r_paren:
+    return "r_paren";
+  }
+static bool IsLetter(char c) {
+  return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z');
+static bool IsDigit(char c) { return '0' <= c && c <= '9'; }
+// A word starts with a letter, underscore, or dollar sign, followed by
+// letters ('a'..'z','A'..'Z'), digits ('0'..'9'), and/or  underscores.
+static std::optional<llvm::StringRef> IsWord(llvm::StringRef expr,
+                                             llvm::StringRef &remainder) {
+  // Find the longest prefix consisting of letters, digits, underscors and
+  // '$'. If it doesn't start with a digit, then it's a word.
+  llvm::StringRef candidate = remainder.take_while(
+      [](char c) { return IsDigit(c) || IsLetter(c) || c == '_' || c == '$'; 
+  if (candidate.empty() || IsDigit(candidate[0]))
+    return std::nullopt;
+  remainder = remainder.drop_front(candidate.size());
+  return candidate;
+llvm::Expected<DILLexer> DILLexer::Create(llvm::StringRef expr) {
+  std::vector<Token> tokens;
+  llvm::StringRef remainder = expr;
+  do {
+    if (llvm::Expected<Token> t = Lex(expr, remainder)) {
+      tokens.push_back(std::move(*t));
+    } else {
+      return t.takeError();
+    }
+  } while (tokens.back().GetKind() != Token::eof);
+  return DILLexer(expr, std::move(tokens));
+llvm::Expected<Token> DILLexer::Lex(llvm::StringRef expr,
+                                    llvm::StringRef &remainder) {
+  // Skip over whitespace (spaces).
+  remainder = remainder.ltrim();
+  llvm::StringRef::iterator cur_pos = remainder.begin();
+  // Check to see if we've reached the end of our input string.
+  if (remainder.empty())
+    return Token(Token::eof, "", (uint32_t)expr.size());
+  uint32_t position = cur_pos - expr.begin();
+  std::optional<llvm::StringRef> maybe_word = IsWord(expr, remainder);
+  if (maybe_word)
+    return Token(Token::identifier, maybe_word->str(), position);
+  constexpr std::pair<Token::Kind, const char *> operators[] = {
+      {Token::l_paren, "("},
+      {Token::r_paren, ")"},
+      {Token::coloncolon, "::"},
+  };
+  for (auto [kind, str] : operators) {
+    if (remainder.consume_front(str))
+      return Token(kind, str, position);
+  }
+  // Unrecognized character(s) in string; unable to lex it.
+  return llvm::createStringError("Unable to lex input string");
+} // namespace lldb_private::dil

diff  --git a/lldb/unittests/ValueObject/CMakeLists.txt 
index 8fcc8d62a79979..14808aa2f213a5 100644
--- a/lldb/unittests/ValueObject/CMakeLists.txt
+++ b/lldb/unittests/ValueObject/CMakeLists.txt
@@ -1,10 +1,12 @@
+  DILLexerTests.cpp
+    LLVMTestingSupport

diff  --git a/lldb/unittests/ValueObject/DILLexerTests.cpp 
new file mode 100644
index 00000000000000..9e5b8efd7af80a
--- /dev/null
+++ b/lldb/unittests/ValueObject/DILLexerTests.cpp
@@ -0,0 +1,156 @@
+//===-- DILLexerTests.cpp --------------------------------------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
+// See for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "lldb/ValueObject/DILLexer.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Testing/Support/Error.h"
+#include "gtest/gtest.h"
+#include <string>
+using llvm::StringRef;
+using namespace lldb_private::dil;
+llvm::Expected<std::vector<std::pair<Token::Kind, std::string>>>
+ExtractTokenData(llvm::StringRef input_expr) {
+  llvm::Expected<DILLexer> maybe_lexer = DILLexer::Create(input_expr);
+  if (!maybe_lexer)
+    return maybe_lexer.takeError();
+  DILLexer lexer(*maybe_lexer);
+  std::vector<std::pair<Token::Kind, std::string>> data;
+  do {
+    Token tok = lexer.GetCurrentToken();
+    data.push_back(std::make_pair(tok.GetKind(), tok.GetSpelling()));
+    lexer.Advance();
+  } while (data.back().first != Token::eof);
+  // Don't return the eof token.
+  data.pop_back();
+  return data;
+TEST(DILLexerTests, SimpleTest) {
+  StringRef input_expr("simple_var");
+  llvm::Expected<DILLexer> maybe_lexer = DILLexer::Create(input_expr);
+  ASSERT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded());
+  DILLexer lexer(*maybe_lexer);
+  Token token = lexer.GetCurrentToken();
+  EXPECT_EQ(token.GetKind(), Token::identifier);
+  EXPECT_EQ(token.GetSpelling(), "simple_var");
+  lexer.Advance();
+  token = lexer.GetCurrentToken();
+  EXPECT_EQ(token.GetKind(), Token::eof);
+TEST(DILLexerTests, TokenKindTest) {
+  Token token = Token(Token::identifier, "ident", 0);
+  EXPECT_TRUE(token.Is(Token::identifier));
+  EXPECT_FALSE(token.Is(Token::l_paren));
+  EXPECT_TRUE(token.IsOneOf(Token::eof, Token::identifier));
+  EXPECT_FALSE(token.IsOneOf(Token::l_paren, Token::r_paren, Token::coloncolon,
+                             Token::eof));
+TEST(DILLexerTests, LookAheadTest) {
+  StringRef input_expr("(anonymous namespace)::some_var");
+  llvm::Expected<DILLexer> maybe_lexer = DILLexer::Create(input_expr);
+  ASSERT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded());
+  DILLexer lexer(*maybe_lexer);
+  Token token = lexer.GetCurrentToken();
+  // Current token is '('; check the next 4 tokens, to make
+  // sure they are the identifier 'anonymous', the identifier 'namespace'
+  // ')' and '::', in that order.
+  EXPECT_EQ(token.GetKind(), Token::l_paren);
+  EXPECT_EQ(lexer.LookAhead(1).GetKind(), Token::identifier);
+  EXPECT_EQ(lexer.LookAhead(1).GetSpelling(), "anonymous");
+  EXPECT_EQ(lexer.LookAhead(2).GetKind(), Token::identifier);
+  EXPECT_EQ(lexer.LookAhead(2).GetSpelling(), "namespace");
+  EXPECT_EQ(lexer.LookAhead(3).GetKind(), Token::r_paren);
+  EXPECT_EQ(lexer.LookAhead(4).GetKind(), Token::coloncolon);
+  // Our current index should still be 0, as we only looked ahead; we are still
+  // officially on the '('.
+  EXPECT_EQ(lexer.GetCurrentTokenIdx(), 0u);
+  // Accept the 'lookahead', so our current token is '::', which has the index
+  // 4 in our vector of tokens (which starts at zero).
+  lexer.Advance(4);
+  token = lexer.GetCurrentToken();
+  EXPECT_EQ(token.GetKind(), Token::coloncolon);
+  EXPECT_EQ(lexer.GetCurrentTokenIdx(), 4u);
+  lexer.Advance();
+  token = lexer.GetCurrentToken();
+  EXPECT_EQ(token.GetKind(), Token::identifier);
+  EXPECT_EQ(token.GetSpelling(), "some_var");
+  EXPECT_EQ(lexer.GetCurrentTokenIdx(), 5u);
+  EXPECT_EQ(token.GetLocation(), strlen("(anonymous namespace)::"));
+  lexer.Advance();
+  token = lexer.GetCurrentToken();
+  EXPECT_EQ(token.GetKind(), Token::eof);
+TEST(DILLexerTests, MultiTokenLexTest) {
+      ExtractTokenData("This string has (several ) ::identifiers"),
+      llvm::HasValue(testing::ElementsAre(
+          testing::Pair(Token::identifier, "This"),
+          testing::Pair(Token::identifier, "string"),
+          testing::Pair(Token::identifier, "has"),
+          testing::Pair(Token::l_paren, "("),
+          testing::Pair(Token::identifier, "several"),
+          testing::Pair(Token::r_paren, ")"),
+          testing::Pair(Token::coloncolon, "::"),
+          testing::Pair(Token::identifier, "identifiers"))));
+TEST(DILLexerTests, IdentifiersTest) {
+  // These strings should lex into identifier tokens.
+  std::vector<std::string> valid_identifiers = {
+      "$My_name1", "$pc",  "abcd", "_", "_a",     "_a_",      "$",
+      "a_b",       "this", "self", "a", "MyName", "namespace"};
+  // The lexer can lex these strings, but they should not be identifiers.
+  std::vector<std::string> invalid_identifiers = {"", "::", "(", ")"};
+  // The lexer is expected to fail attempting to lex these strings (it cannot
+  // create valid tokens out of them).
+  std::vector<std::string> invalid_tok_strings = {"234", "2a", "2", "1MyName"};
+  // Verify that all of the valid identifiers come out as identifier tokens.
+  for (auto &str : valid_identifiers) {
+    SCOPED_TRACE(str);
+    EXPECT_THAT_EXPECTED(ExtractTokenData(str),
+                         llvm::HasValue(testing::ElementsAre(
+                             testing::Pair(Token::identifier, str))));
+  }
+  // Verify that the lexer fails on invalid token strings.
+  for (auto &str : invalid_tok_strings) {
+    SCOPED_TRACE(str);
+    auto maybe_lexer = DILLexer::Create(str);
+    EXPECT_THAT_EXPECTED(maybe_lexer, llvm::Failed());
+  }
+  // Verify that none of the invalid identifiers come out as identifier tokens.
+  for (auto &str : invalid_identifiers) {
+    SCOPED_TRACE(str);
+    llvm::Expected<DILLexer> maybe_lexer = DILLexer::Create(str);
+    EXPECT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded());
+    DILLexer lexer(*maybe_lexer);
+    Token token = lexer.GetCurrentToken();
+    EXPECT_TRUE(token.IsNot(Token::identifier));
+    EXPECT_TRUE(token.IsOneOf(Token::eof, Token::coloncolon, Token::l_paren,
+                              Token::r_paren));
+  }

