[Lldb-commits] [lldb] [LLDB] Add Lexer (with tests) for DIL (Data Inspection Language). (PR #123521)

Pavel Labath via lldb-commits Mon, 27 Jan 2025 04:37:40 -0800

================
@@ -0,0 +1,189 @@
+//===-- DILLexer.cpp 
------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// This implements the recursive descent parser for the Data Inspection
+// Language (DIL), and its helper functions, which will eventually underlie the
+// 'frame variable' command. The language that this parser recognizes is
+// described in lldb/docs/dil-expr-lang.ebnf
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/ValueObject/DILLexer.h"
+#include "lldb/Utility/Status.h"
+#include "llvm/ADT/StringSwitch.h"
+
+namespace lldb_private {
+
+namespace dil {
+
+llvm::StringRef Token::GetTokenName(Kind kind) {
+  switch (kind) {
+  case Kind::coloncolon:
+    return "coloncolon";
+  case Kind::eof:
+    return "eof";
+  case Kind::identifier:
+    return "identifier";
+  case Kind::invalid:
+    return "invalid";
+  case Kind::kw_namespace:
+    return "namespace";
+  case Kind::l_paren:
+    return "l_paren";
+  case Kind::none:
+    return "none";
+  case Kind::r_paren:
+    return "r_paren";
+  case Kind::unknown:
+    return "unknown";
+  }
+}
+
+static bool IsLetter(char c) {
+  return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z');
+}
+
+static bool IsDigit(char c) { return '0' <= c && c <= '9'; }
+
+// A word starts with a letter, underscore, or dollar sign, followed by
+// letters ('a'..'z','A'..'Z'), digits ('0'..'9'), and/or  underscores.
+llvm::iterator_range<llvm::StringRef::iterator> DILLexer::IsWord() {
+  llvm::StringRef::iterator start = m_cur_pos;
+  bool dollar_start = false;
+
+  // Must not start with a digit.
+  if (m_cur_pos == m_expr.end() || IsDigit(*m_cur_pos))
+    return llvm::make_range(m_cur_pos, m_cur_pos);
+
+  // First character *may* be a '$', for a register name or convenience
+  // variable.
+  if (*m_cur_pos == '$') {
+    dollar_start = true;
+    ++m_cur_pos;
+  }
+
+  // Contains only letters, digits or underscores
+  for (; m_cur_pos != m_expr.end(); ++m_cur_pos) {
+    char c = *m_cur_pos;
+    if (!IsLetter(c) && !IsDigit(c) && c != '_')
+      break;
+  }
+
+  // If first char is '$', make sure there's at least one mare char, or it's
+  // invalid.
+  if (dollar_start && (m_cur_pos - start <= 1)) {
+    m_cur_pos = start;
+    return llvm::make_range(start, start); // Empty range
+  }
+
+  return llvm::make_range(start, m_cur_pos);
+}
+
+void DILLexer::UpdateLexedTokens(Token &result, Token::Kind tok_kind,
+                                 std::string tok_str, uint32_t tok_pos) {
+  Token new_token(tok_kind, tok_str, tok_pos);
+  result = new_token;
+  m_lexed_tokens.push_back(std::move(new_token));
+}
+
+llvm::Expected<bool> DILLexer::LexAll() {
+  bool done = false;
+  while (!done) {
+    auto tok_or_err = Lex();
+    if (!tok_or_err)
+      return tok_or_err.takeError();
+    Token token = *tok_or_err;
+    if (token.GetKind() == Token::eof) {
+      done = true;
+    }
+  }
+  return true;
+}
+
+llvm::Expected<Token> DILLexer::Lex() {
+  Token result;
+
+  // Skip over whitespace (spaces).
+  while (m_cur_pos != m_expr.end() && *m_cur_pos == ' ')
+    m_cur_pos++;
+
+  // Check to see if we've reached the end of our input string.
+  if (m_cur_pos == m_expr.end()) {
+    UpdateLexedTokens(result, Token::eof, "", (uint32_t)m_expr.size());
+    return result;
+  }
+
+  uint32_t position = m_cur_pos - m_expr.begin();
+  llvm::StringRef::iterator start = m_cur_pos;
+  llvm::iterator_range<llvm::StringRef::iterator> word_range = IsWord();
+  if (!word_range.empty()) {
+    uint32_t length = word_range.end() - word_range.begin();
+    llvm::StringRef word(m_expr.substr(position, length));
+    // We will be adding more keywords here in the future...
+    Token::Kind kind = llvm::StringSwitch<Token::Kind>(word)
+                           .Case("namespace", Token::kw_namespace)
+                           .Default(Token::identifier);
+    UpdateLexedTokens(result, kind, word.str(), position);
+    return result;
+  }
+
+  m_cur_pos = start;
+  llvm::StringRef remainder(m_expr.substr(position, m_expr.end() - m_cur_pos));
+  std::vector<std::pair<Token::Kind, const char *>> operators = {
+      {Token::l_paren, "("},
+      {Token::r_paren, ")"},
+      {Token::coloncolon, "::"},
+  };
+  for (auto [kind, str] : operators) {
+    if (remainder.consume_front(str)) {
+      m_cur_pos += strlen(str);
+      UpdateLexedTokens(result, kind, str, position);
+      return result;
+    }
+  }
+
+  // Unrecognized character(s) in string; unable to lex it.
+  Status error("Unable to lex input string");
+  return error.ToError();
+}
+
+const Token &DILLexer::LookAhead(uint32_t N) {
+  if (m_tokens_idx + N + 1 < m_lexed_tokens.size())
----------------
labath wrote:


You didn't say anything about what you make of my suggestion to use an 
"infinite" stream of `eof` tokens at the end. The current implementation uses 
an infinite stream of `invalid` tokens, but I doubt that anything cares whether 
it's working with an invalid or eof token (if you're looking ahead, chances are 
you know what you're expecting to find, and you just want to know whether your 
token matches that expectation).

That would allow us to get rid of another pseudo-token and the m_invalid_token 
member.

https://github.com/llvm/llvm-project/pull/123521
_______________________________________________
lldb-commits mailing list
lldb-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/lldb-commits

[Lldb-commits] [lldb] [LLDB] Add Lexer (with tests) for DIL (Data Inspection Language). (PR #123521)

Reply via email to