[Lldb-commits] [PATCH] D123375: [lldb][intelpt] Reduce trace memory usage by grouping instructions

Alisamar Husain via Phabricator via lldb-commits Fri, 08 Apr 2022 03:18:22 -0700

zrthxn created this revision.
zrthxn added reviewers: wallace, jj10306.
Herald added a project: All.
zrthxn requested review of this revision.
Herald added a project: LLDB.
Herald added a subscriber: lldb-commits.


Grouping instructions with the same high 48 bits and storing prefixes. Simple 
lookup improved to get xponentially faster instruction address lookups.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D123375

Files:
  lldb/source/Plugins/Trace/intel-pt/DecodedThread.cpp
  lldb/source/Plugins/Trace/intel-pt/DecodedThread.h
  lldb/source/Plugins/Trace/intel-pt/IntelPTDecoder.cpp
  lldb/source/Plugins/Trace/intel-pt/TraceCursorIntelPT.cpp
  lldb/source/Plugins/Trace/intel-pt/TraceCursorIntelPT.h

Index: lldb/source/Plugins/Trace/intel-pt/TraceCursorIntelPT.h
===================================================================
--- lldb/source/Plugins/Trace/intel-pt/TraceCursorIntelPT.h
+++ lldb/source/Plugins/Trace/intel-pt/TraceCursorIntelPT.h
@@ -46,6 +46,8 @@
   DecodedThreadSP m_decoded_thread_sp;
   /// Internal instruction index currently pointing at.
   size_t m_pos;
+  /// Internal instruction ID currently pointing at.
+  lldb::user_id_t m_id;
   /// Tsc range covering the current instruction.
   llvm::Optional<DecodedThread::TscRange> m_tsc_range;
 };
Index: lldb/source/Plugins/Trace/intel-pt/TraceCursorIntelPT.cpp
===================================================================
--- lldb/source/Plugins/Trace/intel-pt/TraceCursorIntelPT.cpp
+++ lldb/source/Plugins/Trace/intel-pt/TraceCursorIntelPT.cpp
@@ -23,8 +23,9 @@
   assert(m_decoded_thread_sp->GetInstructionsCount() > 0 &&
          "a trace should have at least one instruction or error");
   m_pos = m_decoded_thread_sp->GetInstructionsCount() - 1;
+  m_id = m_decoded_thread_sp->ToID(m_pos);
   m_tsc_range =
-      m_decoded_thread_sp->CalculateTscRange(m_pos, /*hint_range*/ None);
+      m_decoded_thread_sp->CalculateTscRangeByIndex(m_pos, /*hint_range*/ None);
 }
 
 size_t TraceCursorIntelPT::GetInternalInstructionSize() {
@@ -42,6 +43,7 @@
 
   while (canMoveOne()) {
     m_pos += IsForwards() ? 1 : -1;
+    m_id = m_decoded_thread_sp->ToID(m_pos);
 
     if (m_tsc_range && !m_tsc_range->InRange(m_pos))
       m_tsc_range = IsForwards() ? m_tsc_range->Next() : m_tsc_range->Prev();
@@ -81,12 +83,14 @@
   };
 
   int64_t dist = FindDistanceAndSetPos();
-  m_tsc_range = m_decoded_thread_sp->CalculateTscRange(m_pos, m_tsc_range);
+  m_id = m_decoded_thread_sp->ToID(m_pos);
+  m_tsc_range =
+      m_decoded_thread_sp->CalculateTscRangeByIndex(m_pos, m_tsc_range);
   return dist;
 }
 
 bool TraceCursorIntelPT::IsError() {
-  return m_decoded_thread_sp->IsInstructionAnError(m_pos);
+  return m_decoded_thread_sp->IsInstructionAnError(m_id);
 }
 
 const char *TraceCursorIntelPT::GetError() {
@@ -94,7 +98,7 @@
 }
 
 lldb::addr_t TraceCursorIntelPT::GetLoadAddress() {
-  return m_decoded_thread_sp->GetInstructionLoadAddress(m_pos);
+  return m_decoded_thread_sp->GetInstructionLoadAddress(m_id);
 }
 
 Optional<uint64_t>
@@ -110,16 +114,19 @@
 
 TraceInstructionControlFlowType
 TraceCursorIntelPT::GetInstructionControlFlowType() {
-  return m_decoded_thread_sp->GetInstructionControlFlowType(m_pos);
+  return m_decoded_thread_sp->GetInstructionControlFlowType(m_id);
 }
 
 bool TraceCursorIntelPT::GoToId(user_id_t id) {
-  if (m_decoded_thread_sp->GetInstructionsCount() <= id)
+  size_t idx = m_decoded_thread_sp->ToIndex(id);
+  if (m_decoded_thread_sp->GetInstructionsCount() <= idx)
     return false;
-  m_pos = id;
-  m_tsc_range = m_decoded_thread_sp->CalculateTscRange(m_pos, m_tsc_range);
+  m_pos = idx;
+  m_id = id;
+  m_tsc_range =
+      m_decoded_thread_sp->CalculateTscRangeByIndex(m_pos, m_tsc_range);
 
   return true;
 }
 
-user_id_t TraceCursorIntelPT::GetId() const { return m_pos; }
+user_id_t TraceCursorIntelPT::GetId() const { return m_id; }
Index: lldb/source/Plugins/Trace/intel-pt/IntelPTDecoder.cpp
===================================================================
--- lldb/source/Plugins/Trace/intel-pt/IntelPTDecoder.cpp
+++ lldb/source/Plugins/Trace/intel-pt/IntelPTDecoder.cpp
@@ -197,6 +197,8 @@
       AppendInstruction(decoded_thread, insn, tsc_info);
     }
   }
+
+  decoded_thread.CalcIPBlockSizes();
 }
 
 /// Callback used by libipt for reading the process memory.
Index: lldb/source/Plugins/Trace/intel-pt/DecodedThread.h
===================================================================
--- lldb/source/Plugins/Trace/intel-pt/DecodedThread.h
+++ lldb/source/Plugins/Trace/intel-pt/DecodedThread.h
@@ -109,6 +109,31 @@
     void RecordError(int libipt_error_code);
   };
 
+  /// \struct InstructionPointer
+  /// Structure that stores a 6 byte prefix a list of 2 byte suffixes for each
+  /// instruction that is appended to the trace, in order. Concatenating the
+  /// prefix and suffix for any instruction ID (which is the concatenation of
+  /// two indices) will give the instruction load address.
+  ///
+  /// Generally most consecutive instructions will be close. Unless you are
+  /// doing a function call between different shared libraries, all your
+  /// instructions should be close in memory. This means that the addresses of
+  /// most consecutive instructions will have the same prefix. For we'll divide
+  /// the instruction addresses by the first 48 bits (6 bytes).
+  /// Each new instruction will now occupy 2 bytes. And each time you see a
+  /// change in the first 6 bytes, you'll pay the cost of storing that prefix,
+  /// which takes 8 bytes.
+  struct InstructionPointer {
+    uint64_t insn_prefix;
+    std::vector<uint16_t> suffixes;
+
+    InstructionPointer(uint64_t prefix, uint16_t suffix)
+        : insn_prefix(prefix), suffixes(std::vector<uint16_t>({suffix})){};
+
+    size_t AppendInstructionSuffix(uint16_t suffix);
+    uint64_t GetFullInstructionAddress(size_t suffix_index) const;
+  };
+
   DecodedThread(lldb::ThreadSP thread_sp);
 
   /// Utility constructor that initializes the trace with a provided error.
@@ -132,10 +157,23 @@
   /// message with the \a GetErrorByInstructionIndex() method.
   size_t GetInstructionsCount() const;
 
+  /// For each \a InstructionPointer block, calculate the net number of
+  /// instructions upto that block. See also, \a m_ipblock_sizes member.
+  void CalcIPBlockSizes();
+
+  /// Convert instruction ID to index
+  size_t ToIndex(lldb::user_id_t id) const;
+  /// Convert instruction index to ID
+  lldb::user_id_t ToID(size_t index) const;
+  /// Calculate next instruction ID
+  lldb::user_id_t NextID(lldb::user_id_t id) const;
+  /// Calculate previous instruction ID
+  lldb::user_id_t PrevID(lldb::user_id_t id) const;
+
   /// \return
   ///     The load address of the instruction at the given index, or \a
   ///     LLDB_INVALID_ADDRESS if it is an error.
-  lldb::addr_t GetInstructionLoadAddress(size_t insn_index) const;
+  lldb::addr_t GetInstructionLoadAddress(size_t insn_id) const;
 
   /// Get the \a lldb::TraceInstructionControlFlowType categories of the
   /// instruction.
@@ -143,7 +181,7 @@
   /// \return
   ///     The control flow categories, or \b 0 if the instruction is an error.
   lldb::TraceInstructionControlFlowType
-  GetInstructionControlFlowType(size_t insn_index) const;
+  GetInstructionControlFlowType(lldb::user_id_t insn_id) const;
 
   /// Construct the TSC range that covers the given instruction index.
   /// This operation is O(logn) and should be used sparingly.
@@ -158,7 +196,7 @@
   ///   An optional range that might include the given index or might be a
   ///   neighbor of it. It might help speed it traversals of the trace with
   ///   short jumps.
-  llvm::Optional<TscRange> CalculateTscRange(
+  llvm::Optional<TscRange> CalculateTscRangeByIndex(
       size_t insn_index,
       const llvm::Optional<DecodedThread::TscRange> &hint_range) const;
 
@@ -214,9 +252,11 @@
   /// When adding new members to this class, make sure
   /// to update \a CalculateApproximateMemoryUsage() accordingly.
   lldb::ThreadSP m_thread_sp;
-  /// The low level storage of all instruction addresses. Each instruction has
-  /// an index in this vector and it will be used in other parts of the code.
-  std::vector<lldb::addr_t> m_instruction_ips;
+  /// The low level storage of all instruction addresses. This stores
+  /// instructions in blocks such that instructions with the same high 48 bits
+  /// are stored together. Each instruction has an ID in this vector and it
+  /// will be used in other parts of the code.
+  std::vector<InstructionPointer> m_instruction_blocks;
   /// The size in bytes of each instruction.
   std::vector<uint8_t> m_instruction_sizes;
   /// The libipt instruction class for each instruction.
@@ -231,8 +271,11 @@
   std::map<size_t, uint64_t> m_instruction_timestamps;
   /// This is the chronologically last TSC that has been added.
   llvm::Optional<uint64_t> m_last_tsc = llvm::None;
-  // This variables stores the messages of all the error instructions in the
-  // trace. It maps `instruction index -> error message`.
+  /// For each \a InstructionPointer block, this contains the net number of
+  /// instructions upto that block
+  std::vector<size_t> m_ipblock_sizes;
+  /// This variables stores the messages of all the error instructions in the
+  /// trace. It maps `instruction index -> error message`.
   llvm::DenseMap<uint64_t, std::string> m_errors;
   /// The size in bytes of the raw buffer before decoding. It might be None if
   /// the decoding failed.
Index: lldb/source/Plugins/Trace/intel-pt/DecodedThread.cpp
===================================================================
--- lldb/source/Plugins/Trace/intel-pt/DecodedThread.cpp
+++ lldb/source/Plugins/Trace/intel-pt/DecodedThread.cpp
@@ -10,6 +10,7 @@
 
 #include <intel-pt.h>
 #include <memory>
+#include <vector>
 
 #include "TraceCursorIntelPT.h"
 #include "lldb/Utility/StreamString.h"
@@ -21,7 +22,7 @@
 
 char IntelPTError::ID;
 
-IntelPTError::IntelPTError(int libipt_error_code, lldb::addr_t address)
+IntelPTError::IntelPTError(int libipt_error_code, addr_t address)
     : m_libipt_error_code(libipt_error_code), m_address(address) {
   assert(libipt_error_code < 0);
 }
@@ -35,27 +36,124 @@
   OS << "error: " << libipt_error_message;
 }
 
+DecodedThread::DecodedThread(ThreadSP thread_sp) : m_thread_sp(thread_sp) {}
+
+DecodedThread::DecodedThread(ThreadSP thread_sp, Error &&error)
+    : m_thread_sp(thread_sp) {
+  AppendError(std::move(error));
+}
+
+void DecodedThread::SetRawTraceSize(size_t size) { m_raw_trace_size = size; }
+
+TraceCursorUP DecodedThread::GetCursor() {
+  // We insert a fake error signaling an empty trace if needed becasue the
+  // TraceCursor requires non-empty traces.
+  if (m_instruction_blocks.empty())
+    AppendError(createStringError(inconvertibleErrorCode(), "empty trace"));
+  return std::make_unique<TraceCursorIntelPT>(m_thread_sp, shared_from_this());
+}
+
 Optional<size_t> DecodedThread::GetRawTraceSize() const {
   return m_raw_trace_size;
 }
 
 size_t DecodedThread::GetInstructionsCount() const {
-  return m_instruction_ips.size();
+  size_t count = 0;
+  for (const InstructionPointer &ip : m_instruction_blocks)
+    count += ip.suffixes.size();
+  return count;
+}
+
+uint64_t DecodedThread::InstructionPointer::GetFullInstructionAddress(
+    size_t suffix_index) const {
+  uint16_t suffix = suffixes[suffix_index];
+  uint64_t address = (insn_prefix << 16) | (uint64_t)suffix;
+  return address;
+}
+
+void DecodedThread::CalcIPBlockSizes() {
+  size_t acc_insn_count = 0;
+  for (const InstructionPointer &ip : m_instruction_blocks) {
+    acc_insn_count += ip.suffixes.size();
+    m_ipblock_sizes.push_back(acc_insn_count);
+  }
 }
 
-lldb::addr_t DecodedThread::GetInstructionLoadAddress(size_t insn_index) const {
-  return m_instruction_ips[insn_index];
+size_t DecodedThread::ToIndex(user_id_t id) const {
+  return m_ipblock_sizes[id >> 32] + (UINT32_MAX & id);
+}
+
+user_id_t DecodedThread::ToID(size_t index) const {
+  if (m_ipblock_sizes.size() == 0) {
+    // Simple slow search if block sizes were not set up
+    size_t prefix_index = 0;
+    for (const InstructionPointer &ip : m_instruction_blocks) {
+      if (index >= 0 && index < ip.suffixes.size())
+        return (prefix_index << 32 | UINT32_MAX & index);
+
+      index -= ip.suffixes.size();
+      prefix_index++;
+    }
+
+    return LLDB_INVALID_ADDRESS;
+  }
+
+  // Binary fast search if block sizes are available
+  size_t lower = 0, upper = m_ipblock_sizes.size() - 1;
+  while (upper != lower && upper - lower != 1) {
+    size_t mid = (upper + lower) / 2;
+    if (index > m_ipblock_sizes[mid])
+      upper = mid;
+    else
+      lower = mid;
+  }
+
+  if (lower > 0)
+    index -= m_ipblock_sizes[lower];
+  return (upper << 32 | UINT32_MAX & index);
+}
+
+user_id_t DecodedThread::NextID(user_id_t id) const {
+  size_t prefix_idx = id >> 32;
+  size_t suffix_idx = UINT32_MAX & id;
+  if (m_instruction_blocks[prefix_idx].suffixes.size() - 1 > suffix_idx &&
+      suffix_idx < UINT32_MAX)
+    return id + 1;
+  else if (m_instruction_blocks.size() - 1 > prefix_idx &&
+           prefix_idx < UINT32_MAX)
+    return (prefix_idx + 1) << 32;
+  else
+    return id; // Sorry cant help you, already at MAX,MAX
+}
+
+user_id_t DecodedThread::PrevID(user_id_t id) const {
+  size_t prefix_idx = id >> 32;
+  size_t suffix_idx = UINT32_MAX & id;
+  if (suffix_idx < 0)
+    return id - 1;
+  else if (prefix_idx < 0)
+    return (prefix_idx - 1) << 32 |
+           (UINT32_MAX &
+            (m_instruction_blocks[prefix_idx - 1].suffixes.size() - 1));
+  else
+    return id; // Sorry cant help you, already at 0,0
+}
+
+addr_t DecodedThread::GetInstructionLoadAddress(user_id_t insn_id) const {
+  return m_instruction_blocks[insn_id >> 32].GetFullInstructionAddress(
+      UINT32_MAX & insn_id);
 }
 
 TraceInstructionControlFlowType
-DecodedThread::GetInstructionControlFlowType(size_t insn_index) const {
-  if (IsInstructionAnError(insn_index))
+DecodedThread::GetInstructionControlFlowType(user_id_t insn_id) const {
+  if (IsInstructionAnError(insn_id))
     return (TraceInstructionControlFlowType)0;
 
   TraceInstructionControlFlowType mask =
       eTraceInstructionControlFlowTypeInstruction;
 
-  lldb::addr_t load_address = m_instruction_ips[insn_index];
+  addr_t load_address = GetInstructionLoadAddress(insn_id);
+  size_t insn_index = ToIndex(insn_id);
   uint8_t insn_byte_size = m_instruction_sizes[insn_index];
   pt_insn_class iclass = m_instruction_classes[insn_index];
 
@@ -64,8 +162,9 @@
   case ptic_jump:
   case ptic_far_jump:
     mask |= eTraceInstructionControlFlowTypeBranch;
-    if (insn_index + 1 < m_instruction_ips.size() &&
-        load_address + insn_byte_size != m_instruction_ips[insn_index + 1])
+    if (insn_index + 1 < m_instruction_blocks.size() &&
+        load_address + insn_byte_size !=
+            GetInstructionLoadAddress(NextID(insn_id)))
       mask |= eTraceInstructionControlFlowTypeTakenBranch;
     break;
   case ptic_return:
@@ -85,20 +184,24 @@
 
 ThreadSP DecodedThread::GetThread() { return m_thread_sp; }
 
-void DecodedThread::RecordTscForLastInstruction(uint64_t tsc) {
-  if (!m_last_tsc || *m_last_tsc != tsc) {
-    // In case the first instructions are errors or did not have a TSC, we'll
-    // get a first valid TSC not in position 0. We can safely force these error
-    // instructions to use the first valid TSC, so that all the trace has TSCs.
-    size_t start_index =
-        m_instruction_timestamps.empty() ? 0 : m_instruction_ips.size() - 1;
-    m_instruction_timestamps.emplace(start_index, tsc);
-    m_last_tsc = tsc;
-  }
+size_t
+DecodedThread::InstructionPointer::AppendInstructionSuffix(uint16_t suffix) {
+  suffixes.push_back(suffix);
+  return suffixes.size() - 1;
 }
 
 void DecodedThread::AppendInstruction(const pt_insn &insn) {
-  m_instruction_ips.emplace_back(insn.ip);
+  uint16_t suffix = UINT16_MAX & insn.ip;
+  uint64_t prefix = insn.ip >> 16;
+
+  auto last_ip = m_instruction_blocks.end();
+  last_ip--;
+  if (!m_instruction_blocks.empty() && last_ip->insn_prefix == prefix &&
+      last_ip->suffixes.size() < UINT32_MAX)
+    last_ip->AppendInstructionSuffix(suffix);
+  else
+    m_instruction_blocks.emplace_back(prefix, suffix);
+
   m_instruction_sizes.emplace_back(insn.size);
   m_instruction_classes.emplace_back(insn.iclass);
 }
@@ -109,10 +212,10 @@
 }
 
 void DecodedThread::AppendError(llvm::Error &&error) {
-  m_errors.try_emplace(m_instruction_ips.size(), toString(std::move(error)));
-  m_instruction_ips.emplace_back(LLDB_INVALID_ADDRESS);
-  m_instruction_sizes.emplace_back(0);
-  m_instruction_classes.emplace_back(pt_insn_class::ptic_error);
+  m_errors.try_emplace(m_instruction_blocks.size(), toString(std::move(error)));
+  AppendInstruction(pt_insn{.ip = LLDB_INVALID_ADDRESS,
+                            .iclass = pt_insn_class::ptic_error,
+                            .size = 0});
 }
 
 void DecodedThread::AppendError(llvm::Error &&error, uint64_t tsc) {
@@ -125,6 +228,18 @@
   total_count++;
 }
 
+bool DecodedThread::IsInstructionAnError(user_id_t insn_id) const {
+  return GetInstructionLoadAddress(insn_id) == LLDB_INVALID_ADDRESS;
+}
+
+const char *DecodedThread::GetErrorByInstructionIndex(size_t insn_idx) {
+  auto it = m_errors.find(insn_idx);
+  if (it == m_errors.end())
+    return nullptr;
+
+  return it->second.c_str();
+}
+
 void DecodedThread::RecordTscError(int libipt_error_code) {
   m_tsc_errors.RecordError(libipt_error_code);
 }
@@ -133,7 +248,7 @@
   return m_tsc_errors;
 }
 
-Optional<DecodedThread::TscRange> DecodedThread::CalculateTscRange(
+Optional<DecodedThread::TscRange> DecodedThread::CalculateTscRangeByIndex(
     size_t insn_index,
     const Optional<DecodedThread::TscRange> &hint_range) const {
   // We first try to check the given hint range in case we are traversing the
@@ -159,41 +274,16 @@
   return TscRange(--it, *this);
 }
 
-bool DecodedThread::IsInstructionAnError(size_t insn_idx) const {
-  return m_instruction_ips[insn_idx] == LLDB_INVALID_ADDRESS;
-}
-
-const char *DecodedThread::GetErrorByInstructionIndex(size_t insn_idx) {
-  auto it = m_errors.find(insn_idx);
-  if (it == m_errors.end())
-    return nullptr;
-
-  return it->second.c_str();
-}
-
-DecodedThread::DecodedThread(ThreadSP thread_sp) : m_thread_sp(thread_sp) {}
-
-DecodedThread::DecodedThread(ThreadSP thread_sp, Error &&error)
-    : m_thread_sp(thread_sp) {
-  AppendError(std::move(error));
-}
-
-void DecodedThread::SetRawTraceSize(size_t size) { m_raw_trace_size = size; }
-
-lldb::TraceCursorUP DecodedThread::GetCursor() {
-  // We insert a fake error signaling an empty trace if needed becasue the
-  // TraceCursor requires non-empty traces.
-  if (m_instruction_ips.empty())
-    AppendError(createStringError(inconvertibleErrorCode(), "empty trace"));
-  return std::make_unique<TraceCursorIntelPT>(m_thread_sp, shared_from_this());
-}
-
-size_t DecodedThread::CalculateApproximateMemoryUsage() const {
-  return sizeof(pt_insn::ip) * m_instruction_ips.size() +
-         sizeof(pt_insn::size) * m_instruction_sizes.size() +
-         sizeof(pt_insn::iclass) * m_instruction_classes.size() +
-         (sizeof(size_t) + sizeof(uint64_t)) * m_instruction_timestamps.size() +
-         m_errors.getMemorySize();
+void DecodedThread::RecordTscForLastInstruction(uint64_t tsc) {
+  if (!m_last_tsc || *m_last_tsc != tsc) {
+    // In case the first instructions are errors or did not have a TSC, we'll
+    // get a first valid TSC not in position 0. We can safely force these error
+    // instructions to use the first valid TSC, so that all the trace has TSCs.
+    size_t start_index =
+        m_instruction_timestamps.empty() ? 0 : m_instruction_blocks.size() - 1;
+    m_instruction_timestamps.emplace(start_index, tsc);
+    m_last_tsc = tsc;
+  }
 }
 
 DecodedThread::TscRange::TscRange(std::map<size_t, uint64_t>::const_iterator it,
@@ -236,3 +326,11 @@
   --prev_it;
   return TscRange(prev_it, *m_decoded_thread);
 }
+
+size_t DecodedThread::CalculateApproximateMemoryUsage() const {
+  return sizeof(InstructionPointer) * m_instruction_blocks.size() +
+         sizeof(pt_insn::size) * m_instruction_sizes.size() +
+         sizeof(pt_insn::iclass) * m_instruction_classes.size() +
+         (sizeof(size_t) + sizeof(uint64_t)) * m_instruction_timestamps.size() +
+         m_errors.getMemorySize();
+}

_______________________________________________
lldb-commits mailing list
lldb-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/lldb-commits

[Lldb-commits] [PATCH] D123375: [lldb][intelpt] Reduce trace memory usage by grouping instructions

Reply via email to