[llvm-branch-commits] [llvm] [AllocToken] Introduce AllocToken instrumentation pass (PR #156838)

Oliver Hunt via llvm-branch-commits Thu, 25 Sep 2025 14:55:42 -0700

================
@@ -0,0 +1,469 @@
+//===- AllocToken.cpp - Allocation token instrumentation 
------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements AllocToken, an instrumentation pass that
+// replaces allocation calls with token-enabled versions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/AllocToken.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/Analysis.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/RandomNumberGenerator.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/xxhash.h"
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <variant>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "alloc-token"
+
+namespace {
+
+//===--- Constants 
--------------------------------------------------------===//
+
+enum class TokenMode : unsigned {
+  /// Incrementally increasing token ID.
+  Increment = 0,
+
+  /// Simple mode that returns a statically-assigned random token ID.
+  Random = 1,
+
+  /// Token ID based on allocated type hash.
+  TypeHash = 2,
+};
+
+//===--- Command-line options 
---------------------------------------------===//
+
+cl::opt<TokenMode>
+    ClMode("alloc-token-mode", cl::Hidden, cl::desc("Token assignment mode"),
+           cl::init(TokenMode::TypeHash),
+           cl::values(clEnumValN(TokenMode::Increment, "increment",
+                                 "Incrementally increasing token ID"),
+                      clEnumValN(TokenMode::Random, "random",
+                                 "Statically-assigned random token ID"),
+                      clEnumValN(TokenMode::TypeHash, "typehash",
+                                 "Token ID based on allocated type hash")));
+
+cl::opt<std::string> ClFuncPrefix("alloc-token-prefix",
+                                  cl::desc("The allocation function prefix"),
+                                  cl::Hidden, cl::init("__alloc_token_"));
+
+cl::opt<uint64_t> ClMaxTokens("alloc-token-max",
+                              cl::desc("Maximum number of tokens (0 = no 
max)"),
+                              cl::Hidden, cl::init(0));
+
+cl::opt<bool>
+    ClFastABI("alloc-token-fast-abi",
+              cl::desc("The token ID is encoded in the function name"),
+              cl::Hidden, cl::init(false));
+
+// Instrument libcalls only by default - compatible allocators only need to 
take
+// care of providing standard allocation functions. With extended coverage, 
also
+// instrument non-libcall allocation function calls with !alloc_token
+// metadata.
+cl::opt<bool>
+    ClExtended("alloc-token-extended",
+               cl::desc("Extend coverage to custom allocation functions"),
+               cl::Hidden, cl::init(false));
+
+// C++ defines ::operator new (and variants) as replaceable (vs. standard
+// library versions), which are nobuiltin, and are therefore not covered by
+// isAllocationFn(). Cover by default, as users of AllocToken are already
+// required to provide token-aware allocation functions (no defaults).
+cl::opt<bool> ClCoverReplaceableNew("alloc-token-cover-replaceable-new",
+                                    cl::desc("Cover replaceable operator new"),
+                                    cl::Hidden, cl::init(true));
+
+cl::opt<uint64_t> ClFallbackToken(
+    "alloc-token-fallback",
+    cl::desc("The default fallback token where none could be determined"),
+    cl::Hidden, cl::init(0));
+
+//===--- Statistics 
-------------------------------------------------------===//
+
+STATISTIC(NumFunctionsInstrumented, "Functions instrumented");
+STATISTIC(NumAllocationsInstrumented, "Allocations instrumented");
+
+//===----------------------------------------------------------------------===//
+
+/// Returns the !alloc_token metadata if available.
+///
+/// Expected format is: !{<type-name>}
+MDNode *getAllocTokenMetadata(const CallBase &CB) {
+  MDNode *Ret = CB.getMetadata(LLVMContext::MD_alloc_token);
+  if (!Ret)
+    return nullptr;
+  assert(Ret->getNumOperands() == 1 && "bad !alloc_token");
+  assert(isa<MDString>(Ret->getOperand(0)));
+  return Ret;
+}
+
+class ModeBase {
+public:
+  explicit ModeBase(uint64_t MaxTokens) : MaxTokens(MaxTokens) {}
+
+protected:
+  uint64_t boundedToken(uint64_t Val) const {
+    return MaxTokens ? Val % MaxTokens : Val;
+  }
+
+  const uint64_t MaxTokens;
+};
+
+/// Implementation for TokenMode::Increment.
+class IncrementMode : public ModeBase {
+public:
+  using ModeBase::ModeBase;
+
+  uint64_t operator()(const CallBase &CB, OptimizationRemarkEmitter &) {
+    return boundedToken(Counter++);
+  }
+
+private:
+  uint64_t Counter = 0;
+};
+
+/// Implementation for TokenMode::Random.
+class RandomMode : public ModeBase {
+public:
+  RandomMode(uint64_t MaxTokens, std::unique_ptr<RandomNumberGenerator> RNG)
+      : ModeBase(MaxTokens), RNG(std::move(RNG)) {}
+  uint64_t operator()(const CallBase &CB, OptimizationRemarkEmitter &) {
+    return boundedToken((*RNG)());
+  }
+
+private:
+  std::unique_ptr<RandomNumberGenerator> RNG;
+};
+
+/// Implementation for TokenMode::TypeHash. The implementation ensures
+/// hashes are stable across different compiler invocations. Uses xxHash as the
+/// hash function.
+class TypeHashMode : public ModeBase {
+public:
+  using ModeBase::ModeBase;
+
+  uint64_t operator()(const CallBase &CB, OptimizationRemarkEmitter &ORE) {
+    if (MDNode *N = getAllocTokenMetadata(CB)) {
+      MDString *S = cast<MDString>(N->getOperand(0));
+      return boundedToken(xxHash64(S->getString()));
+    }
+    remarkNoMetadata(CB, ORE);
+    return ClFallbackToken;
+  }
+
+  /// Remark that there was no precise type information.
+  static void remarkNoMetadata(const CallBase &CB,
+                               OptimizationRemarkEmitter &ORE) {
+    ORE.emit([&] {
+      ore::NV FuncNV("Function", CB.getParent()->getParent());
+      const Function *Callee = CB.getCalledFunction();
+      ore::NV CalleeNV("Callee", Callee ? Callee->getName() : "<unknown>");
+      return OptimizationRemark(DEBUG_TYPE, "NoAllocToken", &CB)
+             << "Call to '" << CalleeNV << "' in '" << FuncNV
+             << "' without source-level type token";
+    });
+  }
+};
+
+// Apply opt overrides.
+AllocTokenOptions transformOptionsFromCl(AllocTokenOptions Opts) {
+  if (!Opts.MaxTokens.has_value())
+    Opts.MaxTokens = ClMaxTokens;
+  Opts.FastABI |= ClFastABI;
+  Opts.Extended |= ClExtended;
+  return Opts;
+}
+
+class AllocToken {
+public:
+  explicit AllocToken(AllocTokenOptions Opts, Module &M,
+                      ModuleAnalysisManager &MAM)
+      : Options(transformOptionsFromCl(std::move(Opts))), Mod(M),
+        FAM(MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager()),
+        Mode(IncrementMode(*Options.MaxTokens)) {
+    switch (ClMode.getValue()) {
+    case TokenMode::Increment:
+      break;
+    case TokenMode::Random:
+      Mode.emplace<RandomMode>(*Options.MaxTokens, M.createRNG(DEBUG_TYPE));
+      break;
+    case TokenMode::TypeHash:
+      Mode.emplace<TypeHashMode>(*Options.MaxTokens);
+      break;
+    }
+  }
+
+  bool instrumentFunction(Function &F);
+
+private:
+  /// Returns true for !isAllocationFn() functions that are also eligible for
+  /// instrumentation.
+  static bool isInstrumentableLibFunc(LibFunc Func, const Value *V,
+                                      const TargetLibraryInfo *TLI);
+
+  /// Returns true for isAllocationFn() functions that we should ignore.
+  static bool ignoreInstrumentableLibFunc(LibFunc Func);
+
+  /// Replace a call/invoke with a call/invoke to the allocation function
+  /// with token ID.
+  bool replaceAllocationCall(CallBase *CB, LibFunc Func,
+                             OptimizationRemarkEmitter &ORE,
+                             const TargetLibraryInfo &TLI);
+
+  /// Return replacement function for a LibFunc that takes a token ID.
+  FunctionCallee getTokenAllocFunction(const CallBase &CB, uint64_t TokenID,
+                                       LibFunc OriginalFunc);
+
+  /// Return the token ID from metadata in the call.
+  uint64_t getToken(const CallBase &CB, OptimizationRemarkEmitter &ORE) {
+    return std::visit([&](auto &&Mode) { return Mode(CB, ORE); }, Mode);
+  }
+
+  const AllocTokenOptions Options;
+  Module &Mod;
+  IntegerType *IntPtrTy = Mod.getDataLayout().getIntPtrType(Mod.getContext());
+  FunctionAnalysisManager &FAM;
+  // Cache for replacement functions.
+  DenseMap<std::pair<LibFunc, uint64_t>, FunctionCallee> TokenAllocFunctions;
+  // Selected mode.
+  std::variant<IncrementMode, RandomMode, TypeHashMode> Mode;
+};
+
+bool AllocToken::instrumentFunction(Function &F) {
+  // Do not apply any instrumentation for naked functions.
+  if (F.hasFnAttribute(Attribute::Naked))
+    return false;
+  if (F.hasFnAttribute(Attribute::DisableSanitizerInstrumentation))
+    return false;
+  // Don't touch available_externally functions, their actual body is 
elsewhere.
+  if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage)
+    return false;
+  // Only instrument functions that have the sanitize_alloc_token attribute.
+  if (!F.hasFnAttribute(Attribute::SanitizeAllocToken))
+    return false;
+
+  auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+  auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
+  SmallVector<std::pair<CallBase *, LibFunc>, 4> AllocCalls;
+
+  // Collect all allocation calls to avoid iterator invalidation.
+  for (Instruction &I : instructions(F)) {
+    auto *CB = dyn_cast<CallBase>(&I);
+    if (!CB)
+      continue;
+    const Function *Callee = CB->getCalledFunction();
+    if (!Callee)
+      continue;
+    // Ignore nobuiltin of the CallBase, so that we can cover nobuiltin 
libcalls
+    // if requested via isInstrumentableLibFunc(). Note that isAllocationFn() 
is
+    // returning false for nobuiltin calls.
+    LibFunc Func;
+    if (TLI.getLibFunc(*Callee, Func)) {
+      if (ignoreInstrumentableLibFunc(Func))
+        continue;
+      if (isInstrumentableLibFunc(Func, CB, &TLI))
+        AllocCalls.emplace_back(CB, Func);
+    } else if (Options.Extended && getAllocTokenMetadata(*CB)) {
+      AllocCalls.emplace_back(CB, NotLibFunc);
+    }
+  }
+
+  bool Modified = false;
+  for (auto &[CB, Func] : AllocCalls)
+    Modified |= replaceAllocationCall(CB, Func, ORE, TLI);
+
+  if (Modified)
+    NumFunctionsInstrumented++;
+  return Modified;
+}
+
+bool AllocToken::isInstrumentableLibFunc(LibFunc Func, const Value *V,
+                                         const TargetLibraryInfo *TLI) {
+  if (isAllocationFn(V, TLI))
+    return true;
+
+  switch (Func) {
+  case LibFunc_posix_memalign:
----------------
ojhunt wrote:


out parameter returns are a problem we have with TMO as well (currently we 
assume pointer return == allocation return) so include any casts on that output 
if we are unable to infer the type from the expression, but I'm pondering 
including an optional "this is where the returned allocation is" parameter to 
the attribute. In principle I think the -fbounds-safety attributes could be 
used to infer the out location but those aren't available in c++, and I'm not 
sure that such a guess is particularly trivial.

Of course TMO does not care about error/alloc failed responses (it's purely 
intended as an input side semantic info flag)

https://github.com/llvm/llvm-project/pull/156838
_______________________________________________
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] [AllocToken] Introduce AllocToken instrumentation pass (PR #156838)

Reply via email to