cuda and make it header-only (#19621)

bohan Wed, 27 May 2026 12:31:04 -0700

This is an automated email from the ASF dual-hosted git repository.

spectrometerHBH pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git



The following commit(s) were added to refs/heads/main by this push:
     new 66a4bcd401 [REFACTOR][RUNTIME] Relocate nvtx.h to tvm/support/cuda and 
make it header-only (#19621)
66a4bcd401 is described below

commit 66a4bcd40154f86015c85bfe5f2de8b1e70ec235
Author: Tianqi Chen <[email protected]>
AuthorDate: Wed May 27 15:30:48 2026 -0400

    [REFACTOR][RUNTIME] Relocate nvtx.h to tvm/support/cuda and make it 
header-only (#19621)
    
    ## Summary
    
    The NVTXScopedRange utility is a thin RAII wrapper over
    nvtxRangePush/Pop
    with a no-op fallback when NVTX is not enabled. The two function bodies
    and the conditional include of `<nvtx3/nvToolsExt.h>` fit naturally
    inline
    in the header, eliminating the separate translation unit and its
    `TVM_RUNTIME_DLL` export annotations.
    
    - Move `include/tvm/runtime/nvtx.h` to `include/tvm/support/cuda/nvtx.h`
      under namespace `tvm::support`; delete `src/runtime/nvtx.cc`.
    - Inline the constructor/destructor; gate the real-vs-stub split with
      `TVM_NVTX_ENABLED` in the header.
    - Switch the CMake gate from a per-file `COMPILE_DEFINITIONS` on
      `nvtx.cc` to a global `add_compile_definitions(TVM_NVTX_ENABLED=1)`
      when `USE_CUDA AND USE_NVTX`, so every TU that includes the header
      agrees on the definition.
    - Update the three call-site files (`vm.cc`, `paged_kv_cache.cc`,
      `attn_utils.h`) to the new include path and qualify `NVTXScopedRange`
      as `support::NVTXScopedRange`.
---
 CMakeLists.txt                               |  2 +-
 include/tvm/{runtime => support/cuda}/nvtx.h | 48 +++++++++++++++++++++-------
 src/runtime/nvtx.cc                          | 42 ------------------------
 src/runtime/vm/attn_utils.h                  |  2 +-
 src/runtime/vm/paged_kv_cache.cc             |  4 +--
 src/runtime/vm/vm.cc                         |  4 +--
 web/emcc/wasm_runtime.cc                     |  1 -
 7 files changed, 42 insertions(+), 61 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6c35af4b95..82794b185c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -834,7 +834,7 @@ if(USE_CUDA AND USE_CUTLASS)
 endif()
 
 if(USE_CUDA AND USE_NVTX)
-  set_source_files_properties(src/runtime/nvtx.cc PROPERTIES 
COMPILE_DEFINITIONS "TVM_NVTX_ENABLED=1")
+  add_compile_definitions(TVM_NVTX_ENABLED=1)
 endif()
 
 # Note: NCCL, NVSHMEM, RCCL target_link_libraries are handled in the inline
diff --git a/include/tvm/runtime/nvtx.h b/include/tvm/support/cuda/nvtx.h
similarity index 56%
rename from include/tvm/runtime/nvtx.h
rename to include/tvm/support/cuda/nvtx.h
index 2dbaeb9257..ef9083cfcd 100644
--- a/include/tvm/runtime/nvtx.h
+++ b/include/tvm/support/cuda/nvtx.h
@@ -16,14 +16,29 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-#ifndef TVM_RUNTIME_NVTX_H_
-#define TVM_RUNTIME_NVTX_H_
-
-#include <tvm/runtime/base.h>
+/*!
+ * \file tvm/support/cuda/nvtx.h
+ * \brief NVTX scoped range utility (header-only).
+ *
+ * Provides NVTXScopedRange: a lightweight RAII wrapper over
+ * nvtxRangePush/Pop.  When TVM_NVTX_ENABLED is not defined or is 0,
+ * all methods are no-ops compiled away by the optimizer.
+ */
+#ifndef TVM_SUPPORT_CUDA_NVTX_H_
+#define TVM_SUPPORT_CUDA_NVTX_H_
 
 #include <string>
+
+#ifndef TVM_NVTX_ENABLED
+#define TVM_NVTX_ENABLED 0
+#endif
+
+#if TVM_NVTX_ENABLED
+#include <nvtx3/nvToolsExt.h>
+#endif  // TVM_NVTX_ENABLED
+
 namespace tvm {
-namespace runtime {
+namespace support {
 
 /*!
  * \brief A class to create a NVTX range. No-op if TVM is not built against 
NVTX.
@@ -31,11 +46,19 @@ namespace runtime {
 class NVTXScopedRange {
  public:
   /*! \brief Enter an NVTX scoped range */
-  TVM_RUNTIME_DLL explicit NVTXScopedRange(const char* name);
+#if TVM_NVTX_ENABLED
+  explicit NVTXScopedRange(const char* name) { nvtxRangePush(name); }
+#else
+  explicit NVTXScopedRange(const char* name) {}
+#endif  // TVM_NVTX_ENABLED
   /*! \brief Enter an NVTX scoped range */
   explicit NVTXScopedRange(const std::string& name) : 
NVTXScopedRange(name.c_str()) {}
-  /*! \brief Exist an NVTX scoped range */
-  TVM_RUNTIME_DLL ~NVTXScopedRange();
+  /*! \brief Exit an NVTX scoped range */
+#if TVM_NVTX_ENABLED
+  ~NVTXScopedRange() { nvtxRangePop(); }
+#else
+  ~NVTXScopedRange() {}
+#endif  // TVM_NVTX_ENABLED
   NVTXScopedRange(const NVTXScopedRange& other) = delete;
   NVTXScopedRange(NVTXScopedRange&& other) = delete;
   NVTXScopedRange& operator=(const NVTXScopedRange& other) = delete;
@@ -43,12 +66,13 @@ class NVTXScopedRange {
 };
 
 #ifdef _MSC_VER
-#define TVM_NVTX_FUNC_SCOPE() NVTXScopedRange _nvtx_func_scope_(__FUNCSIG__);
+#define TVM_NVTX_FUNC_SCOPE() ::tvm::support::NVTXScopedRange 
_nvtx_func_scope_(__FUNCSIG__);
 #else
-#define TVM_NVTX_FUNC_SCOPE() NVTXScopedRange 
_nvtx_func_scope_(__PRETTY_FUNCTION__);
+#define TVM_NVTX_FUNC_SCOPE() \
+  ::tvm::support::NVTXScopedRange _nvtx_func_scope_(__PRETTY_FUNCTION__);
 #endif
 
-}  // namespace runtime
+}  // namespace support
 }  // namespace tvm
 
-#endif  // TVM_RUNTIME_NVTX_H_
+#endif  // TVM_SUPPORT_CUDA_NVTX_H_
diff --git a/src/runtime/nvtx.cc b/src/runtime/nvtx.cc
deleted file mode 100644
index 9cfd788714..0000000000
--- a/src/runtime/nvtx.cc
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-#include <string>
-#ifndef TVM_NVTX_ENABLED
-#define TVM_NVTX_ENABLED 0
-#endif
-
-#if TVM_NVTX_ENABLED
-#include <nvtx3/nvToolsExt.h>
-#endif  // TVM_NVTX_ENABLED
-
-#include <tvm/runtime/nvtx.h>
-
-namespace tvm {
-namespace runtime {
-
-#if TVM_NVTX_ENABLED
-NVTXScopedRange::NVTXScopedRange(const char* name) { nvtxRangePush(name); }
-NVTXScopedRange::~NVTXScopedRange() { nvtxRangePop(); }
-#else
-NVTXScopedRange::NVTXScopedRange(const char* name) {}
-NVTXScopedRange::~NVTXScopedRange() {}
-#endif  // TVM_NVTX_ENABLED
-
-}  // namespace runtime
-}  // namespace tvm
diff --git a/src/runtime/vm/attn_utils.h b/src/runtime/vm/attn_utils.h
index 9f46a2d2ec..2ee86bb075 100644
--- a/src/runtime/vm/attn_utils.h
+++ b/src/runtime/vm/attn_utils.h
@@ -27,8 +27,8 @@
 #include <tvm/ffi/container/map.h>
 #include <tvm/ffi/container/shape.h>
 #include <tvm/runtime/logging.h>
-#include <tvm/runtime/nvtx.h>
 #include <tvm/runtime/tensor.h>
+#include <tvm/support/cuda/nvtx.h>
 
 #include <algorithm>
 #include <limits>
diff --git a/src/runtime/vm/paged_kv_cache.cc b/src/runtime/vm/paged_kv_cache.cc
index 6e54f0bce0..e5c4576e01 100644
--- a/src/runtime/vm/paged_kv_cache.cc
+++ b/src/runtime/vm/paged_kv_cache.cc
@@ -27,8 +27,8 @@
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/disco/disco_worker.h>
 #include <tvm/runtime/memory/memory_manager.h>
-#include <tvm/runtime/nvtx.h>
 #include <tvm/runtime/tensor.h>
+#include <tvm/support/cuda/nvtx.h>
 
 #include <algorithm>
 #include <numeric>
@@ -2306,7 +2306,7 @@ class PagedAttentionKVCacheObj : public 
AttentionKVCacheObj {
    * invoked before running attention computation on device.
    */
   void SyncAuxArrayToDevice() {
-    NVTXScopedRange range("SyncAuxArrayToDevice");
+    support::NVTXScopedRange range("SyncAuxArrayToDevice");
     TVM_FFI_ICHECK(dtype_aux_.bits == 32 && dtype_aux_.code == kDLInt);
     int64_t total_append_length = 0;
     int num_sequences = cur_append_lengths_.size();
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index d6ffab9be0..0d84e64c7a 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -25,8 +25,8 @@
 #include <tvm/ffi/function.h>
 #include <tvm/runtime/logging.h>
 #include <tvm/runtime/memory/memory_manager.h>
-#include <tvm/runtime/nvtx.h>
 #include <tvm/runtime/vm/vm.h>
+#include <tvm/support/cuda/nvtx.h>
 
 #include <thread>
 
@@ -547,7 +547,7 @@ void VirtualMachineImpl::InvokeClosurePacked(const 
ffi::ObjectRef& closure_or_pa
   packed_args[0] = static_cast<void*>(static_cast<VirtualMachine*>(this));
   std::copy(args.data(), args.data() + args.size(), packed_args.begin() + 1);
   {
-    NVTXScopedRange scope("RelaxVM: " + clo->func_name);
+    support::NVTXScopedRange scope("RelaxVM: " + clo->func_name);
     clo->impl.CallPacked(ffi::PackedArgs(packed_args.data(), 
packed_args.size()), rv);
   }
 }
diff --git a/web/emcc/wasm_runtime.cc b/web/emcc/wasm_runtime.cc
index d2bfe326e1..b2b9a470be 100644
--- a/web/emcc/wasm_runtime.cc
+++ b/web/emcc/wasm_runtime.cc
@@ -63,7 +63,6 @@
 #include "3rdparty/tvm-ffi/src/ffi/tensor.cc"
 #include "3rdparty/tvm-ffi/src/ffi/testing/testing.cc"
 #include "src/runtime/memory/memory_manager.cc"
-#include "src/runtime/nvtx.cc"
 #include "src/runtime/vm/attn_backend.cc"
 #include "src/runtime/vm/builtin.cc"
 #include "src/runtime/vm/bytecode.cc"

(tvm) branch main updated: [REFACTOR][RUNTIME] Relocate nvtx.h to tvm/support/cuda and make it header-only (#19621)

Reply via email to