This is an automated email from the ASF dual-hosted git repository.
spectrometerHBH pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git
The following commit(s) were added to refs/heads/main by this push:
new 66a4bcd401 [REFACTOR][RUNTIME] Relocate nvtx.h to tvm/support/cuda and
make it header-only (#19621)
66a4bcd401 is described below
commit 66a4bcd40154f86015c85bfe5f2de8b1e70ec235
Author: Tianqi Chen <[email protected]>
AuthorDate: Wed May 27 15:30:48 2026 -0400
[REFACTOR][RUNTIME] Relocate nvtx.h to tvm/support/cuda and make it
header-only (#19621)
## Summary
The NVTXScopedRange utility is a thin RAII wrapper over
nvtxRangePush/Pop
with a no-op fallback when NVTX is not enabled. The two function bodies
and the conditional include of `<nvtx3/nvToolsExt.h>` fit naturally
inline
in the header, eliminating the separate translation unit and its
`TVM_RUNTIME_DLL` export annotations.
- Move `include/tvm/runtime/nvtx.h` to `include/tvm/support/cuda/nvtx.h`
under namespace `tvm::support`; delete `src/runtime/nvtx.cc`.
- Inline the constructor/destructor; gate the real-vs-stub split with
`TVM_NVTX_ENABLED` in the header.
- Switch the CMake gate from a per-file `COMPILE_DEFINITIONS` on
`nvtx.cc` to a global `add_compile_definitions(TVM_NVTX_ENABLED=1)`
when `USE_CUDA AND USE_NVTX`, so every TU that includes the header
agrees on the definition.
- Update the three call-site files (`vm.cc`, `paged_kv_cache.cc`,
`attn_utils.h`) to the new include path and qualify `NVTXScopedRange`
as `support::NVTXScopedRange`.
---
CMakeLists.txt | 2 +-
include/tvm/{runtime => support/cuda}/nvtx.h | 48 +++++++++++++++++++++-------
src/runtime/nvtx.cc | 42 ------------------------
src/runtime/vm/attn_utils.h | 2 +-
src/runtime/vm/paged_kv_cache.cc | 4 +--
src/runtime/vm/vm.cc | 4 +--
web/emcc/wasm_runtime.cc | 1 -
7 files changed, 42 insertions(+), 61 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6c35af4b95..82794b185c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -834,7 +834,7 @@ if(USE_CUDA AND USE_CUTLASS)
endif()
if(USE_CUDA AND USE_NVTX)
- set_source_files_properties(src/runtime/nvtx.cc PROPERTIES
COMPILE_DEFINITIONS "TVM_NVTX_ENABLED=1")
+ add_compile_definitions(TVM_NVTX_ENABLED=1)
endif()
# Note: NCCL, NVSHMEM, RCCL target_link_libraries are handled in the inline
diff --git a/include/tvm/runtime/nvtx.h b/include/tvm/support/cuda/nvtx.h
similarity index 56%
rename from include/tvm/runtime/nvtx.h
rename to include/tvm/support/cuda/nvtx.h
index 2dbaeb9257..ef9083cfcd 100644
--- a/include/tvm/runtime/nvtx.h
+++ b/include/tvm/support/cuda/nvtx.h
@@ -16,14 +16,29 @@
* specific language governing permissions and limitations
* under the License.
*/
-#ifndef TVM_RUNTIME_NVTX_H_
-#define TVM_RUNTIME_NVTX_H_
-
-#include <tvm/runtime/base.h>
+/*!
+ * \file tvm/support/cuda/nvtx.h
+ * \brief NVTX scoped range utility (header-only).
+ *
+ * Provides NVTXScopedRange: a lightweight RAII wrapper over
+ * nvtxRangePush/Pop. When TVM_NVTX_ENABLED is not defined or is 0,
+ * all methods are no-ops compiled away by the optimizer.
+ */
+#ifndef TVM_SUPPORT_CUDA_NVTX_H_
+#define TVM_SUPPORT_CUDA_NVTX_H_
#include <string>
+
+#ifndef TVM_NVTX_ENABLED
+#define TVM_NVTX_ENABLED 0
+#endif
+
+#if TVM_NVTX_ENABLED
+#include <nvtx3/nvToolsExt.h>
+#endif // TVM_NVTX_ENABLED
+
namespace tvm {
-namespace runtime {
+namespace support {
/*!
* \brief A class to create a NVTX range. No-op if TVM is not built against
NVTX.
@@ -31,11 +46,19 @@ namespace runtime {
class NVTXScopedRange {
public:
/*! \brief Enter an NVTX scoped range */
- TVM_RUNTIME_DLL explicit NVTXScopedRange(const char* name);
+#if TVM_NVTX_ENABLED
+ explicit NVTXScopedRange(const char* name) { nvtxRangePush(name); }
+#else
+ explicit NVTXScopedRange(const char* name) {}
+#endif // TVM_NVTX_ENABLED
/*! \brief Enter an NVTX scoped range */
explicit NVTXScopedRange(const std::string& name) :
NVTXScopedRange(name.c_str()) {}
- /*! \brief Exist an NVTX scoped range */
- TVM_RUNTIME_DLL ~NVTXScopedRange();
+ /*! \brief Exit an NVTX scoped range */
+#if TVM_NVTX_ENABLED
+ ~NVTXScopedRange() { nvtxRangePop(); }
+#else
+ ~NVTXScopedRange() {}
+#endif // TVM_NVTX_ENABLED
NVTXScopedRange(const NVTXScopedRange& other) = delete;
NVTXScopedRange(NVTXScopedRange&& other) = delete;
NVTXScopedRange& operator=(const NVTXScopedRange& other) = delete;
@@ -43,12 +66,13 @@ class NVTXScopedRange {
};
#ifdef _MSC_VER
-#define TVM_NVTX_FUNC_SCOPE() NVTXScopedRange _nvtx_func_scope_(__FUNCSIG__);
+#define TVM_NVTX_FUNC_SCOPE() ::tvm::support::NVTXScopedRange
_nvtx_func_scope_(__FUNCSIG__);
#else
-#define TVM_NVTX_FUNC_SCOPE() NVTXScopedRange
_nvtx_func_scope_(__PRETTY_FUNCTION__);
+#define TVM_NVTX_FUNC_SCOPE() \
+ ::tvm::support::NVTXScopedRange _nvtx_func_scope_(__PRETTY_FUNCTION__);
#endif
-} // namespace runtime
+} // namespace support
} // namespace tvm
-#endif // TVM_RUNTIME_NVTX_H_
+#endif // TVM_SUPPORT_CUDA_NVTX_H_
diff --git a/src/runtime/nvtx.cc b/src/runtime/nvtx.cc
deleted file mode 100644
index 9cfd788714..0000000000
--- a/src/runtime/nvtx.cc
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-#include <string>
-#ifndef TVM_NVTX_ENABLED
-#define TVM_NVTX_ENABLED 0
-#endif
-
-#if TVM_NVTX_ENABLED
-#include <nvtx3/nvToolsExt.h>
-#endif // TVM_NVTX_ENABLED
-
-#include <tvm/runtime/nvtx.h>
-
-namespace tvm {
-namespace runtime {
-
-#if TVM_NVTX_ENABLED
-NVTXScopedRange::NVTXScopedRange(const char* name) { nvtxRangePush(name); }
-NVTXScopedRange::~NVTXScopedRange() { nvtxRangePop(); }
-#else
-NVTXScopedRange::NVTXScopedRange(const char* name) {}
-NVTXScopedRange::~NVTXScopedRange() {}
-#endif // TVM_NVTX_ENABLED
-
-} // namespace runtime
-} // namespace tvm
diff --git a/src/runtime/vm/attn_utils.h b/src/runtime/vm/attn_utils.h
index 9f46a2d2ec..2ee86bb075 100644
--- a/src/runtime/vm/attn_utils.h
+++ b/src/runtime/vm/attn_utils.h
@@ -27,8 +27,8 @@
#include <tvm/ffi/container/map.h>
#include <tvm/ffi/container/shape.h>
#include <tvm/runtime/logging.h>
-#include <tvm/runtime/nvtx.h>
#include <tvm/runtime/tensor.h>
+#include <tvm/support/cuda/nvtx.h>
#include <algorithm>
#include <limits>
diff --git a/src/runtime/vm/paged_kv_cache.cc b/src/runtime/vm/paged_kv_cache.cc
index 6e54f0bce0..e5c4576e01 100644
--- a/src/runtime/vm/paged_kv_cache.cc
+++ b/src/runtime/vm/paged_kv_cache.cc
@@ -27,8 +27,8 @@
#include <tvm/runtime/device_api.h>
#include <tvm/runtime/disco/disco_worker.h>
#include <tvm/runtime/memory/memory_manager.h>
-#include <tvm/runtime/nvtx.h>
#include <tvm/runtime/tensor.h>
+#include <tvm/support/cuda/nvtx.h>
#include <algorithm>
#include <numeric>
@@ -2306,7 +2306,7 @@ class PagedAttentionKVCacheObj : public
AttentionKVCacheObj {
* invoked before running attention computation on device.
*/
void SyncAuxArrayToDevice() {
- NVTXScopedRange range("SyncAuxArrayToDevice");
+ support::NVTXScopedRange range("SyncAuxArrayToDevice");
TVM_FFI_ICHECK(dtype_aux_.bits == 32 && dtype_aux_.code == kDLInt);
int64_t total_append_length = 0;
int num_sequences = cur_append_lengths_.size();
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index d6ffab9be0..0d84e64c7a 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -25,8 +25,8 @@
#include <tvm/ffi/function.h>
#include <tvm/runtime/logging.h>
#include <tvm/runtime/memory/memory_manager.h>
-#include <tvm/runtime/nvtx.h>
#include <tvm/runtime/vm/vm.h>
+#include <tvm/support/cuda/nvtx.h>
#include <thread>
@@ -547,7 +547,7 @@ void VirtualMachineImpl::InvokeClosurePacked(const
ffi::ObjectRef& closure_or_pa
packed_args[0] = static_cast<void*>(static_cast<VirtualMachine*>(this));
std::copy(args.data(), args.data() + args.size(), packed_args.begin() + 1);
{
- NVTXScopedRange scope("RelaxVM: " + clo->func_name);
+ support::NVTXScopedRange scope("RelaxVM: " + clo->func_name);
clo->impl.CallPacked(ffi::PackedArgs(packed_args.data(),
packed_args.size()), rv);
}
}
diff --git a/web/emcc/wasm_runtime.cc b/web/emcc/wasm_runtime.cc
index d2bfe326e1..b2b9a470be 100644
--- a/web/emcc/wasm_runtime.cc
+++ b/web/emcc/wasm_runtime.cc
@@ -63,7 +63,6 @@
#include "3rdparty/tvm-ffi/src/ffi/tensor.cc"
#include "3rdparty/tvm-ffi/src/ffi/testing/testing.cc"
#include "src/runtime/memory/memory_manager.cc"
-#include "src/runtime/nvtx.cc"
#include "src/runtime/vm/attn_backend.cc"
#include "src/runtime/vm/builtin.cc"
#include "src/runtime/vm/bytecode.cc"