(tvm) branch main updated: [REFACTOR][RUNTIME] Phase out profiling.h heavy types, rename to timer.h (#19455)

tqchen Mon, 27 Apr 2026 17:27:23 -0700

This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git



The following commit(s) were added to refs/heads/main by this push:
     new 4e5b869c27 [REFACTOR][RUNTIME] Phase out profiling.h heavy types, 
rename to timer.h (#19455)
4e5b869c27 is described below

commit 4e5b869c27cdb922de10a6b6d7d83ca845b9803e
Author: Tianqi Chen <[email protected]>
AuthorDate: Mon Apr 27 20:27:07 2026 -0400

    [REFACTOR][RUNTIME] Phase out profiling.h heavy types, rename to timer.h 
(#19455)
---
 apps/android_rpc/app/src/main/jni/tvm_runtime.h    |   2 +-
 docs/arch/relax_vm.rst                             |  18 +-
 docs/reference/api/python/index.rst                |   1 -
 docs/reference/api/python/runtime/profiling.rst    |  21 -
 include/tvm/runtime/profiling.h                    | 590 -------------
 include/tvm/runtime/timer.h                        | 195 +++++
 include/tvm/runtime/vm/executable.h                |   2 -
 include/tvm/runtime/vm/vm.h                        |   9 -
 python/tvm/relax/training/trainer.py               |  47 --
 python/tvm/runtime/__init__.py                     |   1 -
 python/tvm/runtime/profiling/__init__.py           | 272 ------
 python/tvm/runtime/profiling/_ffi_api.py           |  21 -
 python/tvm/runtime/vm.py                           |  32 +-
 src/ir/structural_hash.cc                          |  64 +-
 src/runtime/contrib/clml/clml_runtime.cc           | 154 +---
 src/runtime/contrib/clml/clml_runtime.h            |   1 -
 src/runtime/contrib/json/json_runtime.h            |  36 -
 src/runtime/cuda/cuda_device_api.cc                |   4 +-
 src/runtime/hexagon/hexagon_common.cc              |   4 +-
 src/runtime/metal/metal_device_api.mm              |   4 +-
 src/runtime/opencl/opencl_common.h                 |   2 +-
 src/runtime/opencl/opencl_device_api.cc            |   4 +-
 src/runtime/profiling.cc                           | 937 ---------------------
 src/runtime/rocm/rocm_device_api.cc                |   4 +-
 src/runtime/rpc/rpc_module.cc                      |  14 +-
 src/runtime/timer.cc                               | 176 ++++
 src/runtime/vm/executable.cc                       |   7 -
 src/runtime/vm/vm.cc                               | 123 ---
 tests/python/relax/test_codegen_coreml.py          |   4 +-
 .../python/relax/test_training_trainer_numeric.py  |   3 +-
 tests/python/relax/test_vm_profiler.py             | 129 ---
 web/emcc/wasm_runtime.cc                           |   2 +-
 32 files changed, 402 insertions(+), 2481 deletions(-)

diff --git a/apps/android_rpc/app/src/main/jni/tvm_runtime.h 
b/apps/android_rpc/app/src/main/jni/tvm_runtime.h
index 460bca0bc7..920ae6bb1d 100644
--- a/apps/android_rpc/app/src/main/jni/tvm_runtime.h
+++ b/apps/android_rpc/app/src/main/jni/tvm_runtime.h
@@ -50,7 +50,6 @@
 #include "../src/runtime/logging.cc"
 #include "../src/runtime/memory/memory_manager.cc"
 #include "../src/runtime/minrpc/minrpc_logger.cc"
-#include "../src/runtime/profiling.cc"
 #include "../src/runtime/registry.cc"
 #include "../src/runtime/rpc/rpc_channel.cc"
 #include "../src/runtime/rpc/rpc_endpoint.cc"
@@ -63,6 +62,7 @@
 #include "../src/runtime/tensor.cc"
 #include "../src/runtime/thread_pool.cc"
 #include "../src/runtime/threading_backend.cc"
+#include "../src/runtime/timer.cc"
 #include "../src/runtime/workspace_pool.cc"
 
 #ifdef TVM_OPENCL_RUNTIME
diff --git a/docs/arch/relax_vm.rst b/docs/arch/relax_vm.rst
index 30ce5bd058..222329ce0b 100644
--- a/docs/arch/relax_vm.rst
+++ b/docs/arch/relax_vm.rst
@@ -354,26 +354,14 @@ Key methods:
   reducing dictionary lookup overhead during benchmarking.
 - ``vm.time_evaluator(func_name, dev)`` — returns a timing function following 
the same convention
   as ``tvm.runtime.Module.time_evaluator``.
-- ``vm.profile(func_name, *args)`` — returns a per-operator profiling report 
(requires
-  ``profile=True`` at VM construction).
 - ``vm.set_instrument(func)`` — register an instrumentation callback that is 
invoked before/after
   every ``Call`` instruction. The callback can return 
``VMInstrumentReturnKind.SKIP_RUN`` to
   skip the call.
 
-Profiling and instrumentation
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Instrumentation
+~~~~~~~~~~~~~~~
 
-The VM supports two levels of observability:
-
-**Profiling** via ``VirtualMachine(exec, dev, profile=True)``:
-
-.. code-block:: python
-
-   vm = relax.VirtualMachine(ex, tvm.cuda(), profile=True)
-   report = vm.profile("main", inp)
-   print(report)
-
-This produces a ``tvm.runtime.profiling.Report`` with per-operator timing 
breakdown.
+The VM supports observability via instrumentation:
 
 **Instrumentation** via ``set_instrument()``:
 
diff --git a/docs/reference/api/python/index.rst 
b/docs/reference/api/python/index.rst
index 89f9f0c577..4bef65f82d 100644
--- a/docs/reference/api/python/index.rst
+++ b/docs/reference/api/python/index.rst
@@ -40,7 +40,6 @@ Python API
     runtime/runtime
     runtime/vm
     runtime/disco
-    runtime/profiling
 
 .. toctree::
     :maxdepth: 1
diff --git a/docs/reference/api/python/runtime/profiling.rst 
b/docs/reference/api/python/runtime/profiling.rst
deleted file mode 100644
index d26f00af90..0000000000
--- a/docs/reference/api/python/runtime/profiling.rst
+++ /dev/null
@@ -1,21 +0,0 @@
-..  Licensed to the Apache Software Foundation (ASF) under one
-    or more contributor license agreements.  See the NOTICE file
-    distributed with this work for additional information
-    regarding copyright ownership.  The ASF licenses this file
-    to you under the Apache License, Version 2.0 (the
-    "License"); you may not use this file except in compliance
-    with the License.  You may obtain a copy of the License at
-
-..    http://www.apache.org/licenses/LICENSE-2.0
-
-..  Unless required by applicable law or agreed to in writing,
-    software distributed under the License is distributed on an
-    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-    KIND, either express or implied.  See the License for the
-    specific language governing permissions and limitations
-    under the License.
-
-tvm.runtime.profiling
----------------------
-.. automodule:: tvm.runtime.profiling
-    :members:
diff --git a/include/tvm/runtime/profiling.h b/include/tvm/runtime/profiling.h
deleted file mode 100644
index 7bdf602808..0000000000
--- a/include/tvm/runtime/profiling.h
+++ /dev/null
@@ -1,590 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file include/tvm/runtime/profiling.h
- * \brief Runtime profiling including timers.
- */
-#ifndef TVM_RUNTIME_PROFILING_H_
-#define TVM_RUNTIME_PROFILING_H_
-
-#include <tvm/ffi/container/array.h>
-#include <tvm/ffi/container/map.h>
-#include <tvm/ffi/extra/module.h>
-#include <tvm/ffi/function.h>
-#include <tvm/runtime/base.h>
-#include <tvm/runtime/device_api.h>
-#include <tvm/runtime/object.h>
-#include <tvm/runtime/tensor.h>
-
-#include <stack>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-namespace tvm {
-
-namespace runtime {
-
-/*! \brief Base class for all implementations.
- *
- * New implementations of this interface should make sure that `Start` and 
`Stop`
- * are as lightweight as possible. Expensive state synchronization should be
- * done in `SyncAndGetElapsedNanos`.
- */
-class TimerNode : public Object {
- public:
-  /*! \brief Start the timer.
-   *
-   * Note: this function should only be called once per object.
-   */
-  virtual void Start() = 0;
-  /*! \brief Stop the timer.
-   *
-   * Note: this function should only be called once per object.
-   */
-  virtual void Stop() = 0;
-  /*! \brief Synchronize timer state and return elapsed time between `Start` 
and `Stop`.
-   * \return The time in nanoseconds between `Start` and `Stop`.
-   *
-   * This function is necessary because we want to avoid timing the overhead of
-   * doing timing. When using multiple timers, it is recommended to stop all of
-   * them before calling `SyncAndGetElapsedNanos` on any of them.
-   *
-   * Note: this function should be only called once per object. It may incur
-   * a large synchronization overhead (for example, with GPUs).
-   */
-  virtual int64_t SyncAndGetElapsedNanos() = 0;
-
-  virtual ~TimerNode() {}
-
-  static constexpr const bool _type_mutable = true;
-  TVM_FFI_DECLARE_OBJECT_INFO("runtime.TimerNode", TimerNode, Object);
-};
-
-/*! \brief Timer for a specific device.
- *
- * This is a managed reference to a TimerNode.
- *
- * \sa TimerNode
- */
-class Timer : public ObjectRef {
- public:
-  /*!
-   * \brief Get a device specific timer.
-   * \param dev The device to time.
-   * \return A `Timer` that has already been started.
-   *
-   * Use this function to time runtime of arbitrary regions of code on a 
specific
-   * device. The code that you want to time should be running on the device
-   * otherwise the timer will not return correct results. This is a lower level
-   * interface than TimeEvaluator and only runs the timed code once
-   * (TimeEvaluator runs the code multiple times).
-   *
-   * A default timer is used if a device specific one does not exist. This
-   * timer performs synchronization between the device and CPU, which can lead
-   * to overhead in the reported results.
-   *
-   * Example usage:
-   * \code{.cpp}
-   * Timer t = Timer::Start(Device::cpu());
-   * my_long_running_function();
-   * t->Stop();
-   * ... // some more computation
-   * int64_t nanosecs = t->SyncAndGetElapsedNanos() // elapsed time in 
nanoseconds
-   * \endcode
-   *
-   * To add a new device-specific timer, register a new function
-   * "profiler.timer.my_device" (where `my_device` is the `DeviceName` of your
-   * device). This function should accept a `Device` and return a new `Timer`
-   * that has already been started.
-   *
-   * For example, this is how the CPU timer is implemented:
-   * \code{.cpp}
-   *  class CPUTimerNode : public TimerNode {
-   *   public:
-   *    virtual void Start() { start_ = 
std::chrono::high_resolution_clock::now(); }
-   *    virtual void Stop() { duration_ = 
std::chrono::high_resolution_clock::now() - start_; }
-   *    virtual int64_t SyncAndGetElapsedNanos() { return duration_.count(); }
-   *    virtual ~CPUTimerNode() {}
-   *
-   *    static constexpr const char* _type_key = "runtime.CPUTimerNode";
-   *    TVM_FFI_DECLARE_OBJECT_INFO_FINAL(CPUTimerNode, TimerNode);
-   *
-   *   private:
-   *    std::chrono::high_resolution_clock::time_point start_;
-   *    std::chrono::duration<int64_t, std::nano> duration_;
-   *  };
-   *
-   *
-   *  TVM_FFI_STATIC_INIT_BLOCK() {
-   *    namespace refl = tvm::ffi::reflection;
-   *    refl::GlobalDef().def("profiling.timer.cpu", [](Device dev) {
-   *      return Timer(ffi::make_object<CPUTimerNode>());
-   *    });
-   *  }
-   * \endcode
-   */
-  static TVM_RUNTIME_DLL Timer Start(Device dev);
-
-  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(Timer, ObjectRef, TimerNode);
-};
-
-/*!
- * \brief Default timer if one does not exist for the device.
- * \param dev The device to time on.
- *
- * Note that this timer performs synchronization between the device and CPU,
- * which can lead to overhead in the reported results.
- */
-Timer DefaultTimer(Device dev);
-
-namespace profiling {
-/*! \brief Wrapper for `Device` because `Device` is not passable across the
- * ffi::Function interface.
- */
-struct DeviceWrapperNode : public Object {
-  /*! The device */
-  Device device;
-
-  /*! Constructor */
-  explicit DeviceWrapperNode(Device device) : device(device) {}
-  TVM_FFI_DECLARE_OBJECT_INFO("runtime.profiling.DeviceWrapper", 
DeviceWrapperNode, Object);
-};
-
-/*! \brief Wrapper for `Device`. */
-class DeviceWrapper : public ObjectRef {
- public:
-  explicit DeviceWrapper(Device dev) { data_ = 
ffi::make_object<DeviceWrapperNode>(dev); }
-  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(DeviceWrapper, ObjectRef, 
DeviceWrapperNode);
-};
-
-/*! \brief Data collected from a profiling run. Includes per-call metrics and 
per-device metrics.
- */
-class TVM_RUNTIME_DLL ReportNode : public Object {
- public:
-  /*! \brief A list of function calls and the metrics recorded for that call.
-   *
-   * Each element is a mapping from metric name to value. Some metrics that
-   * appear in every call are "Name" (the function name), "Argument Shapes",
-   * and "Duration (us)". Values are one of `String`, `PercentNode`,
-   * `DurationNode`, or `CountNode`.
-   */
-  ffi::Array<ffi::Map<ffi::String, ffi::Any>> calls;
-  /*! \brief Metrics collected for the entire run of the model on a per-device 
basis.
-   *
-   * `device_metrics` is indexed by device name then metric.
-   *
-   * These metrics may be larger than the sum of the same metric in `calls`
-   * because these metrics include the overhead of the executor.
-   */
-  ffi::Map<ffi::String, ffi::Map<ffi::String, ffi::Any>> device_metrics;
-  /*! Configuration used for this profiling run. Includes number of threads, 
executor.
-   *
-   * Values must be an object type that can be used with device_metrics.
-   */
-  ffi::Map<ffi::String, ffi::Any> configuration;
-  /*! \brief Output `calls` in CSV format.
-   *
-   * Note that this does not include `device_metrics`, it only includes 
per-call metrics.
-   */
-  ffi::String AsCSV() const;
-  /*! \brief Create a human readable table of profiling metrics.
-   *
-   *  \param aggregate Whether or not to join multiple calls to the
-   *      same op into a single line.
-   *
-   *  \param sort Whether or not to sort call frames by descending
-   *      duration. If false and if `aggregate` is false, frames will
-   *      be sorted by order of appearance in the program. Order is
-   *      undefined if `sort` is false and `aggregate` is true.
-   *
-   *  \param compute_col_sums Whether or not to include sum totals for
-   *      the Count, Duation, and Percent columns.
-   *
-   */
-  ffi::String AsTable(bool sort = true, bool aggregate = true, bool 
compute_col_sums = true) const;
-  /*! \brief Convert this report to JSON.
-   *
-   * Output JSON will be of this format:
-   * \code
-   *  {
-   *    "calls": [
-   *      {
-   *        "Duration (us)": {
-   *          "microseconds": 12.3
-   *        },
-   *        "Name": "fused_dense",
-   *        "Count": {
-   *          "count": 1
-   *        },
-   *        "Percent": {
-   *          "percent": 10.3
-   *        }
-   *      }
-   *    ],
-   *    "device_metrics": {
-   *      "cpu": {
-   *        "Duration (us)": {
-   *          "microseconds": 334.2
-   *        },
-   *        "Percent": {
-   *          "percent": 100
-   *        }
-   *      }
-   *    }
-   *  }
-   * \endcode
-   */
-  ffi::String AsJSON() const;
-  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("runtime.profiling.Report", ReportNode, 
Object);
-};
-
-class Report : public ObjectRef {
- public:
-  /*! Construct a Report from a set of calls (with associated metrics) and 
per-device metrics.
-   * \param calls Function calls and associated metrics.
-   * \param device_metrics Per-device metrics for overall execution.
-   * \param configuration Configuration data specific to this profiling run.
-   */
-  explicit Report(ffi::Array<ffi::Map<ffi::String, ffi::Any>> calls,
-                  ffi::Map<ffi::String, ffi::Map<ffi::String, ffi::Any>> 
device_metrics,
-                  ffi::Map<ffi::String, ffi::Any> configuration);
-
-  /*! Deserialize a Report from a JSON object. Needed for sending the report 
over RPC.
-   * \param json Serialized json report from `ReportNode::AsJSON`.
-   * \returns A Report.
-   */
-  static Report FromJSON(ffi::String json);
-  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NOTNULLABLE(Report, ObjectRef, ReportNode);
-};
-
-/*! \brief Interface for user defined profiling metric collection.
- *
- * Users can register their own collector by registering a packed function with
- * the name "runtime.profiling.metrics.my_collector_name" where
- * "my_collector_name" is the name of their collector. This function should
- * take an Array of Device as input which contains the devices the collector
- * will be run on.
- *
- * `MetricCollectorNode`s will be called in the following fashion.
- * \code
- * MetricCollector mc;
- * for (auto op : model) {
- *   auto o = mc.Start();
- *   op();
- *   auto metrics = mc.Stop(o); // metrics are added the profiling report
- * }
- * \endcode
- */
-class MetricCollectorNode : public Object {
- public:
-  /*! \brief Initialization call. Called before profiling has started. Any
-   * expensive precomputation should happen here.
-   * \param devs The list of devices this collector will be run on.
-   */
-  virtual void Init(ffi::Array<DeviceWrapper> devs) = 0;
-  /*! \brief Start colling metrics for a function call.
-   * \param dev The device the call will be run on.
-   * \returns An object used to maintain state of the metric collection. This
-   * object will be passed to the corresponding `Stop` call. If the device is
-   * not supported, this function will return a nullptr ObjectRef.
-   */
-  virtual ObjectRef Start(Device dev) = 0;
-  /*! \brief Stop collecting metrics.
-   * \param obj The object created by the corresponding `Start` call.
-   * \returns A set of metric names and the associated values. Values must be
-   * one of DurationNode, PercentNode, CountNode, or String.
-   */
-  virtual ffi::Map<ffi::String, ffi::Any> Stop(ffi::ObjectRef obj) = 0;
-
-  virtual ~MetricCollectorNode() {}
-
-  static constexpr const bool _type_mutable = true;
-  TVM_FFI_DECLARE_OBJECT_INFO("runtime.profiling.MetricCollector", 
MetricCollectorNode, Object);
-};
-
-/*! \brief Wrapper for `MetricCollectorNode`. */
-class MetricCollector : public ObjectRef {
- public:
-  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(MetricCollector, ObjectRef, 
MetricCollectorNode);
-};
-
-/*! Information about a single function or operator call. */
-struct CallFrame {
-  /*! Device on which the call was made */
-  Device dev;
-  /*! Name of the function or op */
-  ffi::String name;
-  /*! Runtime of the function or op */
-  Timer timer;
-  /*! Extra performance metrics */
-  std::unordered_map<std::string, ffi::Any> extra_metrics;
-  /*! User defined metric collectors. Each pair is the MetricCollector and its
-   * associated data (returned from MetricCollector.Start).
-   */
-  std::vector<std::pair<MetricCollector, ObjectRef>> extra_collectors;
-};
-
-/*! Runtime profiler for function and/or operator calls. Used in the graph
- * runtime and VM to provide profiling information for all operators.
- *
- * Example usage:
- * \code{.cpp}
- * Device cpu, gpu;
- * Profiler prof({cpu, gpu});
- * my_gpu_kernel(); // do a warmup iteration
- * prof.Start();
- * prof.StartCall("my_gpu_kernel", gpu);
- * my_gpu_kernel();
- * prof.StopCall();
- * prof.StartCall("my_cpu_function", cpu);
- * my_cpu_function();
- * prof.StopCall();
- * prof.Stop();
- * std::cout << prof.Report << std::endl; // print profiling report
- * \endcode
- */
-class Profiler {
- public:
-  /*! Constructor.
-   *
-   * The profiler should be constructed before you do any warmup iterations.
-   *
-   * \note
-   * Calling this constructor will reset the TVM threadpool. It is necessary in
-   * order to install thread handlers required by certain collectors.
-   *
-   * \param devs The list of devices the profiler will be running on. Should
-   *             include all devices used by profiled operators.
-   * \param metric_collectors Additional `MetricCollector`s to use with this 
profiler.
-   * \param configuration Additional configuration data to add to the 
outputted profiling report.
-   */
-  explicit Profiler(std::vector<Device> devs, std::vector<MetricCollector> 
metric_collectors,
-                    std::unordered_map<ffi::String, ffi::Any> configuration = 
{});
-  /*! \brief Start the profiler.
-   *
-   * This function should only be called once per object.
-   */
-  void Start();
-  /*! \brief Stop the profiler.
-   *
-   * This function should only be called once per object after start has been 
called.
-   */
-  void Stop();
-  /*! \brief Start a function call.
-   * \param name The name of the function being called.
-   * \param dev The device on which the function is running.
-   * \param extra_metrics Optional additional profiling information to add to
-   * the frame (input sizes, allocations).
-   *
-   * `StartCall` may be nested, but each `StartCall` needs a matching
-   * `StopCall`. Function calls are stopped in LIFO order, so calls to
-   * `StartCall` and `StopCall` must be nested properly.
-   */
-  void StartCall(ffi::String name, Device dev,
-                 std::unordered_map<std::string, ffi::Any> extra_metrics = {});
-  /*! \brief Stop the last `StartCall`.
-   * \param extra_metrics Optional additional profiling information to add to
-   * the frame (input sizes, allocations).
-   */
-  void StopCall(std::unordered_map<std::string, ffi::Any> extra_metrics = {});
-  /*! \brief A report of total runtime between `Start` and `Stop` as
-   *        well as individual statistics for each `StartCall`-`StopCall` pair.
-   *  \returns A `Report` that can either be formatted as CSV (with `.AsCSV`)
-   *  or as a human readable table (with `.AsTable`).
-   */
-  profiling::Report Report();
-  /*! \brief Check if the profiler is currently running.
-   * \returns Whether or not the profiler is running.
-   */
-  bool IsRunning() const { return is_running_; }
-
- private:
-  std::vector<Device> devs_;
-  bool is_running_{false};
-  std::vector<CallFrame> calls_;
-  std::stack<CallFrame> in_flight_;
-  std::vector<MetricCollector> collectors_;
-  std::unordered_map<ffi::String, ffi::Any> configuration_;
-};
-
-/* \brief A duration in time. */
-class DurationNode : public Object {
- public:
-  /* The duration as a floating point number of microseconds. */
-  double microseconds;
-
-  /* \brief Construct a new duration.
-   * \param a The duration in microseconds.
-   */
-  explicit DurationNode(double a) : microseconds(a) {}
-  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("runtime.profiling.Duration", 
DurationNode, Object);
-};
-
-/* A percentage of something */
-class PercentNode : public Object {
- public:
-  /* The percent as a floating point value out of 100%. i.e. if `percent` is 
10 then we have 10%. */
-  double percent;
-
-  /* \brief Construct a new percentage.
-   * \param a The percentage out of 100.
-   */
-  explicit PercentNode(double a) : percent(a) {}
-  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("runtime.profiling.Percent", PercentNode, 
Object);
-};
-
-/* A count of something */
-class CountNode : public Object {
- public:
-  /* The actual count */
-  int64_t value;
-
-  /* \brief Construct a new count.
-   * \param a The count.
-   */
-  explicit CountNode(int64_t a) : value(a) {}
-  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("runtime.profiling.Count", CountNode, 
Object);
-};
-
-/* \brief A ratio of two things. */
-class RatioNode : public Object {
- public:
-  /* The ratio as a double precision floating point number. */
-  double ratio;
-
-  /* \brief Construct a new ratio.
-   * \param a The ratio.
-   */
-  explicit RatioNode(double a) : ratio(a) {}
-  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("runtime.profiling.Ratio", RatioNode, 
Object);
-};
-
-/*! \brief ffi::String representation of an array of Tensor shapes
- *  \param shapes Array of Tensors to get the shapes of.
- *  \return A textual representation of the shapes. For example: `float32[2], 
int64[1, 2]`.
- */
-ffi::String ShapeString(const std::vector<Tensor>& shapes);
-/*! \brief ffi::String representation of shape encoded as an Tensor
- *  \param shape Tensor containing the shape.
- *  \param dtype The dtype of the shape.
- *  \return A textual representation of the shape. For example: `float32[2]`.
- */
-ffi::String ShapeString(Tensor shape, DLDataType dtype);
-/*! \brief ffi::String representation of a shape encoded as a vector
- *  \param shape Shape as a vector of integers.
- *  \param dtype The dtype of the shape.
- *  \return A textual representation of the shape. For example: `float32[2]`.
- */
-ffi::String ShapeString(const std::vector<int64_t>& shape, DLDataType dtype);
-
-/*! \brief Collect performance information of a function execution. Usually
- * used with a compiled PrimFunc (via tvm.compile).
- *
- * This information can include performance counters like cache hits and FLOPs
- * that are useful in debugging performance issues of individual PrimFuncs.
- * Different metrics can be collected depending on which MetricCollector is
- * used.
- *
- * Example usage:
- * \code{.cpp}
- * // Use PAPI to measure the number of floating point operations.
- * ffi::Function profiler = ProfileModule(
- *     mod, "main", kDLCPU, 0, {CreatePAPIMetricCollector({{kDLCPU, 0}, 
{"PAPI_FP_OPS"}})});
- * Report r = profiler(arg1, arg2, arg);
- * std::cout << r << std::endl;
- * \endcode
- *
- * \param mod Module to profile. Usually a PrimFunc that has been compiled to 
machine code.
- * \param func_name Name of function to run in the module.
- * \param device_type Device type to run on. Profiling will include performance
- *                    metrics specific to this device type.
- * \param device_id Id of device to run on.
- * \param warmup_iters Number of iterations of the function to run before 
collecting
- *                     performance information. Recommend to set this larger
- *                     than 0 so that cache effects are consistent.
- * \param collectors List of different
- *                   ways to collect metrics. See MetricCollector.
- * \returns A ffi::Function which takes the same arguments as the 
`mod[func_name]`
- *          and returns performance metrics as a `ffi::Map<ffi::String, 
ffi::Any>` where
- *          values can be `CountNode`, `DurationNode`, `PercentNode`.
- */
-ffi::Function ProfileFunction(ffi::Module mod, std::string func_name, int 
device_type,
-                              int device_id, int warmup_iters,
-                              ffi::Array<MetricCollector> collectors);
-
-/*!
- * \brief Wrap a timer function to measure the time cost of a given packed 
function.
- *
- * Approximate implementation:
- * \code{.py}
- * f() // warmup
- * for i in range(repeat)
- *   f_preproc()
- *   while True:
- *     start = time()
- *     for j in range(number):
- *       f()
- *     duration_ms = time() - start
- *     if duration_ms >= min_repeat_ms:
- *       break
- *     else:
- *        number = (min_repeat_ms / (duration_ms / number) + 1
- *   if cooldown_interval_ms and i % repeats_to_cooldown == 0:
- *     sleep(cooldown_interval_ms)
- * \endcode
- *
- * \param f The function argument.
- * \param dev The device.
- * \param number The number of times to run this function for taking average.
- *        We call these runs as one `repeat` of measurement.
- * \param repeat The number of times to repeat the measurement.
- *        In total, the function will be invoked (1 + number x repeat) times,
- *        where the first one is warm up and will be discarded.
- *        The returned result contains `repeat` costs,
- *        each of which is an average of `number` costs.
- * \param min_repeat_ms The minimum duration of one `repeat` in milliseconds.
- *        By default, one `repeat` contains `number` runs. If this parameter 
is set,
- *        the parameters `number` will be dynamically adjusted to meet the
- *        minimum duration requirement of one `repeat`.
- *        i.e., When the run time of one `repeat` falls below this time,
- *        the `number` parameter will be automatically increased.
- * \param limit_zero_time_iterations The maximum number of repeats when
- *        measured time is equal to 0.  It helps to avoid hanging during 
measurements.
- * \param cooldown_interval_ms The cooldown interval in milliseconds between 
the number of repeats
- *        defined by `repeats_to_cooldown`.
- * \param repeats_to_cooldown The number of repeats before the
- *        cooldown is activated.
- * \param cache_flush_bytes The number of bytes to flush from cache before
- * \param f_preproc The function to be executed before we execute time
- *        evaluator.
- * \return f_timer A timer function.
- */
-ffi::Function WrapTimeEvaluator(ffi::Function f, Device dev, int number, int 
repeat,
-                                int min_repeat_ms, int 
limit_zero_time_iterations,
-                                int cooldown_interval_ms, int 
repeats_to_cooldown,
-                                int cache_flush_bytes = 0, ffi::Function 
f_preproc = nullptr);
-
-}  // namespace profiling
-}  // namespace runtime
-}  // namespace tvm
-
-#endif  // TVM_RUNTIME_PROFILING_H_
diff --git a/include/tvm/runtime/timer.h b/include/tvm/runtime/timer.h
new file mode 100644
index 0000000000..25d963d7f3
--- /dev/null
+++ b/include/tvm/runtime/timer.h
@@ -0,0 +1,195 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file include/tvm/runtime/timer.h
+ * \brief Runtime timer primitives: Timer, TimerNode, WrapTimeEvaluator.
+ */
+#ifndef TVM_RUNTIME_TIMER_H_
+#define TVM_RUNTIME_TIMER_H_
+
+#include <tvm/ffi/function.h>
+#include <tvm/runtime/base.h>
+#include <tvm/runtime/device_api.h>
+#include <tvm/runtime/object.h>
+#include <tvm/runtime/tensor.h>
+
+namespace tvm {
+namespace runtime {
+
+/*! \brief Base class for all timer implementations.
+ *
+ * New implementations of this interface should make sure that `Start` and 
`Stop`
+ * are as lightweight as possible. Expensive state synchronization should be
+ * done in `SyncAndGetElapsedNanos`.
+ */
+class TimerNode : public Object {
+ public:
+  /*! \brief Start the timer.
+   *
+   * Note: this function should only be called once per object.
+   */
+  virtual void Start() = 0;
+  /*! \brief Stop the timer.
+   *
+   * Note: this function should only be called once per object.
+   */
+  virtual void Stop() = 0;
+  /*! \brief Synchronize timer state and return elapsed time between `Start` 
and `Stop`.
+   * \return The time in nanoseconds between `Start` and `Stop`.
+   *
+   * This function is necessary because we want to avoid timing the overhead of
+   * doing timing. When using multiple timers, it is recommended to stop all of
+   * them before calling `SyncAndGetElapsedNanos` on any of them.
+   *
+   * Note: this function should be only called once per object. It may incur
+   * a large synchronization overhead (for example, with GPUs).
+   */
+  virtual int64_t SyncAndGetElapsedNanos() = 0;
+
+  virtual ~TimerNode() {}
+
+  static constexpr const bool _type_mutable = true;
+  TVM_FFI_DECLARE_OBJECT_INFO("runtime.TimerNode", TimerNode, Object);
+};
+
+/*! \brief Timer for a specific device.
+ *
+ * This is a managed reference to a TimerNode.
+ *
+ * \sa TimerNode
+ */
+class Timer : public ObjectRef {
+ public:
+  /*!
+   * \brief Get a device specific timer.
+   * \param dev The device to time.
+   * \return A `Timer` that has already been started.
+   *
+   * Use this function to time runtime of arbitrary regions of code on a 
specific
+   * device. The code that you want to time should be running on the device
+   * otherwise the timer will not return correct results. This is a lower level
+   * interface than TimeEvaluator and only runs the timed code once
+   * (TimeEvaluator runs the code multiple times).
+   *
+   * A default timer is used if a device specific one does not exist. This
+   * timer performs synchronization between the device and CPU, which can lead
+   * to overhead in the reported results.
+   *
+   * Example usage:
+   * \code{.cpp}
+   * Timer t = Timer::Start(Device::cpu());
+   * my_long_running_function();
+   * t->Stop();
+   * ... // some more computation
+   * int64_t nanosecs = t->SyncAndGetElapsedNanos() // elapsed time in 
nanoseconds
+   * \endcode
+   *
+   * To add a new device-specific timer, register a new function
+   * "runtime.timer.my_device" (where `my_device` is the `DeviceName` of your
+   * device). This function should accept a `Device` and return a new `Timer`
+   * that has already been started.
+   *
+   * For example, this is how the CPU timer is implemented:
+   * \code{.cpp}
+   *  class CPUTimerNode : public TimerNode {
+   *   public:
+   *    virtual void Start() { start_ = 
std::chrono::high_resolution_clock::now(); }
+   *    virtual void Stop() { duration_ = 
std::chrono::high_resolution_clock::now() - start_; }
+   *    virtual int64_t SyncAndGetElapsedNanos() { return duration_.count(); }
+   *    virtual ~CPUTimerNode() {}
+   *
+   *    static constexpr const char* _type_key = "runtime.CPUTimerNode";
+   *    TVM_FFI_DECLARE_OBJECT_INFO_FINAL(CPUTimerNode, TimerNode);
+   *
+   *   private:
+   *    std::chrono::high_resolution_clock::time_point start_;
+   *    std::chrono::duration<int64_t, std::nano> duration_;
+   *  };
+   *
+   *
+   *  TVM_FFI_STATIC_INIT_BLOCK() {
+   *    namespace refl = tvm::ffi::reflection;
+   *    refl::GlobalDef().def("runtime.timer.cpu", [](Device dev) {
+   *      return Timer(ffi::make_object<CPUTimerNode>());
+   *    });
+   *  }
+   * \endcode
+   */
+  static TVM_RUNTIME_DLL Timer Start(Device dev);
+
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(Timer, ObjectRef, TimerNode);
+};
+
+/*!
+ * \brief Wrap a timer function to measure the time cost of a given packed 
function.
+ *
+ * Approximate implementation:
+ * \code{.py}
+ * f() // warmup
+ * for i in range(repeat)
+ *   f_preproc()
+ *   while True:
+ *     start = time()
+ *     for j in range(number):
+ *       f()
+ *     duration_ms = time() - start
+ *     if duration_ms >= min_repeat_ms:
+ *       break
+ *     else:
+ *        number = (min_repeat_ms / (duration_ms / number) + 1
+ *   if cooldown_interval_ms and i % repeats_to_cooldown == 0:
+ *     sleep(cooldown_interval_ms)
+ * \endcode
+ *
+ * \param f The function argument.
+ * \param dev The device.
+ * \param number The number of times to run this function for taking average.
+ *        We call these runs as one `repeat` of measurement.
+ * \param repeat The number of times to repeat the measurement.
+ *        In total, the function will be invoked (1 + number x repeat) times,
+ *        where the first one is warm up and will be discarded.
+ *        The returned result contains `repeat` costs,
+ *        each of which is an average of `number` costs.
+ * \param min_repeat_ms The minimum duration of one `repeat` in milliseconds.
+ *        By default, one `repeat` contains `number` runs. If this parameter 
is set,
+ *        the parameters `number` will be dynamically adjusted to meet the
+ *        minimum duration requirement of one `repeat`.
+ *        i.e., When the run time of one `repeat` falls below this time,
+ *        the `number` parameter will be automatically increased.
+ * \param limit_zero_time_iterations The maximum number of repeats when
+ *        measured time is equal to 0.  It helps to avoid hanging during 
measurements.
+ * \param cooldown_interval_ms The cooldown interval in milliseconds between 
the number of repeats
+ *        defined by `repeats_to_cooldown`.
+ * \param repeats_to_cooldown The number of repeats before the
+ *        cooldown is activated.
+ * \param cache_flush_bytes The number of bytes to flush from cache before
+ * \param f_preproc The function to be executed before we execute time
+ *        evaluator.
+ * \return f_timer A timer function.
+ */
+ffi::Function WrapTimeEvaluator(ffi::Function f, Device dev, int number, int 
repeat,
+                                int min_repeat_ms, int 
limit_zero_time_iterations,
+                                int cooldown_interval_ms, int 
repeats_to_cooldown,
+                                int cache_flush_bytes = 0, ffi::Function 
f_preproc = nullptr);
+
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_TIMER_H_
diff --git a/include/tvm/runtime/vm/executable.h 
b/include/tvm/runtime/vm/executable.h
index 06c6020efb..022a88a469 100644
--- a/include/tvm/runtime/vm/executable.h
+++ b/include/tvm/runtime/vm/executable.h
@@ -140,8 +140,6 @@ class TVM_RUNTIME_DLL VMExecutable : public ffi::ModuleObj {
   void WriteToFile(const ffi::String& file_name, const ffi::String& format) 
const final;
   /*! \brief Create a Relax virtual machine and load `this` as the executable. 
*/
   ffi::Module VMLoadExecutable() const;
-  /*! \brief Create a Relax virtual machine with profiler and load `this` as 
the executable. */
-  ffi::Module VMProfilerLoadExecutable() const;
   /*! \brief Check if the VMExecutable contains a specific function. */
   bool HasFunction(const ffi::String& name) const;
   /*!
diff --git a/include/tvm/runtime/vm/vm.h b/include/tvm/runtime/vm/vm.h
index 335d77f196..2804e17eb3 100644
--- a/include/tvm/runtime/vm/vm.h
+++ b/include/tvm/runtime/vm/vm.h
@@ -23,10 +23,6 @@
 #ifndef TVM_RUNTIME_VM_VM_H_
 #define TVM_RUNTIME_VM_VM_H_
 
-#ifndef TVM_VM_ENABLE_PROFILER
-#define TVM_VM_ENABLE_PROFILER 1
-#endif
-
 #include <tvm/ffi/extra/module.h>
 
 #include <memory>
@@ -201,11 +197,6 @@ class VirtualMachine : public ffi::ModuleObj {
    * \return Created VM
    */
   static ObjectPtr<VirtualMachine> Create();
-  /*!
-   * \brief Create an instance of VM with the profiling feature enabled.
-   * \return Created VM
-   */
-  static ObjectPtr<VirtualMachine> CreateProfiler();
   /*!
    * \brief Helper function for vm closure functions to get the context ptr
    * \param arg The argument value.
diff --git a/python/tvm/relax/training/trainer.py 
b/python/tvm/relax/training/trainer.py
index ce9d2368b7..f35f4ab69c 100644
--- a/python/tvm/relax/training/trainer.py
+++ b/python/tvm/relax/training/trainer.py
@@ -65,7 +65,6 @@ class Trainer:
         trainer.xaiver_uniform_init_params()
         trainer.predict(input_instances)
         trainer.update([input_instances], [labels])
-        trainer.profile_adjoint([input_instances], [labels])
     """
 
     BACKBONE_FUNC: str = "backbone"
@@ -347,49 +346,3 @@ class Trainer:
         self._params = list(new_params)
 
         return ret
-
-    def profile_adjoint(
-        self,
-        input_instances: list[np.ndarray | Tensor],
-        targets: list[np.ndarray | Tensor],
-    ) -> tvm.runtime.profiling.Report:
-        """Profile the adjoint function. It requires the VM to be constructed 
with `profile=True`,
-        and runs `tvm.relax.VirtualMachine.profile()` internally.
-
-        Parameters
-        ----------
-        input_instances : Union[np.ndarray, Tensor, List[Union[np.ndarray, 
Tensor]]]
-            The values corresponding to the input_instances part of the 
backbone function.
-            Parameters and model states are not needed to provide.
-
-            If there are more than one input instances, you can provide a list.
-
-        targets : Union[np.ndarray, Tensor, List[Union[np.ndarray, Tensor]]]
-            The values corresponding to the targets part of the backbone 
function.
-
-            If there are more than one targets, you can provide a list.
-
-        Returns
-        -------
-        report : tvm.runtime.profiling.Report
-            The formatted profiling result.
-        """
-        self._check_inited()
-
-        if not isinstance(input_instances, list):
-            input_instances = [input_instances]
-
-        if not isinstance(targets, list):
-            targets = [targets]
-
-        if len(input_instances) != self._input_num:
-            raise ValueError("The length of the input does not match the 
backbone")
-
-        all_inputs: list[Tensor] = (
-            [tvm.runtime.tensor(i) for i in input_instances]
-            + self._params
-            + self._states
-            + [tvm.runtime.tensor(i) for i in targets]
-        )
-        all_inputs = [i.copyto(self.device) for i in all_inputs]
-        return self.vm.profile(self.ADJOINT_FUNC, *all_inputs)
diff --git a/python/tvm/runtime/__init__.py b/python/tvm/runtime/__init__.py
index 6f25216fb4..86f7507d7c 100644
--- a/python/tvm/runtime/__init__.py
+++ b/python/tvm/runtime/__init__.py
@@ -30,7 +30,6 @@ from .object_generic import ObjectConvertible
 from .device import Device
 from ._tensor import Tensor, tensor, empty
 from .module import Module
-from .profiling import Report
 from .executable import Executable
 
 # function exposures
diff --git a/python/tvm/runtime/profiling/__init__.py 
b/python/tvm/runtime/profiling/__init__.py
deleted file mode 100644
index 7ebc5e5c40..0000000000
--- a/python/tvm/runtime/profiling/__init__.py
+++ /dev/null
@@ -1,272 +0,0 @@
-# isort: skip_file
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Registration of profiling objects in python."""
-
-from typing import Optional
-from collections.abc import Sequence
-from ... import ffi as _ffi
-from . import _ffi_api
-from .. import Object, Device
-
-
-@_ffi.register_object("runtime.profiling.Report")
-class Report(Object):
-    """A container for information gathered during a profiling run.
-
-    Attributes
-    ----------
-    calls : Array[Dict[str, Object]]
-        Per-call profiling metrics (function name, runtime, device, ...).
-
-    device_metrics : Dict[Device, Dict[str, Object]]
-        Per-device metrics collected over the entire run.
-    """
-
-    def __init__(
-        self,
-        calls: Sequence[dict[str, Object]],
-        device_metrics: dict[str, dict[str, Object]],
-        configuration: dict[str, Object],
-    ):
-        """Construct a profiling report from a list of metrics and per-device 
metrics.
-
-        Parameters
-        ----------
-        calls : Sequence[Dict[str, Object]]
-            Per function call metrics.
-
-        device_metrics : Dict[str, Dict[str, Object]]
-            Per device metrics.
-
-        configuration : Dict[str, Object]
-            Configuration of TVM for this profiling run. Includes number of
-            threads, executor.
-        """
-        self.__init_handle_by_constructor__(_ffi_api.Report, calls, 
device_metrics, configuration)
-
-    def csv(self):
-        """Convert this profiling report into CSV format.
-
-        This only includes calls and not overall metrics.
-
-        Returns
-        -------
-        csv : str
-            `calls` in CSV format.
-        """
-        return _ffi_api.AsCSV(self)
-
-    def table(self, sort=True, aggregate=True, col_sums=True):
-        """Generate a human-readable table
-
-        Parameters
-        ----------
-        sort : bool
-
-            If aggregate is true, whether to sort call frames by
-            descending duration.  If aggregate is False, whether to
-            sort frames by order of appearancei n the program.
-
-        aggregate : bool
-
-            Whether to join multiple calls to the same op into a
-            single line.
-
-        col_sums : bool
-
-            Whether to include the sum of each column.
-
-        Returns
-        -------
-        table : str
-
-            A human-readable table
-
-        """
-        return _ffi_api.AsTable(self, sort, aggregate, col_sums)
-
-    def json(self):
-        """Convert this profiling report into JSON format.
-
-        Example output:
-
-        .. code-block:
-
-            {
-              "calls": [
-                {
-                  "Duration (us)": {
-                    "microseconds": 12.3
-                  },
-                  "Name": "fused_dense",
-                  "Count": {
-                    "count": 1
-                  },
-                  "Percent": {
-                    "percent": 10.3
-                  }
-                }
-              ],
-              "device_metrics": {
-                "cpu": {
-                  "Duration (us)": {
-                    "microseconds": 334.2
-                  },
-                  "Percent": {
-                    "percent": 100
-                  }
-                }
-              }
-            }
-
-           {"calls":
-              [
-                {"Duration (us)": {"microseconds": 12.3}
-                 ,"Name": "fused_dense"
-                 ,"Count": {"count":1}
-                 ,"Percent": {"percent": 10.3}
-                 }
-              ],
-            "device_metrics":
-              {"cpu":
-                {"Duration (us)": {"microseconds": 334.2}
-                ,"Percent": {"percent": 100.0}
-                }
-              }
-           }
-
-        Returns
-        -------
-        json : str
-            Formatted JSON
-        """
-        return _ffi_api.AsJSON(self)
-
-    @classmethod
-    def from_json(cls, s):
-        """Deserialize a report from JSON.
-
-        Parameters
-        ----------
-        s : str
-            Report serialize via :py:meth:`json`.
-
-        Returns
-        -------
-        report : Report
-            The deserialized report.
-        """
-        return _ffi_api.FromJSON(s)
-
-
-@_ffi.register_object("runtime.profiling.Count")
-class Count(Object):
-    """A integer count of something"""
-
-    def __init__(self, count: int):
-        self.__init_handle_by_constructor__(_ffi_api.Count, count)
-
-
-@_ffi.register_object("runtime.profiling.Duration")
-class Duration(Object):
-    """A duration of something"""
-
-    def __init__(self, duration: float):
-        self.__init_handle_by_constructor__(_ffi_api.Duration, duration)
-
-
-@_ffi.register_object("runtime.profiling.Percent")
-class Percent(Object):
-    """A Percent of something"""
-
-    def __init__(self, percent: float):
-        self.__init_handle_by_constructor__(_ffi_api.Percent, percent)
-
-
-@_ffi.register_object("runtime.profiling.Ratio")
-class Ratio(Object):
-    """A Ratio of two things"""
-
-    def __init__(self, ratio: float):
-        self.__init_handle_by_constructor__(_ffi_api.Ratio, ratio)
-
-
-@_ffi.register_object("runtime.profiling.MetricCollector")
-class MetricCollector(Object):
-    """Interface for user defined profiling metric collection."""
-
-
-@_ffi.register_object("runtime.profiling.DeviceWrapper")
-class DeviceWrapper(Object):
-    """Wraps a tvm.runtime.Device"""
-
-    def __init__(self, dev: Device):
-        self.__init_handle_by_constructor__(_ffi_api.DeviceWrapper, dev)
-
-
-def profile_function(mod, dev, collectors, func_name=None, warmup_iters=10):
-    """Collect performance information of a function execution. Usually used 
with
-    a compiled PrimFunc.
-
-    This information can include performance counters like cache hits and FLOPs
-    that are useful in debugging performance issues of individual PrimFuncs.
-    Different metrics can be collected depending on which MetricCollector is
-    used.
-
-    Example
-    -------
-
-    .. code-block: python
-
-        f = tvm.compile(my_func, target="llvm", name="my_func")
-        prof = tvm.runtime.profiling.profile_function(
-            f,
-            tvm.cpu(),
-            [tvm.runtime.profiling.PAPIMetricCollector({tvm.cpu(): 
["PAPI_FP_OPS"]}),
-        )
-        counters = prof(*args)
-        print(counters)
-
-    Parameters
-    ----------
-    mod: Module
-        Module containing the function to profile.
-    dev: Device
-        Device to run the function on.
-
-    collectors: List[MetricCollector]
-        :py:class:`MetricCollector` which will collect performance information.
-    func_name: Optional[str]
-        Name of the function in `mod` to profile. Defaults to the `entry_name` 
of `mod`.
-    warmup_iters: int
-        Number of iterations to run the function before collecting performance
-        information. Recommended to set this larger than 0 for consistent cache
-        effects. Defaults to 10.
-
-    Returns
-    -------
-    prof: PackedFunc[args, Dict[str, ObjectRef]]
-        PackedFunc which takes the same arguments as the `mod[func_name]` and
-        returns performance metrics as a `Dict[str, ObjectRef]` where values
-        can be `CountNode`, `DurationNode`, `PercentNode`.
-    """
-    if func_name is None:
-        func_name = mod.entry_name
-    return _ffi_api.ProfileFunction(
-        mod, func_name, dev.dlpack_device_type(), dev.index, warmup_iters, 
collectors
-    )
diff --git a/python/tvm/runtime/profiling/_ffi_api.py 
b/python/tvm/runtime/profiling/_ffi_api.py
deleted file mode 100644
index c633c8144b..0000000000
--- a/python/tvm/runtime/profiling/_ffi_api.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""FFI for profiling"""
-
-import tvm_ffi
-
-tvm_ffi.init_ffi_api("runtime.profiling", __name__)
diff --git a/python/tvm/runtime/vm.py b/python/tvm/runtime/vm.py
index 982322d601..0adb446dbf 100644
--- a/python/tvm/runtime/vm.py
+++ b/python/tvm/runtime/vm.py
@@ -28,7 +28,6 @@ from tvm_ffi import Function, register_global_func
 
 import tvm
 from tvm.runtime import Device, Object
-from tvm.runtime.profiling import Report
 
 from ..rpc.base import RPC_SESS_MASK
 
@@ -50,7 +49,6 @@ class VirtualMachine:
         rt_mod: tvm.runtime.Module | tvm.runtime.Executable,
         device: Device | list[Device],
         memory_cfg: str | dict[Device, str] | None = None,
-        profile: bool = False,
     ) -> None:
         """
         Construct a VirtualMachine wrapper object.
@@ -70,9 +68,6 @@ class VirtualMachine:
             allocator type. If memory_cfg is a dict, each device uses the 
allocator
             type specified in the dict, or pooled allocator if not specified 
in the
             dict.
-
-        profile : Optional[bool]
-            Whether or not to enable profiling.
         """
         if not isinstance(rt_mod, tvm.runtime.Module):
             if isinstance(rt_mod, tvm.runtime.Executable):
@@ -80,8 +75,7 @@ class VirtualMachine:
             else:
                 raise ValueError("Expect the rt_mod to be an runtime.Module")
 
-        load_exec = "vm_profiler_load_executable" if profile else 
"vm_load_executable"
-        self.module = rt_mod[load_exec]()
+        self.module = rt_mod["vm_load_executable"]()
         self._invoke_closure = self.module["invoke_closure"]
         self._save_function = self.module["save_function"]
         self._set_input = self.module["set_input"]
@@ -477,30 +471,6 @@ class VirtualMachine:
             f_preproc=f_preproc,
         )
 
-    def profile(self, func_name: str, *args):
-        """Profile a function call.
-
-        Parameters
-        ----------
-        func_name : str
-            The name of the function.
-
-        args: List of Tensor or other objects supported by Function.
-            The arguments to the function.
-
-        Returns
-        -------
-        report: tvm.runtime.profiling.Report
-            The formatted profiling result, showing per-op timing measurements.
-        """
-        cargs: list[Any] = []
-
-        for arg in args:
-            self._convert(arg, cargs)
-
-        report_json = self.module["profile"](func_name, *cargs)
-        return Report.from_json(report_json)
-
 
 @register_global_func("vm.builtin.debug_print")
 def _print(lineo: str, array) -> None:
diff --git a/src/ir/structural_hash.cc b/src/ir/structural_hash.cc
index b875f86625..01ea19e4b7 100644
--- a/src/ir/structural_hash.cc
+++ b/src/ir/structural_hash.cc
@@ -26,7 +26,7 @@
 #include <tvm/ffi/reflection/access_path.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/node/functor.h>
-#include <tvm/runtime/profiling.h>
+#include <tvm/runtime/tensor.h>
 #include <tvm/support/io.h>
 #include <tvm/target/codegen.h>
 
@@ -85,66 +85,4 @@ struct RefToObjectPtr : public ObjectRef {
   }
 };
 
-struct ReportNodeTrait {
-  static void RegisterReflection() {
-    namespace refl = tvm::ffi::reflection;
-    refl::ObjectDef<runtime::profiling::ReportNode>()
-        .def_ro("calls", &runtime::profiling::ReportNode::calls)
-        .def_ro("device_metrics", 
&runtime::profiling::ReportNode::device_metrics)
-        .def_ro("configuration", 
&runtime::profiling::ReportNode::configuration);
-  }
-};
-
-TVM_FFI_STATIC_INIT_BLOCK() { ReportNodeTrait::RegisterReflection(); }
-
-// Pattern A (RM): auto-default repr from reflection for ReportNode.
-
-struct CountNodeTrait {
-  static void RegisterReflection() {
-    namespace refl = tvm::ffi::reflection;
-    refl::ObjectDef<runtime::profiling::CountNode>().def_ro("value",
-                                                            
&runtime::profiling::CountNode::value);
-  }
-};
-
-TVM_FFI_STATIC_INIT_BLOCK() { CountNodeTrait::RegisterReflection(); }
-
-// Pattern A (RM): auto-default repr from reflection for CountNode.
-
-struct DurationNodeTrait {
-  static void RegisterReflection() {
-    namespace refl = tvm::ffi::reflection;
-    refl::ObjectDef<runtime::profiling::DurationNode>().def_ro(
-        "microseconds", &runtime::profiling::DurationNode::microseconds);
-  }
-};
-
-TVM_FFI_STATIC_INIT_BLOCK() { DurationNodeTrait::RegisterReflection(); }
-
-// Pattern A (RM): auto-default repr from reflection for DurationNode.
-
-struct PercentNodeTrait {
-  static void RegisterReflection() {
-    namespace refl = tvm::ffi::reflection;
-    refl::ObjectDef<runtime::profiling::PercentNode>().def_ro(
-        "percent", &runtime::profiling::PercentNode::percent);
-  }
-};
-
-TVM_FFI_STATIC_INIT_BLOCK() { PercentNodeTrait::RegisterReflection(); }
-
-// Pattern A (RM): auto-default repr from reflection for PercentNode.
-
-struct RatioNodeTrait {
-  static void RegisterReflection() {
-    namespace refl = tvm::ffi::reflection;
-    refl::ObjectDef<runtime::profiling::RatioNode>().def_ro("ratio",
-                                                            
&runtime::profiling::RatioNode::ratio);
-  }
-};
-
-TVM_FFI_STATIC_INIT_BLOCK() { RatioNodeTrait::RegisterReflection(); }
-
-// Pattern A (RM): auto-default repr from reflection for RatioNode.
-
 }  // namespace tvm
diff --git a/src/runtime/contrib/clml/clml_runtime.cc 
b/src/runtime/contrib/clml/clml_runtime.cc
index 2487af6915..5ea6c1398e 100644
--- a/src/runtime/contrib/clml/clml_runtime.cc
+++ b/src/runtime/contrib/clml/clml_runtime.cc
@@ -36,7 +36,7 @@
 #include "clml_utils.h"
 #endif
 
-#include <tvm/runtime/profiling.h>
+#include <tvm/runtime/timer.h>
 
 namespace tvm {
 namespace runtime {
@@ -362,152 +362,6 @@ class CLMLRuntime : public JSONRuntimeBase {
   }
 #endif
 
-  void RunProfile(profiling::Profiler* prof) override {
-    cl_command_queue queue = CLML_QUEUE;
-    std::vector<cl_event>& evts = 
cws->workspace->GetEventQueue(cws->tentry->device);
-    std::vector<profiling::MetricCollector> cs;
-    std::vector<Device> devices;
-    devices.push_back(cws->tentry->device);
-    bool update_desc = false;
-
-    for (size_t i = 0; i < input_nodes_.size(); ++i) {
-      auto nid = input_nodes_[i];
-      uint32_t eid = EntryID(nid, 0);
-      if (nodes_[nid].GetOpType() == "input") {
-#if (CL_QCOM_ML_OPS_H_MAJOR_VERSION >= 5)
-        if (this->layer_.storage_map[nid].is_dynamic_tensor) {
-          SetTensorMemDesc(&this->layer_, nid, eid);
-          update_desc = true;
-        }
-#endif
-        // Assuming all inputs are from OpenCL
-        if (kDLOpenCL == data_entry_[eid]->device.device_type) {
-          if (this->layer_.storage_map[nid].layout == 
CL_TENSOR_LAYOUT_NCHW_QCOM) {
-            int index = layer_.tensorMemDescs_indexmap[nid];
-            layer_.tensorMemDescs[index].memory = static_cast<cl_mem>(
-                
((cl::BufferDescriptor*)const_cast<DLTensor*>(data_entry_[eid])->data)->buffer);
-            update_desc = true;
-          } else {
-            layer_.in_placeholder[nid]->memory = static_cast<cl_mem>(
-                
((cl::BufferDescriptor*)const_cast<DLTensor*>(data_entry_[eid])->data)->buffer);
-            cl_event cpy_evt = nullptr;
-            cl_event* evt = &cpy_evt;
-            if (cws->workspace->IsProfiling(cws->tentry->device)) {
-              evts.resize(evts.size() + 1);
-              evt = &(evts.back());
-            }
-            std::unordered_map<std::string, ffi::Any> metrics;
-            std::string shape_str;
-            std::vector<int64_t> shape(nodes_[nid].GetOpShape()[0].begin(),
-                                       nodes_[nid].GetOpShape()[0].end());
-            DLDataType tvm_dtype = nodes_[nid].GetOpDataType()[0];
-            shape_str.append(profiling::ShapeString(shape, tvm_dtype));
-            metrics["Argument Shapes"] = ffi::String(shape_str);
-
-            prof->StartCall("CopyIn", cws->tentry->device, metrics);
-            CLML_CALL(clEnqueueCopyMLTensorDataQCOM, queue, 
layer_.in_placeholder[nid]->tensor,
-                      layer_.in_placeholder[nid]->memory, 
layer_.inputs[nid]->tensor,
-                      layer_.inputs[nid]->memory, 0, nullptr, evt);
-            prof->StopCall();
-          }
-        }
-      }
-    }
-
-    for (size_t i = 0; i < outputs_.size(); ++i) {
-      auto nid = outputs_[i].id_;
-      uint32_t eid = EntryID(outputs_[i]);
-#if (CL_QCOM_ML_OPS_H_MAJOR_VERSION >= 5)
-      if (this->layer_.storage_map[nid].is_dynamic_tensor) {
-        SetTensorMemDesc(&this->layer_, nid, eid);
-        update_desc = true;
-      }
-#endif
-      if (this->layer_.storage_map[nid].layout == CL_TENSOR_LAYOUT_NCHW_QCOM) {
-        int index = layer_.tensorMemDescs_indexmap[nid];
-        layer_.tensorMemDescs[index].memory = static_cast<cl_mem>(
-            
((cl::BufferDescriptor*)const_cast<DLTensor*>(data_entry_[eid])->data)->buffer);
-        update_desc = true;
-      }
-    }
-
-    if (update_desc) {
-      CLML_CALL(clUpdateMLTensorMemoryDescriptorSetQCOM, 
this->layer_.descriptorSet,
-                static_cast<uint32_t>(this->layer_.tensorMemDescs.size()),
-                this->layer_.tensorMemDescs.data());
-    }
-
-    for (size_t i = 0; i < this->layer_.function.size(); ++i) {
-      std::unordered_map<std::string, ffi::Any> metrics;
-      auto node = this->layer_.op_node_map[this->layer_.function[i].op].second;
-      std::string shape_str;
-      for (uint32_t j = 0; j < node.GetInputs().size(); ++j) {
-        const JSONGraphNode in_node = nodes_[node.GetInputs()[j].id_];
-        auto shape_arr = in_node.GetOpShape()[0];
-        std::vector<int64_t> shape(shape_arr.begin(), shape_arr.end());
-        DLDataType tvm_dtype = in_node.GetOpDataType()[0];
-        shape_str.append(profiling::ShapeString(shape, tvm_dtype));
-        shape_str.append(", ");
-      }
-      // Assuming one output per operation
-      auto shape_arr = node.GetOpShape()[0];
-      std::vector<int64_t> shape(shape_arr.begin(), shape_arr.end());
-      DLDataType tvm_dtype = node.GetOpDataType()[0];
-      shape_str.append(profiling::ShapeString(shape, tvm_dtype));
-      metrics["Argument Shapes"] = ffi::String(shape_str);
-
-      // Launch call
-      prof->StartCall(clml_symbol + "-" + this->layer_.function[i].layer_name, 
cws->tentry->device,
-                      metrics);
-      queue = CLML_QUEUE;
-      evts.resize(evts.size() + 1);
-      cl_event* evt = &(evts.back());
-#if (CL_QCOM_ML_OPS_H_MAJOR_VERSION >= 5)
-      if (this->layer_.function[i].op_props.size()) {
-        CLML_CALL_clUpdateMLOpQCOM(this->layer_.function[i].op,
-                                   this->layer_.function[i].op_props.data(),
-                                   this->layer_.descriptorSet, NULL);
-      }
-#endif
-      CLML_CALL(clEnqueueMLOpQCOM, queue, this->layer_.function[i].op, 
this->layer_.descriptorSet,
-                0, nullptr, evt);
-      prof->StopCall();
-    }
-
-    for (size_t i = 0; i < outputs_.size(); ++i) {
-      uint32_t eid = EntryID(outputs_[i]);
-      auto nid = outputs_[i].id_;
-      // Assuming all outputs are to OpenCL
-      if (kDLOpenCL == data_entry_[eid]->device.device_type) {
-        if (this->layer_.storage_map[nid].layout != 
CL_TENSOR_LAYOUT_NCHW_QCOM) {
-          layer_.out_placeholder[i]->memory = static_cast<cl_mem>(
-              
((cl::BufferDescriptor*)const_cast<DLTensor*>(data_entry_[eid])->data)->buffer);
-          cl_event cpy_evt = nullptr;
-          cl_event* evt = &cpy_evt;
-          if (cws->workspace->IsProfiling(cws->tentry->device)) {
-            evts.resize(evts.size() + 1);
-            evt = &(evts.back());
-          }
-
-          std::unordered_map<std::string, ffi::Any> metrics;
-          std::string shape_str;
-          std::vector<int64_t> shape(nodes_[eid].GetOpShape()[0].begin(),
-                                     nodes_[eid].GetOpShape()[0].end());
-          DLDataType tvm_dtype = nodes_[eid].GetOpDataType()[0];
-          shape_str.append(profiling::ShapeString(shape, tvm_dtype));
-          metrics["Argument Shapes"] = ffi::String(shape_str);
-
-          prof->StartCall("CopyOut", cws->tentry->device, metrics);
-          CLML_CALL(clEnqueueCopyMLTensorDataQCOM, queue, 
layer_.outputs[i]->tensor,
-                    layer_.outputs[i]->memory, 
layer_.out_placeholder[i]->tensor,
-                    layer_.out_placeholder[i]->memory, 0, nullptr, evt);
-          prof->StopCall();
-        }
-      }
-    }
-    return;
-  }
-
   /*!
    * \brief Unpack inputs and outputs and run inference on a given layer.
    *
@@ -599,7 +453,7 @@ class CLMLRuntime : public JSONRuntimeBase {
       LOG_CLML << "Execution by Rec Queue";
       if (cws->workspace->IsProfiling(cws->tentry->device)) {
         Timer t;
-        auto f = 
tvm::ffi::Function::GetGlobal(std::string("profiling.timer.opencl"));
+        auto f = 
tvm::ffi::Function::GetGlobal(std::string("runtime.timer.opencl"));
         t = f->operator()(cws->tentry->device).cast<Timer>();
         t->Start();
         queue = CLML_QUEUE;
@@ -627,7 +481,7 @@ class CLMLRuntime : public JSONRuntimeBase {
 #endif
         if (cws->workspace->IsProfiling(cws->tentry->device)) {
           Timer t;
-          auto f = 
tvm::ffi::Function::GetGlobal(std::string("profiling.timer.opencl"));
+          auto f = 
tvm::ffi::Function::GetGlobal(std::string("runtime.timer.opencl"));
           t = f->operator()(cws->tentry->device).cast<Timer>();
           t->Start();
           queue = CLML_QUEUE;
@@ -2102,8 +1956,6 @@ class CLMLRuntime : public JSONRuntimeBase {
                  << "Please build with USE_CLML_GRAPH_EXECUTOR.";
   }
 #endif
-  bool CanDebug() override { return true; }
-
   /*! CLML sub graph symbol in TVM main module */
   std::string clml_symbol;
 };
diff --git a/src/runtime/contrib/clml/clml_runtime.h 
b/src/runtime/contrib/clml/clml_runtime.h
index 29aadc434a..3a0a7b12c0 100644
--- a/src/runtime/contrib/clml/clml_runtime.h
+++ b/src/runtime/contrib/clml/clml_runtime.h
@@ -33,7 +33,6 @@
 #include <CL/opencl.h>
 #include <stdlib.h>
 #include <tvm/ffi/function.h>
-#include <tvm/runtime/profiling.h>
 #include <tvm/runtime/tensor.h>
 
 #include <fstream>
diff --git a/src/runtime/contrib/json/json_runtime.h 
b/src/runtime/contrib/json/json_runtime.h
index d00d03ec89..d980804bcb 100644
--- a/src/runtime/contrib/json/json_runtime.h
+++ b/src/runtime/contrib/json/json_runtime.h
@@ -27,7 +27,6 @@
 
 #include <tvm/ffi/extra/json.h>
 #include <tvm/ffi/extra/module.h>
-#include <tvm/runtime/profiling.h>
 #include <tvm/runtime/tensor.h>
 #include <tvm/support/io.h>
 
@@ -72,17 +71,6 @@ class JSONRuntimeBase : public ffi::ModuleObj {
   /*! \brief Invoke the execution engine to inteprete a specific json runtime. 
*/
   virtual void Run() = 0;
 
-  /*! \brief Does the backend support debug & profiling */
-  virtual bool CanDebug() { return false; }
-
-  /*!
-   * \brief Invoke the profiler
-   * \param pointer to profiler
-   */
-  virtual void RunProfile(profiling::Profiler* prof) {
-    TVM_FFI_THROW(InternalError) << "Not expected to be here : Profiling call 
w/o support ?";
-  }
-
   /*!
    * \brief Invoke the debugger
    * \return External compiler specific debug blob
@@ -116,30 +104,6 @@ class JSONRuntimeBase : public ffi::ModuleObj {
         // Execute the subgraph.
         this->Run();
       });
-    } else if (this->symbol_name_ + "_debug" == name) {
-      // NOTE: the current debug convention is not very compatible with
-      // the FFI convention, consider clean up
-      if (!this->CanDebug()) {
-        return ffi::Function(nullptr);
-      }
-      return ffi::Function([sptr_to_self, this](ffi::PackedArgs args, 
ffi::Any* rv) {
-        TVM_FFI_ICHECK(this->initialized_) << "The module has not been 
initialized";
-
-        // Bind argument tensors to data entries.
-        this->SetInputOutputBuffers(args);
-
-        if (auto opt_str = rv->try_cast<ffi::String>()) {
-          ffi::String purpose = std::move(opt_str.value());
-          if ("debug_dump" == purpose) {
-            *rv = this->DebugDump();
-          }
-        } else {
-          // Profile the subgraph.
-          profiling::Profiler* prof = 
static_cast<profiling::Profiler*>(rv->cast<void*>());
-          this->RunProfile(prof);
-        }
-        // ffi::String vendor_prof = this->RunProfile(prof);
-      });
     } else if ("__init_" + this->symbol_name_ == name) {
       // The function to initialize constant tensors.
       return ffi::Function([sptr_to_self, this](ffi::PackedArgs args, 
ffi::Any* rv) {
diff --git a/src/runtime/cuda/cuda_device_api.cc 
b/src/runtime/cuda/cuda_device_api.cc
index 45ccbb3b10..a01d223ff6 100644
--- a/src/runtime/cuda/cuda_device_api.cc
+++ b/src/runtime/cuda/cuda_device_api.cc
@@ -27,7 +27,7 @@
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/runtime/device_api.h>
-#include <tvm/runtime/profiling.h>
+#include <tvm/runtime/timer.h>
 
 #include <cstring>
 
@@ -333,7 +333,7 @@ class CUDATimerNode : public TimerNode {
 
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
-  refl::GlobalDef().def("profiling.timer.cuda",
+  refl::GlobalDef().def("runtime.timer.cuda",
                         [](Device dev) { return 
Timer(ffi::make_object<CUDATimerNode>()); });
 }
 
diff --git a/src/runtime/hexagon/hexagon_common.cc 
b/src/runtime/hexagon/hexagon_common.cc
index 442ec5ff56..15ac224a8e 100644
--- a/src/runtime/hexagon/hexagon_common.cc
+++ b/src/runtime/hexagon/hexagon_common.cc
@@ -25,7 +25,7 @@
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/runtime/logging.h>
-#include <tvm/runtime/profiling.h>
+#include <tvm/runtime/timer.h>
 
 #include <sstream>
 #include <string>
@@ -55,7 +55,7 @@ class HexagonTimerNode : public TimerNode {
 
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
-  refl::GlobalDef().def("profiling.timer.hexagon",
+  refl::GlobalDef().def("runtime.timer.hexagon",
                         [](Device dev) { return 
Timer(ffi::make_object<HexagonTimerNode>()); });
 }
 }  // namespace hexagon
diff --git a/src/runtime/metal/metal_device_api.mm 
b/src/runtime/metal/metal_device_api.mm
index f240f589c1..47ab8148c5 100644
--- a/src/runtime/metal/metal_device_api.mm
+++ b/src/runtime/metal/metal_device_api.mm
@@ -22,7 +22,7 @@
  */
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/profiling.h>
+#include <tvm/runtime/timer.h>
 #include "metal_common.h"
 
 namespace tvm {
@@ -453,7 +453,7 @@ class MetalTimerNode : public TimerNode {
 
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
-  refl::GlobalDef().def("profiling.timer.metal",
+  refl::GlobalDef().def("runtime.timer.metal",
                         [](Device dev) { return 
Timer(ffi::make_object<MetalTimerNode>(dev)); });
 }
 
diff --git a/src/runtime/opencl/opencl_common.h 
b/src/runtime/opencl/opencl_common.h
index a9fb5c01ec..2ae389046f 100644
--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -29,8 +29,8 @@
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/logging.h>
 #include <tvm/runtime/memory/memory_manager.h>
-#include <tvm/runtime/profiling.h>
 #include <tvm/runtime/tensor.h>
+#include <tvm/runtime/timer.h>
 
 /* There are many OpenCL platforms that do not yet support OpenCL 2.0,
  * hence we use 1.2 APIs, some of which are now deprecated.  In order
diff --git a/src/runtime/opencl/opencl_device_api.cc 
b/src/runtime/opencl/opencl_device_api.cc
index a4f7daf532..0b63f497db 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -22,7 +22,7 @@
  */
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/profiling.h>
+#include <tvm/runtime/timer.h>
 
 #include <sstream>
 
@@ -813,7 +813,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
 
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
-  refl::GlobalDef().def("profiling.timer.opencl",
+  refl::GlobalDef().def("runtime.timer.opencl",
                         [](Device dev) { return 
Timer(ffi::make_object<OpenCLTimerNode>(dev)); });
 }
 
diff --git a/src/runtime/profiling.cc b/src/runtime/profiling.cc
deleted file mode 100644
index 99b5d77a6e..0000000000
--- a/src/runtime/profiling.cc
+++ /dev/null
@@ -1,937 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file src/runtime/profiling.cc
- * \brief Runtime profiling including timers.
- */
-
-#include <tvm/ffi/extra/json.h>
-#include <tvm/ffi/function.h>
-#include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/c_backend_api.h>
-#include <tvm/runtime/data_type.h>
-#include <tvm/runtime/profiling.h>
-#include <tvm/runtime/threading_backend.h>
-
-#include <algorithm>
-#include <chrono>
-#include <iomanip>
-#include <iostream>
-#include <map>
-#include <mutex>
-#include <numeric>
-#include <set>
-#include <thread>
-#include <unordered_set>
-
-namespace tvm {
-namespace runtime {
-
-class DefaultTimerNode : public TimerNode {
- public:
-  virtual void Start() {
-    DeviceAPI::Get(device_)->StreamSync(device_, nullptr);
-    start_ = std::chrono::high_resolution_clock::now();
-  }
-  virtual void Stop() {
-    DeviceAPI::Get(device_)->StreamSync(device_, nullptr);
-    duration_ = std::chrono::high_resolution_clock::now() - start_;
-  }
-  virtual int64_t SyncAndGetElapsedNanos() { return duration_.count(); }
-  virtual ~DefaultTimerNode() {}
-
-  explicit DefaultTimerNode(Device dev) : device_(dev) {}
-  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("runtime.DefaultTimerNode", 
DefaultTimerNode, TimerNode);
-
- private:
-  std::chrono::high_resolution_clock::time_point start_;
-  std::chrono::duration<int64_t, std::nano> duration_;
-  Device device_;
-};
-
-Timer DefaultTimer(Device dev) { return 
Timer(ffi::make_object<DefaultTimerNode>(dev)); }
-
-class CPUTimerNode : public TimerNode {
- public:
-  virtual void Start() { start_ = std::chrono::high_resolution_clock::now(); }
-  virtual void Stop() { duration_ = std::chrono::high_resolution_clock::now() 
- start_; }
-  virtual int64_t SyncAndGetElapsedNanos() { return duration_.count(); }
-  virtual ~CPUTimerNode() {}
-  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("runtime.CPUTimerNode", CPUTimerNode, 
TimerNode);
-
- private:
-  std::chrono::high_resolution_clock::time_point start_;
-  std::chrono::duration<int64_t, std::nano> duration_;
-};
-
-TVM_FFI_STATIC_INIT_BLOCK() {
-  namespace refl = tvm::ffi::reflection;
-  refl::GlobalDef().def("profiling.timer.cpu",
-                        [](Device dev) { return 
Timer(ffi::make_object<CPUTimerNode>()); });
-}
-
-// keep track of which timers are not defined but we have already warned about
-std::set<DLDeviceType> seen_devices;
-std::mutex seen_devices_lock;
-
-Timer Timer::Start(Device dev) {
-  auto f = tvm::ffi::Function::GetGlobal(std::string("profiling.timer.") +
-                                         DLDeviceType2Str(dev.device_type));
-  if (!f.has_value()) {
-    {
-      std::lock_guard<std::mutex> lock(seen_devices_lock);
-      if (seen_devices.find(dev.device_type) == seen_devices.end()) {
-        LOG(WARNING)
-            << "No timer implementation for " << 
DLDeviceType2Str(dev.device_type)
-            << ", using default timer instead. It may be inaccurate or have 
extra overhead.";
-        seen_devices.insert(dev.device_type);
-      }
-    }
-    Timer t = DefaultTimer(dev);
-    t->Start();
-    return t;
-  } else {
-    Timer t = f->operator()(dev).cast<Timer>();
-    t->Start();
-    return t;
-  }
-}
-
-TVM_FFI_STATIC_INIT_BLOCK() {
-  namespace refl = tvm::ffi::reflection;
-  refl::GlobalDef().def("profiling.start_timer", Timer::Start);
-}
-
-namespace profiling {
-
-Profiler::Profiler(std::vector<Device> devs, std::vector<MetricCollector> 
metric_collectors,
-                   std::unordered_map<ffi::String, ffi::Any> configuration)
-    : devs_(devs), collectors_(metric_collectors), 
configuration_(configuration) {
-  is_running_ = false;
-  std::vector<DeviceWrapper> wrapped_devs;
-  for (auto dev : devs) {
-    
wrapped_devs.push_back(DeviceWrapper(ffi::make_object<DeviceWrapperNode>(dev)));
-  }
-  for (auto& x : collectors_) {
-    x->Init(wrapped_devs);
-  }
-  // reset the thread pool so that PAPI eventset hooks are set in all threads.
-  threading::ResetThreadPool();
-
-  configuration_[ffi::String("Number of threads")] =
-      ObjectRef(ffi::make_object<CountNode>(threading::NumThreads()));
-}
-
-void Profiler::Start() {
-  is_running_ = true;
-  for (auto dev : devs_) {
-    StartCall("Total", dev, {});
-  }
-}
-
-void Profiler::StartCall(ffi::String name, Device dev,
-                         std::unordered_map<std::string, ffi::Any> 
extra_metrics) {
-  std::vector<std::pair<MetricCollector, ObjectRef>> objs;
-  for (auto& collector : collectors_) {
-    ObjectRef obj = collector->Start(dev);
-    if (obj.defined()) {
-      objs.emplace_back(collector, obj);
-    }
-  }
-  in_flight_.push(CallFrame{dev, name, Timer::Start(dev), extra_metrics, 
objs});
-}
-
-void Profiler::StopCall(std::unordered_map<std::string, ffi::Any> 
extra_metrics) {
-  CallFrame cf = in_flight_.top();
-  cf.timer->Stop();
-  for (auto& p : extra_metrics) {
-    cf.extra_metrics[p.first] = p.second;
-  }
-  // collect the extra metrics from user defined collectors
-  for (const auto& obj : cf.extra_collectors) {
-    auto collector_metrics = obj.first->Stop(obj.second);
-    for (auto& p : collector_metrics) {
-      cf.extra_metrics[p.first] = p.second;
-    }
-  }
-  in_flight_.pop();
-  calls_.push_back(cf);
-}
-
-void Profiler::Stop() {
-  is_running_ = false;
-  for (size_t i = 0; i < devs_.size(); i++) {
-    StopCall();
-  }
-}
-
-std::vector<int64_t> ToShape(Tensor shape_tensor) {
-  std::vector<int64_t> shape;
-  auto rank = shape_tensor.Shape().size();
-  auto dtype = shape_tensor.DataType();
-
-  // For 0-rank shapes we need to allocate a single scalar.
-  if (rank == 0) {
-    return shape;
-  }
-
-  // Otherwise we should be rank-1, and we will extract the number of 
dimensions
-  // for the output vector.
-  TVM_FFI_ICHECK_EQ(rank, 1U) << "shape tensor should be a k-length vector, 
found " << rank;
-  int64_t ndim = shape_tensor.Shape().at(0);
-  shape.resize(ndim);
-
-  const DLTensor* dl_tensor = shape_tensor.operator->();
-  if (dtype.is_int() && dtype.bits() == 32 && dtype.lanes() == 1) {
-    int32_t* dims = reinterpret_cast<int32_t*>(dl_tensor->data);
-    shape.assign(dims, dims + ndim);
-  } else if (dtype.is_int() && dtype.bits() == 64 && dtype.lanes() == 1) {
-    int64_t* dims = reinterpret_cast<int64_t*>(dl_tensor->data);
-    shape.assign(dims, dims + ndim);
-  } else {
-    TVM_FFI_THROW(InternalError) << "invalid shape tensor datatype: " << dtype;
-  }
-
-  return shape;
-}
-
-ffi::String ShapeString(Tensor shape, DLDataType dtype) {
-  return ShapeString(ToShape(shape), dtype);
-}
-
-ffi::String ShapeString(const std::vector<int64_t>& shape, DLDataType dtype) {
-  std::stringstream sizes;
-  sizes << dtype << "[";
-  for (size_t i = 0; i < shape.size(); i++) {
-    if (i != 0) {
-      sizes << ", ";
-    }
-    sizes << shape[i];
-  }
-  sizes << "]";
-  return ffi::String(sizes.str());
-}
-
-ffi::String ShapeString(const std::vector<Tensor>& shapes) {
-  std::stringstream sizes;
-  for (const Tensor& ary : shapes) {
-    if (sizes.tellp() > 0) {
-      sizes << ", ";
-    }
-    auto shape = ary.Shape();
-    sizes << ary.DataType() << "[";
-    for (size_t i = 0; i < shape.size(); i++) {
-      if (i != 0) {
-        sizes << ", ";
-      }
-      sizes << shape[i];
-    }
-    sizes << "]";
-  }
-  return ffi::String(sizes.str());
-}
-
-ffi::String ReportNode::AsCSV() const {
-  // get unique headers
-  std::set<std::string> unique_headers;
-
-  for (auto row : calls) {
-    for (auto p : row) {
-      unique_headers.insert(p.first);
-    }
-  }
-
-  std::vector<std::string> headers;
-  for (auto x : unique_headers) {
-    headers.push_back(x);
-  }
-
-  std::stringstream s;
-
-  for (size_t i = 0; i < headers.size(); i++) {
-    std::string header = headers[i];
-    s << header;
-    if (i < headers.size() - 1) {
-      s << ",";
-    }
-  }
-  s << std::endl;
-  for (auto row : calls) {
-    for (size_t i = 0; i < headers.size(); i++) {
-      std::string header = headers[i];
-      auto it = row.find(header);
-      if (it != row.end()) {
-        std::string val;
-        if ((*it).second.as<CountNode>()) {
-          s << (*it).second.as<CountNode>()->value;
-        } else if ((*it).second.as<DurationNode>()) {
-          s << (*it).second.as<DurationNode>()->microseconds;
-        } else if ((*it).second.as<PercentNode>()) {
-          s << (*it).second.as<PercentNode>()->percent;
-        } else if ((*it).second.as<RatioNode>()) {
-          s << (*it).second.as<RatioNode>()->ratio;
-        } else if (auto opt_str = (*it).second.as<ffi::String>()) {
-          s << "\"" << *opt_str << "\"";
-        }
-      }
-      if (i < headers.size() - 1) {
-        s << ",";
-      }
-    }
-    s << std::endl;
-  }
-  return s.str();
-}
-
-namespace {
-void metric_as_json(std::ostream& os, ffi::Any o) {
-  if (auto opt_str = o.as<ffi::String>()) {
-    os << "{\"string\":"
-       << "\"" << *opt_str << "\""
-       << "}";
-  } else if (const CountNode* n = o.as<CountNode>()) {
-    os << "{\"count\":" << n->value << "}";
-  } else if (const DurationNode* n = o.as<DurationNode>()) {
-    os << "{\"microseconds\":" << 
std::setprecision(std::numeric_limits<double>::max_digits10)
-       << std::fixed << n->microseconds << "}";
-  } else if (const PercentNode* n = o.as<PercentNode>()) {
-    os << "{\"percent\":" << 
std::setprecision(std::numeric_limits<double>::max_digits10)
-       << std::fixed << n->percent << "}";
-  } else if (const RatioNode* n = o.as<RatioNode>()) {
-    os << "{\"ratio\":" << 
std::setprecision(std::numeric_limits<double>::max_digits10)
-       << std::fixed << n->ratio << "}";
-  } else {
-    TVM_FFI_THROW(InternalError) << "Unprintable type " << o.GetTypeKey();
-  }
-}
-}  // namespace
-
-ffi::String ReportNode::AsJSON() const {
-  std::ostringstream s;
-  // We want a specific write for the value,
-  // so we would have to implement a custom data structure for each type of
-  // value we want to print. Instead we construct the json by hand because it
-  // is easier.
-  s << "{";
-
-  s << "\"calls\":[";
-  for (size_t i = 0; i < calls.size(); i++) {
-    size_t j = 0;
-    s << "{";
-    for (const auto& kv : calls[i]) {
-      s << "\"" << kv.first << "\":";
-      metric_as_json(s, kv.second);
-      if (j < calls[i].size() - 1) {
-        s << ",";
-      }
-      j++;
-    }
-    s << "}";
-    if (i < calls.size() - 1) {
-      s << ",";
-    }
-  }
-  s << "],";  // end calls
-
-  s << "\"device_metrics\":{";
-  size_t i = 0;
-  for (const auto& dev_kv : device_metrics) {
-    size_t j = 0;
-    s << "\"" << dev_kv.first << "\":{";
-    for (const auto& metric_kv : dev_kv.second) {
-      s << "\"" << metric_kv.first << "\":";
-      metric_as_json(s, metric_kv.second);
-      if (j < dev_kv.second.size() - 1) {
-        s << ",";
-      }
-      j++;
-    }
-    s << "}";
-    if (i < device_metrics.size() - 1) {
-      s << ",";
-    }
-    i++;
-  }
-  s << "},";  // end device metrics
-
-  s << "\"configuration\":{";
-  size_t k = 0;
-  for (const auto& kv : configuration) {
-    s << "\"" << kv.first << "\":";
-    metric_as_json(s, kv.second);
-    if (k < configuration.size() - 1) {
-      s << ",";
-    }
-    k++;
-  }
-  s << "}";  // end configuration
-  s << "}";
-  return s.str();
-}
-
-// Aggregate a set of values for a metric. Computes sum for Duration, Count,
-// and Percent; average for Ratio; and assumes all Strings are the same. All
-// ObjectRefs in metrics must have the same type.
-Any AggregateMetric(const std::vector<ffi::Any>& metrics) {
-  TVM_FFI_ICHECK_GT(metrics.size(), 0) << "Must pass a non-zero number of 
metrics";
-  if (metrics[0].as<DurationNode>()) {
-    double sum = 0;
-    for (auto& metric : metrics) {
-      sum += metric.as<DurationNode>()->microseconds;
-    }
-    return ObjectRef(ffi::make_object<DurationNode>(sum));
-  } else if (metrics[0].as<CountNode>()) {
-    int64_t sum = 0;
-    for (auto& metric : metrics) {
-      sum += metric.as<CountNode>()->value;
-    }
-    return ObjectRef(ffi::make_object<CountNode>(sum));
-  } else if (metrics[0].as<PercentNode>()) {
-    double sum = 0;
-    for (auto& metric : metrics) {
-      sum += metric.as<PercentNode>()->percent;
-    }
-    return ObjectRef(ffi::make_object<PercentNode>(sum));
-  } else if (metrics[0].as<RatioNode>()) {
-    double sum = 0;
-    for (auto& metric : metrics) {
-      sum += metric.as<RatioNode>()->ratio;
-    }
-    return ObjectRef(ffi::make_object<RatioNode>(sum / metrics.size()));
-  } else if (auto opt_str = metrics[0].as<ffi::String>()) {
-    for (auto& m : metrics) {
-      if (*opt_str != m.as<ffi::String>()) {
-        return ffi::String("");
-      }
-    }
-    // Assume all strings in metrics are the same.
-    return metrics[0];
-  } else {
-    TVM_FFI_THROW(InternalError)
-        << "Can only aggregate metrics with types DurationNode, CountNode, "
-           "PercentNode, RatioNode, and String, but got "
-        << metrics[0].GetTypeKey();
-    return ffi::Any();  // To silence warnings
-  }
-}
-
-// Try and set the locale of the provided stringstream so that it will print
-// numbers with thousands separators. Sometimes users will have a misconfigured
-// system where an invalid locale is set, so we catch and ignore any locale
-// errors.
-static void set_locale_for_separators(std::stringstream& s) {
-  try {
-    // empty string indicates locale should be the user's default, see man 3 
setlocale
-    s.imbue(std::locale(""));
-  } catch (std::runtime_error& e) {
-  }
-}
-
-static ffi::String print_metric(ffi::Any metric) {
-  std::string val;
-  if (metric.as<CountNode>()) {
-    std::stringstream s;
-    set_locale_for_separators(s);
-    s << std::fixed << metric.as<CountNode>()->value;
-    val = s.str();
-  } else if (metric.as<DurationNode>()) {
-    std::stringstream s;
-    set_locale_for_separators(s);
-    s << std::fixed << std::setprecision(2) << 
metric.as<DurationNode>()->microseconds;
-    val = s.str();
-  } else if (metric.as<PercentNode>()) {
-    std::stringstream s;
-    s << std::fixed << std::setprecision(2) << 
metric.as<PercentNode>()->percent;
-    val = s.str();
-  } else if (metric.as<RatioNode>()) {
-    std::stringstream s;
-    set_locale_for_separators(s);
-    s << std::setprecision(2) << metric.as<RatioNode>()->ratio;
-    val = s.str();
-  } else if (auto opt_str = metric.as<ffi::String>()) {
-    val = *opt_str;
-  } else {
-    TVM_FFI_THROW(InternalError) << "Cannot print metric of type " << 
metric.GetTypeKey();
-  }
-  return val;
-}
-
-ffi::String ReportNode::AsTable(bool sort, bool aggregate, bool 
compute_col_sums) const {
-  // aggregate calls by op hash (or op name if hash is not set) + argument 
shapes
-  std::vector<ffi::Map<ffi::String, ffi::Any>> aggregated_calls;
-  if (aggregate) {
-    std::unordered_map<std::string, std::vector<size_t>> aggregates;
-    for (size_t i = 0; i < calls.size(); i++) {
-      auto frame = calls[i];
-      auto it = frame.find("Hash");
-      std::string name = frame["Name"].cast<ffi::String>();
-      if (it != frame.end()) {
-        name = (*it).second.cast<ffi::String>();
-      }
-      if (frame.find("Argument Shapes") != frame.end()) {
-        name += frame["Argument Shapes"].cast<ffi::String>();
-      }
-      if (frame.find("Device") != frame.end()) {
-        name += frame["Device"].cast<ffi::String>();
-      }
-
-      if (aggregates.find(name) == aggregates.end()) {
-        aggregates[name] = {i};
-      } else {
-        aggregates[name].push_back(i);
-      }
-    }
-    for (const auto& p : aggregates) {
-      std::unordered_map<ffi::String, ffi::Any> aggregated;
-      std::unordered_set<std::string> metrics;
-      for (auto& call : calls) {
-        for (auto& metric : call) {
-          metrics.insert(metric.first);
-        }
-      }
-      for (const std::string& metric : metrics) {
-        std::vector<ffi::Any> per_call;
-        for (auto i : p.second) {
-          auto call = calls[i];
-          auto it = std::find_if(call.begin(), call.end(),
-                                 [&metric](const std::pair<ffi::String, 
ffi::Any>& call_metric) {
-                                   return std::string(call_metric.first) == 
metric;
-                                 });
-          if (it != call.end()) {
-            per_call.push_back((*it).second);
-          }
-        }
-        if (per_call.size() > 0) {
-          aggregated[metric] = AggregateMetric(per_call);
-        }
-      }
-      aggregated_calls.push_back(aggregated);
-    }
-  } else {
-    for (auto call : calls) {
-      aggregated_calls.push_back(call);
-    }
-  }
-
-  // sort rows by duration
-  if (sort) {
-    std::sort(
-        aggregated_calls.begin(), aggregated_calls.end(),
-        [&](const ffi::Map<ffi::String, ffi::Any>& a, const 
ffi::Map<ffi::String, ffi::Any>& b) {
-          return a.at("Duration (us)").as<DurationNode>()->microseconds >
-                 b.at("Duration (us)").as<DurationNode>()->microseconds;
-        });
-  }
-
-  // compute columnwise sums
-  if (compute_col_sums) {
-    std::unordered_map<ffi::String, ffi::Any> col_sums;
-    for (auto call : aggregated_calls) {
-      for (auto p : call) {
-        if (p.second.as<CountNode>()) {
-          int64_t val = p.second.as<CountNode>()->value;
-          auto it = col_sums.find(p.first);
-          if (it != col_sums.end()) {
-            val += it->second.as<CountNode>()->value;
-          }
-          col_sums[p.first] = ObjectRef(ffi::make_object<CountNode>(val));
-        } else if (p.second.as<DurationNode>()) {
-          double val = p.second.as<DurationNode>()->microseconds;
-          auto it = col_sums.find(p.first);
-          if (it != col_sums.end()) {
-            val += it->second.as<DurationNode>()->microseconds;
-          }
-          col_sums[p.first] = ObjectRef(ffi::make_object<DurationNode>(val));
-        } else if (p.second.as<PercentNode>()) {
-          double val = p.second.as<PercentNode>()->percent;
-          auto it = col_sums.find(p.first);
-          if (it != col_sums.end()) {
-            val += it->second.as<PercentNode>()->percent;
-          }
-          col_sums[p.first] = ObjectRef(ffi::make_object<PercentNode>(val));
-        } else if (p.second.as<RatioNode>()) {
-          // It does not make sense to sum ratios
-        }
-      }
-    }
-    col_sums["Name"] = ffi::String("Sum");
-    aggregated_calls.push_back({{ffi::String("Name"), 
ffi::String("----------")}});  // separator
-    aggregated_calls.push_back(col_sums);
-  }
-
-  // per-device metrics
-  for (auto p : device_metrics) {
-    ffi::Map<ffi::String, ffi::Any> metrics = p.second;
-    metrics.Set("Name", ffi::String("Total"));
-    aggregated_calls.push_back(metrics);
-  }
-
-  // Table formatting
-  std::set<std::string> unique_headers;
-  for (auto row : aggregated_calls) {
-    for (auto p : row) {
-      unique_headers.insert(p.first);
-    }
-  }
-
-  // always include these headers in this order
-  std::vector<std::string> headers = {"Name",   "Duration (us)", "Percent",
-                                      "Device", "Count",         "Argument 
Shapes"};
-  for (auto header : unique_headers) {
-    if (std::find(headers.begin(), headers.end(), header) == headers.end()) {
-      headers.push_back(header);
-    }
-  }
-
-  // Switch layout from row major to column major so we can easily compute 
column widths.
-  std::vector<std::vector<std::string>> cols;
-  for (auto header : headers) {
-    cols.push_back({header});
-  }
-  for (auto row : aggregated_calls) {
-    for (size_t i = 0; i < headers.size(); i++) {
-      auto it = row.find(headers[i]);
-      if (it == row.end()) {
-        // fill empty data with empty strings
-        cols[i].push_back("");
-      } else {
-        cols[i].push_back(print_metric((*it).second));
-      }
-    }
-  }
-
-  std::vector<size_t> widths;
-  for (auto v : cols) {
-    size_t width = 0;
-    for (auto x : v) {
-      width = std::max(width, x.size());
-    }
-    widths.push_back(width);
-  }
-  size_t length = 0;
-  for (auto v : cols) {
-    length = std::max(length, v.size());
-  }
-
-  std::stringstream s;
-  for (size_t row = 0; row < length; row++) {
-    for (size_t col = 0; col < cols.size(); col++) {
-      // left align first column
-      if (col == 0) {
-        s << std::left;
-      } else {
-        s << std::right;
-      }
-      if (row < cols[col].size()) {
-        s << std::setw(widths[col]) << cols[col][row] << "  ";
-      } else {
-        s << std::setw(widths[col]) << "  ";
-      }
-    }
-    s << std::endl;
-  }
-
-  // Add configuration information. It will not be aligned with the columns.
-  s << std::endl << "Configuration" << std::endl << "-------------" << 
std::endl;
-  for (auto kv : configuration) {
-    s << kv.first << ": " << print_metric(kv.second) << std::endl;
-  }
-  return s.str();
-}
-
-std::string DeviceString(Device dev) {
-  return DLDeviceType2Str(dev.device_type) + std::to_string(dev.device_id);
-}
-
-Report Profiler::Report() {
-  // sync all timers and normalize rows
-  std::vector<std::unordered_map<ffi::String, ffi::Any>> rows;
-  for (auto& cf : calls_) {
-    std::unordered_map<ffi::String, ffi::Any> row;
-    double us = cf.timer->SyncAndGetElapsedNanos() / 1e3;
-    row["Duration (us)"] = ObjectRef(ffi::make_object<DurationNode>(us));
-    row["Count"] = ObjectRef(ffi::make_object<CountNode>(1));
-    row["Name"] = cf.name;
-    row["Device"] = ffi::String(DeviceString(cf.dev));
-    for (auto p : cf.extra_metrics) {
-      row[p.first] = p.second;
-    }
-    rows.push_back(row);
-  }
-
-  // the last frames are the overall times
-  double overall_time_us = 0;
-  std::unordered_map<ffi::String, ffi::Map<ffi::String, ffi::Any>> 
device_metrics;
-  for (size_t i = 0; i < devs_.size(); i++) {
-    auto row = rows[rows.size() - 1];
-    rows.pop_back();
-    device_metrics[row["Device"].cast<ffi::String>()] = row;
-    overall_time_us =
-        std::max(overall_time_us, row["Duration 
(us)"].as<DurationNode>()->microseconds);
-  }
-
-  // Calculate percentages
-  for (auto& row : rows) {
-    row["Percent"] = ObjectRef(ffi::make_object<PercentNode>(
-        row["Duration (us)"].as<DurationNode>()->microseconds / 
overall_time_us * 100));
-  }
-
-  // convert to map
-  std::vector<ffi::Map<ffi::String, ffi::Any>> converted_rows;
-  for (const auto& row : rows) {
-    converted_rows.push_back(row);
-  }
-
-  return profiling::Report(converted_rows, device_metrics, configuration_);
-}
-
-Report::Report(ffi::Array<ffi::Map<ffi::String, ffi::Any>> calls,
-               ffi::Map<ffi::String, ffi::Map<ffi::String, ffi::Any>> 
device_metrics,
-               ffi::Map<ffi::String, ffi::Any> configuration) {
-  auto node = ffi::make_object<ReportNode>();
-  node->calls = std::move(calls);
-  node->device_metrics = std::move(device_metrics);
-  node->configuration = std::move(configuration);
-  data_ = std::move(node);
-}
-
-namespace json = ::tvm::ffi::json;
-
-ffi::Map<ffi::String, ffi::Any> parse_metrics(const json::Object& obj) {
-  ffi::Map<ffi::String, ffi::Any> metrics;
-  for (const auto& [k, v] : obj) {
-    std::string metric_name = k.cast<ffi::String>();
-    json::Object metric_obj = v.cast<json::Object>();
-    ffi::Any o;
-    // Each metric value is an object with a single key indicating the type
-    for (const auto& [type_key, type_val] : metric_obj) {
-      std::string metric_value_name = type_key.cast<ffi::String>();
-      if (metric_value_name == "microseconds") {
-        o = ObjectRef(ffi::make_object<DurationNode>(type_val.cast<double>()));
-      } else if (metric_value_name == "percent") {
-        o = ObjectRef(ffi::make_object<PercentNode>(type_val.cast<double>()));
-      } else if (metric_value_name == "count") {
-        o = ObjectRef(ffi::make_object<CountNode>(type_val.cast<int64_t>()));
-      } else if (metric_value_name == "ratio") {
-        o = ObjectRef(ffi::make_object<RatioNode>(type_val.cast<double>()));
-      } else if (metric_value_name == "string") {
-        o = ffi::String(type_val.cast<ffi::String>());
-      } else {
-        TVM_FFI_THROW(InternalError) << "Cannot parse metric of type " << 
metric_value_name
-                                     << " valid types are microseconds, 
percent, count.";
-      }
-    }
-    metrics.Set(metric_name, o);
-  }
-  return metrics;
-}
-
-Report Report::FromJSON(ffi::String json_str) {
-  auto root = json::Parse(json_str).cast<json::Object>();
-  ffi::Array<ffi::Map<ffi::String, ffi::Any>> calls;
-  ffi::Map<ffi::String, ffi::Map<ffi::String, ffi::Any>> device_metrics;
-  ffi::Map<ffi::String, ffi::Any> configuration;
-
-  for (const auto& [k, v] : root) {
-    std::string key = k.cast<ffi::String>();
-    if (key == "calls") {
-      json::Array calls_arr = v.cast<json::Array>();
-      for (const ffi::Any& item : calls_arr) {
-        calls.push_back(parse_metrics(item.cast<json::Object>()));
-      }
-    } else if (key == "device_metrics") {
-      json::Object dev_obj = v.cast<json::Object>();
-      for (const auto& [dev_key, dev_val] : dev_obj) {
-        std::string device_name = dev_key.cast<ffi::String>();
-        device_metrics.Set(device_name, 
parse_metrics(dev_val.cast<json::Object>()));
-      }
-    } else if (key == "configuration") {
-      configuration = parse_metrics(v.cast<json::Object>());
-    }
-  }
-  return Report(calls, device_metrics, configuration);
-}
-
-TVM_FFI_STATIC_INIT_BLOCK() {
-  namespace refl = tvm::ffi::reflection;
-  refl::ObjectDef<MetricCollectorNode>();
-  refl::ObjectDef<DeviceWrapperNode>();
-
-  refl::GlobalDef()
-      .def_method("runtime.profiling.AsTable", &ReportNode::AsTable)
-      .def("runtime.profiling.AsCSV", [](Report n) { return n->AsCSV(); })
-      .def("runtime.profiling.AsJSON", [](Report n) { return n->AsJSON(); })
-      .def("runtime.profiling.FromJSON", Report::FromJSON)
-      .def("runtime.profiling.DeviceWrapper", [](Device dev) { return 
DeviceWrapper(dev); });
-}
-
-ffi::Function ProfileFunction(ffi::Module mod, std::string func_name, int 
device_type,
-                              int device_id, int warmup_iters,
-                              ffi::Array<MetricCollector> collectors) {
-  // Module::GetFunction is not const, so this lambda has to be mutable
-  return ffi::Function::FromPacked(
-      [=](const ffi::AnyView* args, int32_t num_args, ffi::Any* ret) mutable {
-        auto optf = mod->GetFunction(func_name);
-        TVM_FFI_ICHECK(optf.has_value())
-            << "There is no function called \"" << func_name << "\" in the 
module";
-        auto f = *optf;
-        Device dev{static_cast<DLDeviceType>(device_type), device_id};
-
-        // warmup
-        for (int i = 0; i < warmup_iters; i++) {
-          f.CallPacked(args, num_args, ret);
-        }
-
-        for (auto& collector : collectors) {
-          collector->Init({DeviceWrapper(dev)});
-        }
-        std::vector<ffi::Map<ffi::String, ffi::Any>> results;
-        results.reserve(collectors.size());
-        std::vector<std::pair<MetricCollector, ObjectRef>> collector_data;
-        collector_data.reserve(collectors.size());
-        for (auto& collector : collectors) {
-          ObjectRef o = collector->Start(dev);
-          // If not defined, then the collector cannot time this device.
-          if (o.defined()) {
-            collector_data.push_back({collector, o});
-          }
-        }
-
-        // TODO(tkonolige): repeated calls if the runtime is small?
-        f.CallPacked(args, num_args, ret);
-
-        for (auto& kv : collector_data) {
-          results.push_back(kv.first->Stop(kv.second));
-        }
-        ffi::Map<ffi::String, ffi::Any> combined_results;
-        for (auto m : results) {
-          for (auto p : m) {
-            // assume that there is no shared metric name between collectors
-            combined_results.Set(p.first, p.second);
-          }
-        }
-        *ret = combined_results;
-      });
-}
-
-TVM_FFI_STATIC_INIT_BLOCK() {
-  namespace refl = tvm::ffi::reflection;
-  refl::GlobalDef().def(
-      "runtime.profiling.ProfileFunction",
-      [](ffi::Module mod, ffi::String func_name, int device_type, int 
device_id, int warmup_iters,
-         ffi::Array<MetricCollector> collectors) {
-        if (mod->kind() == std::string("rpc")) {
-          TVM_FFI_THROW(InternalError)
-              << "Profiling a module over RPC is not yet supported";  // 
because we can't send
-                                                                      // 
MetricCollectors over rpc.
-          throw;
-        } else {
-          return ProfileFunction(mod, func_name, device_type, device_id, 
warmup_iters, collectors);
-        }
-      });
-}
-
-ffi::Function WrapTimeEvaluator(ffi::Function pf, Device dev, int number, int 
repeat,
-                                int min_repeat_ms, int 
limit_zero_time_iterations,
-                                int cooldown_interval_ms, int 
repeats_to_cooldown,
-                                int cache_flush_bytes, ffi::Function 
f_preproc) {
-  TVM_FFI_ICHECK(pf != nullptr);
-
-  auto ftimer = [pf, dev, number, repeat, min_repeat_ms, 
limit_zero_time_iterations,
-                 cooldown_interval_ms, repeats_to_cooldown, cache_flush_bytes,
-                 f_preproc](const ffi::AnyView* args, int num_args, ffi::Any* 
rv) mutable {
-    ffi::Any temp;
-    std::ostringstream os;
-    // skip first time call, to activate lazy compilation components.
-    pf.CallPacked(args, num_args, &temp);
-
-    // allocate two large arrays to flush L2 cache
-    Tensor arr1, arr2;
-    if (cache_flush_bytes > 0) {
-      arr1 = Tensor::Empty({cache_flush_bytes / 4}, {kDLInt, 32, 1}, dev);
-      arr2 = Tensor::Empty({cache_flush_bytes / 4}, {kDLInt, 32, 1}, dev);
-    }
-
-    DeviceAPI::Get(dev)->StreamSync(dev, nullptr);
-
-    for (int i = 0; i < repeat; ++i) {
-      if (f_preproc != nullptr) {
-        f_preproc.CallPacked(args, num_args, &temp);
-      }
-      double duration_ms = 0.0;
-      int absolute_zero_times = 0;
-      do {
-        if (duration_ms > 0.0) {
-          const double golden_ratio = 1.618;
-          number = static_cast<int>(
-              std::max((min_repeat_ms / (duration_ms / number) + 1), number * 
golden_ratio));
-        }
-        if (cache_flush_bytes > 0) {
-          arr1.CopyFrom(arr2);
-        }
-        DeviceAPI::Get(dev)->StreamSync(dev, nullptr);
-        // start timing
-        Timer t = Timer::Start(dev);
-        for (int j = 0; j < number; ++j) {
-          pf.CallPacked(args, num_args, &temp);
-        }
-        t->Stop();
-        int64_t t_nanos = t->SyncAndGetElapsedNanos();
-        if (t_nanos == 0) absolute_zero_times++;
-        duration_ms = t_nanos / 1e6;
-      } while (duration_ms < min_repeat_ms && absolute_zero_times < 
limit_zero_time_iterations);
-
-      double speed = duration_ms / 1e3 / number;
-      os.write(reinterpret_cast<char*>(&speed), sizeof(speed));
-
-      if (cooldown_interval_ms > 0 && (i % repeats_to_cooldown) == 0) {
-        
std::this_thread::sleep_for(std::chrono::milliseconds(cooldown_interval_ms));
-      }
-    }
-
-    std::string blob = os.str();
-    // return the time.
-    *rv = ffi::Bytes(std::move(blob));
-  };
-  return ffi::Function::FromPacked(ftimer);
-}
-
-TVM_FFI_STATIC_INIT_BLOCK() {
-  namespace refl = tvm::ffi::reflection;
-  refl::GlobalDef()
-      .def("runtime.profiling.Report",
-           [](ffi::Array<ffi::Map<ffi::String, ffi::Any>> calls,
-              ffi::Map<ffi::String, ffi::Map<ffi::String, ffi::Any>> 
device_metrics,
-              ffi::Map<ffi::String, ffi::Any> configuration) {
-             return Report(calls, device_metrics, configuration);
-           })
-      .def("runtime.profiling.Count",
-           [](int64_t count) { return 
ObjectRef(ffi::make_object<CountNode>(count)); })
-      .def("runtime.profiling.Percent",
-           [](double percent) { return 
ObjectRef(ffi::make_object<PercentNode>(percent)); })
-      .def("runtime.profiling.Duration",
-           [](double duration) { return 
ObjectRef(ffi::make_object<DurationNode>(duration)); })
-      .def("runtime.profiling.Ratio",
-           [](double ratio) { return 
ObjectRef(ffi::make_object<RatioNode>(ratio)); });
-}
-
-}  // namespace profiling
-}  // namespace runtime
-}  // namespace tvm
diff --git a/src/runtime/rocm/rocm_device_api.cc 
b/src/runtime/rocm/rocm_device_api.cc
index ed7bd98ffe..6612f1a8fb 100644
--- a/src/runtime/rocm/rocm_device_api.cc
+++ b/src/runtime/rocm/rocm_device_api.cc
@@ -28,7 +28,7 @@
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/logging.h>
-#include <tvm/runtime/profiling.h>
+#include <tvm/runtime/timer.h>
 
 #include "rocm_common.h"
 
@@ -297,7 +297,7 @@ class ROCMTimerNode : public TimerNode {
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef()
-      .def("profiling.timer.rocm",
+      .def("runtime.timer.rocm",
            [](Device dev) { return Timer(ffi::make_object<ROCMTimerNode>()); })
       .def("runtime.get_rocm_stream", []() {
         int device_id;
diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc
index ad8f1967db..71639abf3f 100644
--- a/src/runtime/rpc/rpc_module.cc
+++ b/src/runtime/rpc/rpc_module.cc
@@ -25,7 +25,7 @@
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/ffi/string.h>
 #include <tvm/runtime/device_api.h>
-#include <tvm/runtime/profiling.h>
+#include <tvm/runtime/timer.h>
 
 #include <chrono>
 #include <cstring>
@@ -432,9 +432,9 @@ TVM_FFI_STATIC_INIT_BLOCK() {
                  ffi::Optional<ffi::Function> pf = m->GetFunction(name);
                  TVM_FFI_ICHECK(pf.has_value())
                      << "Cannot find " << name << "` in the global registry";
-                 return profiling::WrapTimeEvaluator(
-                     *pf, dev, number, repeat, min_repeat_ms, 
limit_zero_time_iterations,
-                     cooldown_interval_ms, repeats_to_cooldown, 
cache_flush_bytes, f_preproc);
+                 return WrapTimeEvaluator(*pf, dev, number, repeat, 
min_repeat_ms,
+                                          limit_zero_time_iterations, 
cooldown_interval_ms,
+                                          repeats_to_cooldown, 
cache_flush_bytes, f_preproc);
                }
              } else {
                auto pf = tvm::ffi::Function::GetGlobal(name);
@@ -447,9 +447,9 @@ TVM_FFI_STATIC_INIT_BLOCK() {
                      << "Cannot find " << f_preproc_name << " in the global 
function";
                  f_preproc = *pf_preproc;
                }
-               return profiling::WrapTimeEvaluator(
-                   *pf, dev, number, repeat, min_repeat_ms, 
limit_zero_time_iterations,
-                   cooldown_interval_ms, repeats_to_cooldown, 
cache_flush_bytes, f_preproc);
+               return WrapTimeEvaluator(*pf, dev, number, repeat, 
min_repeat_ms,
+                                        limit_zero_time_iterations, 
cooldown_interval_ms,
+                                        repeats_to_cooldown, 
cache_flush_bytes, f_preproc);
              }
            })
       .def_packed("cache_flush_cpu_non_first_arg",
diff --git a/src/runtime/timer.cc b/src/runtime/timer.cc
new file mode 100644
index 0000000000..075f56337e
--- /dev/null
+++ b/src/runtime/timer.cc
@@ -0,0 +1,176 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/runtime/timer.cc
+ * \brief Runtime timer primitives: Timer, WrapTimeEvaluator.
+ */
+
+#include <tvm/ffi/function.h>
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/runtime/c_backend_api.h>
+#include <tvm/runtime/device_api.h>
+#include <tvm/runtime/timer.h>
+
+#include <chrono>
+#include <mutex>
+#include <set>
+#include <sstream>
+#include <thread>
+
+namespace tvm {
+namespace runtime {
+
+class DefaultTimerNode : public TimerNode {
+ public:
+  virtual void Start() {
+    DeviceAPI::Get(device_)->StreamSync(device_, nullptr);
+    start_ = std::chrono::high_resolution_clock::now();
+  }
+  virtual void Stop() {
+    DeviceAPI::Get(device_)->StreamSync(device_, nullptr);
+    duration_ = std::chrono::high_resolution_clock::now() - start_;
+  }
+  virtual int64_t SyncAndGetElapsedNanos() { return duration_.count(); }
+  virtual ~DefaultTimerNode() {}
+
+  explicit DefaultTimerNode(Device dev) : device_(dev) {}
+  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("runtime.DefaultTimerNode", 
DefaultTimerNode, TimerNode);
+
+ private:
+  std::chrono::high_resolution_clock::time_point start_;
+  std::chrono::duration<int64_t, std::nano> duration_;
+  Device device_;
+};
+
+static Timer DefaultTimer(Device dev) { return 
Timer(ffi::make_object<DefaultTimerNode>(dev)); }
+
+class CPUTimerNode : public TimerNode {
+ public:
+  virtual void Start() { start_ = std::chrono::high_resolution_clock::now(); }
+  virtual void Stop() { duration_ = std::chrono::high_resolution_clock::now() 
- start_; }
+  virtual int64_t SyncAndGetElapsedNanos() { return duration_.count(); }
+  virtual ~CPUTimerNode() {}
+  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("runtime.CPUTimerNode", CPUTimerNode, 
TimerNode);
+
+ private:
+  std::chrono::high_resolution_clock::time_point start_;
+  std::chrono::duration<int64_t, std::nano> duration_;
+};
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("runtime.timer.cpu",
+                        [](Device dev) { return 
Timer(ffi::make_object<CPUTimerNode>()); });
+}
+
+Timer Timer::Start(Device dev) {
+  // Function-local statics: thread-safe lazy init (C++11 magic statics),
+  // visible only to this function.
+  static std::set<DLDeviceType> seen_devices;
+  static std::mutex seen_devices_lock;
+  auto f = tvm::ffi::Function::GetGlobal(std::string("runtime.timer.") +
+                                         DLDeviceType2Str(dev.device_type));
+  if (!f.has_value()) {
+    {
+      std::lock_guard<std::mutex> lock(seen_devices_lock);
+      if (seen_devices.find(dev.device_type) == seen_devices.end()) {
+        LOG(WARNING)
+            << "No timer implementation for " << 
DLDeviceType2Str(dev.device_type)
+            << ", using default timer instead. It may be inaccurate or have 
extra overhead.";
+        seen_devices.insert(dev.device_type);
+      }
+    }
+    Timer t = DefaultTimer(dev);
+    t->Start();
+    return t;
+  } else {
+    Timer t = f->operator()(dev).cast<Timer>();
+    t->Start();
+    return t;
+  }
+}
+
+ffi::Function WrapTimeEvaluator(ffi::Function pf, Device dev, int number, int 
repeat,
+                                int min_repeat_ms, int 
limit_zero_time_iterations,
+                                int cooldown_interval_ms, int 
repeats_to_cooldown,
+                                int cache_flush_bytes, ffi::Function 
f_preproc) {
+  TVM_FFI_ICHECK(pf != nullptr);
+
+  auto ftimer = [pf, dev, number, repeat, min_repeat_ms, 
limit_zero_time_iterations,
+                 cooldown_interval_ms, repeats_to_cooldown, cache_flush_bytes,
+                 f_preproc](const ffi::AnyView* args, int num_args, ffi::Any* 
rv) mutable {
+    ffi::Any temp;
+    std::ostringstream os;
+    // skip first time call, to activate lazy compilation components.
+    pf.CallPacked(args, num_args, &temp);
+
+    // allocate two large arrays to flush L2 cache
+    Tensor arr1, arr2;
+    if (cache_flush_bytes > 0) {
+      arr1 = Tensor::Empty({cache_flush_bytes / 4}, {kDLInt, 32, 1}, dev);
+      arr2 = Tensor::Empty({cache_flush_bytes / 4}, {kDLInt, 32, 1}, dev);
+    }
+
+    DeviceAPI::Get(dev)->StreamSync(dev, nullptr);
+
+    for (int i = 0; i < repeat; ++i) {
+      if (f_preproc != nullptr) {
+        f_preproc.CallPacked(args, num_args, &temp);
+      }
+      double duration_ms = 0.0;
+      int absolute_zero_times = 0;
+      do {
+        if (duration_ms > 0.0) {
+          const double golden_ratio = 1.618;
+          number = static_cast<int>(
+              std::max((min_repeat_ms / (duration_ms / number) + 1), number * 
golden_ratio));
+        }
+        if (cache_flush_bytes > 0) {
+          arr1.CopyFrom(arr2);
+        }
+        DeviceAPI::Get(dev)->StreamSync(dev, nullptr);
+        // start timing
+        Timer t = Timer::Start(dev);
+        for (int j = 0; j < number; ++j) {
+          pf.CallPacked(args, num_args, &temp);
+        }
+        t->Stop();
+        int64_t t_nanos = t->SyncAndGetElapsedNanos();
+        if (t_nanos == 0) absolute_zero_times++;
+        duration_ms = t_nanos / 1e6;
+      } while (duration_ms < min_repeat_ms && absolute_zero_times < 
limit_zero_time_iterations);
+
+      double speed = duration_ms / 1e3 / number;
+      os.write(reinterpret_cast<char*>(&speed), sizeof(speed));
+
+      if (cooldown_interval_ms > 0 && (i % repeats_to_cooldown) == 0) {
+        
std::this_thread::sleep_for(std::chrono::milliseconds(cooldown_interval_ms));
+      }
+    }
+
+    std::string blob = os.str();
+    // return the time.
+    *rv = ffi::Bytes(std::move(blob));
+  };
+  return ffi::Function::FromPacked(ftimer);
+}
+
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/vm/executable.cc b/src/runtime/vm/executable.cc
index 24ac2949b4..18bb8f6880 100644
--- a/src/runtime/vm/executable.cc
+++ b/src/runtime/vm/executable.cc
@@ -56,7 +56,6 @@ ffi::Optional<ffi::Function> VMExecutable::GetFunction(const 
ffi::String& _name)
   TVM_MODULE_VTABLE_ENTRY("as_text", &VMExecutable::AsText);
   TVM_MODULE_VTABLE_ENTRY("as_python", &VMExecutable::AsPython);
   TVM_MODULE_VTABLE_ENTRY("vm_load_executable", 
&VMExecutable::VMLoadExecutable);
-  TVM_MODULE_VTABLE_ENTRY("vm_profiler_load_executable", 
&VMExecutable::VMProfilerLoadExecutable);
   TVM_MODULE_VTABLE_ENTRY("has_function", &VMExecutable::HasFunction);
   return std::nullopt;
 }
@@ -437,12 +436,6 @@ ffi::Module VMExecutable::VMLoadExecutable() const {
   return ffi::Module(vm);
 }
 
-ffi::Module VMExecutable::VMProfilerLoadExecutable() const {
-  ObjectPtr<VirtualMachine> vm = VirtualMachine::CreateProfiler();
-  
vm->LoadExecutable(GetObjectPtr<VMExecutable>(const_cast<VMExecutable*>(this)));
-  return ffi::Module(vm);
-}
-
 bool VMExecutable::HasFunction(const ffi::String& name) const { return 
func_map.count(name); }
 
 ffi::String VMExecutable::AsText() const {
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index f7d6765e6a..660565d9b3 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -24,10 +24,8 @@
 #include <tvm/ffi/function.h>
 #include <tvm/runtime/memory/memory_manager.h>
 #include <tvm/runtime/nvtx.h>
-#include <tvm/runtime/profiling.h>
 #include <tvm/runtime/vm/vm.h>
 
-#include <optional>
 #include <thread>
 
 #include "./module_utils.h"
@@ -971,127 +969,6 @@ ffi::Function VirtualMachineImpl::_LookupFunction(const 
ffi::String& name) {
   return ffi::Function(nullptr);
 }
 
-//----------------------------------------------------------------
-// Profiler can be optionally disabled via a macro to reduce dep.
-//----------------------------------------------------------------
-#if TVM_VM_ENABLE_PROFILER
-
-/*!
- * \brief An extension of VirtualMachineImpl to support per-op profiling
- * It overrides RunInstrCall to add instrumentations around it.
- */
-class VirtualMachineProfiler : public VirtualMachineImpl {
- public:
-  ffi::Optional<ffi::Function> GetFunction(const ffi::String& name) override {
-    ObjectPtr<Object> sptr_to_self = ffi::GetObjectPtr<Object>(this);
-    if (name == "profile") {
-      return ffi::Function([sptr_to_self, this](ffi::PackedArgs args, 
ffi::Any* rv) {
-        std::string f_name = args[0].cast<std::string>();
-        VMClosure clo = this->GetClosure(f_name);
-
-        std::vector<Device> devices;
-        for (auto dev : this->devices) {
-          if (dev.device_type > 0) {
-            devices.push_back(dev);
-          }
-        }
-
-        prof_ = profiling::Profiler(devices, {}, {{ffi::String("Executor"), 
ffi::String("VM")}});
-
-        auto inputs = GetInputsFor(f_name);
-
-        bool clear_inputs = false;
-        if (inputs.size() == 0) {
-          TVM_FFI_ICHECK(args.size() > 1) << "No input is provided";
-          SetInput(f_name, false, args.Slice(1));
-          inputs = GetInputsFor(f_name);
-          clear_inputs = true;
-        } else {
-          TVM_FFI_ICHECK_EQ(args.size(), 1) << "Inputs are already provided by 
set_input.";
-        }
-
-        // warmup
-        this->InvokeClosureInternal(clo, inputs);
-
-        prof_->Start();
-        this->InvokeClosureInternal(clo, inputs);
-        prof_->Stop();
-
-        // Return the report as json, since profiling::Report object is not 
supported by RPC
-        std::string report_json = prof_->Report()->AsJSON();
-        *rv = report_json;
-
-        prof_ = std::nullopt;  // releases hardware counters
-        if (clear_inputs) {
-          // SetInput modifies the internal states of VM. Undo the change 
after profiling.
-          ClearInputsFor(f_name);
-        }
-      });
-    } else {
-      return VirtualMachineImpl::GetFunction(name);
-    }
-  }
-
- protected:
-  void RunInstrCall(VMFrame* curr_frame, Instruction inst) override {
-    bool profiling = false;
-    if (prof_ && prof_->IsRunning()) {
-      auto f_name = GetFuncName(inst.func_idx);
-      std::optional<Device> dev;
-      std::vector<Tensor> arrs;
-
-      auto f_check_tensor_arg = [&dev, &arrs](const RegType& arg) {
-        if (auto opt_nd = arg.as<Tensor>()) {
-          Tensor arr = opt_nd.value();
-          if (arr.defined()) {
-            dev = arr->device;
-            arrs.push_back(arr);
-          }
-        }
-      };
-
-      for (Index i = 0; i < inst.num_args; ++i) {
-        Instruction::Arg arg = inst.args[i];
-        if (arg.kind() == Instruction::ArgKind::kRegister) {
-          auto reg = ReadRegister(curr_frame, arg.value());
-          f_check_tensor_arg(reg);
-        } else if (arg.kind() == Instruction::ArgKind::kConstIdx) {
-          const auto& const_val = this->const_pool_[arg.value()];
-          f_check_tensor_arg(const_val);
-        }
-      }
-
-      std::unordered_map<std::string, ffi::Any> metrics;
-      metrics["Argument Shapes"] = profiling::ShapeString(arrs);
-
-      // If a suitable device is found, enable profiling.
-      if (dev) {
-        profiling = true;
-        prof_->StartCall(f_name, *dev, metrics);
-      }
-    }
-
-    VirtualMachineImpl::RunInstrCall(curr_frame, inst);
-
-    if (profiling) {
-      prof_->StopCall();
-    }
-  }
-
- private:
-  std::optional<profiling::Profiler> prof_;
-};
-
-ObjectPtr<VirtualMachine> VirtualMachine::CreateProfiler() {
-  return ffi::make_object<VirtualMachineProfiler>();
-}
-
-#else
-ObjectPtr<VirtualMachine> VirtualMachine::CreateProfiler() {
-  TVM_FFI_THROW(InternalError) << "Profiler support is disabled";
-  return nullptr;
-}
-#endif  // TVM_VM_ENABLE_PROFILER
 }  // namespace vm
 }  // namespace runtime
 }  // namespace tvm
diff --git a/tests/python/relax/test_codegen_coreml.py 
b/tests/python/relax/test_codegen_coreml.py
index de3a6d0789..63a704cc41 100644
--- a/tests/python/relax/test_codegen_coreml.py
+++ b/tests/python/relax/test_codegen_coreml.py
@@ -54,12 +54,12 @@ def verify(mod, inputs):
     assert relax.analysis.well_formed(mod1)
 
     ex1 = tvm.compile(mod1, target=target)
-    vm1 = relax.VirtualMachine(ex1, dev, profile=True)
+    vm1 = relax.VirtualMachine(ex1, dev)
     out1 = vm1["main"](*inputs)
 
     mod2 = relax.transform.LegalizeOps()(mod)
     ex2 = tvm.compile(mod2, target=target)
-    vm2 = relax.VirtualMachine(ex2, dev, profile=True)
+    vm2 = relax.VirtualMachine(ex2, dev)
     out2 = vm2["main"](*inputs)
 
     tvm.testing.assert_allclose(out1.numpy(), out2.numpy(), rtol=1e-3, 
atol=1e-3)
diff --git a/tests/python/relax/test_training_trainer_numeric.py 
b/tests/python/relax/test_training_trainer_numeric.py
index c3f1deee6b..b96c46dd8f 100644
--- a/tests/python/relax/test_training_trainer_numeric.py
+++ b/tests/python/relax/test_training_trainer_numeric.py
@@ -66,7 +66,7 @@ def test_execute(target, dev):
 
     train_mod = setup_trainer(backbone)
     ex = tvm.compile(train_mod, target)
-    vm = relax.VirtualMachine(ex, dev, profile=True)
+    vm = relax.VirtualMachine(ex, dev)
 
     trainer = Trainer(train_mod, vm, dev, False)
     trainer.zero_init_params()
@@ -75,7 +75,6 @@ def test_execute(target, dev):
     dataset = _make_dataset()
     trainer.predict(dataset[0][0])
     trainer.update(dataset[0][0], dataset[0][1])
-    trainer.profile_adjoint(dataset[0][0], dataset[0][1])
 
 
 @tvm.testing.parametrize_targets("llvm")
diff --git a/tests/python/relax/test_vm_profiler.py 
b/tests/python/relax/test_vm_profiler.py
deleted file mode 100644
index c661bbe5da..0000000000
--- a/tests/python/relax/test_vm_profiler.py
+++ /dev/null
@@ -1,129 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# ruff: noqa: RUF005
-import numpy as np
-
-import tvm
-import tvm.testing
-from tvm import relax, rpc
-from tvm.contrib import utils
-from tvm.relax.testing import nn
-from tvm.script import relax as R
-
-
-def get_exec(data_shape):
-    builder = relax.BlockBuilder()
-    weight1_np = np.random.randn(64, 64).astype("float32")
-    weight2_np = np.random.randn(64, 64).astype("float32")
-
-    with builder.function("main"):
-        model = nn.Sequential(
-            nn.Linear(data_shape[1], weight1_np.shape[0], bias=False),
-            nn.ReLU(),
-            nn.Linear(weight2_np.shape[0], weight2_np.shape[1], bias=False),
-            nn.ReLU(),
-        )
-        data = nn.Placeholder(data_shape, name="data")
-        output = model(data)
-        params = [data] + model.parameters()
-        builder.emit_func_output(output, params=params)
-
-    mod = builder.get()
-
-    params = {"linear_weight": weight1_np, "linear_weight1": weight2_np}
-    mod = relax.transform.BindParams("main", params)(mod)
-
-    target = "llvm"
-    return tvm.compile(mod, target)
-
-
-def test_conv2d_cpu():
-    data_np = np.random.randn(1, 64).astype("float32")
-    ex = get_exec(data_np.shape)
-
-    vm = relax.VirtualMachine(ex, tvm.cpu(), profile=True)
-    report = vm.profile("main", tvm.runtime.tensor(data_np))
-    print(report)
-
-    assert "Duration" in str(report)
-    assert "matmul" in str(report)
-
-
-def with_rpc(ex, f, data_np):
-    temp = utils.tempdir()
-    path = temp.relpath("vm_library.so")
-    ex.export_library(path)
-
-    server = rpc.Server("127.0.0.1")
-    remote = rpc.connect(server.host, server.port, session_timeout=10)
-
-    remote.upload(path)
-    rexec = remote.load_module("vm_library.so")
-
-    device = remote.cpu()
-
-    vm = relax.VirtualMachine(rexec, device=device, profile=True)
-    data = tvm.runtime.tensor(data_np, device)
-
-    f(vm, data)
-
-
-def test_rpc():
-    data_np = np.random.randn(1, 64).astype("float32")
-    ex = get_exec(data_np.shape)
-
-    def callback(vm, data):
-        vm.profile("main", data)
-
-        vm.set_input("main", data)
-        report = vm.profile("main")
-
-        assert "matmul" in str(report)
-        print(report)
-
-    with_rpc(ex, callback, data_np)
-
-
-def test_tuple():
-    @tvm.script.ir_module
-    class NestedTuple:
-        @R.function
-        def main(x: R.Tensor((16,), "float32")) -> R.Tuple(
-            R.Tuple(
-                R.Tensor((16,), "float32"),
-                R.Tuple(
-                    R.Tensor((16,), "float32"),
-                ),
-            ),
-            R.Tensor((16,), "float32"),
-        ):
-            return ((x, (x,)), x)
-
-    target = "llvm"
-    ex = tvm.compile(NestedTuple, target)
-
-    data_np = np.random.randn(16).astype("float32")
-
-    def callback(vm, data):
-        report = vm.profile("main", data)
-        assert "vm.builtin.make_tuple" in str(report)
-
-    with_rpc(ex, callback, data_np)
-
-
-if __name__ == "__main__":
-    tvm.testing.main()
diff --git a/web/emcc/wasm_runtime.cc b/web/emcc/wasm_runtime.cc
index baaff6b8ca..d2bfe326e1 100644
--- a/web/emcc/wasm_runtime.cc
+++ b/web/emcc/wasm_runtime.cc
@@ -37,7 +37,6 @@
 #include "src/runtime/device_api.cc"
 #include "src/runtime/file_utils.cc"
 #include "src/runtime/logging.cc"
-#include "src/runtime/profiling.cc"
 #include "src/runtime/rpc/rpc_channel.cc"
 #include "src/runtime/rpc/rpc_endpoint.cc"
 #include "src/runtime/rpc/rpc_event_impl.cc"
@@ -45,6 +44,7 @@
 #include "src/runtime/rpc/rpc_module.cc"
 #include "src/runtime/rpc/rpc_session.cc"
 #include "src/runtime/tensor.cc"
+#include "src/runtime/timer.cc"
 #include "src/runtime/workspace_pool.cc"
 // relax setup
 #include "3rdparty/tvm-ffi/src/ffi/backtrace.cc"

(tvm) branch main updated: [REFACTOR][RUNTIME] Phase out profiling.h heavy types, rename to timer.h (#19455)

Reply via email to