This is an automated email from the ASF dual-hosted git repository.
tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git
The following commit(s) were added to refs/heads/main by this push:
new 4e5b869c27 [REFACTOR][RUNTIME] Phase out profiling.h heavy types,
rename to timer.h (#19455)
4e5b869c27 is described below
commit 4e5b869c27cdb922de10a6b6d7d83ca845b9803e
Author: Tianqi Chen <[email protected]>
AuthorDate: Mon Apr 27 20:27:07 2026 -0400
[REFACTOR][RUNTIME] Phase out profiling.h heavy types, rename to timer.h
(#19455)
---
apps/android_rpc/app/src/main/jni/tvm_runtime.h | 2 +-
docs/arch/relax_vm.rst | 18 +-
docs/reference/api/python/index.rst | 1 -
docs/reference/api/python/runtime/profiling.rst | 21 -
include/tvm/runtime/profiling.h | 590 -------------
include/tvm/runtime/timer.h | 195 +++++
include/tvm/runtime/vm/executable.h | 2 -
include/tvm/runtime/vm/vm.h | 9 -
python/tvm/relax/training/trainer.py | 47 --
python/tvm/runtime/__init__.py | 1 -
python/tvm/runtime/profiling/__init__.py | 272 ------
python/tvm/runtime/profiling/_ffi_api.py | 21 -
python/tvm/runtime/vm.py | 32 +-
src/ir/structural_hash.cc | 64 +-
src/runtime/contrib/clml/clml_runtime.cc | 154 +---
src/runtime/contrib/clml/clml_runtime.h | 1 -
src/runtime/contrib/json/json_runtime.h | 36 -
src/runtime/cuda/cuda_device_api.cc | 4 +-
src/runtime/hexagon/hexagon_common.cc | 4 +-
src/runtime/metal/metal_device_api.mm | 4 +-
src/runtime/opencl/opencl_common.h | 2 +-
src/runtime/opencl/opencl_device_api.cc | 4 +-
src/runtime/profiling.cc | 937 ---------------------
src/runtime/rocm/rocm_device_api.cc | 4 +-
src/runtime/rpc/rpc_module.cc | 14 +-
src/runtime/timer.cc | 176 ++++
src/runtime/vm/executable.cc | 7 -
src/runtime/vm/vm.cc | 123 ---
tests/python/relax/test_codegen_coreml.py | 4 +-
.../python/relax/test_training_trainer_numeric.py | 3 +-
tests/python/relax/test_vm_profiler.py | 129 ---
web/emcc/wasm_runtime.cc | 2 +-
32 files changed, 402 insertions(+), 2481 deletions(-)
diff --git a/apps/android_rpc/app/src/main/jni/tvm_runtime.h
b/apps/android_rpc/app/src/main/jni/tvm_runtime.h
index 460bca0bc7..920ae6bb1d 100644
--- a/apps/android_rpc/app/src/main/jni/tvm_runtime.h
+++ b/apps/android_rpc/app/src/main/jni/tvm_runtime.h
@@ -50,7 +50,6 @@
#include "../src/runtime/logging.cc"
#include "../src/runtime/memory/memory_manager.cc"
#include "../src/runtime/minrpc/minrpc_logger.cc"
-#include "../src/runtime/profiling.cc"
#include "../src/runtime/registry.cc"
#include "../src/runtime/rpc/rpc_channel.cc"
#include "../src/runtime/rpc/rpc_endpoint.cc"
@@ -63,6 +62,7 @@
#include "../src/runtime/tensor.cc"
#include "../src/runtime/thread_pool.cc"
#include "../src/runtime/threading_backend.cc"
+#include "../src/runtime/timer.cc"
#include "../src/runtime/workspace_pool.cc"
#ifdef TVM_OPENCL_RUNTIME
diff --git a/docs/arch/relax_vm.rst b/docs/arch/relax_vm.rst
index 30ce5bd058..222329ce0b 100644
--- a/docs/arch/relax_vm.rst
+++ b/docs/arch/relax_vm.rst
@@ -354,26 +354,14 @@ Key methods:
reducing dictionary lookup overhead during benchmarking.
- ``vm.time_evaluator(func_name, dev)`` — returns a timing function following
the same convention
as ``tvm.runtime.Module.time_evaluator``.
-- ``vm.profile(func_name, *args)`` — returns a per-operator profiling report
(requires
- ``profile=True`` at VM construction).
- ``vm.set_instrument(func)`` — register an instrumentation callback that is
invoked before/after
every ``Call`` instruction. The callback can return
``VMInstrumentReturnKind.SKIP_RUN`` to
skip the call.
-Profiling and instrumentation
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Instrumentation
+~~~~~~~~~~~~~~~
-The VM supports two levels of observability:
-
-**Profiling** via ``VirtualMachine(exec, dev, profile=True)``:
-
-.. code-block:: python
-
- vm = relax.VirtualMachine(ex, tvm.cuda(), profile=True)
- report = vm.profile("main", inp)
- print(report)
-
-This produces a ``tvm.runtime.profiling.Report`` with per-operator timing
breakdown.
+The VM supports observability via instrumentation:
**Instrumentation** via ``set_instrument()``:
diff --git a/docs/reference/api/python/index.rst
b/docs/reference/api/python/index.rst
index 89f9f0c577..4bef65f82d 100644
--- a/docs/reference/api/python/index.rst
+++ b/docs/reference/api/python/index.rst
@@ -40,7 +40,6 @@ Python API
runtime/runtime
runtime/vm
runtime/disco
- runtime/profiling
.. toctree::
:maxdepth: 1
diff --git a/docs/reference/api/python/runtime/profiling.rst
b/docs/reference/api/python/runtime/profiling.rst
deleted file mode 100644
index d26f00af90..0000000000
--- a/docs/reference/api/python/runtime/profiling.rst
+++ /dev/null
@@ -1,21 +0,0 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
-.. http://www.apache.org/licenses/LICENSE-2.0
-
-.. Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
-
-tvm.runtime.profiling
----------------------
-.. automodule:: tvm.runtime.profiling
- :members:
diff --git a/include/tvm/runtime/profiling.h b/include/tvm/runtime/profiling.h
deleted file mode 100644
index 7bdf602808..0000000000
--- a/include/tvm/runtime/profiling.h
+++ /dev/null
@@ -1,590 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file include/tvm/runtime/profiling.h
- * \brief Runtime profiling including timers.
- */
-#ifndef TVM_RUNTIME_PROFILING_H_
-#define TVM_RUNTIME_PROFILING_H_
-
-#include <tvm/ffi/container/array.h>
-#include <tvm/ffi/container/map.h>
-#include <tvm/ffi/extra/module.h>
-#include <tvm/ffi/function.h>
-#include <tvm/runtime/base.h>
-#include <tvm/runtime/device_api.h>
-#include <tvm/runtime/object.h>
-#include <tvm/runtime/tensor.h>
-
-#include <stack>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-namespace tvm {
-
-namespace runtime {
-
-/*! \brief Base class for all implementations.
- *
- * New implementations of this interface should make sure that `Start` and
`Stop`
- * are as lightweight as possible. Expensive state synchronization should be
- * done in `SyncAndGetElapsedNanos`.
- */
-class TimerNode : public Object {
- public:
- /*! \brief Start the timer.
- *
- * Note: this function should only be called once per object.
- */
- virtual void Start() = 0;
- /*! \brief Stop the timer.
- *
- * Note: this function should only be called once per object.
- */
- virtual void Stop() = 0;
- /*! \brief Synchronize timer state and return elapsed time between `Start`
and `Stop`.
- * \return The time in nanoseconds between `Start` and `Stop`.
- *
- * This function is necessary because we want to avoid timing the overhead of
- * doing timing. When using multiple timers, it is recommended to stop all of
- * them before calling `SyncAndGetElapsedNanos` on any of them.
- *
- * Note: this function should be only called once per object. It may incur
- * a large synchronization overhead (for example, with GPUs).
- */
- virtual int64_t SyncAndGetElapsedNanos() = 0;
-
- virtual ~TimerNode() {}
-
- static constexpr const bool _type_mutable = true;
- TVM_FFI_DECLARE_OBJECT_INFO("runtime.TimerNode", TimerNode, Object);
-};
-
-/*! \brief Timer for a specific device.
- *
- * This is a managed reference to a TimerNode.
- *
- * \sa TimerNode
- */
-class Timer : public ObjectRef {
- public:
- /*!
- * \brief Get a device specific timer.
- * \param dev The device to time.
- * \return A `Timer` that has already been started.
- *
- * Use this function to time runtime of arbitrary regions of code on a
specific
- * device. The code that you want to time should be running on the device
- * otherwise the timer will not return correct results. This is a lower level
- * interface than TimeEvaluator and only runs the timed code once
- * (TimeEvaluator runs the code multiple times).
- *
- * A default timer is used if a device specific one does not exist. This
- * timer performs synchronization between the device and CPU, which can lead
- * to overhead in the reported results.
- *
- * Example usage:
- * \code{.cpp}
- * Timer t = Timer::Start(Device::cpu());
- * my_long_running_function();
- * t->Stop();
- * ... // some more computation
- * int64_t nanosecs = t->SyncAndGetElapsedNanos() // elapsed time in
nanoseconds
- * \endcode
- *
- * To add a new device-specific timer, register a new function
- * "profiler.timer.my_device" (where `my_device` is the `DeviceName` of your
- * device). This function should accept a `Device` and return a new `Timer`
- * that has already been started.
- *
- * For example, this is how the CPU timer is implemented:
- * \code{.cpp}
- * class CPUTimerNode : public TimerNode {
- * public:
- * virtual void Start() { start_ =
std::chrono::high_resolution_clock::now(); }
- * virtual void Stop() { duration_ =
std::chrono::high_resolution_clock::now() - start_; }
- * virtual int64_t SyncAndGetElapsedNanos() { return duration_.count(); }
- * virtual ~CPUTimerNode() {}
- *
- * static constexpr const char* _type_key = "runtime.CPUTimerNode";
- * TVM_FFI_DECLARE_OBJECT_INFO_FINAL(CPUTimerNode, TimerNode);
- *
- * private:
- * std::chrono::high_resolution_clock::time_point start_;
- * std::chrono::duration<int64_t, std::nano> duration_;
- * };
- *
- *
- * TVM_FFI_STATIC_INIT_BLOCK() {
- * namespace refl = tvm::ffi::reflection;
- * refl::GlobalDef().def("profiling.timer.cpu", [](Device dev) {
- * return Timer(ffi::make_object<CPUTimerNode>());
- * });
- * }
- * \endcode
- */
- static TVM_RUNTIME_DLL Timer Start(Device dev);
-
- TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(Timer, ObjectRef, TimerNode);
-};
-
-/*!
- * \brief Default timer if one does not exist for the device.
- * \param dev The device to time on.
- *
- * Note that this timer performs synchronization between the device and CPU,
- * which can lead to overhead in the reported results.
- */
-Timer DefaultTimer(Device dev);
-
-namespace profiling {
-/*! \brief Wrapper for `Device` because `Device` is not passable across the
- * ffi::Function interface.
- */
-struct DeviceWrapperNode : public Object {
- /*! The device */
- Device device;
-
- /*! Constructor */
- explicit DeviceWrapperNode(Device device) : device(device) {}
- TVM_FFI_DECLARE_OBJECT_INFO("runtime.profiling.DeviceWrapper",
DeviceWrapperNode, Object);
-};
-
-/*! \brief Wrapper for `Device`. */
-class DeviceWrapper : public ObjectRef {
- public:
- explicit DeviceWrapper(Device dev) { data_ =
ffi::make_object<DeviceWrapperNode>(dev); }
- TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(DeviceWrapper, ObjectRef,
DeviceWrapperNode);
-};
-
-/*! \brief Data collected from a profiling run. Includes per-call metrics and
per-device metrics.
- */
-class TVM_RUNTIME_DLL ReportNode : public Object {
- public:
- /*! \brief A list of function calls and the metrics recorded for that call.
- *
- * Each element is a mapping from metric name to value. Some metrics that
- * appear in every call are "Name" (the function name), "Argument Shapes",
- * and "Duration (us)". Values are one of `String`, `PercentNode`,
- * `DurationNode`, or `CountNode`.
- */
- ffi::Array<ffi::Map<ffi::String, ffi::Any>> calls;
- /*! \brief Metrics collected for the entire run of the model on a per-device
basis.
- *
- * `device_metrics` is indexed by device name then metric.
- *
- * These metrics may be larger than the sum of the same metric in `calls`
- * because these metrics include the overhead of the executor.
- */
- ffi::Map<ffi::String, ffi::Map<ffi::String, ffi::Any>> device_metrics;
- /*! Configuration used for this profiling run. Includes number of threads,
executor.
- *
- * Values must be an object type that can be used with device_metrics.
- */
- ffi::Map<ffi::String, ffi::Any> configuration;
- /*! \brief Output `calls` in CSV format.
- *
- * Note that this does not include `device_metrics`, it only includes
per-call metrics.
- */
- ffi::String AsCSV() const;
- /*! \brief Create a human readable table of profiling metrics.
- *
- * \param aggregate Whether or not to join multiple calls to the
- * same op into a single line.
- *
- * \param sort Whether or not to sort call frames by descending
- * duration. If false and if `aggregate` is false, frames will
- * be sorted by order of appearance in the program. Order is
- * undefined if `sort` is false and `aggregate` is true.
- *
- * \param compute_col_sums Whether or not to include sum totals for
- * the Count, Duation, and Percent columns.
- *
- */
- ffi::String AsTable(bool sort = true, bool aggregate = true, bool
compute_col_sums = true) const;
- /*! \brief Convert this report to JSON.
- *
- * Output JSON will be of this format:
- * \code
- * {
- * "calls": [
- * {
- * "Duration (us)": {
- * "microseconds": 12.3
- * },
- * "Name": "fused_dense",
- * "Count": {
- * "count": 1
- * },
- * "Percent": {
- * "percent": 10.3
- * }
- * }
- * ],
- * "device_metrics": {
- * "cpu": {
- * "Duration (us)": {
- * "microseconds": 334.2
- * },
- * "Percent": {
- * "percent": 100
- * }
- * }
- * }
- * }
- * \endcode
- */
- ffi::String AsJSON() const;
- TVM_FFI_DECLARE_OBJECT_INFO_FINAL("runtime.profiling.Report", ReportNode,
Object);
-};
-
-class Report : public ObjectRef {
- public:
- /*! Construct a Report from a set of calls (with associated metrics) and
per-device metrics.
- * \param calls Function calls and associated metrics.
- * \param device_metrics Per-device metrics for overall execution.
- * \param configuration Configuration data specific to this profiling run.
- */
- explicit Report(ffi::Array<ffi::Map<ffi::String, ffi::Any>> calls,
- ffi::Map<ffi::String, ffi::Map<ffi::String, ffi::Any>>
device_metrics,
- ffi::Map<ffi::String, ffi::Any> configuration);
-
- /*! Deserialize a Report from a JSON object. Needed for sending the report
over RPC.
- * \param json Serialized json report from `ReportNode::AsJSON`.
- * \returns A Report.
- */
- static Report FromJSON(ffi::String json);
- TVM_FFI_DEFINE_OBJECT_REF_METHODS_NOTNULLABLE(Report, ObjectRef, ReportNode);
-};
-
-/*! \brief Interface for user defined profiling metric collection.
- *
- * Users can register their own collector by registering a packed function with
- * the name "runtime.profiling.metrics.my_collector_name" where
- * "my_collector_name" is the name of their collector. This function should
- * take an Array of Device as input which contains the devices the collector
- * will be run on.
- *
- * `MetricCollectorNode`s will be called in the following fashion.
- * \code
- * MetricCollector mc;
- * for (auto op : model) {
- * auto o = mc.Start();
- * op();
- * auto metrics = mc.Stop(o); // metrics are added the profiling report
- * }
- * \endcode
- */
-class MetricCollectorNode : public Object {
- public:
- /*! \brief Initialization call. Called before profiling has started. Any
- * expensive precomputation should happen here.
- * \param devs The list of devices this collector will be run on.
- */
- virtual void Init(ffi::Array<DeviceWrapper> devs) = 0;
- /*! \brief Start colling metrics for a function call.
- * \param dev The device the call will be run on.
- * \returns An object used to maintain state of the metric collection. This
- * object will be passed to the corresponding `Stop` call. If the device is
- * not supported, this function will return a nullptr ObjectRef.
- */
- virtual ObjectRef Start(Device dev) = 0;
- /*! \brief Stop collecting metrics.
- * \param obj The object created by the corresponding `Start` call.
- * \returns A set of metric names and the associated values. Values must be
- * one of DurationNode, PercentNode, CountNode, or String.
- */
- virtual ffi::Map<ffi::String, ffi::Any> Stop(ffi::ObjectRef obj) = 0;
-
- virtual ~MetricCollectorNode() {}
-
- static constexpr const bool _type_mutable = true;
- TVM_FFI_DECLARE_OBJECT_INFO("runtime.profiling.MetricCollector",
MetricCollectorNode, Object);
-};
-
-/*! \brief Wrapper for `MetricCollectorNode`. */
-class MetricCollector : public ObjectRef {
- public:
- TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(MetricCollector, ObjectRef,
MetricCollectorNode);
-};
-
-/*! Information about a single function or operator call. */
-struct CallFrame {
- /*! Device on which the call was made */
- Device dev;
- /*! Name of the function or op */
- ffi::String name;
- /*! Runtime of the function or op */
- Timer timer;
- /*! Extra performance metrics */
- std::unordered_map<std::string, ffi::Any> extra_metrics;
- /*! User defined metric collectors. Each pair is the MetricCollector and its
- * associated data (returned from MetricCollector.Start).
- */
- std::vector<std::pair<MetricCollector, ObjectRef>> extra_collectors;
-};
-
-/*! Runtime profiler for function and/or operator calls. Used in the graph
- * runtime and VM to provide profiling information for all operators.
- *
- * Example usage:
- * \code{.cpp}
- * Device cpu, gpu;
- * Profiler prof({cpu, gpu});
- * my_gpu_kernel(); // do a warmup iteration
- * prof.Start();
- * prof.StartCall("my_gpu_kernel", gpu);
- * my_gpu_kernel();
- * prof.StopCall();
- * prof.StartCall("my_cpu_function", cpu);
- * my_cpu_function();
- * prof.StopCall();
- * prof.Stop();
- * std::cout << prof.Report << std::endl; // print profiling report
- * \endcode
- */
-class Profiler {
- public:
- /*! Constructor.
- *
- * The profiler should be constructed before you do any warmup iterations.
- *
- * \note
- * Calling this constructor will reset the TVM threadpool. It is necessary in
- * order to install thread handlers required by certain collectors.
- *
- * \param devs The list of devices the profiler will be running on. Should
- * include all devices used by profiled operators.
- * \param metric_collectors Additional `MetricCollector`s to use with this
profiler.
- * \param configuration Additional configuration data to add to the
outputted profiling report.
- */
- explicit Profiler(std::vector<Device> devs, std::vector<MetricCollector>
metric_collectors,
- std::unordered_map<ffi::String, ffi::Any> configuration =
{});
- /*! \brief Start the profiler.
- *
- * This function should only be called once per object.
- */
- void Start();
- /*! \brief Stop the profiler.
- *
- * This function should only be called once per object after start has been
called.
- */
- void Stop();
- /*! \brief Start a function call.
- * \param name The name of the function being called.
- * \param dev The device on which the function is running.
- * \param extra_metrics Optional additional profiling information to add to
- * the frame (input sizes, allocations).
- *
- * `StartCall` may be nested, but each `StartCall` needs a matching
- * `StopCall`. Function calls are stopped in LIFO order, so calls to
- * `StartCall` and `StopCall` must be nested properly.
- */
- void StartCall(ffi::String name, Device dev,
- std::unordered_map<std::string, ffi::Any> extra_metrics = {});
- /*! \brief Stop the last `StartCall`.
- * \param extra_metrics Optional additional profiling information to add to
- * the frame (input sizes, allocations).
- */
- void StopCall(std::unordered_map<std::string, ffi::Any> extra_metrics = {});
- /*! \brief A report of total runtime between `Start` and `Stop` as
- * well as individual statistics for each `StartCall`-`StopCall` pair.
- * \returns A `Report` that can either be formatted as CSV (with `.AsCSV`)
- * or as a human readable table (with `.AsTable`).
- */
- profiling::Report Report();
- /*! \brief Check if the profiler is currently running.
- * \returns Whether or not the profiler is running.
- */
- bool IsRunning() const { return is_running_; }
-
- private:
- std::vector<Device> devs_;
- bool is_running_{false};
- std::vector<CallFrame> calls_;
- std::stack<CallFrame> in_flight_;
- std::vector<MetricCollector> collectors_;
- std::unordered_map<ffi::String, ffi::Any> configuration_;
-};
-
-/* \brief A duration in time. */
-class DurationNode : public Object {
- public:
- /* The duration as a floating point number of microseconds. */
- double microseconds;
-
- /* \brief Construct a new duration.
- * \param a The duration in microseconds.
- */
- explicit DurationNode(double a) : microseconds(a) {}
- TVM_FFI_DECLARE_OBJECT_INFO_FINAL("runtime.profiling.Duration",
DurationNode, Object);
-};
-
-/* A percentage of something */
-class PercentNode : public Object {
- public:
- /* The percent as a floating point value out of 100%. i.e. if `percent` is
10 then we have 10%. */
- double percent;
-
- /* \brief Construct a new percentage.
- * \param a The percentage out of 100.
- */
- explicit PercentNode(double a) : percent(a) {}
- TVM_FFI_DECLARE_OBJECT_INFO_FINAL("runtime.profiling.Percent", PercentNode,
Object);
-};
-
-/* A count of something */
-class CountNode : public Object {
- public:
- /* The actual count */
- int64_t value;
-
- /* \brief Construct a new count.
- * \param a The count.
- */
- explicit CountNode(int64_t a) : value(a) {}
- TVM_FFI_DECLARE_OBJECT_INFO_FINAL("runtime.profiling.Count", CountNode,
Object);
-};
-
-/* \brief A ratio of two things. */
-class RatioNode : public Object {
- public:
- /* The ratio as a double precision floating point number. */
- double ratio;
-
- /* \brief Construct a new ratio.
- * \param a The ratio.
- */
- explicit RatioNode(double a) : ratio(a) {}
- TVM_FFI_DECLARE_OBJECT_INFO_FINAL("runtime.profiling.Ratio", RatioNode,
Object);
-};
-
-/*! \brief ffi::String representation of an array of Tensor shapes
- * \param shapes Array of Tensors to get the shapes of.
- * \return A textual representation of the shapes. For example: `float32[2],
int64[1, 2]`.
- */
-ffi::String ShapeString(const std::vector<Tensor>& shapes);
-/*! \brief ffi::String representation of shape encoded as an Tensor
- * \param shape Tensor containing the shape.
- * \param dtype The dtype of the shape.
- * \return A textual representation of the shape. For example: `float32[2]`.
- */
-ffi::String ShapeString(Tensor shape, DLDataType dtype);
-/*! \brief ffi::String representation of a shape encoded as a vector
- * \param shape Shape as a vector of integers.
- * \param dtype The dtype of the shape.
- * \return A textual representation of the shape. For example: `float32[2]`.
- */
-ffi::String ShapeString(const std::vector<int64_t>& shape, DLDataType dtype);
-
-/*! \brief Collect performance information of a function execution. Usually
- * used with a compiled PrimFunc (via tvm.compile).
- *
- * This information can include performance counters like cache hits and FLOPs
- * that are useful in debugging performance issues of individual PrimFuncs.
- * Different metrics can be collected depending on which MetricCollector is
- * used.
- *
- * Example usage:
- * \code{.cpp}
- * // Use PAPI to measure the number of floating point operations.
- * ffi::Function profiler = ProfileModule(
- * mod, "main", kDLCPU, 0, {CreatePAPIMetricCollector({{kDLCPU, 0},
{"PAPI_FP_OPS"}})});
- * Report r = profiler(arg1, arg2, arg);
- * std::cout << r << std::endl;
- * \endcode
- *
- * \param mod Module to profile. Usually a PrimFunc that has been compiled to
machine code.
- * \param func_name Name of function to run in the module.
- * \param device_type Device type to run on. Profiling will include performance
- * metrics specific to this device type.
- * \param device_id Id of device to run on.
- * \param warmup_iters Number of iterations of the function to run before
collecting
- * performance information. Recommend to set this larger
- * than 0 so that cache effects are consistent.
- * \param collectors List of different
- * ways to collect metrics. See MetricCollector.
- * \returns A ffi::Function which takes the same arguments as the
`mod[func_name]`
- * and returns performance metrics as a `ffi::Map<ffi::String,
ffi::Any>` where
- * values can be `CountNode`, `DurationNode`, `PercentNode`.
- */
-ffi::Function ProfileFunction(ffi::Module mod, std::string func_name, int
device_type,
- int device_id, int warmup_iters,
- ffi::Array<MetricCollector> collectors);
-
-/*!
- * \brief Wrap a timer function to measure the time cost of a given packed
function.
- *
- * Approximate implementation:
- * \code{.py}
- * f() // warmup
- * for i in range(repeat)
- * f_preproc()
- * while True:
- * start = time()
- * for j in range(number):
- * f()
- * duration_ms = time() - start
- * if duration_ms >= min_repeat_ms:
- * break
- * else:
- * number = (min_repeat_ms / (duration_ms / number) + 1
- * if cooldown_interval_ms and i % repeats_to_cooldown == 0:
- * sleep(cooldown_interval_ms)
- * \endcode
- *
- * \param f The function argument.
- * \param dev The device.
- * \param number The number of times to run this function for taking average.
- * We call these runs as one `repeat` of measurement.
- * \param repeat The number of times to repeat the measurement.
- * In total, the function will be invoked (1 + number x repeat) times,
- * where the first one is warm up and will be discarded.
- * The returned result contains `repeat` costs,
- * each of which is an average of `number` costs.
- * \param min_repeat_ms The minimum duration of one `repeat` in milliseconds.
- * By default, one `repeat` contains `number` runs. If this parameter
is set,
- * the parameters `number` will be dynamically adjusted to meet the
- * minimum duration requirement of one `repeat`.
- * i.e., When the run time of one `repeat` falls below this time,
- * the `number` parameter will be automatically increased.
- * \param limit_zero_time_iterations The maximum number of repeats when
- * measured time is equal to 0. It helps to avoid hanging during
measurements.
- * \param cooldown_interval_ms The cooldown interval in milliseconds between
the number of repeats
- * defined by `repeats_to_cooldown`.
- * \param repeats_to_cooldown The number of repeats before the
- * cooldown is activated.
- * \param cache_flush_bytes The number of bytes to flush from cache before
- * \param f_preproc The function to be executed before we execute time
- * evaluator.
- * \return f_timer A timer function.
- */
-ffi::Function WrapTimeEvaluator(ffi::Function f, Device dev, int number, int
repeat,
- int min_repeat_ms, int
limit_zero_time_iterations,
- int cooldown_interval_ms, int
repeats_to_cooldown,
- int cache_flush_bytes = 0, ffi::Function
f_preproc = nullptr);
-
-} // namespace profiling
-} // namespace runtime
-} // namespace tvm
-
-#endif // TVM_RUNTIME_PROFILING_H_
diff --git a/include/tvm/runtime/timer.h b/include/tvm/runtime/timer.h
new file mode 100644
index 0000000000..25d963d7f3
--- /dev/null
+++ b/include/tvm/runtime/timer.h
@@ -0,0 +1,195 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file include/tvm/runtime/timer.h
+ * \brief Runtime timer primitives: Timer, TimerNode, WrapTimeEvaluator.
+ */
+#ifndef TVM_RUNTIME_TIMER_H_
+#define TVM_RUNTIME_TIMER_H_
+
+#include <tvm/ffi/function.h>
+#include <tvm/runtime/base.h>
+#include <tvm/runtime/device_api.h>
+#include <tvm/runtime/object.h>
+#include <tvm/runtime/tensor.h>
+
+namespace tvm {
+namespace runtime {
+
+/*! \brief Base class for all timer implementations.
+ *
+ * New implementations of this interface should make sure that `Start` and
`Stop`
+ * are as lightweight as possible. Expensive state synchronization should be
+ * done in `SyncAndGetElapsedNanos`.
+ */
+class TimerNode : public Object {
+ public:
+ /*! \brief Start the timer.
+ *
+ * Note: this function should only be called once per object.
+ */
+ virtual void Start() = 0;
+ /*! \brief Stop the timer.
+ *
+ * Note: this function should only be called once per object.
+ */
+ virtual void Stop() = 0;
+ /*! \brief Synchronize timer state and return elapsed time between `Start`
and `Stop`.
+ * \return The time in nanoseconds between `Start` and `Stop`.
+ *
+ * This function is necessary because we want to avoid timing the overhead of
+ * doing timing. When using multiple timers, it is recommended to stop all of
+ * them before calling `SyncAndGetElapsedNanos` on any of them.
+ *
+ * Note: this function should be only called once per object. It may incur
+ * a large synchronization overhead (for example, with GPUs).
+ */
+ virtual int64_t SyncAndGetElapsedNanos() = 0;
+
+ virtual ~TimerNode() {}
+
+ static constexpr const bool _type_mutable = true;
+ TVM_FFI_DECLARE_OBJECT_INFO("runtime.TimerNode", TimerNode, Object);
+};
+
+/*! \brief Timer for a specific device.
+ *
+ * This is a managed reference to a TimerNode.
+ *
+ * \sa TimerNode
+ */
+class Timer : public ObjectRef {
+ public:
+ /*!
+ * \brief Get a device specific timer.
+ * \param dev The device to time.
+ * \return A `Timer` that has already been started.
+ *
+ * Use this function to time runtime of arbitrary regions of code on a
specific
+ * device. The code that you want to time should be running on the device
+ * otherwise the timer will not return correct results. This is a lower level
+ * interface than TimeEvaluator and only runs the timed code once
+ * (TimeEvaluator runs the code multiple times).
+ *
+ * A default timer is used if a device specific one does not exist. This
+ * timer performs synchronization between the device and CPU, which can lead
+ * to overhead in the reported results.
+ *
+ * Example usage:
+ * \code{.cpp}
+ * Timer t = Timer::Start(Device::cpu());
+ * my_long_running_function();
+ * t->Stop();
+ * ... // some more computation
+ * int64_t nanosecs = t->SyncAndGetElapsedNanos() // elapsed time in
nanoseconds
+ * \endcode
+ *
+ * To add a new device-specific timer, register a new function
+ * "runtime.timer.my_device" (where `my_device` is the `DeviceName` of your
+ * device). This function should accept a `Device` and return a new `Timer`
+ * that has already been started.
+ *
+ * For example, this is how the CPU timer is implemented:
+ * \code{.cpp}
+ * class CPUTimerNode : public TimerNode {
+ * public:
+ * virtual void Start() { start_ =
std::chrono::high_resolution_clock::now(); }
+ * virtual void Stop() { duration_ =
std::chrono::high_resolution_clock::now() - start_; }
+ * virtual int64_t SyncAndGetElapsedNanos() { return duration_.count(); }
+ * virtual ~CPUTimerNode() {}
+ *
+ * static constexpr const char* _type_key = "runtime.CPUTimerNode";
+ * TVM_FFI_DECLARE_OBJECT_INFO_FINAL(CPUTimerNode, TimerNode);
+ *
+ * private:
+ * std::chrono::high_resolution_clock::time_point start_;
+ * std::chrono::duration<int64_t, std::nano> duration_;
+ * };
+ *
+ *
+ * TVM_FFI_STATIC_INIT_BLOCK() {
+ * namespace refl = tvm::ffi::reflection;
+ * refl::GlobalDef().def("runtime.timer.cpu", [](Device dev) {
+ * return Timer(ffi::make_object<CPUTimerNode>());
+ * });
+ * }
+ * \endcode
+ */
+ static TVM_RUNTIME_DLL Timer Start(Device dev);
+
+ TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(Timer, ObjectRef, TimerNode);
+};
+
+/*!
+ * \brief Wrap a timer function to measure the time cost of a given packed
function.
+ *
+ * Approximate implementation:
+ * \code{.py}
+ * f() // warmup
+ * for i in range(repeat)
+ * f_preproc()
+ * while True:
+ * start = time()
+ * for j in range(number):
+ * f()
+ * duration_ms = time() - start
+ * if duration_ms >= min_repeat_ms:
+ * break
+ * else:
+ * number = (min_repeat_ms / (duration_ms / number) + 1
+ * if cooldown_interval_ms and i % repeats_to_cooldown == 0:
+ * sleep(cooldown_interval_ms)
+ * \endcode
+ *
+ * \param f The function argument.
+ * \param dev The device.
+ * \param number The number of times to run this function for taking average.
+ * We call these runs as one `repeat` of measurement.
+ * \param repeat The number of times to repeat the measurement.
+ * In total, the function will be invoked (1 + number x repeat) times,
+ * where the first one is warm up and will be discarded.
+ * The returned result contains `repeat` costs,
+ * each of which is an average of `number` costs.
+ * \param min_repeat_ms The minimum duration of one `repeat` in milliseconds.
+ * By default, one `repeat` contains `number` runs. If this parameter
is set,
+ * the parameters `number` will be dynamically adjusted to meet the
+ * minimum duration requirement of one `repeat`.
+ * i.e., When the run time of one `repeat` falls below this time,
+ * the `number` parameter will be automatically increased.
+ * \param limit_zero_time_iterations The maximum number of repeats when
+ * measured time is equal to 0. It helps to avoid hanging during
measurements.
+ * \param cooldown_interval_ms The cooldown interval in milliseconds between
the number of repeats
+ * defined by `repeats_to_cooldown`.
+ * \param repeats_to_cooldown The number of repeats before the
+ * cooldown is activated.
+ * \param cache_flush_bytes The number of bytes to flush from cache before
+ * \param f_preproc The function to be executed before we execute time
+ * evaluator.
+ * \return f_timer A timer function.
+ */
+ffi::Function WrapTimeEvaluator(ffi::Function f, Device dev, int number, int
repeat,
+ int min_repeat_ms, int
limit_zero_time_iterations,
+ int cooldown_interval_ms, int
repeats_to_cooldown,
+ int cache_flush_bytes = 0, ffi::Function
f_preproc = nullptr);
+
+} // namespace runtime
+} // namespace tvm
+
+#endif // TVM_RUNTIME_TIMER_H_
diff --git a/include/tvm/runtime/vm/executable.h
b/include/tvm/runtime/vm/executable.h
index 06c6020efb..022a88a469 100644
--- a/include/tvm/runtime/vm/executable.h
+++ b/include/tvm/runtime/vm/executable.h
@@ -140,8 +140,6 @@ class TVM_RUNTIME_DLL VMExecutable : public ffi::ModuleObj {
void WriteToFile(const ffi::String& file_name, const ffi::String& format)
const final;
/*! \brief Create a Relax virtual machine and load `this` as the executable.
*/
ffi::Module VMLoadExecutable() const;
- /*! \brief Create a Relax virtual machine with profiler and load `this` as
the executable. */
- ffi::Module VMProfilerLoadExecutable() const;
/*! \brief Check if the VMExecutable contains a specific function. */
bool HasFunction(const ffi::String& name) const;
/*!
diff --git a/include/tvm/runtime/vm/vm.h b/include/tvm/runtime/vm/vm.h
index 335d77f196..2804e17eb3 100644
--- a/include/tvm/runtime/vm/vm.h
+++ b/include/tvm/runtime/vm/vm.h
@@ -23,10 +23,6 @@
#ifndef TVM_RUNTIME_VM_VM_H_
#define TVM_RUNTIME_VM_VM_H_
-#ifndef TVM_VM_ENABLE_PROFILER
-#define TVM_VM_ENABLE_PROFILER 1
-#endif
-
#include <tvm/ffi/extra/module.h>
#include <memory>
@@ -201,11 +197,6 @@ class VirtualMachine : public ffi::ModuleObj {
* \return Created VM
*/
static ObjectPtr<VirtualMachine> Create();
- /*!
- * \brief Create an instance of VM with the profiling feature enabled.
- * \return Created VM
- */
- static ObjectPtr<VirtualMachine> CreateProfiler();
/*!
* \brief Helper function for vm closure functions to get the context ptr
* \param arg The argument value.
diff --git a/python/tvm/relax/training/trainer.py
b/python/tvm/relax/training/trainer.py
index ce9d2368b7..f35f4ab69c 100644
--- a/python/tvm/relax/training/trainer.py
+++ b/python/tvm/relax/training/trainer.py
@@ -65,7 +65,6 @@ class Trainer:
trainer.xaiver_uniform_init_params()
trainer.predict(input_instances)
trainer.update([input_instances], [labels])
- trainer.profile_adjoint([input_instances], [labels])
"""
BACKBONE_FUNC: str = "backbone"
@@ -347,49 +346,3 @@ class Trainer:
self._params = list(new_params)
return ret
-
- def profile_adjoint(
- self,
- input_instances: list[np.ndarray | Tensor],
- targets: list[np.ndarray | Tensor],
- ) -> tvm.runtime.profiling.Report:
- """Profile the adjoint function. It requires the VM to be constructed
with `profile=True`,
- and runs `tvm.relax.VirtualMachine.profile()` internally.
-
- Parameters
- ----------
- input_instances : Union[np.ndarray, Tensor, List[Union[np.ndarray,
Tensor]]]
- The values corresponding to the input_instances part of the
backbone function.
- Parameters and model states are not needed to provide.
-
- If there are more than one input instances, you can provide a list.
-
- targets : Union[np.ndarray, Tensor, List[Union[np.ndarray, Tensor]]]
- The values corresponding to the targets part of the backbone
function.
-
- If there are more than one targets, you can provide a list.
-
- Returns
- -------
- report : tvm.runtime.profiling.Report
- The formatted profiling result.
- """
- self._check_inited()
-
- if not isinstance(input_instances, list):
- input_instances = [input_instances]
-
- if not isinstance(targets, list):
- targets = [targets]
-
- if len(input_instances) != self._input_num:
- raise ValueError("The length of the input does not match the
backbone")
-
- all_inputs: list[Tensor] = (
- [tvm.runtime.tensor(i) for i in input_instances]
- + self._params
- + self._states
- + [tvm.runtime.tensor(i) for i in targets]
- )
- all_inputs = [i.copyto(self.device) for i in all_inputs]
- return self.vm.profile(self.ADJOINT_FUNC, *all_inputs)
diff --git a/python/tvm/runtime/__init__.py b/python/tvm/runtime/__init__.py
index 6f25216fb4..86f7507d7c 100644
--- a/python/tvm/runtime/__init__.py
+++ b/python/tvm/runtime/__init__.py
@@ -30,7 +30,6 @@ from .object_generic import ObjectConvertible
from .device import Device
from ._tensor import Tensor, tensor, empty
from .module import Module
-from .profiling import Report
from .executable import Executable
# function exposures
diff --git a/python/tvm/runtime/profiling/__init__.py
b/python/tvm/runtime/profiling/__init__.py
deleted file mode 100644
index 7ebc5e5c40..0000000000
--- a/python/tvm/runtime/profiling/__init__.py
+++ /dev/null
@@ -1,272 +0,0 @@
-# isort: skip_file
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Registration of profiling objects in python."""
-
-from typing import Optional
-from collections.abc import Sequence
-from ... import ffi as _ffi
-from . import _ffi_api
-from .. import Object, Device
-
-
-@_ffi.register_object("runtime.profiling.Report")
-class Report(Object):
- """A container for information gathered during a profiling run.
-
- Attributes
- ----------
- calls : Array[Dict[str, Object]]
- Per-call profiling metrics (function name, runtime, device, ...).
-
- device_metrics : Dict[Device, Dict[str, Object]]
- Per-device metrics collected over the entire run.
- """
-
- def __init__(
- self,
- calls: Sequence[dict[str, Object]],
- device_metrics: dict[str, dict[str, Object]],
- configuration: dict[str, Object],
- ):
- """Construct a profiling report from a list of metrics and per-device
metrics.
-
- Parameters
- ----------
- calls : Sequence[Dict[str, Object]]
- Per function call metrics.
-
- device_metrics : Dict[str, Dict[str, Object]]
- Per device metrics.
-
- configuration : Dict[str, Object]
- Configuration of TVM for this profiling run. Includes number of
- threads, executor.
- """
- self.__init_handle_by_constructor__(_ffi_api.Report, calls,
device_metrics, configuration)
-
- def csv(self):
- """Convert this profiling report into CSV format.
-
- This only includes calls and not overall metrics.
-
- Returns
- -------
- csv : str
- `calls` in CSV format.
- """
- return _ffi_api.AsCSV(self)
-
- def table(self, sort=True, aggregate=True, col_sums=True):
- """Generate a human-readable table
-
- Parameters
- ----------
- sort : bool
-
- If aggregate is true, whether to sort call frames by
- descending duration. If aggregate is False, whether to
- sort frames by order of appearancei n the program.
-
- aggregate : bool
-
- Whether to join multiple calls to the same op into a
- single line.
-
- col_sums : bool
-
- Whether to include the sum of each column.
-
- Returns
- -------
- table : str
-
- A human-readable table
-
- """
- return _ffi_api.AsTable(self, sort, aggregate, col_sums)
-
- def json(self):
- """Convert this profiling report into JSON format.
-
- Example output:
-
- .. code-block:
-
- {
- "calls": [
- {
- "Duration (us)": {
- "microseconds": 12.3
- },
- "Name": "fused_dense",
- "Count": {
- "count": 1
- },
- "Percent": {
- "percent": 10.3
- }
- }
- ],
- "device_metrics": {
- "cpu": {
- "Duration (us)": {
- "microseconds": 334.2
- },
- "Percent": {
- "percent": 100
- }
- }
- }
- }
-
- {"calls":
- [
- {"Duration (us)": {"microseconds": 12.3}
- ,"Name": "fused_dense"
- ,"Count": {"count":1}
- ,"Percent": {"percent": 10.3}
- }
- ],
- "device_metrics":
- {"cpu":
- {"Duration (us)": {"microseconds": 334.2}
- ,"Percent": {"percent": 100.0}
- }
- }
- }
-
- Returns
- -------
- json : str
- Formatted JSON
- """
- return _ffi_api.AsJSON(self)
-
- @classmethod
- def from_json(cls, s):
- """Deserialize a report from JSON.
-
- Parameters
- ----------
- s : str
- Report serialize via :py:meth:`json`.
-
- Returns
- -------
- report : Report
- The deserialized report.
- """
- return _ffi_api.FromJSON(s)
-
-
-@_ffi.register_object("runtime.profiling.Count")
-class Count(Object):
- """A integer count of something"""
-
- def __init__(self, count: int):
- self.__init_handle_by_constructor__(_ffi_api.Count, count)
-
-
-@_ffi.register_object("runtime.profiling.Duration")
-class Duration(Object):
- """A duration of something"""
-
- def __init__(self, duration: float):
- self.__init_handle_by_constructor__(_ffi_api.Duration, duration)
-
-
-@_ffi.register_object("runtime.profiling.Percent")
-class Percent(Object):
- """A Percent of something"""
-
- def __init__(self, percent: float):
- self.__init_handle_by_constructor__(_ffi_api.Percent, percent)
-
-
-@_ffi.register_object("runtime.profiling.Ratio")
-class Ratio(Object):
- """A Ratio of two things"""
-
- def __init__(self, ratio: float):
- self.__init_handle_by_constructor__(_ffi_api.Ratio, ratio)
-
-
-@_ffi.register_object("runtime.profiling.MetricCollector")
-class MetricCollector(Object):
- """Interface for user defined profiling metric collection."""
-
-
-@_ffi.register_object("runtime.profiling.DeviceWrapper")
-class DeviceWrapper(Object):
- """Wraps a tvm.runtime.Device"""
-
- def __init__(self, dev: Device):
- self.__init_handle_by_constructor__(_ffi_api.DeviceWrapper, dev)
-
-
-def profile_function(mod, dev, collectors, func_name=None, warmup_iters=10):
- """Collect performance information of a function execution. Usually used
with
- a compiled PrimFunc.
-
- This information can include performance counters like cache hits and FLOPs
- that are useful in debugging performance issues of individual PrimFuncs.
- Different metrics can be collected depending on which MetricCollector is
- used.
-
- Example
- -------
-
- .. code-block: python
-
- f = tvm.compile(my_func, target="llvm", name="my_func")
- prof = tvm.runtime.profiling.profile_function(
- f,
- tvm.cpu(),
- [tvm.runtime.profiling.PAPIMetricCollector({tvm.cpu():
["PAPI_FP_OPS"]}),
- )
- counters = prof(*args)
- print(counters)
-
- Parameters
- ----------
- mod: Module
- Module containing the function to profile.
- dev: Device
- Device to run the function on.
-
- collectors: List[MetricCollector]
- :py:class:`MetricCollector` which will collect performance information.
- func_name: Optional[str]
- Name of the function in `mod` to profile. Defaults to the `entry_name`
of `mod`.
- warmup_iters: int
- Number of iterations to run the function before collecting performance
- information. Recommended to set this larger than 0 for consistent cache
- effects. Defaults to 10.
-
- Returns
- -------
- prof: PackedFunc[args, Dict[str, ObjectRef]]
- PackedFunc which takes the same arguments as the `mod[func_name]` and
- returns performance metrics as a `Dict[str, ObjectRef]` where values
- can be `CountNode`, `DurationNode`, `PercentNode`.
- """
- if func_name is None:
- func_name = mod.entry_name
- return _ffi_api.ProfileFunction(
- mod, func_name, dev.dlpack_device_type(), dev.index, warmup_iters,
collectors
- )
diff --git a/python/tvm/runtime/profiling/_ffi_api.py
b/python/tvm/runtime/profiling/_ffi_api.py
deleted file mode 100644
index c633c8144b..0000000000
--- a/python/tvm/runtime/profiling/_ffi_api.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""FFI for profiling"""
-
-import tvm_ffi
-
-tvm_ffi.init_ffi_api("runtime.profiling", __name__)
diff --git a/python/tvm/runtime/vm.py b/python/tvm/runtime/vm.py
index 982322d601..0adb446dbf 100644
--- a/python/tvm/runtime/vm.py
+++ b/python/tvm/runtime/vm.py
@@ -28,7 +28,6 @@ from tvm_ffi import Function, register_global_func
import tvm
from tvm.runtime import Device, Object
-from tvm.runtime.profiling import Report
from ..rpc.base import RPC_SESS_MASK
@@ -50,7 +49,6 @@ class VirtualMachine:
rt_mod: tvm.runtime.Module | tvm.runtime.Executable,
device: Device | list[Device],
memory_cfg: str | dict[Device, str] | None = None,
- profile: bool = False,
) -> None:
"""
Construct a VirtualMachine wrapper object.
@@ -70,9 +68,6 @@ class VirtualMachine:
allocator type. If memory_cfg is a dict, each device uses the
allocator
type specified in the dict, or pooled allocator if not specified
in the
dict.
-
- profile : Optional[bool]
- Whether or not to enable profiling.
"""
if not isinstance(rt_mod, tvm.runtime.Module):
if isinstance(rt_mod, tvm.runtime.Executable):
@@ -80,8 +75,7 @@ class VirtualMachine:
else:
raise ValueError("Expect the rt_mod to be an runtime.Module")
- load_exec = "vm_profiler_load_executable" if profile else
"vm_load_executable"
- self.module = rt_mod[load_exec]()
+ self.module = rt_mod["vm_load_executable"]()
self._invoke_closure = self.module["invoke_closure"]
self._save_function = self.module["save_function"]
self._set_input = self.module["set_input"]
@@ -477,30 +471,6 @@ class VirtualMachine:
f_preproc=f_preproc,
)
- def profile(self, func_name: str, *args):
- """Profile a function call.
-
- Parameters
- ----------
- func_name : str
- The name of the function.
-
- args: List of Tensor or other objects supported by Function.
- The arguments to the function.
-
- Returns
- -------
- report: tvm.runtime.profiling.Report
- The formatted profiling result, showing per-op timing measurements.
- """
- cargs: list[Any] = []
-
- for arg in args:
- self._convert(arg, cargs)
-
- report_json = self.module["profile"](func_name, *cargs)
- return Report.from_json(report_json)
-
@register_global_func("vm.builtin.debug_print")
def _print(lineo: str, array) -> None:
diff --git a/src/ir/structural_hash.cc b/src/ir/structural_hash.cc
index b875f86625..01ea19e4b7 100644
--- a/src/ir/structural_hash.cc
+++ b/src/ir/structural_hash.cc
@@ -26,7 +26,7 @@
#include <tvm/ffi/reflection/access_path.h>
#include <tvm/ffi/reflection/registry.h>
#include <tvm/node/functor.h>
-#include <tvm/runtime/profiling.h>
+#include <tvm/runtime/tensor.h>
#include <tvm/support/io.h>
#include <tvm/target/codegen.h>
@@ -85,66 +85,4 @@ struct RefToObjectPtr : public ObjectRef {
}
};
-struct ReportNodeTrait {
- static void RegisterReflection() {
- namespace refl = tvm::ffi::reflection;
- refl::ObjectDef<runtime::profiling::ReportNode>()
- .def_ro("calls", &runtime::profiling::ReportNode::calls)
- .def_ro("device_metrics",
&runtime::profiling::ReportNode::device_metrics)
- .def_ro("configuration",
&runtime::profiling::ReportNode::configuration);
- }
-};
-
-TVM_FFI_STATIC_INIT_BLOCK() { ReportNodeTrait::RegisterReflection(); }
-
-// Pattern A (RM): auto-default repr from reflection for ReportNode.
-
-struct CountNodeTrait {
- static void RegisterReflection() {
- namespace refl = tvm::ffi::reflection;
- refl::ObjectDef<runtime::profiling::CountNode>().def_ro("value",
-
&runtime::profiling::CountNode::value);
- }
-};
-
-TVM_FFI_STATIC_INIT_BLOCK() { CountNodeTrait::RegisterReflection(); }
-
-// Pattern A (RM): auto-default repr from reflection for CountNode.
-
-struct DurationNodeTrait {
- static void RegisterReflection() {
- namespace refl = tvm::ffi::reflection;
- refl::ObjectDef<runtime::profiling::DurationNode>().def_ro(
- "microseconds", &runtime::profiling::DurationNode::microseconds);
- }
-};
-
-TVM_FFI_STATIC_INIT_BLOCK() { DurationNodeTrait::RegisterReflection(); }
-
-// Pattern A (RM): auto-default repr from reflection for DurationNode.
-
-struct PercentNodeTrait {
- static void RegisterReflection() {
- namespace refl = tvm::ffi::reflection;
- refl::ObjectDef<runtime::profiling::PercentNode>().def_ro(
- "percent", &runtime::profiling::PercentNode::percent);
- }
-};
-
-TVM_FFI_STATIC_INIT_BLOCK() { PercentNodeTrait::RegisterReflection(); }
-
-// Pattern A (RM): auto-default repr from reflection for PercentNode.
-
-struct RatioNodeTrait {
- static void RegisterReflection() {
- namespace refl = tvm::ffi::reflection;
- refl::ObjectDef<runtime::profiling::RatioNode>().def_ro("ratio",
-
&runtime::profiling::RatioNode::ratio);
- }
-};
-
-TVM_FFI_STATIC_INIT_BLOCK() { RatioNodeTrait::RegisterReflection(); }
-
-// Pattern A (RM): auto-default repr from reflection for RatioNode.
-
} // namespace tvm
diff --git a/src/runtime/contrib/clml/clml_runtime.cc
b/src/runtime/contrib/clml/clml_runtime.cc
index 2487af6915..5ea6c1398e 100644
--- a/src/runtime/contrib/clml/clml_runtime.cc
+++ b/src/runtime/contrib/clml/clml_runtime.cc
@@ -36,7 +36,7 @@
#include "clml_utils.h"
#endif
-#include <tvm/runtime/profiling.h>
+#include <tvm/runtime/timer.h>
namespace tvm {
namespace runtime {
@@ -362,152 +362,6 @@ class CLMLRuntime : public JSONRuntimeBase {
}
#endif
- void RunProfile(profiling::Profiler* prof) override {
- cl_command_queue queue = CLML_QUEUE;
- std::vector<cl_event>& evts =
cws->workspace->GetEventQueue(cws->tentry->device);
- std::vector<profiling::MetricCollector> cs;
- std::vector<Device> devices;
- devices.push_back(cws->tentry->device);
- bool update_desc = false;
-
- for (size_t i = 0; i < input_nodes_.size(); ++i) {
- auto nid = input_nodes_[i];
- uint32_t eid = EntryID(nid, 0);
- if (nodes_[nid].GetOpType() == "input") {
-#if (CL_QCOM_ML_OPS_H_MAJOR_VERSION >= 5)
- if (this->layer_.storage_map[nid].is_dynamic_tensor) {
- SetTensorMemDesc(&this->layer_, nid, eid);
- update_desc = true;
- }
-#endif
- // Assuming all inputs are from OpenCL
- if (kDLOpenCL == data_entry_[eid]->device.device_type) {
- if (this->layer_.storage_map[nid].layout ==
CL_TENSOR_LAYOUT_NCHW_QCOM) {
- int index = layer_.tensorMemDescs_indexmap[nid];
- layer_.tensorMemDescs[index].memory = static_cast<cl_mem>(
-
((cl::BufferDescriptor*)const_cast<DLTensor*>(data_entry_[eid])->data)->buffer);
- update_desc = true;
- } else {
- layer_.in_placeholder[nid]->memory = static_cast<cl_mem>(
-
((cl::BufferDescriptor*)const_cast<DLTensor*>(data_entry_[eid])->data)->buffer);
- cl_event cpy_evt = nullptr;
- cl_event* evt = &cpy_evt;
- if (cws->workspace->IsProfiling(cws->tentry->device)) {
- evts.resize(evts.size() + 1);
- evt = &(evts.back());
- }
- std::unordered_map<std::string, ffi::Any> metrics;
- std::string shape_str;
- std::vector<int64_t> shape(nodes_[nid].GetOpShape()[0].begin(),
- nodes_[nid].GetOpShape()[0].end());
- DLDataType tvm_dtype = nodes_[nid].GetOpDataType()[0];
- shape_str.append(profiling::ShapeString(shape, tvm_dtype));
- metrics["Argument Shapes"] = ffi::String(shape_str);
-
- prof->StartCall("CopyIn", cws->tentry->device, metrics);
- CLML_CALL(clEnqueueCopyMLTensorDataQCOM, queue,
layer_.in_placeholder[nid]->tensor,
- layer_.in_placeholder[nid]->memory,
layer_.inputs[nid]->tensor,
- layer_.inputs[nid]->memory, 0, nullptr, evt);
- prof->StopCall();
- }
- }
- }
- }
-
- for (size_t i = 0; i < outputs_.size(); ++i) {
- auto nid = outputs_[i].id_;
- uint32_t eid = EntryID(outputs_[i]);
-#if (CL_QCOM_ML_OPS_H_MAJOR_VERSION >= 5)
- if (this->layer_.storage_map[nid].is_dynamic_tensor) {
- SetTensorMemDesc(&this->layer_, nid, eid);
- update_desc = true;
- }
-#endif
- if (this->layer_.storage_map[nid].layout == CL_TENSOR_LAYOUT_NCHW_QCOM) {
- int index = layer_.tensorMemDescs_indexmap[nid];
- layer_.tensorMemDescs[index].memory = static_cast<cl_mem>(
-
((cl::BufferDescriptor*)const_cast<DLTensor*>(data_entry_[eid])->data)->buffer);
- update_desc = true;
- }
- }
-
- if (update_desc) {
- CLML_CALL(clUpdateMLTensorMemoryDescriptorSetQCOM,
this->layer_.descriptorSet,
- static_cast<uint32_t>(this->layer_.tensorMemDescs.size()),
- this->layer_.tensorMemDescs.data());
- }
-
- for (size_t i = 0; i < this->layer_.function.size(); ++i) {
- std::unordered_map<std::string, ffi::Any> metrics;
- auto node = this->layer_.op_node_map[this->layer_.function[i].op].second;
- std::string shape_str;
- for (uint32_t j = 0; j < node.GetInputs().size(); ++j) {
- const JSONGraphNode in_node = nodes_[node.GetInputs()[j].id_];
- auto shape_arr = in_node.GetOpShape()[0];
- std::vector<int64_t> shape(shape_arr.begin(), shape_arr.end());
- DLDataType tvm_dtype = in_node.GetOpDataType()[0];
- shape_str.append(profiling::ShapeString(shape, tvm_dtype));
- shape_str.append(", ");
- }
- // Assuming one output per operation
- auto shape_arr = node.GetOpShape()[0];
- std::vector<int64_t> shape(shape_arr.begin(), shape_arr.end());
- DLDataType tvm_dtype = node.GetOpDataType()[0];
- shape_str.append(profiling::ShapeString(shape, tvm_dtype));
- metrics["Argument Shapes"] = ffi::String(shape_str);
-
- // Launch call
- prof->StartCall(clml_symbol + "-" + this->layer_.function[i].layer_name,
cws->tentry->device,
- metrics);
- queue = CLML_QUEUE;
- evts.resize(evts.size() + 1);
- cl_event* evt = &(evts.back());
-#if (CL_QCOM_ML_OPS_H_MAJOR_VERSION >= 5)
- if (this->layer_.function[i].op_props.size()) {
- CLML_CALL_clUpdateMLOpQCOM(this->layer_.function[i].op,
- this->layer_.function[i].op_props.data(),
- this->layer_.descriptorSet, NULL);
- }
-#endif
- CLML_CALL(clEnqueueMLOpQCOM, queue, this->layer_.function[i].op,
this->layer_.descriptorSet,
- 0, nullptr, evt);
- prof->StopCall();
- }
-
- for (size_t i = 0; i < outputs_.size(); ++i) {
- uint32_t eid = EntryID(outputs_[i]);
- auto nid = outputs_[i].id_;
- // Assuming all outputs are to OpenCL
- if (kDLOpenCL == data_entry_[eid]->device.device_type) {
- if (this->layer_.storage_map[nid].layout !=
CL_TENSOR_LAYOUT_NCHW_QCOM) {
- layer_.out_placeholder[i]->memory = static_cast<cl_mem>(
-
((cl::BufferDescriptor*)const_cast<DLTensor*>(data_entry_[eid])->data)->buffer);
- cl_event cpy_evt = nullptr;
- cl_event* evt = &cpy_evt;
- if (cws->workspace->IsProfiling(cws->tentry->device)) {
- evts.resize(evts.size() + 1);
- evt = &(evts.back());
- }
-
- std::unordered_map<std::string, ffi::Any> metrics;
- std::string shape_str;
- std::vector<int64_t> shape(nodes_[eid].GetOpShape()[0].begin(),
- nodes_[eid].GetOpShape()[0].end());
- DLDataType tvm_dtype = nodes_[eid].GetOpDataType()[0];
- shape_str.append(profiling::ShapeString(shape, tvm_dtype));
- metrics["Argument Shapes"] = ffi::String(shape_str);
-
- prof->StartCall("CopyOut", cws->tentry->device, metrics);
- CLML_CALL(clEnqueueCopyMLTensorDataQCOM, queue,
layer_.outputs[i]->tensor,
- layer_.outputs[i]->memory,
layer_.out_placeholder[i]->tensor,
- layer_.out_placeholder[i]->memory, 0, nullptr, evt);
- prof->StopCall();
- }
- }
- }
- return;
- }
-
/*!
* \brief Unpack inputs and outputs and run inference on a given layer.
*
@@ -599,7 +453,7 @@ class CLMLRuntime : public JSONRuntimeBase {
LOG_CLML << "Execution by Rec Queue";
if (cws->workspace->IsProfiling(cws->tentry->device)) {
Timer t;
- auto f =
tvm::ffi::Function::GetGlobal(std::string("profiling.timer.opencl"));
+ auto f =
tvm::ffi::Function::GetGlobal(std::string("runtime.timer.opencl"));
t = f->operator()(cws->tentry->device).cast<Timer>();
t->Start();
queue = CLML_QUEUE;
@@ -627,7 +481,7 @@ class CLMLRuntime : public JSONRuntimeBase {
#endif
if (cws->workspace->IsProfiling(cws->tentry->device)) {
Timer t;
- auto f =
tvm::ffi::Function::GetGlobal(std::string("profiling.timer.opencl"));
+ auto f =
tvm::ffi::Function::GetGlobal(std::string("runtime.timer.opencl"));
t = f->operator()(cws->tentry->device).cast<Timer>();
t->Start();
queue = CLML_QUEUE;
@@ -2102,8 +1956,6 @@ class CLMLRuntime : public JSONRuntimeBase {
<< "Please build with USE_CLML_GRAPH_EXECUTOR.";
}
#endif
- bool CanDebug() override { return true; }
-
/*! CLML sub graph symbol in TVM main module */
std::string clml_symbol;
};
diff --git a/src/runtime/contrib/clml/clml_runtime.h
b/src/runtime/contrib/clml/clml_runtime.h
index 29aadc434a..3a0a7b12c0 100644
--- a/src/runtime/contrib/clml/clml_runtime.h
+++ b/src/runtime/contrib/clml/clml_runtime.h
@@ -33,7 +33,6 @@
#include <CL/opencl.h>
#include <stdlib.h>
#include <tvm/ffi/function.h>
-#include <tvm/runtime/profiling.h>
#include <tvm/runtime/tensor.h>
#include <fstream>
diff --git a/src/runtime/contrib/json/json_runtime.h
b/src/runtime/contrib/json/json_runtime.h
index d00d03ec89..d980804bcb 100644
--- a/src/runtime/contrib/json/json_runtime.h
+++ b/src/runtime/contrib/json/json_runtime.h
@@ -27,7 +27,6 @@
#include <tvm/ffi/extra/json.h>
#include <tvm/ffi/extra/module.h>
-#include <tvm/runtime/profiling.h>
#include <tvm/runtime/tensor.h>
#include <tvm/support/io.h>
@@ -72,17 +71,6 @@ class JSONRuntimeBase : public ffi::ModuleObj {
/*! \brief Invoke the execution engine to inteprete a specific json runtime.
*/
virtual void Run() = 0;
- /*! \brief Does the backend support debug & profiling */
- virtual bool CanDebug() { return false; }
-
- /*!
- * \brief Invoke the profiler
- * \param pointer to profiler
- */
- virtual void RunProfile(profiling::Profiler* prof) {
- TVM_FFI_THROW(InternalError) << "Not expected to be here : Profiling call
w/o support ?";
- }
-
/*!
* \brief Invoke the debugger
* \return External compiler specific debug blob
@@ -116,30 +104,6 @@ class JSONRuntimeBase : public ffi::ModuleObj {
// Execute the subgraph.
this->Run();
});
- } else if (this->symbol_name_ + "_debug" == name) {
- // NOTE: the current debug convention is not very compatible with
- // the FFI convention, consider clean up
- if (!this->CanDebug()) {
- return ffi::Function(nullptr);
- }
- return ffi::Function([sptr_to_self, this](ffi::PackedArgs args,
ffi::Any* rv) {
- TVM_FFI_ICHECK(this->initialized_) << "The module has not been
initialized";
-
- // Bind argument tensors to data entries.
- this->SetInputOutputBuffers(args);
-
- if (auto opt_str = rv->try_cast<ffi::String>()) {
- ffi::String purpose = std::move(opt_str.value());
- if ("debug_dump" == purpose) {
- *rv = this->DebugDump();
- }
- } else {
- // Profile the subgraph.
- profiling::Profiler* prof =
static_cast<profiling::Profiler*>(rv->cast<void*>());
- this->RunProfile(prof);
- }
- // ffi::String vendor_prof = this->RunProfile(prof);
- });
} else if ("__init_" + this->symbol_name_ == name) {
// The function to initialize constant tensors.
return ffi::Function([sptr_to_self, this](ffi::PackedArgs args,
ffi::Any* rv) {
diff --git a/src/runtime/cuda/cuda_device_api.cc
b/src/runtime/cuda/cuda_device_api.cc
index 45ccbb3b10..a01d223ff6 100644
--- a/src/runtime/cuda/cuda_device_api.cc
+++ b/src/runtime/cuda/cuda_device_api.cc
@@ -27,7 +27,7 @@
#include <tvm/ffi/function.h>
#include <tvm/ffi/reflection/registry.h>
#include <tvm/runtime/device_api.h>
-#include <tvm/runtime/profiling.h>
+#include <tvm/runtime/timer.h>
#include <cstring>
@@ -333,7 +333,7 @@ class CUDATimerNode : public TimerNode {
TVM_FFI_STATIC_INIT_BLOCK() {
namespace refl = tvm::ffi::reflection;
- refl::GlobalDef().def("profiling.timer.cuda",
+ refl::GlobalDef().def("runtime.timer.cuda",
[](Device dev) { return
Timer(ffi::make_object<CUDATimerNode>()); });
}
diff --git a/src/runtime/hexagon/hexagon_common.cc
b/src/runtime/hexagon/hexagon_common.cc
index 442ec5ff56..15ac224a8e 100644
--- a/src/runtime/hexagon/hexagon_common.cc
+++ b/src/runtime/hexagon/hexagon_common.cc
@@ -25,7 +25,7 @@
#include <tvm/ffi/function.h>
#include <tvm/ffi/reflection/registry.h>
#include <tvm/runtime/logging.h>
-#include <tvm/runtime/profiling.h>
+#include <tvm/runtime/timer.h>
#include <sstream>
#include <string>
@@ -55,7 +55,7 @@ class HexagonTimerNode : public TimerNode {
TVM_FFI_STATIC_INIT_BLOCK() {
namespace refl = tvm::ffi::reflection;
- refl::GlobalDef().def("profiling.timer.hexagon",
+ refl::GlobalDef().def("runtime.timer.hexagon",
[](Device dev) { return
Timer(ffi::make_object<HexagonTimerNode>()); });
}
} // namespace hexagon
diff --git a/src/runtime/metal/metal_device_api.mm
b/src/runtime/metal/metal_device_api.mm
index f240f589c1..47ab8148c5 100644
--- a/src/runtime/metal/metal_device_api.mm
+++ b/src/runtime/metal/metal_device_api.mm
@@ -22,7 +22,7 @@
*/
#include <tvm/ffi/function.h>
#include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/profiling.h>
+#include <tvm/runtime/timer.h>
#include "metal_common.h"
namespace tvm {
@@ -453,7 +453,7 @@ class MetalTimerNode : public TimerNode {
TVM_FFI_STATIC_INIT_BLOCK() {
namespace refl = tvm::ffi::reflection;
- refl::GlobalDef().def("profiling.timer.metal",
+ refl::GlobalDef().def("runtime.timer.metal",
[](Device dev) { return
Timer(ffi::make_object<MetalTimerNode>(dev)); });
}
diff --git a/src/runtime/opencl/opencl_common.h
b/src/runtime/opencl/opencl_common.h
index a9fb5c01ec..2ae389046f 100644
--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -29,8 +29,8 @@
#include <tvm/runtime/device_api.h>
#include <tvm/runtime/logging.h>
#include <tvm/runtime/memory/memory_manager.h>
-#include <tvm/runtime/profiling.h>
#include <tvm/runtime/tensor.h>
+#include <tvm/runtime/timer.h>
/* There are many OpenCL platforms that do not yet support OpenCL 2.0,
* hence we use 1.2 APIs, some of which are now deprecated. In order
diff --git a/src/runtime/opencl/opencl_device_api.cc
b/src/runtime/opencl/opencl_device_api.cc
index a4f7daf532..0b63f497db 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -22,7 +22,7 @@
*/
#include <tvm/ffi/function.h>
#include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/profiling.h>
+#include <tvm/runtime/timer.h>
#include <sstream>
@@ -813,7 +813,7 @@ TVM_FFI_STATIC_INIT_BLOCK() {
TVM_FFI_STATIC_INIT_BLOCK() {
namespace refl = tvm::ffi::reflection;
- refl::GlobalDef().def("profiling.timer.opencl",
+ refl::GlobalDef().def("runtime.timer.opencl",
[](Device dev) { return
Timer(ffi::make_object<OpenCLTimerNode>(dev)); });
}
diff --git a/src/runtime/profiling.cc b/src/runtime/profiling.cc
deleted file mode 100644
index 99b5d77a6e..0000000000
--- a/src/runtime/profiling.cc
+++ /dev/null
@@ -1,937 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file src/runtime/profiling.cc
- * \brief Runtime profiling including timers.
- */
-
-#include <tvm/ffi/extra/json.h>
-#include <tvm/ffi/function.h>
-#include <tvm/ffi/reflection/registry.h>
-#include <tvm/runtime/c_backend_api.h>
-#include <tvm/runtime/data_type.h>
-#include <tvm/runtime/profiling.h>
-#include <tvm/runtime/threading_backend.h>
-
-#include <algorithm>
-#include <chrono>
-#include <iomanip>
-#include <iostream>
-#include <map>
-#include <mutex>
-#include <numeric>
-#include <set>
-#include <thread>
-#include <unordered_set>
-
-namespace tvm {
-namespace runtime {
-
-class DefaultTimerNode : public TimerNode {
- public:
- virtual void Start() {
- DeviceAPI::Get(device_)->StreamSync(device_, nullptr);
- start_ = std::chrono::high_resolution_clock::now();
- }
- virtual void Stop() {
- DeviceAPI::Get(device_)->StreamSync(device_, nullptr);
- duration_ = std::chrono::high_resolution_clock::now() - start_;
- }
- virtual int64_t SyncAndGetElapsedNanos() { return duration_.count(); }
- virtual ~DefaultTimerNode() {}
-
- explicit DefaultTimerNode(Device dev) : device_(dev) {}
- TVM_FFI_DECLARE_OBJECT_INFO_FINAL("runtime.DefaultTimerNode",
DefaultTimerNode, TimerNode);
-
- private:
- std::chrono::high_resolution_clock::time_point start_;
- std::chrono::duration<int64_t, std::nano> duration_;
- Device device_;
-};
-
-Timer DefaultTimer(Device dev) { return
Timer(ffi::make_object<DefaultTimerNode>(dev)); }
-
-class CPUTimerNode : public TimerNode {
- public:
- virtual void Start() { start_ = std::chrono::high_resolution_clock::now(); }
- virtual void Stop() { duration_ = std::chrono::high_resolution_clock::now()
- start_; }
- virtual int64_t SyncAndGetElapsedNanos() { return duration_.count(); }
- virtual ~CPUTimerNode() {}
- TVM_FFI_DECLARE_OBJECT_INFO_FINAL("runtime.CPUTimerNode", CPUTimerNode,
TimerNode);
-
- private:
- std::chrono::high_resolution_clock::time_point start_;
- std::chrono::duration<int64_t, std::nano> duration_;
-};
-
-TVM_FFI_STATIC_INIT_BLOCK() {
- namespace refl = tvm::ffi::reflection;
- refl::GlobalDef().def("profiling.timer.cpu",
- [](Device dev) { return
Timer(ffi::make_object<CPUTimerNode>()); });
-}
-
-// keep track of which timers are not defined but we have already warned about
-std::set<DLDeviceType> seen_devices;
-std::mutex seen_devices_lock;
-
-Timer Timer::Start(Device dev) {
- auto f = tvm::ffi::Function::GetGlobal(std::string("profiling.timer.") +
- DLDeviceType2Str(dev.device_type));
- if (!f.has_value()) {
- {
- std::lock_guard<std::mutex> lock(seen_devices_lock);
- if (seen_devices.find(dev.device_type) == seen_devices.end()) {
- LOG(WARNING)
- << "No timer implementation for " <<
DLDeviceType2Str(dev.device_type)
- << ", using default timer instead. It may be inaccurate or have
extra overhead.";
- seen_devices.insert(dev.device_type);
- }
- }
- Timer t = DefaultTimer(dev);
- t->Start();
- return t;
- } else {
- Timer t = f->operator()(dev).cast<Timer>();
- t->Start();
- return t;
- }
-}
-
-TVM_FFI_STATIC_INIT_BLOCK() {
- namespace refl = tvm::ffi::reflection;
- refl::GlobalDef().def("profiling.start_timer", Timer::Start);
-}
-
-namespace profiling {
-
-Profiler::Profiler(std::vector<Device> devs, std::vector<MetricCollector>
metric_collectors,
- std::unordered_map<ffi::String, ffi::Any> configuration)
- : devs_(devs), collectors_(metric_collectors),
configuration_(configuration) {
- is_running_ = false;
- std::vector<DeviceWrapper> wrapped_devs;
- for (auto dev : devs) {
-
wrapped_devs.push_back(DeviceWrapper(ffi::make_object<DeviceWrapperNode>(dev)));
- }
- for (auto& x : collectors_) {
- x->Init(wrapped_devs);
- }
- // reset the thread pool so that PAPI eventset hooks are set in all threads.
- threading::ResetThreadPool();
-
- configuration_[ffi::String("Number of threads")] =
- ObjectRef(ffi::make_object<CountNode>(threading::NumThreads()));
-}
-
-void Profiler::Start() {
- is_running_ = true;
- for (auto dev : devs_) {
- StartCall("Total", dev, {});
- }
-}
-
-void Profiler::StartCall(ffi::String name, Device dev,
- std::unordered_map<std::string, ffi::Any>
extra_metrics) {
- std::vector<std::pair<MetricCollector, ObjectRef>> objs;
- for (auto& collector : collectors_) {
- ObjectRef obj = collector->Start(dev);
- if (obj.defined()) {
- objs.emplace_back(collector, obj);
- }
- }
- in_flight_.push(CallFrame{dev, name, Timer::Start(dev), extra_metrics,
objs});
-}
-
-void Profiler::StopCall(std::unordered_map<std::string, ffi::Any>
extra_metrics) {
- CallFrame cf = in_flight_.top();
- cf.timer->Stop();
- for (auto& p : extra_metrics) {
- cf.extra_metrics[p.first] = p.second;
- }
- // collect the extra metrics from user defined collectors
- for (const auto& obj : cf.extra_collectors) {
- auto collector_metrics = obj.first->Stop(obj.second);
- for (auto& p : collector_metrics) {
- cf.extra_metrics[p.first] = p.second;
- }
- }
- in_flight_.pop();
- calls_.push_back(cf);
-}
-
-void Profiler::Stop() {
- is_running_ = false;
- for (size_t i = 0; i < devs_.size(); i++) {
- StopCall();
- }
-}
-
-std::vector<int64_t> ToShape(Tensor shape_tensor) {
- std::vector<int64_t> shape;
- auto rank = shape_tensor.Shape().size();
- auto dtype = shape_tensor.DataType();
-
- // For 0-rank shapes we need to allocate a single scalar.
- if (rank == 0) {
- return shape;
- }
-
- // Otherwise we should be rank-1, and we will extract the number of
dimensions
- // for the output vector.
- TVM_FFI_ICHECK_EQ(rank, 1U) << "shape tensor should be a k-length vector,
found " << rank;
- int64_t ndim = shape_tensor.Shape().at(0);
- shape.resize(ndim);
-
- const DLTensor* dl_tensor = shape_tensor.operator->();
- if (dtype.is_int() && dtype.bits() == 32 && dtype.lanes() == 1) {
- int32_t* dims = reinterpret_cast<int32_t*>(dl_tensor->data);
- shape.assign(dims, dims + ndim);
- } else if (dtype.is_int() && dtype.bits() == 64 && dtype.lanes() == 1) {
- int64_t* dims = reinterpret_cast<int64_t*>(dl_tensor->data);
- shape.assign(dims, dims + ndim);
- } else {
- TVM_FFI_THROW(InternalError) << "invalid shape tensor datatype: " << dtype;
- }
-
- return shape;
-}
-
-ffi::String ShapeString(Tensor shape, DLDataType dtype) {
- return ShapeString(ToShape(shape), dtype);
-}
-
-ffi::String ShapeString(const std::vector<int64_t>& shape, DLDataType dtype) {
- std::stringstream sizes;
- sizes << dtype << "[";
- for (size_t i = 0; i < shape.size(); i++) {
- if (i != 0) {
- sizes << ", ";
- }
- sizes << shape[i];
- }
- sizes << "]";
- return ffi::String(sizes.str());
-}
-
-ffi::String ShapeString(const std::vector<Tensor>& shapes) {
- std::stringstream sizes;
- for (const Tensor& ary : shapes) {
- if (sizes.tellp() > 0) {
- sizes << ", ";
- }
- auto shape = ary.Shape();
- sizes << ary.DataType() << "[";
- for (size_t i = 0; i < shape.size(); i++) {
- if (i != 0) {
- sizes << ", ";
- }
- sizes << shape[i];
- }
- sizes << "]";
- }
- return ffi::String(sizes.str());
-}
-
-ffi::String ReportNode::AsCSV() const {
- // get unique headers
- std::set<std::string> unique_headers;
-
- for (auto row : calls) {
- for (auto p : row) {
- unique_headers.insert(p.first);
- }
- }
-
- std::vector<std::string> headers;
- for (auto x : unique_headers) {
- headers.push_back(x);
- }
-
- std::stringstream s;
-
- for (size_t i = 0; i < headers.size(); i++) {
- std::string header = headers[i];
- s << header;
- if (i < headers.size() - 1) {
- s << ",";
- }
- }
- s << std::endl;
- for (auto row : calls) {
- for (size_t i = 0; i < headers.size(); i++) {
- std::string header = headers[i];
- auto it = row.find(header);
- if (it != row.end()) {
- std::string val;
- if ((*it).second.as<CountNode>()) {
- s << (*it).second.as<CountNode>()->value;
- } else if ((*it).second.as<DurationNode>()) {
- s << (*it).second.as<DurationNode>()->microseconds;
- } else if ((*it).second.as<PercentNode>()) {
- s << (*it).second.as<PercentNode>()->percent;
- } else if ((*it).second.as<RatioNode>()) {
- s << (*it).second.as<RatioNode>()->ratio;
- } else if (auto opt_str = (*it).second.as<ffi::String>()) {
- s << "\"" << *opt_str << "\"";
- }
- }
- if (i < headers.size() - 1) {
- s << ",";
- }
- }
- s << std::endl;
- }
- return s.str();
-}
-
-namespace {
-void metric_as_json(std::ostream& os, ffi::Any o) {
- if (auto opt_str = o.as<ffi::String>()) {
- os << "{\"string\":"
- << "\"" << *opt_str << "\""
- << "}";
- } else if (const CountNode* n = o.as<CountNode>()) {
- os << "{\"count\":" << n->value << "}";
- } else if (const DurationNode* n = o.as<DurationNode>()) {
- os << "{\"microseconds\":" <<
std::setprecision(std::numeric_limits<double>::max_digits10)
- << std::fixed << n->microseconds << "}";
- } else if (const PercentNode* n = o.as<PercentNode>()) {
- os << "{\"percent\":" <<
std::setprecision(std::numeric_limits<double>::max_digits10)
- << std::fixed << n->percent << "}";
- } else if (const RatioNode* n = o.as<RatioNode>()) {
- os << "{\"ratio\":" <<
std::setprecision(std::numeric_limits<double>::max_digits10)
- << std::fixed << n->ratio << "}";
- } else {
- TVM_FFI_THROW(InternalError) << "Unprintable type " << o.GetTypeKey();
- }
-}
-} // namespace
-
-ffi::String ReportNode::AsJSON() const {
- std::ostringstream s;
- // We want a specific write for the value,
- // so we would have to implement a custom data structure for each type of
- // value we want to print. Instead we construct the json by hand because it
- // is easier.
- s << "{";
-
- s << "\"calls\":[";
- for (size_t i = 0; i < calls.size(); i++) {
- size_t j = 0;
- s << "{";
- for (const auto& kv : calls[i]) {
- s << "\"" << kv.first << "\":";
- metric_as_json(s, kv.second);
- if (j < calls[i].size() - 1) {
- s << ",";
- }
- j++;
- }
- s << "}";
- if (i < calls.size() - 1) {
- s << ",";
- }
- }
- s << "],"; // end calls
-
- s << "\"device_metrics\":{";
- size_t i = 0;
- for (const auto& dev_kv : device_metrics) {
- size_t j = 0;
- s << "\"" << dev_kv.first << "\":{";
- for (const auto& metric_kv : dev_kv.second) {
- s << "\"" << metric_kv.first << "\":";
- metric_as_json(s, metric_kv.second);
- if (j < dev_kv.second.size() - 1) {
- s << ",";
- }
- j++;
- }
- s << "}";
- if (i < device_metrics.size() - 1) {
- s << ",";
- }
- i++;
- }
- s << "},"; // end device metrics
-
- s << "\"configuration\":{";
- size_t k = 0;
- for (const auto& kv : configuration) {
- s << "\"" << kv.first << "\":";
- metric_as_json(s, kv.second);
- if (k < configuration.size() - 1) {
- s << ",";
- }
- k++;
- }
- s << "}"; // end configuration
- s << "}";
- return s.str();
-}
-
-// Aggregate a set of values for a metric. Computes sum for Duration, Count,
-// and Percent; average for Ratio; and assumes all Strings are the same. All
-// ObjectRefs in metrics must have the same type.
-Any AggregateMetric(const std::vector<ffi::Any>& metrics) {
- TVM_FFI_ICHECK_GT(metrics.size(), 0) << "Must pass a non-zero number of
metrics";
- if (metrics[0].as<DurationNode>()) {
- double sum = 0;
- for (auto& metric : metrics) {
- sum += metric.as<DurationNode>()->microseconds;
- }
- return ObjectRef(ffi::make_object<DurationNode>(sum));
- } else if (metrics[0].as<CountNode>()) {
- int64_t sum = 0;
- for (auto& metric : metrics) {
- sum += metric.as<CountNode>()->value;
- }
- return ObjectRef(ffi::make_object<CountNode>(sum));
- } else if (metrics[0].as<PercentNode>()) {
- double sum = 0;
- for (auto& metric : metrics) {
- sum += metric.as<PercentNode>()->percent;
- }
- return ObjectRef(ffi::make_object<PercentNode>(sum));
- } else if (metrics[0].as<RatioNode>()) {
- double sum = 0;
- for (auto& metric : metrics) {
- sum += metric.as<RatioNode>()->ratio;
- }
- return ObjectRef(ffi::make_object<RatioNode>(sum / metrics.size()));
- } else if (auto opt_str = metrics[0].as<ffi::String>()) {
- for (auto& m : metrics) {
- if (*opt_str != m.as<ffi::String>()) {
- return ffi::String("");
- }
- }
- // Assume all strings in metrics are the same.
- return metrics[0];
- } else {
- TVM_FFI_THROW(InternalError)
- << "Can only aggregate metrics with types DurationNode, CountNode, "
- "PercentNode, RatioNode, and String, but got "
- << metrics[0].GetTypeKey();
- return ffi::Any(); // To silence warnings
- }
-}
-
-// Try and set the locale of the provided stringstream so that it will print
-// numbers with thousands separators. Sometimes users will have a misconfigured
-// system where an invalid locale is set, so we catch and ignore any locale
-// errors.
-static void set_locale_for_separators(std::stringstream& s) {
- try {
- // empty string indicates locale should be the user's default, see man 3
setlocale
- s.imbue(std::locale(""));
- } catch (std::runtime_error& e) {
- }
-}
-
-static ffi::String print_metric(ffi::Any metric) {
- std::string val;
- if (metric.as<CountNode>()) {
- std::stringstream s;
- set_locale_for_separators(s);
- s << std::fixed << metric.as<CountNode>()->value;
- val = s.str();
- } else if (metric.as<DurationNode>()) {
- std::stringstream s;
- set_locale_for_separators(s);
- s << std::fixed << std::setprecision(2) <<
metric.as<DurationNode>()->microseconds;
- val = s.str();
- } else if (metric.as<PercentNode>()) {
- std::stringstream s;
- s << std::fixed << std::setprecision(2) <<
metric.as<PercentNode>()->percent;
- val = s.str();
- } else if (metric.as<RatioNode>()) {
- std::stringstream s;
- set_locale_for_separators(s);
- s << std::setprecision(2) << metric.as<RatioNode>()->ratio;
- val = s.str();
- } else if (auto opt_str = metric.as<ffi::String>()) {
- val = *opt_str;
- } else {
- TVM_FFI_THROW(InternalError) << "Cannot print metric of type " <<
metric.GetTypeKey();
- }
- return val;
-}
-
-ffi::String ReportNode::AsTable(bool sort, bool aggregate, bool
compute_col_sums) const {
- // aggregate calls by op hash (or op name if hash is not set) + argument
shapes
- std::vector<ffi::Map<ffi::String, ffi::Any>> aggregated_calls;
- if (aggregate) {
- std::unordered_map<std::string, std::vector<size_t>> aggregates;
- for (size_t i = 0; i < calls.size(); i++) {
- auto frame = calls[i];
- auto it = frame.find("Hash");
- std::string name = frame["Name"].cast<ffi::String>();
- if (it != frame.end()) {
- name = (*it).second.cast<ffi::String>();
- }
- if (frame.find("Argument Shapes") != frame.end()) {
- name += frame["Argument Shapes"].cast<ffi::String>();
- }
- if (frame.find("Device") != frame.end()) {
- name += frame["Device"].cast<ffi::String>();
- }
-
- if (aggregates.find(name) == aggregates.end()) {
- aggregates[name] = {i};
- } else {
- aggregates[name].push_back(i);
- }
- }
- for (const auto& p : aggregates) {
- std::unordered_map<ffi::String, ffi::Any> aggregated;
- std::unordered_set<std::string> metrics;
- for (auto& call : calls) {
- for (auto& metric : call) {
- metrics.insert(metric.first);
- }
- }
- for (const std::string& metric : metrics) {
- std::vector<ffi::Any> per_call;
- for (auto i : p.second) {
- auto call = calls[i];
- auto it = std::find_if(call.begin(), call.end(),
- [&metric](const std::pair<ffi::String,
ffi::Any>& call_metric) {
- return std::string(call_metric.first) ==
metric;
- });
- if (it != call.end()) {
- per_call.push_back((*it).second);
- }
- }
- if (per_call.size() > 0) {
- aggregated[metric] = AggregateMetric(per_call);
- }
- }
- aggregated_calls.push_back(aggregated);
- }
- } else {
- for (auto call : calls) {
- aggregated_calls.push_back(call);
- }
- }
-
- // sort rows by duration
- if (sort) {
- std::sort(
- aggregated_calls.begin(), aggregated_calls.end(),
- [&](const ffi::Map<ffi::String, ffi::Any>& a, const
ffi::Map<ffi::String, ffi::Any>& b) {
- return a.at("Duration (us)").as<DurationNode>()->microseconds >
- b.at("Duration (us)").as<DurationNode>()->microseconds;
- });
- }
-
- // compute columnwise sums
- if (compute_col_sums) {
- std::unordered_map<ffi::String, ffi::Any> col_sums;
- for (auto call : aggregated_calls) {
- for (auto p : call) {
- if (p.second.as<CountNode>()) {
- int64_t val = p.second.as<CountNode>()->value;
- auto it = col_sums.find(p.first);
- if (it != col_sums.end()) {
- val += it->second.as<CountNode>()->value;
- }
- col_sums[p.first] = ObjectRef(ffi::make_object<CountNode>(val));
- } else if (p.second.as<DurationNode>()) {
- double val = p.second.as<DurationNode>()->microseconds;
- auto it = col_sums.find(p.first);
- if (it != col_sums.end()) {
- val += it->second.as<DurationNode>()->microseconds;
- }
- col_sums[p.first] = ObjectRef(ffi::make_object<DurationNode>(val));
- } else if (p.second.as<PercentNode>()) {
- double val = p.second.as<PercentNode>()->percent;
- auto it = col_sums.find(p.first);
- if (it != col_sums.end()) {
- val += it->second.as<PercentNode>()->percent;
- }
- col_sums[p.first] = ObjectRef(ffi::make_object<PercentNode>(val));
- } else if (p.second.as<RatioNode>()) {
- // It does not make sense to sum ratios
- }
- }
- }
- col_sums["Name"] = ffi::String("Sum");
- aggregated_calls.push_back({{ffi::String("Name"),
ffi::String("----------")}}); // separator
- aggregated_calls.push_back(col_sums);
- }
-
- // per-device metrics
- for (auto p : device_metrics) {
- ffi::Map<ffi::String, ffi::Any> metrics = p.second;
- metrics.Set("Name", ffi::String("Total"));
- aggregated_calls.push_back(metrics);
- }
-
- // Table formatting
- std::set<std::string> unique_headers;
- for (auto row : aggregated_calls) {
- for (auto p : row) {
- unique_headers.insert(p.first);
- }
- }
-
- // always include these headers in this order
- std::vector<std::string> headers = {"Name", "Duration (us)", "Percent",
- "Device", "Count", "Argument
Shapes"};
- for (auto header : unique_headers) {
- if (std::find(headers.begin(), headers.end(), header) == headers.end()) {
- headers.push_back(header);
- }
- }
-
- // Switch layout from row major to column major so we can easily compute
column widths.
- std::vector<std::vector<std::string>> cols;
- for (auto header : headers) {
- cols.push_back({header});
- }
- for (auto row : aggregated_calls) {
- for (size_t i = 0; i < headers.size(); i++) {
- auto it = row.find(headers[i]);
- if (it == row.end()) {
- // fill empty data with empty strings
- cols[i].push_back("");
- } else {
- cols[i].push_back(print_metric((*it).second));
- }
- }
- }
-
- std::vector<size_t> widths;
- for (auto v : cols) {
- size_t width = 0;
- for (auto x : v) {
- width = std::max(width, x.size());
- }
- widths.push_back(width);
- }
- size_t length = 0;
- for (auto v : cols) {
- length = std::max(length, v.size());
- }
-
- std::stringstream s;
- for (size_t row = 0; row < length; row++) {
- for (size_t col = 0; col < cols.size(); col++) {
- // left align first column
- if (col == 0) {
- s << std::left;
- } else {
- s << std::right;
- }
- if (row < cols[col].size()) {
- s << std::setw(widths[col]) << cols[col][row] << " ";
- } else {
- s << std::setw(widths[col]) << " ";
- }
- }
- s << std::endl;
- }
-
- // Add configuration information. It will not be aligned with the columns.
- s << std::endl << "Configuration" << std::endl << "-------------" <<
std::endl;
- for (auto kv : configuration) {
- s << kv.first << ": " << print_metric(kv.second) << std::endl;
- }
- return s.str();
-}
-
-std::string DeviceString(Device dev) {
- return DLDeviceType2Str(dev.device_type) + std::to_string(dev.device_id);
-}
-
-Report Profiler::Report() {
- // sync all timers and normalize rows
- std::vector<std::unordered_map<ffi::String, ffi::Any>> rows;
- for (auto& cf : calls_) {
- std::unordered_map<ffi::String, ffi::Any> row;
- double us = cf.timer->SyncAndGetElapsedNanos() / 1e3;
- row["Duration (us)"] = ObjectRef(ffi::make_object<DurationNode>(us));
- row["Count"] = ObjectRef(ffi::make_object<CountNode>(1));
- row["Name"] = cf.name;
- row["Device"] = ffi::String(DeviceString(cf.dev));
- for (auto p : cf.extra_metrics) {
- row[p.first] = p.second;
- }
- rows.push_back(row);
- }
-
- // the last frames are the overall times
- double overall_time_us = 0;
- std::unordered_map<ffi::String, ffi::Map<ffi::String, ffi::Any>>
device_metrics;
- for (size_t i = 0; i < devs_.size(); i++) {
- auto row = rows[rows.size() - 1];
- rows.pop_back();
- device_metrics[row["Device"].cast<ffi::String>()] = row;
- overall_time_us =
- std::max(overall_time_us, row["Duration
(us)"].as<DurationNode>()->microseconds);
- }
-
- // Calculate percentages
- for (auto& row : rows) {
- row["Percent"] = ObjectRef(ffi::make_object<PercentNode>(
- row["Duration (us)"].as<DurationNode>()->microseconds /
overall_time_us * 100));
- }
-
- // convert to map
- std::vector<ffi::Map<ffi::String, ffi::Any>> converted_rows;
- for (const auto& row : rows) {
- converted_rows.push_back(row);
- }
-
- return profiling::Report(converted_rows, device_metrics, configuration_);
-}
-
-Report::Report(ffi::Array<ffi::Map<ffi::String, ffi::Any>> calls,
- ffi::Map<ffi::String, ffi::Map<ffi::String, ffi::Any>>
device_metrics,
- ffi::Map<ffi::String, ffi::Any> configuration) {
- auto node = ffi::make_object<ReportNode>();
- node->calls = std::move(calls);
- node->device_metrics = std::move(device_metrics);
- node->configuration = std::move(configuration);
- data_ = std::move(node);
-}
-
-namespace json = ::tvm::ffi::json;
-
-ffi::Map<ffi::String, ffi::Any> parse_metrics(const json::Object& obj) {
- ffi::Map<ffi::String, ffi::Any> metrics;
- for (const auto& [k, v] : obj) {
- std::string metric_name = k.cast<ffi::String>();
- json::Object metric_obj = v.cast<json::Object>();
- ffi::Any o;
- // Each metric value is an object with a single key indicating the type
- for (const auto& [type_key, type_val] : metric_obj) {
- std::string metric_value_name = type_key.cast<ffi::String>();
- if (metric_value_name == "microseconds") {
- o = ObjectRef(ffi::make_object<DurationNode>(type_val.cast<double>()));
- } else if (metric_value_name == "percent") {
- o = ObjectRef(ffi::make_object<PercentNode>(type_val.cast<double>()));
- } else if (metric_value_name == "count") {
- o = ObjectRef(ffi::make_object<CountNode>(type_val.cast<int64_t>()));
- } else if (metric_value_name == "ratio") {
- o = ObjectRef(ffi::make_object<RatioNode>(type_val.cast<double>()));
- } else if (metric_value_name == "string") {
- o = ffi::String(type_val.cast<ffi::String>());
- } else {
- TVM_FFI_THROW(InternalError) << "Cannot parse metric of type " <<
metric_value_name
- << " valid types are microseconds,
percent, count.";
- }
- }
- metrics.Set(metric_name, o);
- }
- return metrics;
-}
-
-Report Report::FromJSON(ffi::String json_str) {
- auto root = json::Parse(json_str).cast<json::Object>();
- ffi::Array<ffi::Map<ffi::String, ffi::Any>> calls;
- ffi::Map<ffi::String, ffi::Map<ffi::String, ffi::Any>> device_metrics;
- ffi::Map<ffi::String, ffi::Any> configuration;
-
- for (const auto& [k, v] : root) {
- std::string key = k.cast<ffi::String>();
- if (key == "calls") {
- json::Array calls_arr = v.cast<json::Array>();
- for (const ffi::Any& item : calls_arr) {
- calls.push_back(parse_metrics(item.cast<json::Object>()));
- }
- } else if (key == "device_metrics") {
- json::Object dev_obj = v.cast<json::Object>();
- for (const auto& [dev_key, dev_val] : dev_obj) {
- std::string device_name = dev_key.cast<ffi::String>();
- device_metrics.Set(device_name,
parse_metrics(dev_val.cast<json::Object>()));
- }
- } else if (key == "configuration") {
- configuration = parse_metrics(v.cast<json::Object>());
- }
- }
- return Report(calls, device_metrics, configuration);
-}
-
-TVM_FFI_STATIC_INIT_BLOCK() {
- namespace refl = tvm::ffi::reflection;
- refl::ObjectDef<MetricCollectorNode>();
- refl::ObjectDef<DeviceWrapperNode>();
-
- refl::GlobalDef()
- .def_method("runtime.profiling.AsTable", &ReportNode::AsTable)
- .def("runtime.profiling.AsCSV", [](Report n) { return n->AsCSV(); })
- .def("runtime.profiling.AsJSON", [](Report n) { return n->AsJSON(); })
- .def("runtime.profiling.FromJSON", Report::FromJSON)
- .def("runtime.profiling.DeviceWrapper", [](Device dev) { return
DeviceWrapper(dev); });
-}
-
-ffi::Function ProfileFunction(ffi::Module mod, std::string func_name, int
device_type,
- int device_id, int warmup_iters,
- ffi::Array<MetricCollector> collectors) {
- // Module::GetFunction is not const, so this lambda has to be mutable
- return ffi::Function::FromPacked(
- [=](const ffi::AnyView* args, int32_t num_args, ffi::Any* ret) mutable {
- auto optf = mod->GetFunction(func_name);
- TVM_FFI_ICHECK(optf.has_value())
- << "There is no function called \"" << func_name << "\" in the
module";
- auto f = *optf;
- Device dev{static_cast<DLDeviceType>(device_type), device_id};
-
- // warmup
- for (int i = 0; i < warmup_iters; i++) {
- f.CallPacked(args, num_args, ret);
- }
-
- for (auto& collector : collectors) {
- collector->Init({DeviceWrapper(dev)});
- }
- std::vector<ffi::Map<ffi::String, ffi::Any>> results;
- results.reserve(collectors.size());
- std::vector<std::pair<MetricCollector, ObjectRef>> collector_data;
- collector_data.reserve(collectors.size());
- for (auto& collector : collectors) {
- ObjectRef o = collector->Start(dev);
- // If not defined, then the collector cannot time this device.
- if (o.defined()) {
- collector_data.push_back({collector, o});
- }
- }
-
- // TODO(tkonolige): repeated calls if the runtime is small?
- f.CallPacked(args, num_args, ret);
-
- for (auto& kv : collector_data) {
- results.push_back(kv.first->Stop(kv.second));
- }
- ffi::Map<ffi::String, ffi::Any> combined_results;
- for (auto m : results) {
- for (auto p : m) {
- // assume that there is no shared metric name between collectors
- combined_results.Set(p.first, p.second);
- }
- }
- *ret = combined_results;
- });
-}
-
-TVM_FFI_STATIC_INIT_BLOCK() {
- namespace refl = tvm::ffi::reflection;
- refl::GlobalDef().def(
- "runtime.profiling.ProfileFunction",
- [](ffi::Module mod, ffi::String func_name, int device_type, int
device_id, int warmup_iters,
- ffi::Array<MetricCollector> collectors) {
- if (mod->kind() == std::string("rpc")) {
- TVM_FFI_THROW(InternalError)
- << "Profiling a module over RPC is not yet supported"; //
because we can't send
- //
MetricCollectors over rpc.
- throw;
- } else {
- return ProfileFunction(mod, func_name, device_type, device_id,
warmup_iters, collectors);
- }
- });
-}
-
-ffi::Function WrapTimeEvaluator(ffi::Function pf, Device dev, int number, int
repeat,
- int min_repeat_ms, int
limit_zero_time_iterations,
- int cooldown_interval_ms, int
repeats_to_cooldown,
- int cache_flush_bytes, ffi::Function
f_preproc) {
- TVM_FFI_ICHECK(pf != nullptr);
-
- auto ftimer = [pf, dev, number, repeat, min_repeat_ms,
limit_zero_time_iterations,
- cooldown_interval_ms, repeats_to_cooldown, cache_flush_bytes,
- f_preproc](const ffi::AnyView* args, int num_args, ffi::Any*
rv) mutable {
- ffi::Any temp;
- std::ostringstream os;
- // skip first time call, to activate lazy compilation components.
- pf.CallPacked(args, num_args, &temp);
-
- // allocate two large arrays to flush L2 cache
- Tensor arr1, arr2;
- if (cache_flush_bytes > 0) {
- arr1 = Tensor::Empty({cache_flush_bytes / 4}, {kDLInt, 32, 1}, dev);
- arr2 = Tensor::Empty({cache_flush_bytes / 4}, {kDLInt, 32, 1}, dev);
- }
-
- DeviceAPI::Get(dev)->StreamSync(dev, nullptr);
-
- for (int i = 0; i < repeat; ++i) {
- if (f_preproc != nullptr) {
- f_preproc.CallPacked(args, num_args, &temp);
- }
- double duration_ms = 0.0;
- int absolute_zero_times = 0;
- do {
- if (duration_ms > 0.0) {
- const double golden_ratio = 1.618;
- number = static_cast<int>(
- std::max((min_repeat_ms / (duration_ms / number) + 1), number *
golden_ratio));
- }
- if (cache_flush_bytes > 0) {
- arr1.CopyFrom(arr2);
- }
- DeviceAPI::Get(dev)->StreamSync(dev, nullptr);
- // start timing
- Timer t = Timer::Start(dev);
- for (int j = 0; j < number; ++j) {
- pf.CallPacked(args, num_args, &temp);
- }
- t->Stop();
- int64_t t_nanos = t->SyncAndGetElapsedNanos();
- if (t_nanos == 0) absolute_zero_times++;
- duration_ms = t_nanos / 1e6;
- } while (duration_ms < min_repeat_ms && absolute_zero_times <
limit_zero_time_iterations);
-
- double speed = duration_ms / 1e3 / number;
- os.write(reinterpret_cast<char*>(&speed), sizeof(speed));
-
- if (cooldown_interval_ms > 0 && (i % repeats_to_cooldown) == 0) {
-
std::this_thread::sleep_for(std::chrono::milliseconds(cooldown_interval_ms));
- }
- }
-
- std::string blob = os.str();
- // return the time.
- *rv = ffi::Bytes(std::move(blob));
- };
- return ffi::Function::FromPacked(ftimer);
-}
-
-TVM_FFI_STATIC_INIT_BLOCK() {
- namespace refl = tvm::ffi::reflection;
- refl::GlobalDef()
- .def("runtime.profiling.Report",
- [](ffi::Array<ffi::Map<ffi::String, ffi::Any>> calls,
- ffi::Map<ffi::String, ffi::Map<ffi::String, ffi::Any>>
device_metrics,
- ffi::Map<ffi::String, ffi::Any> configuration) {
- return Report(calls, device_metrics, configuration);
- })
- .def("runtime.profiling.Count",
- [](int64_t count) { return
ObjectRef(ffi::make_object<CountNode>(count)); })
- .def("runtime.profiling.Percent",
- [](double percent) { return
ObjectRef(ffi::make_object<PercentNode>(percent)); })
- .def("runtime.profiling.Duration",
- [](double duration) { return
ObjectRef(ffi::make_object<DurationNode>(duration)); })
- .def("runtime.profiling.Ratio",
- [](double ratio) { return
ObjectRef(ffi::make_object<RatioNode>(ratio)); });
-}
-
-} // namespace profiling
-} // namespace runtime
-} // namespace tvm
diff --git a/src/runtime/rocm/rocm_device_api.cc
b/src/runtime/rocm/rocm_device_api.cc
index ed7bd98ffe..6612f1a8fb 100644
--- a/src/runtime/rocm/rocm_device_api.cc
+++ b/src/runtime/rocm/rocm_device_api.cc
@@ -28,7 +28,7 @@
#include <tvm/ffi/reflection/registry.h>
#include <tvm/runtime/device_api.h>
#include <tvm/runtime/logging.h>
-#include <tvm/runtime/profiling.h>
+#include <tvm/runtime/timer.h>
#include "rocm_common.h"
@@ -297,7 +297,7 @@ class ROCMTimerNode : public TimerNode {
TVM_FFI_STATIC_INIT_BLOCK() {
namespace refl = tvm::ffi::reflection;
refl::GlobalDef()
- .def("profiling.timer.rocm",
+ .def("runtime.timer.rocm",
[](Device dev) { return Timer(ffi::make_object<ROCMTimerNode>()); })
.def("runtime.get_rocm_stream", []() {
int device_id;
diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc
index ad8f1967db..71639abf3f 100644
--- a/src/runtime/rpc/rpc_module.cc
+++ b/src/runtime/rpc/rpc_module.cc
@@ -25,7 +25,7 @@
#include <tvm/ffi/reflection/registry.h>
#include <tvm/ffi/string.h>
#include <tvm/runtime/device_api.h>
-#include <tvm/runtime/profiling.h>
+#include <tvm/runtime/timer.h>
#include <chrono>
#include <cstring>
@@ -432,9 +432,9 @@ TVM_FFI_STATIC_INIT_BLOCK() {
ffi::Optional<ffi::Function> pf = m->GetFunction(name);
TVM_FFI_ICHECK(pf.has_value())
<< "Cannot find " << name << "` in the global registry";
- return profiling::WrapTimeEvaluator(
- *pf, dev, number, repeat, min_repeat_ms,
limit_zero_time_iterations,
- cooldown_interval_ms, repeats_to_cooldown,
cache_flush_bytes, f_preproc);
+ return WrapTimeEvaluator(*pf, dev, number, repeat,
min_repeat_ms,
+ limit_zero_time_iterations,
cooldown_interval_ms,
+ repeats_to_cooldown,
cache_flush_bytes, f_preproc);
}
} else {
auto pf = tvm::ffi::Function::GetGlobal(name);
@@ -447,9 +447,9 @@ TVM_FFI_STATIC_INIT_BLOCK() {
<< "Cannot find " << f_preproc_name << " in the global
function";
f_preproc = *pf_preproc;
}
- return profiling::WrapTimeEvaluator(
- *pf, dev, number, repeat, min_repeat_ms,
limit_zero_time_iterations,
- cooldown_interval_ms, repeats_to_cooldown,
cache_flush_bytes, f_preproc);
+ return WrapTimeEvaluator(*pf, dev, number, repeat,
min_repeat_ms,
+ limit_zero_time_iterations,
cooldown_interval_ms,
+ repeats_to_cooldown,
cache_flush_bytes, f_preproc);
}
})
.def_packed("cache_flush_cpu_non_first_arg",
diff --git a/src/runtime/timer.cc b/src/runtime/timer.cc
new file mode 100644
index 0000000000..075f56337e
--- /dev/null
+++ b/src/runtime/timer.cc
@@ -0,0 +1,176 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/runtime/timer.cc
+ * \brief Runtime timer primitives: Timer, WrapTimeEvaluator.
+ */
+
+#include <tvm/ffi/function.h>
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/runtime/c_backend_api.h>
+#include <tvm/runtime/device_api.h>
+#include <tvm/runtime/timer.h>
+
+#include <chrono>
+#include <mutex>
+#include <set>
+#include <sstream>
+#include <thread>
+
+namespace tvm {
+namespace runtime {
+
+class DefaultTimerNode : public TimerNode {
+ public:
+ virtual void Start() {
+ DeviceAPI::Get(device_)->StreamSync(device_, nullptr);
+ start_ = std::chrono::high_resolution_clock::now();
+ }
+ virtual void Stop() {
+ DeviceAPI::Get(device_)->StreamSync(device_, nullptr);
+ duration_ = std::chrono::high_resolution_clock::now() - start_;
+ }
+ virtual int64_t SyncAndGetElapsedNanos() { return duration_.count(); }
+ virtual ~DefaultTimerNode() {}
+
+ explicit DefaultTimerNode(Device dev) : device_(dev) {}
+ TVM_FFI_DECLARE_OBJECT_INFO_FINAL("runtime.DefaultTimerNode",
DefaultTimerNode, TimerNode);
+
+ private:
+ std::chrono::high_resolution_clock::time_point start_;
+ std::chrono::duration<int64_t, std::nano> duration_;
+ Device device_;
+};
+
+static Timer DefaultTimer(Device dev) { return
Timer(ffi::make_object<DefaultTimerNode>(dev)); }
+
+class CPUTimerNode : public TimerNode {
+ public:
+ virtual void Start() { start_ = std::chrono::high_resolution_clock::now(); }
+ virtual void Stop() { duration_ = std::chrono::high_resolution_clock::now()
- start_; }
+ virtual int64_t SyncAndGetElapsedNanos() { return duration_.count(); }
+ virtual ~CPUTimerNode() {}
+ TVM_FFI_DECLARE_OBJECT_INFO_FINAL("runtime.CPUTimerNode", CPUTimerNode,
TimerNode);
+
+ private:
+ std::chrono::high_resolution_clock::time_point start_;
+ std::chrono::duration<int64_t, std::nano> duration_;
+};
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+ namespace refl = tvm::ffi::reflection;
+ refl::GlobalDef().def("runtime.timer.cpu",
+ [](Device dev) { return
Timer(ffi::make_object<CPUTimerNode>()); });
+}
+
+Timer Timer::Start(Device dev) {
+ // Function-local statics: thread-safe lazy init (C++11 magic statics),
+ // visible only to this function.
+ static std::set<DLDeviceType> seen_devices;
+ static std::mutex seen_devices_lock;
+ auto f = tvm::ffi::Function::GetGlobal(std::string("runtime.timer.") +
+ DLDeviceType2Str(dev.device_type));
+ if (!f.has_value()) {
+ {
+ std::lock_guard<std::mutex> lock(seen_devices_lock);
+ if (seen_devices.find(dev.device_type) == seen_devices.end()) {
+ LOG(WARNING)
+ << "No timer implementation for " <<
DLDeviceType2Str(dev.device_type)
+ << ", using default timer instead. It may be inaccurate or have
extra overhead.";
+ seen_devices.insert(dev.device_type);
+ }
+ }
+ Timer t = DefaultTimer(dev);
+ t->Start();
+ return t;
+ } else {
+ Timer t = f->operator()(dev).cast<Timer>();
+ t->Start();
+ return t;
+ }
+}
+
+ffi::Function WrapTimeEvaluator(ffi::Function pf, Device dev, int number, int
repeat,
+ int min_repeat_ms, int
limit_zero_time_iterations,
+ int cooldown_interval_ms, int
repeats_to_cooldown,
+ int cache_flush_bytes, ffi::Function
f_preproc) {
+ TVM_FFI_ICHECK(pf != nullptr);
+
+ auto ftimer = [pf, dev, number, repeat, min_repeat_ms,
limit_zero_time_iterations,
+ cooldown_interval_ms, repeats_to_cooldown, cache_flush_bytes,
+ f_preproc](const ffi::AnyView* args, int num_args, ffi::Any*
rv) mutable {
+ ffi::Any temp;
+ std::ostringstream os;
+ // skip first time call, to activate lazy compilation components.
+ pf.CallPacked(args, num_args, &temp);
+
+ // allocate two large arrays to flush L2 cache
+ Tensor arr1, arr2;
+ if (cache_flush_bytes > 0) {
+ arr1 = Tensor::Empty({cache_flush_bytes / 4}, {kDLInt, 32, 1}, dev);
+ arr2 = Tensor::Empty({cache_flush_bytes / 4}, {kDLInt, 32, 1}, dev);
+ }
+
+ DeviceAPI::Get(dev)->StreamSync(dev, nullptr);
+
+ for (int i = 0; i < repeat; ++i) {
+ if (f_preproc != nullptr) {
+ f_preproc.CallPacked(args, num_args, &temp);
+ }
+ double duration_ms = 0.0;
+ int absolute_zero_times = 0;
+ do {
+ if (duration_ms > 0.0) {
+ const double golden_ratio = 1.618;
+ number = static_cast<int>(
+ std::max((min_repeat_ms / (duration_ms / number) + 1), number *
golden_ratio));
+ }
+ if (cache_flush_bytes > 0) {
+ arr1.CopyFrom(arr2);
+ }
+ DeviceAPI::Get(dev)->StreamSync(dev, nullptr);
+ // start timing
+ Timer t = Timer::Start(dev);
+ for (int j = 0; j < number; ++j) {
+ pf.CallPacked(args, num_args, &temp);
+ }
+ t->Stop();
+ int64_t t_nanos = t->SyncAndGetElapsedNanos();
+ if (t_nanos == 0) absolute_zero_times++;
+ duration_ms = t_nanos / 1e6;
+ } while (duration_ms < min_repeat_ms && absolute_zero_times <
limit_zero_time_iterations);
+
+ double speed = duration_ms / 1e3 / number;
+ os.write(reinterpret_cast<char*>(&speed), sizeof(speed));
+
+ if (cooldown_interval_ms > 0 && (i % repeats_to_cooldown) == 0) {
+
std::this_thread::sleep_for(std::chrono::milliseconds(cooldown_interval_ms));
+ }
+ }
+
+ std::string blob = os.str();
+ // return the time.
+ *rv = ffi::Bytes(std::move(blob));
+ };
+ return ffi::Function::FromPacked(ftimer);
+}
+
+} // namespace runtime
+} // namespace tvm
diff --git a/src/runtime/vm/executable.cc b/src/runtime/vm/executable.cc
index 24ac2949b4..18bb8f6880 100644
--- a/src/runtime/vm/executable.cc
+++ b/src/runtime/vm/executable.cc
@@ -56,7 +56,6 @@ ffi::Optional<ffi::Function> VMExecutable::GetFunction(const
ffi::String& _name)
TVM_MODULE_VTABLE_ENTRY("as_text", &VMExecutable::AsText);
TVM_MODULE_VTABLE_ENTRY("as_python", &VMExecutable::AsPython);
TVM_MODULE_VTABLE_ENTRY("vm_load_executable",
&VMExecutable::VMLoadExecutable);
- TVM_MODULE_VTABLE_ENTRY("vm_profiler_load_executable",
&VMExecutable::VMProfilerLoadExecutable);
TVM_MODULE_VTABLE_ENTRY("has_function", &VMExecutable::HasFunction);
return std::nullopt;
}
@@ -437,12 +436,6 @@ ffi::Module VMExecutable::VMLoadExecutable() const {
return ffi::Module(vm);
}
-ffi::Module VMExecutable::VMProfilerLoadExecutable() const {
- ObjectPtr<VirtualMachine> vm = VirtualMachine::CreateProfiler();
-
vm->LoadExecutable(GetObjectPtr<VMExecutable>(const_cast<VMExecutable*>(this)));
- return ffi::Module(vm);
-}
-
bool VMExecutable::HasFunction(const ffi::String& name) const { return
func_map.count(name); }
ffi::String VMExecutable::AsText() const {
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index f7d6765e6a..660565d9b3 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -24,10 +24,8 @@
#include <tvm/ffi/function.h>
#include <tvm/runtime/memory/memory_manager.h>
#include <tvm/runtime/nvtx.h>
-#include <tvm/runtime/profiling.h>
#include <tvm/runtime/vm/vm.h>
-#include <optional>
#include <thread>
#include "./module_utils.h"
@@ -971,127 +969,6 @@ ffi::Function VirtualMachineImpl::_LookupFunction(const
ffi::String& name) {
return ffi::Function(nullptr);
}
-//----------------------------------------------------------------
-// Profiler can be optionally disabled via a macro to reduce dep.
-//----------------------------------------------------------------
-#if TVM_VM_ENABLE_PROFILER
-
-/*!
- * \brief An extension of VirtualMachineImpl to support per-op profiling
- * It overrides RunInstrCall to add instrumentations around it.
- */
-class VirtualMachineProfiler : public VirtualMachineImpl {
- public:
- ffi::Optional<ffi::Function> GetFunction(const ffi::String& name) override {
- ObjectPtr<Object> sptr_to_self = ffi::GetObjectPtr<Object>(this);
- if (name == "profile") {
- return ffi::Function([sptr_to_self, this](ffi::PackedArgs args,
ffi::Any* rv) {
- std::string f_name = args[0].cast<std::string>();
- VMClosure clo = this->GetClosure(f_name);
-
- std::vector<Device> devices;
- for (auto dev : this->devices) {
- if (dev.device_type > 0) {
- devices.push_back(dev);
- }
- }
-
- prof_ = profiling::Profiler(devices, {}, {{ffi::String("Executor"),
ffi::String("VM")}});
-
- auto inputs = GetInputsFor(f_name);
-
- bool clear_inputs = false;
- if (inputs.size() == 0) {
- TVM_FFI_ICHECK(args.size() > 1) << "No input is provided";
- SetInput(f_name, false, args.Slice(1));
- inputs = GetInputsFor(f_name);
- clear_inputs = true;
- } else {
- TVM_FFI_ICHECK_EQ(args.size(), 1) << "Inputs are already provided by
set_input.";
- }
-
- // warmup
- this->InvokeClosureInternal(clo, inputs);
-
- prof_->Start();
- this->InvokeClosureInternal(clo, inputs);
- prof_->Stop();
-
- // Return the report as json, since profiling::Report object is not
supported by RPC
- std::string report_json = prof_->Report()->AsJSON();
- *rv = report_json;
-
- prof_ = std::nullopt; // releases hardware counters
- if (clear_inputs) {
- // SetInput modifies the internal states of VM. Undo the change
after profiling.
- ClearInputsFor(f_name);
- }
- });
- } else {
- return VirtualMachineImpl::GetFunction(name);
- }
- }
-
- protected:
- void RunInstrCall(VMFrame* curr_frame, Instruction inst) override {
- bool profiling = false;
- if (prof_ && prof_->IsRunning()) {
- auto f_name = GetFuncName(inst.func_idx);
- std::optional<Device> dev;
- std::vector<Tensor> arrs;
-
- auto f_check_tensor_arg = [&dev, &arrs](const RegType& arg) {
- if (auto opt_nd = arg.as<Tensor>()) {
- Tensor arr = opt_nd.value();
- if (arr.defined()) {
- dev = arr->device;
- arrs.push_back(arr);
- }
- }
- };
-
- for (Index i = 0; i < inst.num_args; ++i) {
- Instruction::Arg arg = inst.args[i];
- if (arg.kind() == Instruction::ArgKind::kRegister) {
- auto reg = ReadRegister(curr_frame, arg.value());
- f_check_tensor_arg(reg);
- } else if (arg.kind() == Instruction::ArgKind::kConstIdx) {
- const auto& const_val = this->const_pool_[arg.value()];
- f_check_tensor_arg(const_val);
- }
- }
-
- std::unordered_map<std::string, ffi::Any> metrics;
- metrics["Argument Shapes"] = profiling::ShapeString(arrs);
-
- // If a suitable device is found, enable profiling.
- if (dev) {
- profiling = true;
- prof_->StartCall(f_name, *dev, metrics);
- }
- }
-
- VirtualMachineImpl::RunInstrCall(curr_frame, inst);
-
- if (profiling) {
- prof_->StopCall();
- }
- }
-
- private:
- std::optional<profiling::Profiler> prof_;
-};
-
-ObjectPtr<VirtualMachine> VirtualMachine::CreateProfiler() {
- return ffi::make_object<VirtualMachineProfiler>();
-}
-
-#else
-ObjectPtr<VirtualMachine> VirtualMachine::CreateProfiler() {
- TVM_FFI_THROW(InternalError) << "Profiler support is disabled";
- return nullptr;
-}
-#endif // TVM_VM_ENABLE_PROFILER
} // namespace vm
} // namespace runtime
} // namespace tvm
diff --git a/tests/python/relax/test_codegen_coreml.py
b/tests/python/relax/test_codegen_coreml.py
index de3a6d0789..63a704cc41 100644
--- a/tests/python/relax/test_codegen_coreml.py
+++ b/tests/python/relax/test_codegen_coreml.py
@@ -54,12 +54,12 @@ def verify(mod, inputs):
assert relax.analysis.well_formed(mod1)
ex1 = tvm.compile(mod1, target=target)
- vm1 = relax.VirtualMachine(ex1, dev, profile=True)
+ vm1 = relax.VirtualMachine(ex1, dev)
out1 = vm1["main"](*inputs)
mod2 = relax.transform.LegalizeOps()(mod)
ex2 = tvm.compile(mod2, target=target)
- vm2 = relax.VirtualMachine(ex2, dev, profile=True)
+ vm2 = relax.VirtualMachine(ex2, dev)
out2 = vm2["main"](*inputs)
tvm.testing.assert_allclose(out1.numpy(), out2.numpy(), rtol=1e-3,
atol=1e-3)
diff --git a/tests/python/relax/test_training_trainer_numeric.py
b/tests/python/relax/test_training_trainer_numeric.py
index c3f1deee6b..b96c46dd8f 100644
--- a/tests/python/relax/test_training_trainer_numeric.py
+++ b/tests/python/relax/test_training_trainer_numeric.py
@@ -66,7 +66,7 @@ def test_execute(target, dev):
train_mod = setup_trainer(backbone)
ex = tvm.compile(train_mod, target)
- vm = relax.VirtualMachine(ex, dev, profile=True)
+ vm = relax.VirtualMachine(ex, dev)
trainer = Trainer(train_mod, vm, dev, False)
trainer.zero_init_params()
@@ -75,7 +75,6 @@ def test_execute(target, dev):
dataset = _make_dataset()
trainer.predict(dataset[0][0])
trainer.update(dataset[0][0], dataset[0][1])
- trainer.profile_adjoint(dataset[0][0], dataset[0][1])
@tvm.testing.parametrize_targets("llvm")
diff --git a/tests/python/relax/test_vm_profiler.py
b/tests/python/relax/test_vm_profiler.py
deleted file mode 100644
index c661bbe5da..0000000000
--- a/tests/python/relax/test_vm_profiler.py
+++ /dev/null
@@ -1,129 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# ruff: noqa: RUF005
-import numpy as np
-
-import tvm
-import tvm.testing
-from tvm import relax, rpc
-from tvm.contrib import utils
-from tvm.relax.testing import nn
-from tvm.script import relax as R
-
-
-def get_exec(data_shape):
- builder = relax.BlockBuilder()
- weight1_np = np.random.randn(64, 64).astype("float32")
- weight2_np = np.random.randn(64, 64).astype("float32")
-
- with builder.function("main"):
- model = nn.Sequential(
- nn.Linear(data_shape[1], weight1_np.shape[0], bias=False),
- nn.ReLU(),
- nn.Linear(weight2_np.shape[0], weight2_np.shape[1], bias=False),
- nn.ReLU(),
- )
- data = nn.Placeholder(data_shape, name="data")
- output = model(data)
- params = [data] + model.parameters()
- builder.emit_func_output(output, params=params)
-
- mod = builder.get()
-
- params = {"linear_weight": weight1_np, "linear_weight1": weight2_np}
- mod = relax.transform.BindParams("main", params)(mod)
-
- target = "llvm"
- return tvm.compile(mod, target)
-
-
-def test_conv2d_cpu():
- data_np = np.random.randn(1, 64).astype("float32")
- ex = get_exec(data_np.shape)
-
- vm = relax.VirtualMachine(ex, tvm.cpu(), profile=True)
- report = vm.profile("main", tvm.runtime.tensor(data_np))
- print(report)
-
- assert "Duration" in str(report)
- assert "matmul" in str(report)
-
-
-def with_rpc(ex, f, data_np):
- temp = utils.tempdir()
- path = temp.relpath("vm_library.so")
- ex.export_library(path)
-
- server = rpc.Server("127.0.0.1")
- remote = rpc.connect(server.host, server.port, session_timeout=10)
-
- remote.upload(path)
- rexec = remote.load_module("vm_library.so")
-
- device = remote.cpu()
-
- vm = relax.VirtualMachine(rexec, device=device, profile=True)
- data = tvm.runtime.tensor(data_np, device)
-
- f(vm, data)
-
-
-def test_rpc():
- data_np = np.random.randn(1, 64).astype("float32")
- ex = get_exec(data_np.shape)
-
- def callback(vm, data):
- vm.profile("main", data)
-
- vm.set_input("main", data)
- report = vm.profile("main")
-
- assert "matmul" in str(report)
- print(report)
-
- with_rpc(ex, callback, data_np)
-
-
-def test_tuple():
- @tvm.script.ir_module
- class NestedTuple:
- @R.function
- def main(x: R.Tensor((16,), "float32")) -> R.Tuple(
- R.Tuple(
- R.Tensor((16,), "float32"),
- R.Tuple(
- R.Tensor((16,), "float32"),
- ),
- ),
- R.Tensor((16,), "float32"),
- ):
- return ((x, (x,)), x)
-
- target = "llvm"
- ex = tvm.compile(NestedTuple, target)
-
- data_np = np.random.randn(16).astype("float32")
-
- def callback(vm, data):
- report = vm.profile("main", data)
- assert "vm.builtin.make_tuple" in str(report)
-
- with_rpc(ex, callback, data_np)
-
-
-if __name__ == "__main__":
- tvm.testing.main()
diff --git a/web/emcc/wasm_runtime.cc b/web/emcc/wasm_runtime.cc
index baaff6b8ca..d2bfe326e1 100644
--- a/web/emcc/wasm_runtime.cc
+++ b/web/emcc/wasm_runtime.cc
@@ -37,7 +37,6 @@
#include "src/runtime/device_api.cc"
#include "src/runtime/file_utils.cc"
#include "src/runtime/logging.cc"
-#include "src/runtime/profiling.cc"
#include "src/runtime/rpc/rpc_channel.cc"
#include "src/runtime/rpc/rpc_endpoint.cc"
#include "src/runtime/rpc/rpc_event_impl.cc"
@@ -45,6 +44,7 @@
#include "src/runtime/rpc/rpc_module.cc"
#include "src/runtime/rpc/rpc_session.cc"
#include "src/runtime/tensor.cc"
+#include "src/runtime/timer.cc"
#include "src/runtime/workspace_pool.cc"
// relax setup
#include "3rdparty/tvm-ffi/src/ffi/backtrace.cc"