[Intel-gfx] [PATCH 09/15] drm/i915: Extend i915 perf framework for collecting timestamps on all gpu engines
From: Sourab Gupta This patch extends the i915 perf framework to handle the perf sample collection for any given gpu engine. Particularly, the support for collecting timestamp sample type is added, which can be requested for any engine. With this, for RCS, timestamps and OA reports can be collected together, and provided to userspace in separate sample fields. For other engines, the capabilility to collect timestamps is added. The thing to note is that, still only a single stream instance can be opened at any particular time. Though that stream may now be opened for any gpu engine, for collection of timestamp samples. So, this patch doesn't add the support to open multiple concurrent streams, as yet. Though it lays the groundwork for this support to be added susequently. Part of this groundwork involves having separate command stream buffers, per engine, for holding the samples generated. Likewise for a few other data structures maintaining per-engine state. Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_drv.h | 32 +- drivers/gpu/drm/i915/i915_perf.c | 648 ++- drivers/gpu/drm/i915/i915_reg.h | 2 + include/uapi/drm/i915_drm.h | 7 + 4 files changed, 465 insertions(+), 224 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 9da5007..2a31b79 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1797,7 +1797,8 @@ struct i915_perf_stream_ops { * Routine to emit the commands in the command streamer associated * with the corresponding gpu engine. */ - void (*command_stream_hook)(struct drm_i915_gem_request *req, u32 tag); + void (*command_stream_hook)(struct i915_perf_stream *stream, + struct drm_i915_gem_request *req, u32 tag); }; enum i915_perf_stream_state { @@ -1821,6 +1822,9 @@ struct i915_perf_stream { /* Whether command stream based data collection is enabled */ bool cs_mode; + /* Whether the OA unit is in use */ + bool using_oa; + const struct i915_perf_stream_ops *ops; }; @@ -1850,7 +1854,16 @@ struct i915_oa_ops { struct i915_perf_cs_data_node { struct list_head link; struct drm_i915_gem_request *request; - u32 offset; + + /* Offsets into the GEM obj holding the data */ + u32 start_offset; + u32 oa_offset; + u32 ts_offset; + + /* buffer size corresponding to this entry */ + u32 size; + + /* Other metadata */ u32 ctx_id; u32 pid; u32 tag; @@ -2147,14 +2160,13 @@ struct drm_i915_private { spinlock_t hook_lock; - struct { - struct i915_perf_stream *exclusive_stream; + struct hrtimer poll_check_timer; + struct i915_perf_stream *exclusive_stream; + wait_queue_head_t poll_wq[I915_NUM_ENGINES]; + struct { u32 specific_ctx_id; - struct hrtimer poll_check_timer; - wait_queue_head_t poll_wq; - bool periodic; int period_exponent; int timestamp_frequency; @@ -2197,13 +2209,13 @@ struct drm_i915_private { u8 *addr; #define I915_PERF_CMD_STREAM_BUF_STATUS_OVERFLOW (1<<0) u32 status; - } command_stream_buf; + } command_stream_buf[I915_NUM_ENGINES]; u32 last_ctx_id; u32 last_pid; u32 last_tag; - struct list_head node_list; - spinlock_t node_list_lock; + struct list_head node_list[I915_NUM_ENGINES]; + spinlock_t node_list_lock[I915_NUM_ENGINES]; } perf; /* Abstract the submission mechanism (legacy ringbuffer or execlists) away */ diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 902f84f..4a6fc5e 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -84,12 +84,17 @@ static u32 i915_perf_stream_paranoid = true; /* For determining the behavior on overflow of command stream samples */ #define CMD_STREAM_BUF_OVERFLOW_ALLOWED -/* Data common to periodic and RCS based samples */ -struct oa_sample_data { +#define OA_ADDR_ALIGN 64 +#define TS_ADDR_ALIGN 8 +#define I915_PERF_TS_SAMPLE_SIZE 8 + +/* Data common to all samples (periodic OA / CS based OA / Timestamps) */ +struct sample_data { u32 source; u32 ctx_id; u32 pid; u32 tag; + u64 ts; const u8 *report; }; @@ -147,6 +152,7 @@ static const enum intel_engine_id user_ring_map[I915_USER_RINGS + 1] = { #define SAMPLE_CTX_ID (1<<2) #define SAMPLE_PID (1<<3) #define SAMPLE_TAG (1<<4) +#define
[Intel-gfx] [PATCH 07/15] drm/i915: Add support for having pid output with OA report
From: Sourab Gupta This patch introduces flags and adds support for having pid output with the OA reports generated through the RCS commands. When the stream is opened with pid sample type, the pid information is also captured through the command stream samples and forwarded along with the OA reports. Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_drv.h | 2 ++ drivers/gpu/drm/i915/i915_perf.c | 48 +++- include/uapi/drm/i915_drm.h | 7 ++ 3 files changed, 56 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 9d23ca1..60e94e6 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1851,6 +1851,7 @@ struct i915_perf_cs_data_node { struct drm_i915_gem_request *request; u32 offset; u32 ctx_id; + u32 pid; }; struct drm_i915_private { @@ -2197,6 +2198,7 @@ struct drm_i915_private { } command_stream_buf; u32 last_ctx_id; + u32 last_pid; struct list_head node_list; spinlock_t node_list_lock; } perf; diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 2496a4b..bb5356f 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -88,6 +88,7 @@ static u32 i915_perf_stream_paranoid = true; struct oa_sample_data { u32 source; u32 ctx_id; + u32 pid; const u8 *report; }; @@ -143,6 +144,7 @@ static const enum intel_engine_id user_ring_map[I915_USER_RINGS + 1] = { #define SAMPLE_OA_REPORT (1<<0) #define SAMPLE_OA_SOURCE_INFO (1<<1) #define SAMPLE_CTX_ID (1<<2) +#define SAMPLE_PID (1<<3) struct perf_open_properties { u32 sample_flags; @@ -322,6 +324,7 @@ static void i915_perf_command_stream_hook_oa(struct drm_i915_gem_request *req) goto out; entry->ctx_id = ctx->hw_id; + entry->pid = current->pid; i915_gem_request_assign(&entry->request, req); addr = dev_priv->perf.command_stream_buf.vma->node.start + @@ -582,6 +585,12 @@ static int append_oa_sample(struct i915_perf_stream *stream, buf += 4; } + if (sample_flags & SAMPLE_PID) { + if (copy_to_user(buf, &data->pid, 4)) + return -EFAULT; + buf += 4; + } + if (sample_flags & SAMPLE_OA_REPORT) { if (copy_to_user(buf, data->report, report_size)) return -EFAULT; @@ -624,6 +633,9 @@ static int append_oa_buffer_sample(struct i915_perf_stream *stream, data.ctx_id = dev_priv->perf.oa.ops.oa_buffer_get_ctx_id( stream, report); + if (sample_flags & SAMPLE_PID) + data.pid = dev_priv->perf.last_pid; + if (sample_flags & SAMPLE_OA_REPORT) data.report = report; @@ -1092,6 +1104,11 @@ static int append_oa_rcs_sample(struct i915_perf_stream *stream, dev_priv->perf.last_ctx_id = node->ctx_id; } + if (sample_flags & SAMPLE_PID) { + data.pid = node->pid; + dev_priv->perf.last_pid = node->pid; + } + if (sample_flags & SAMPLE_OA_REPORT) data.report = report; @@ -1873,6 +1890,7 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, struct drm_i915_private *dev_priv = stream->dev_priv; bool require_oa_unit = props->sample_flags & (SAMPLE_OA_REPORT | SAMPLE_OA_SOURCE_INFO); + bool require_cs_mode = props->sample_flags & SAMPLE_PID; bool cs_sample_data = props->sample_flags & SAMPLE_OA_REPORT; int ret; @@ -2005,6 +2023,20 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, if (props->sample_flags & SAMPLE_CTX_ID) { stream->sample_flags |= SAMPLE_CTX_ID; stream->sample_size += 4; + + /* +* NB: it's meaningful to request SAMPLE_CTX_ID with just CS +* mode or periodic OA mode sampling but we don't allow +* SAMPLE_CTX_ID without either mode +*/ + if (!require_oa_unit) + require_cs_mode = true; + } + + if (require_cs_mode && !props->cs_mode) { + DRM_ERROR("PID sampling requires a ring to be specified"); + ret = -EINVAL; + goto cs_error; } if (props->cs_mode) { @@ -2015,7 +2047,13 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, goto cs_error;
[Intel-gfx] [PATCH 04/15] drm/i915: flush periodic samples, in case of no pending CS sample requests
From: Sourab Gupta When there are no pending CS OA samples, flush the periodic OA samples collected so far. We can safely forward the periodic OA samples in the case we have no pending CS samples, but we can't do so in the case we have pending CS samples, since we don't know what the ordering between pending CS samples and periodic samples will eventually be. If we have no pending CS sample, it won't be possible for future pending CS sample to have timestamps earlier than current periodic timestamp. Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_drv.h | 14 ++-- drivers/gpu/drm/i915/i915_perf.c | 173 +-- 2 files changed, 140 insertions(+), 47 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index f95b02b..7efdfc2 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1751,7 +1751,7 @@ struct i915_perf_stream_ops { /* Return: true if any i915 perf records are ready to read() * for this stream. */ - bool (*can_read)(struct i915_perf_stream *stream); + bool (*can_read_unlocked)(struct i915_perf_stream *stream); /* Call poll_wait, passing a wait queue that will be woken * once there is something ready to read() for the stream @@ -1763,8 +1763,8 @@ struct i915_perf_stream_ops { /* For handling a blocking read, wait until there is something * to ready to read() for the stream. E.g. wait on the same * wait queue that would be passed to poll_wait() until -* ->can_read() returns true (if its safe to call ->can_read() -* without the i915 perf lock held). +* ->can_read_unlocked() returns true (if its safe to call +* ->can_read_unlocked() without the i915 perf lock held). */ int (*wait_unlocked)(struct i915_perf_stream *stream); @@ -1834,8 +1834,10 @@ struct i915_oa_ops { u32 ctx_id); void (*legacy_ctx_switch_unlocked)(struct drm_i915_gem_request *req); int (*read)(struct i915_perf_stream *stream, - struct i915_perf_read_state *read_state, u32 ts); - bool (*oa_buffer_is_empty)(struct drm_i915_private *dev_priv); + struct i915_perf_read_state *read_state, + u32 ts, u32 max_records); + int (*oa_buffer_num_samples)(struct drm_i915_private *dev_priv, + u32 *last_ts); }; /* @@ -2175,6 +2177,8 @@ struct drm_i915_private { u32 gen7_latched_oastatus1; u32 ctx_oactxctrl_off; u32 ctx_flexeu0_off; + u32 n_pending_periodic_samples; + u32 pending_periodic_ts; struct i915_oa_ops ops; const struct i915_oa_format *oa_formats; diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 42e930f..b53ccf5 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -388,13 +388,30 @@ static void i915_oa_rcs_free_requests(struct drm_i915_private *dev_priv) * pointers. A race here could result in a false positive !empty status which * is acceptable. */ -static bool gen8_oa_buffer_is_empty_fop_unlocked(struct drm_i915_private *dev_priv) +static int +gen8_oa_buffer_num_samples_fop_unlocked(struct drm_i915_private *dev_priv, + u32 *last_ts) { int report_size = dev_priv->perf.oa.oa_buffer.format_size; - u32 head = I915_READ(GEN8_OAHEADPTR); - u32 tail = I915_READ(GEN8_OATAILPTR); + u8 *oa_buf_base = dev_priv->perf.oa.oa_buffer.addr; + u32 head = I915_READ(GEN8_OAHEADPTR) & GEN8_OAHEADPTR_MASK; + u32 tail = I915_READ(GEN8_OATAILPTR) & GEN8_OATAILPTR_MASK; + u32 mask = (OA_BUFFER_SIZE - 1); + u32 num_samples; + u8 *report; + + head -= dev_priv->perf.oa.oa_buffer.gtt_offset; + tail -= dev_priv->perf.oa.oa_buffer.gtt_offset; + num_samples = OA_TAKEN(tail, head) / report_size; - return OA_TAKEN(tail, head) < report_size; + /* read the timestamp of the last sample */ + if (num_samples) { + head += report_size*(num_samples - 1); + report = oa_buf_base + (head & mask); + *last_ts = *(u32 *)(report + 4); + } + + return num_samples; } /* NB: This is either called via fops or the poll check hrtimer (atomic ctx) @@ -408,16 +425,32 @@ static bool gen8_oa_buffer_is_empty_fop_unlocked(struct drm_i915_private *dev_pr * pointers. A race here could result in a false positive !empty status which * is acceptable. */ -static bool gen7_oa_buffer_is_empty_fop_unlocked(struct drm_i915_private *dev_priv) +static int +gen7_oa_buffer_num_
[Intel-gfx] [PATCH 06/15] drm/i915: Populate ctx ID for periodic OA reports
From: Sourab Gupta This adds support for populating the ctx id for the periodic OA reports when requested through the corresponding property. For Gen8, the OA reports itself have the ctx ID and it is the one programmed into HW while submitting workloads. Thus it's retrieved from reports itself. For Gen7, the OA reports don't have any such field, and we can populate this field with the last seen ctx ID while sending CS reports. Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_drv.h | 3 +++ drivers/gpu/drm/i915/i915_perf.c | 52 +--- 2 files changed, 52 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 8cce8bd..9d23ca1 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1838,6 +1838,8 @@ struct i915_oa_ops { u32 ts, u32 max_records); int (*oa_buffer_num_samples)(struct drm_i915_private *dev_priv, u32 *last_ts); + u32 (*oa_buffer_get_ctx_id)(struct i915_perf_stream *stream, + const u8 *report); }; /* @@ -2194,6 +2196,7 @@ struct drm_i915_private { u32 status; } command_stream_buf; + u32 last_ctx_id; struct list_head node_list; spinlock_t node_list_lock; } perf; diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index a9cf103..2496a4b 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -485,6 +485,46 @@ gen7_oa_buffer_num_samples_fop_unlocked(struct drm_i915_private *dev_priv, return num_samples; } +static u32 gen7_oa_buffer_get_ctx_id(struct i915_perf_stream *stream, + const u8 *report) +{ + struct drm_i915_private *dev_priv = stream->dev_priv; + + if (!stream->cs_mode) + WARN_ONCE(1, + "CTX ID can't be retrieved if command stream mode not enabled"); + + /* +* OA reports generated in Gen7 don't have the ctx ID information. +* Therefore, just rely on the ctx ID information from the last CS +* sample forwarded +*/ + return dev_priv->perf.last_ctx_id; +} + +static u32 gen8_oa_buffer_get_ctx_id(struct i915_perf_stream *stream, + const u8 *report) +{ + struct drm_i915_private *dev_priv = stream->dev_priv; + + /* The ctx ID present in the OA reports have intel_context::global_id +* present, since this is programmed into the ELSP in execlist mode. +* In non-execlist mode, fall back to retrieving the ctx ID from the +* last saved ctx ID from command stream mode. +*/ + if (i915.enable_execlists) { + u32 ctx_id = *(u32 *)(report + 12); + ctx_id &= 0xf; + return ctx_id; + } else { + if (!stream->cs_mode) + WARN_ONCE(1, + "CTX ID can't be retrieved if command stream mode not enabled"); + + return dev_priv->perf.last_ctx_id; + } +} + /** * Appends a status record to a userspace read() buffer. */ @@ -580,9 +620,9 @@ static int append_oa_buffer_sample(struct i915_perf_stream *stream, data.source = source; } -#warning "FIXME: append_oa_buffer_sample: read ctx ID from report and map that to an intel_context::global_id" if (sample_flags & SAMPLE_CTX_ID) - data.ctx_id = 0; + data.ctx_id = dev_priv->perf.oa.ops.oa_buffer_get_ctx_id( + stream, report); if (sample_flags & SAMPLE_OA_REPORT) data.report = report; @@ -1047,8 +1087,10 @@ static int append_oa_rcs_sample(struct i915_perf_stream *stream, if (sample_flags & SAMPLE_OA_SOURCE_INFO) data.source = I915_PERF_OA_EVENT_SOURCE_RCS; - if (sample_flags & SAMPLE_CTX_ID) + if (sample_flags & SAMPLE_CTX_ID) { data.ctx_id = node->ctx_id; + dev_priv->perf.last_ctx_id = node->ctx_id; + } if (sample_flags & SAMPLE_OA_REPORT) data.report = report; @@ -2743,6 +2785,8 @@ void i915_perf_init(struct drm_device *dev) dev_priv->perf.oa.ops.read = gen7_oa_read; dev_priv->perf.oa.ops.oa_buffer_num_samples = gen7_oa_buffer_num_samples_fop_unlocked; + dev_priv->perf.oa.ops.oa_buffer_get_ctx_id = + gen7_oa_buffer_get_ctx_id; dev_priv->perf.oa.timestamp_frequency = 1250; @@ -2760,6 +2804,8 @@ void i915_perf_init(struct drm_device
[Intel-gfx] [PATCH 01/15] drm/i915: Add ctx getparam ioctl parameter to retrieve ctx unique id
From: Sourab Gupta This patch adds a new ctx getparam ioctl parameter, which can be used to retrieve ctx unique id by userspace. This can be used by userspace to map the i915 perf samples with their particular ctx's, since those would be having ctx unique id's. Otherwise the userspace has no way of maintaining this association, since it has the knowledge of only per-drm file specific ctx handles. Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_gem_context.c | 3 +++ include/uapi/drm/i915_drm.h | 1 + 2 files changed, 4 insertions(+) diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c index e974451..09f5178 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.c +++ b/drivers/gpu/drm/i915/i915_gem_context.c @@ -1001,6 +1001,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data, else args->value = to_i915(dev)->ggtt.base.total; break; + case I915_CONTEXT_PARAM_HW_ID: + args->value = ctx->hw_id; + break; default: ret = -EINVAL; break; diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index 4a1bcfd8..0badc16 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -1171,6 +1171,7 @@ struct drm_i915_gem_context_param { #define I915_CONTEXT_PARAM_BAN_PERIOD 0x1 #define I915_CONTEXT_PARAM_NO_ZEROMAP 0x2 #define I915_CONTEXT_PARAM_GTT_SIZE0x3 +#define I915_CONTEXT_PARAM_HW_ID 0x4 __u64 value; }; -- 1.9.1 ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
[Intel-gfx] [PATCH 12/15] time: Expose current clocksource in use by timekeeping framework
From: Sourab Gupta For the drivers to be able to use the cross timestamp framework, they need the information of current clocksource being used by the kernel timekeeping. This is needed since the callback given by driver into the get_device_system_crosststamp(), in order to synchronously read the device time and system counter value, requires the knowledge of the clocksource being used to read system counter value (as a part of struct system_counterval_t). Signed-off-by: Sourab Gupta --- include/linux/timekeeping.h | 5 + kernel/time/timekeeping.c | 12 2 files changed, 17 insertions(+) diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 96f37be..d5a8cd6 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -320,6 +320,11 @@ extern int get_device_system_crosststamp( struct system_device_crosststamp *xtstamp); /* + * Get current clocksource used by system timekeeping framework + */ +struct clocksource *get_current_clocksource(void); + +/* * Simultaneously snapshot realtime and monotonic raw clocks */ extern void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 479d25c..e92d466 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1133,6 +1133,18 @@ int get_device_system_crosststamp(int (*get_time_fn) EXPORT_SYMBOL_GPL(get_device_system_crosststamp); /** + * get_current_clocksource - Returns the current clocksource in used by tk_core + * + */ +struct clocksource *get_current_clocksource(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + + return tk->tkr_mono.clock; +} +EXPORT_SYMBOL_GPL(get_current_clocksource); + +/** * do_gettimeofday - Returns the time of day in a timeval * @tv:pointer to the timeval to be set * -- 1.9.1 ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
[Intel-gfx] [PATCH 13/15] time: export clocks_calc_mult_shift
From: Sourab Gupta Exporting clocks_calc_mult_shift is helpful for drivers to calculate the mult/shift values for their clocks, given their frequency. This is particularly useful when such drivers may want to associate timecounter/cyclecounter abstraction for their clock sources, in order to use the cross timestamp infrastructure for syncing device time with system time. Signed-off-by: Sourab Gupta --- kernel/time/clocksource.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 56ece14..fef256f 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -89,6 +89,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec) *mult = tmp; *shift = sft; } +EXPORT_SYMBOL_GPL(clocks_calc_mult_shift); /*[Clocksource internal variables]- * curr_clocksource: -- 1.9.1 ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
[Intel-gfx] [PATCH 03/15] drm/i915: Framework for capturing command stream based OA reports
From: Sourab Gupta This patch introduces a framework to enable OA counter reports associated with Render command stream. We can then associate the reports captured through this mechanism with their corresponding context id's. This can be further extended to associate any other metadata information with the corresponding samples (since the association with Render command stream gives us the ability to capture these information while inserting the corresponding capture commands into the command stream). The OA reports generated in this way are associated with a corresponding workload, and thus can be used the delimit the workload (i.e. sample the counters at the workload boundaries), within an ongoing stream of periodic counter snapshots. There may be usecases wherein we need more than periodic OA capture mode which is supported currently. This mode is primarily used for two usecases: - Ability to capture system wide metrics, alongwith the ability to map the reports back to individual contexts (particularly for HSW). - Ability to inject tags for work, into the reports. This provides visibility into the multiple stages of work within single context. The userspace will be able to distinguish between the periodic and CS based OA reports by the virtue of source_info sample field. The command MI_REPORT_PERF_COUNT can be used to capture snapshots of OA counters, and is inserted at BB boundaries. The data thus captured will be stored in a separate buffer, which will be different from the buffer used otherwise for periodic OA capture mode. The metadata information pertaining to snapshot is maintained in a list, which also has offsets into the gem buffer object per captured snapshot. In order to track whether the gpu has completed processing the node, a field pertaining to corresponding gem request is added, which is tracked for completion of the command. Both periodic and RCS based reports are associated with a single stream (corresponding to render engine), and it is expected to have the samples in the sequential order according to their timestamps. Now, since these reports are collected in separate buffers, these are merge sorted at the time of forwarding to userspace during the read call. v2: Aligining with the non-perf interface (custom drm ioctl based). Also, few related patches are squashed together for better readability Signed-off-by: Sourab Gupta Signed-off-by: Robert Bragg --- drivers/gpu/drm/i915/i915_drv.h| 44 +- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 4 + drivers/gpu/drm/i915/i915_perf.c | 871 - drivers/gpu/drm/i915/intel_lrc.c | 4 + include/uapi/drm/i915_drm.h| 15 + 5 files changed, 804 insertions(+), 134 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 82622c4..f95b02b 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1791,6 +1791,18 @@ struct i915_perf_stream_ops { * The stream will always be disabled before this is called. */ void (*destroy)(struct i915_perf_stream *stream); + + /* +* Routine to emit the commands in the command streamer associated +* with the corresponding gpu engine. +*/ + void (*command_stream_hook)(struct drm_i915_gem_request *req); +}; + +enum i915_perf_stream_state { + I915_PERF_STREAM_DISABLED, + I915_PERF_STREAM_ENABLE_IN_PROGRESS, + I915_PERF_STREAM_ENABLED, }; struct i915_perf_stream { @@ -1798,11 +1810,15 @@ struct i915_perf_stream { struct list_head link; + enum intel_engine_id engine; u32 sample_flags; int sample_size; struct intel_context *ctx; - bool enabled; + enum i915_perf_stream_state state; + + /* Whether command stream based data collection is enabled */ + bool cs_mode; const struct i915_perf_stream_ops *ops; }; @@ -1818,10 +1834,21 @@ struct i915_oa_ops { u32 ctx_id); void (*legacy_ctx_switch_unlocked)(struct drm_i915_gem_request *req); int (*read)(struct i915_perf_stream *stream, - struct i915_perf_read_state *read_state); + struct i915_perf_read_state *read_state, u32 ts); bool (*oa_buffer_is_empty)(struct drm_i915_private *dev_priv); }; +/* + * List element to hold info about the perf sample data associated + * with a particular GPU command stream. + */ +struct i915_perf_cs_data_node { + struct list_head link; + struct drm_i915_gem_request *request; + u32 offset; + u32 ctx_id; +}; + struct drm_i915_private { struct drm_device *dev; struct kmem_cache *objects; @@ -2107,6 +2134,8 @@ struct drm_i915_private { struct ctl_table_header *sysctl_header; struct mutex lock; + + struct mutex streams
[Intel-gfx] [PATCH 14/15] drm/i915: Mechanism to forward clock monotonic raw time in perf samples
From: Sourab Gupta Currently, we have the ability to only forward the GPU timestamps in the samples (which are generated via OA reports or PIPE_CONTROL commands inserted in the ring). This limits the ability to correlate these samples with the system events. If we scale the GPU timestamps according the timestamp base/frequency info present in bspec, it is observed that the timestamps drift really quickly from the system time. An ability is therefore needed to report timestamps in different clock domains, such as CLOCK_MONOTONIC (or _MONO_RAW), in the perf samples to be of more practical use to the userspace. This ability becomes important when we want to correlate/plot GPU events/samples with other system events on the same timeline (e.g. vblank events, or timestamps when work was submitted to kernel, etc.) The patch here proposes a mechanism to achieve this. The correlation between gpu time and system time is established using the cross timestamp framework. For this purpose, the timestamp clock associated with the command stream, is abstracted as timecounter/cyclecounter, before utilizing cross timestamp framework to retrieve gpu/system time correlated values. Different such gpu/system time values are then used to detect and correct the error in published gpu timestamp clock frequency. The userspace can request CLOCK_MONOTONIC_RAW timestamps in samples by requesting the corresponding property while opening the stream. Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_dma.c | 2 + drivers/gpu/drm/i915/i915_drv.h | 24 +++- drivers/gpu/drm/i915/i915_perf.c | 273 +++ include/uapi/drm/i915_drm.h | 9 +- 4 files changed, 284 insertions(+), 24 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c index ab1f6c4..01f3559 100644 --- a/drivers/gpu/drm/i915/i915_dma.c +++ b/drivers/gpu/drm/i915/i915_dma.c @@ -1327,6 +1327,8 @@ static int i915_driver_init_hw(struct drm_i915_private *dev_priv) DRM_DEBUG_DRIVER("can't enable MSI"); } + i915_perf_init_late(dev_priv); + return 0; out_ggtt: diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 9ccac83..d99ea73 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -42,6 +42,9 @@ #include #include #include +#include +#include +#include #include #include @@ -1825,6 +1828,9 @@ struct i915_perf_stream { /* Whether the OA unit is in use */ bool using_oa; + /* monotonic_raw clk timestamp (in ns) for last sample */ + u64 last_sample_ts; + const struct i915_perf_stream_ops *ops; }; @@ -1869,6 +1875,20 @@ struct i915_perf_cs_data_node { u32 tag; }; +/** + * struct i915_clock_info - decribes i915 timestamp clock + * + */ +struct i915_clock_info { + struct cyclecounter cc; + struct timecounter tc; + struct system_device_crosststamp xtstamp; + ktime_t clk_offset; /* Offset (in ns) between monoraw clk and gpu time */ + u32 timestamp_frequency; + u32 resync_period; /* in msecs */ + struct delayed_work clk_sync_work; +}; + struct drm_i915_private { struct drm_device *dev; struct kmem_cache *objects; @@ -2147,6 +2167,8 @@ struct drm_i915_private { struct i915_runtime_pm pm; + struct i915_clock_info ts_clk_info; + struct { bool initialized; @@ -2169,7 +2191,6 @@ struct drm_i915_private { bool periodic; int period_exponent; - int timestamp_frequency; int tail_margin; @@ -3699,6 +3720,7 @@ int i915_parse_cmds(struct intel_engine_cs *engine, /* i915_perf.c */ extern void i915_perf_init(struct drm_device *dev); +extern void i915_perf_init_late(struct drm_i915_private *dev_priv); extern void i915_perf_fini(struct drm_device *dev); /* i915_suspend.c */ diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index aa3589e..e340cf9f 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -23,6 +23,7 @@ #include #include +#include #include "i915_drv.h" #include "intel_ringbuffer.h" @@ -62,6 +63,9 @@ #define POLL_FREQUENCY 200 #define POLL_PERIOD (NSEC_PER_SEC / POLL_FREQUENCY) +#define MAX_CLK_SYNC_PERIOD (60*MSEC_PER_SEC) +#define INIT_CLK_SYNC_PERIOD (20) /* in msecs */ + static u32 i915_perf_stream_paranoid = true; /* The maximum exponent the hardware accepts is 63 (essentially it selects one @@ -88,13 +92,24 @@ static u32 i915_perf_stream_paranoid = true; #define TS_ADDR_ALIGN 8 #define I915_PERF_TS_SAMPLE_SIZE 8 +/* Published frequency of GT command stream timestamp clock */ +#define FREQUENCY_12_5_MHZ (1250) +#define FREQUENCY_12_0_MHZ (1200) +#define FREQUEN
[Intel-gfx] [PATCH 05/15] drm/i915: Handle the overflow condition for command stream buf
From: Sourab Gupta Add a compile time option for detecting the overflow condition of command stream buffer, and not overwriting the old entries in such a case. Also, set a status flag to forward the overflow condition to userspace if overflow is detected. Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_drv.h | 2 ++ drivers/gpu/drm/i915/i915_perf.c | 75 2 files changed, 62 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 7efdfc2..8cce8bd 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -2190,6 +2190,8 @@ struct drm_i915_private { struct drm_i915_gem_object *obj; struct i915_vma *vma; u8 *addr; +#define I915_PERF_CMD_STREAM_BUF_STATUS_OVERFLOW (1<<0) + u32 status; } command_stream_buf; struct list_head node_list; diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index b53ccf5..a9cf103 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -81,6 +81,9 @@ static u32 i915_perf_stream_paranoid = true; #define GEN8_OAREPORT_REASON_GO_TRANSITION (1<<23) #define GEN9_OAREPORT_REASON_CLK_RATIO (1<<24) +/* For determining the behavior on overflow of command stream samples */ +#define CMD_STREAM_BUF_OVERFLOW_ALLOWED + /* Data common to periodic and RCS based samples */ struct oa_sample_data { u32 source; @@ -182,6 +185,7 @@ void i915_perf_command_stream_hook(struct drm_i915_gem_request *request) mutex_unlock(&dev_priv->perf.streams_lock); } +#ifdef CMD_STREAM_BUF_OVERFLOW_ALLOWED /* * Release some perf entries to make space for a new entry data. We dereference * the associated request before deleting the entry. Also, no need to check for @@ -208,25 +212,26 @@ static void release_some_perf_entries(struct drm_i915_private *dev_priv, break; } } +#endif /* - * Insert the perf entry to the end of the list. This function never fails, - * since it always manages to insert the entry. If the space is exhausted in - * the buffer, it will remove the oldest entries in order to make space. + * Insert the perf entry to the end of the list. If the overwrite of old entries + * is allowed, the function always manages to insert the entry and returns 0. + * If overwrite is not allowed, on detection of overflow condition, an + * appropriate status flag is set, and function returns -ENOSPC. */ -static void insert_perf_entry(struct drm_i915_private *dev_priv, +static int insert_perf_entry(struct drm_i915_private *dev_priv, struct i915_perf_cs_data_node *entry) { struct i915_perf_cs_data_node *first_entry, *last_entry; int max_offset = dev_priv->perf.command_stream_buf.obj->base.size; u32 entry_size = dev_priv->perf.oa.oa_buffer.format_size; + int ret = 0; spin_lock(&dev_priv->perf.node_list_lock); if (list_empty(&dev_priv->perf.node_list)) { entry->offset = 0; - list_add_tail(&entry->link, &dev_priv->perf.node_list); - spin_unlock(&dev_priv->perf.node_list_lock); - return; + goto out; } first_entry = list_first_entry(&dev_priv->perf.node_list, @@ -244,29 +249,49 @@ static void insert_perf_entry(struct drm_i915_private *dev_priv, */ else if (entry_size < first_entry->offset) entry->offset = 0; - /* Insufficient space. Overwrite existing old entries */ + /* Insufficient space */ else { +#ifdef CMD_STREAM_BUF_OVERFLOW_ALLOWED u32 target_size = entry_size - first_entry->offset; release_some_perf_entries(dev_priv, target_size); entry->offset = 0; +#else + dev_priv->perf.command_stream_buf.status |= + I915_PERF_CMD_STREAM_BUF_STATUS_OVERFLOW; + ret = -ENOSPC; + goto out_unlock; +#endif } } else { /* Sufficient space available? */ if (last_entry->offset + 2*entry_size < first_entry->offset) entry->offset = last_entry->offset + entry_size; - /* Insufficient space. Overwrite existing old entries */ + /* Insufficient space */ else { +#ifdef CMD_STREAM_BUF_OVERFLOW_ALLOWED u32 target_size = entry_size - (first_entry->offset - last_entry->offset - entry_si
[Intel-gfx] [PATCH 10/15] drm/i915: Extract raw GPU timestamps from OA reports to forward in perf samples
From: Sourab Gupta The OA reports contain the least significant 32 bits of the gpu timestamp. This patch enables retrieval of the timestamp field from OA reports, to forward as 64 bit raw gpu timestamps in the perf samples. Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_drv.h | 1 + drivers/gpu/drm/i915/i915_perf.c | 46 ++-- drivers/gpu/drm/i915/i915_reg.h | 4 3 files changed, 40 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 2a31b79..a9a123b 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -2196,6 +2196,7 @@ struct drm_i915_private { u32 ctx_flexeu0_off; u32 n_pending_periodic_samples; u32 pending_periodic_ts; + u64 last_gpu_ts; struct i915_oa_ops ops; const struct i915_oa_format *oa_formats; diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 4a6fc5e..65b4af6 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -781,6 +781,24 @@ static int append_sample(struct i915_perf_stream *stream, return 0; } +static u64 get_gpu_ts_from_oa_report(struct drm_i915_private *dev_priv, + const u8 *report) +{ + u32 sample_ts = *(u32 *)(report + 4); + u32 delta; + + /* +* NB: We have to assume we're updating last_gpu_ts frequently +* enough that it's never possible to see multiple overflows before +* we compare sample_ts to last_gpu_ts. Since this is significantly +* large duration (~6min for 80ns ts base), we can safely assume so. +*/ + delta = sample_ts - (u32)dev_priv->perf.oa.last_gpu_ts; + dev_priv->perf.oa.last_gpu_ts += delta; + + return dev_priv->perf.oa.last_gpu_ts; +} + static int append_oa_buffer_sample(struct i915_perf_stream *stream, struct i915_perf_read_state *read_state, const u8 *report) @@ -817,10 +835,9 @@ static int append_oa_buffer_sample(struct i915_perf_stream *stream, if (sample_flags & SAMPLE_TAG) data.tag = dev_priv->perf.last_tag; - /* Derive timestamp from OA report, after scaling with the ts base */ -#warning "FIXME: append_oa_buffer_sample: derive the timestamp from OA report" + /* Derive timestamp from OA report */ if (sample_flags & SAMPLE_TS) - data.ts = 0; + data.ts = get_gpu_ts_from_oa_report(dev_priv, report); if (sample_flags & SAMPLE_OA_REPORT) data.report = report; @@ -1272,6 +1289,7 @@ static int append_one_cs_sample(struct i915_perf_stream *stream, enum intel_engine_id id = stream->engine; struct sample_data data = { 0 }; u32 sample_flags = stream->sample_flags; + u64 gpu_ts = 0; int ret = 0; if (sample_flags & SAMPLE_OA_REPORT) { @@ -1288,6 +1306,9 @@ static int append_one_cs_sample(struct i915_perf_stream *stream, U32_MAX); if (ret) return ret; + + if (sample_flags & SAMPLE_TS) + gpu_ts = get_gpu_ts_from_oa_report(dev_priv, report); } if (sample_flags & SAMPLE_OA_SOURCE_INFO) @@ -1309,17 +1330,14 @@ static int append_one_cs_sample(struct i915_perf_stream *stream, } if (sample_flags & SAMPLE_TS) { - /* For RCS, if OA samples are also being collected, derive the -* timestamp from OA report, after scaling with the TS base. + /* If OA sampling is enabled, derive the ts from OA report. * Else, forward the timestamp collected via command stream. */ -#warning "FIXME: append_one_cs_sample: derive the timestamp from OA report" - if (sample_flags & SAMPLE_OA_REPORT) - data.ts = 0; - else - data.ts = *(u64 *) + if (!(sample_flags & SAMPLE_OA_REPORT)) + gpu_ts = *(u64 *) (dev_priv->perf.command_stream_buf[id].addr + node->ts_offset); + data.ts = gpu_ts; } return append_sample(stream, read_state, &data); @@ -2055,9 +2073,15 @@ static void i915_ring_stream_enable(struct i915_perf_stream *stream) { struct drm_i915_private *dev_priv = stream->dev_priv; - if (stream->sample_flags & SAMPLE_OA_REPORT) + if (stream->sample_flags & SAMPLE_OA_REPORT) { dev_priv->perf.oa
[Intel-gfx] [PATCH 15/15] drm/i915: Support for capturing MMIO register values
From: Sourab Gupta This patch adds support for capturing MMIO register values through i915 perf interface. The userspace can request upto 8 MMIO register values to be dumped. The addresses of these registers can be passed through the corresponding property 'value' field while opening the stream. The commands to dump the values of these MMIO registers are then inserted into the ring alongwith other commands. Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_drv.h | 4 + drivers/gpu/drm/i915/i915_perf.c | 173 ++- include/uapi/drm/i915_drm.h | 14 3 files changed, 188 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index d99ea73..bfa52dc 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1865,6 +1865,7 @@ struct i915_perf_cs_data_node { u32 start_offset; u32 oa_offset; u32 ts_offset; + u32 mmio_offset; /* buffer size corresponding to this entry */ u32 size; @@ -2186,6 +2187,9 @@ struct drm_i915_private { struct i915_perf_stream *ring_stream[I915_NUM_ENGINES]; wait_queue_head_t poll_wq[I915_NUM_ENGINES]; + u32 num_mmio; + u32 mmio_list[I915_PERF_MMIO_NUM_MAX]; + struct { u32 specific_ctx_id; diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index e340cf9f..e661d8d 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -111,6 +111,7 @@ struct sample_data { u64 gpu_ts; u64 clk_monoraw; const u8 *report; + const u8 *mmio; }; /* for sysctl proc_dointvec_minmax of i915_oa_min_timer_exponent */ @@ -169,6 +170,7 @@ static const enum intel_engine_id user_ring_map[I915_USER_RINGS + 1] = { #define SAMPLE_TAG (1<<4) #define SAMPLE_TS (1<<5) #define SAMPLE_CLK_MONO_RAW(1<<6) +#define SAMPLE_MMIO(1<<7) struct perf_open_properties { u32 sample_flags; @@ -401,6 +403,9 @@ static int insert_perf_entry(struct drm_i915_private *dev_priv, sample_ts = true; } + if (sample_flags & SAMPLE_MMIO) + entry_size += 4*dev_priv->perf.num_mmio; + spin_lock(&dev_priv->perf.node_list_lock[id]); if (list_empty(&dev_priv->perf.node_list[id])) { offset = 0; @@ -478,6 +483,10 @@ out: entry->ts_offset = ALIGN(entry->ts_offset, TS_ADDR_ALIGN); offset = entry->ts_offset + I915_PERF_TS_SAMPLE_SIZE; } + if (sample_flags & SAMPLE_MMIO) { + entry->mmio_offset = offset; + offset = entry->mmio_offset + 4*dev_priv->perf.num_mmio; + } list_add_tail(&entry->link, &dev_priv->perf.node_list[id]); #ifndef CMD_STREAM_BUF_OVERFLOW_ALLOWED @@ -623,6 +632,66 @@ static int i915_ring_stream_capture_ts(struct drm_i915_gem_request *req, return 0; } +static int i915_ring_stream_capture_mmio(struct drm_i915_gem_request *req, + u32 offset) +{ + struct intel_engine_cs *engine = req->engine; + struct intel_ringbuffer *ringbuf = req->ringbuf; + struct drm_i915_private *dev_priv = engine->dev->dev_private; + int num_mmio = dev_priv->perf.num_mmio; + u32 mmio_addr, addr = 0; + int ret, i; + + ret = intel_ring_begin(req, 4*num_mmio); + if (ret) + return ret; + + mmio_addr = + dev_priv->perf.command_stream_buf[engine->id].vma->node.start + + offset; + + if (i915.enable_execlists) { + for (i = 0; i < num_mmio; i++) { + uint32_t cmd; + + addr = mmio_addr + 4*i; + + cmd = MI_STORE_REGISTER_MEM_GEN8 | + MI_SRM_LRM_GLOBAL_GTT; + + intel_logical_ring_emit(ringbuf, cmd); + intel_logical_ring_emit(ringbuf, + dev_priv->perf.mmio_list[i]); + intel_logical_ring_emit(ringbuf, addr); + intel_logical_ring_emit(ringbuf, 0); + } + intel_logical_ring_advance(ringbuf); + } else { + for (i = 0; i < num_mmio; i++) { + uint32_t cmd; + + addr = mmio_addr + 4*i; + + if (INTEL_INFO(engine->dev)->gen >= 8) + cmd = MI_STORE_REGISTER_MEM_GEN8 | + MI_SRM_LRM_GLOBAL_GTT; + else + cmd = MI_STOR
[Intel-gfx] [PATCH 00/15] Framework to collect command stream gpu metrics using i915 perf
From: Sourab Gupta This series adds framework for collection of gpu performance metrics associated with the command stream of a particular engine. These metrics include OA reports, timestamps, mmio metrics, etc. These metrics are are collected around batchbuffer boundaries. This work utilizes the underlying infrastructure introduced in Robert Bragg's patches for collecting periodic OA counter snapshots (based on Haswell): https://lists.freedesktop.org/archives/intel-gfx/2016-April/093206.html This patch set is based on Gen8+ version of Robert's patches which can be found here: https://github.com/rib/linux/tree/wip/rib/oa-2016-05-05-nightly These are not yet individually floated in the mailing list, which I hope doesn't lead to any significant loss of clarity in order to review the work proposed in this patch series. Compared to last series I floated earlier, (https://lists.freedesktop.org/archives/intel-gfx/2016-April/093645.html), this series incorporates the following changes/fixes, besides rebasing on Robert's latest work (on a later nightly): * Based on Chris's suggestion, I have tried experimenting with using the cross timestamp framework for the purpose of retrieving tightly coupled device/system timestamps. In our case, this framework enables us to have correlated pairs of gpu+system time which can be used over a period of time to correct the frequency of timestamp clock, and thus enable to accurately send system time (_MONO_RAW) as requested to the userspace. The results are generally observed to quite better with the use of cross timestamps and the frequency delta gradually tapers down to 0 with increasing correction periods. The use of cross timestamp framework though requires us to have clockcounter/timecounter abstraction for the timestamp clocksource, and further requires few changes in the kernel timekeeping/clocksource code. I am looking for feedback on the use of this framework and the changes involved. These patches can be found for viewing at https://github.com/sourabgu/linux/tree/oa-2016-05-05 Sourab Gupta (15): drm/i915: Add ctx getparam ioctl parameter to retrieve ctx unique id drm/i915: Expose OA sample source to userspace drm/i915: Framework for capturing command stream based OA reports drm/i915: flush periodic samples, in case of no pending CS sample requests drm/i915: Handle the overflow condition for command stream buf drm/i915: Populate ctx ID for periodic OA reports drm/i915: Add support for having pid output with OA report drm/i915: Add support for emitting execbuffer tags through OA counter reports drm/i915: Extend i915 perf framework for collecting timestamps on all gpu engines drm/i915: Extract raw GPU timestamps from OA reports to forward in perf samples drm/i915: Support opening multiple concurrent perf streams time: Expose current clocksource in use by timekeeping framework time: export clocks_calc_mult_shift drm/i915: Mechanism to forward clock monotonic raw time in perf samples drm/i915: Support for capturing MMIO register values drivers/gpu/drm/i915/i915_dma.c|2 + drivers/gpu/drm/i915/i915_drv.h| 117 +- drivers/gpu/drm/i915/i915_gem_context.c|3 + drivers/gpu/drm/i915/i915_gem_execbuffer.c |5 + drivers/gpu/drm/i915/i915_perf.c | 1925 +--- drivers/gpu/drm/i915/i915_reg.h|6 + drivers/gpu/drm/i915/intel_lrc.c |4 + include/linux/timekeeping.h|5 + include/uapi/drm/i915_drm.h| 79 ++ kernel/time/clocksource.c |1 + kernel/time/timekeeping.c | 12 + 11 files changed, 1958 insertions(+), 201 deletions(-) -- 1.9.1 ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
[Intel-gfx] [PATCH 02/15] drm/i915: Expose OA sample source to userspace
From: Sourab Gupta This patch exposes a new sample source field to userspace. This field can be populated to specify the origin of the OA report. For e.g. for internally triggerred reports (non MI_RPC reports), the RPT_ID field has bitfields for specifying the origin such as timer, or render ctx switch, etc. Likewise this field can be used to specify the source as MI_RPC when such support is added. Signed-off-by: Sourab Gupta Signed-off-by: Robert Bragg --- drivers/gpu/drm/i915/i915_perf.c | 55 ++-- include/uapi/drm/i915_drm.h | 16 2 files changed, 63 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index bb2e44a..a557a82 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -74,6 +74,13 @@ static u32 i915_perf_stream_paranoid = true; */ #define OA_EXPONENT_MAX 31 +#define GEN8_OAREPORT_REASON_TIMER (1<<19) +#define GEN8_OAREPORT_REASON_TRIGGER1 (1<<20) +#define GEN8_OAREPORT_REASON_TRIGGER2 (1<<21) +#define GEN8_OAREPORT_REASON_CTX_SWITCH (1<<22) +#define GEN8_OAREPORT_REASON_GO_TRANSITION (1<<23) +#define GEN9_OAREPORT_REASON_CLK_RATIO (1<<24) + /* for sysctl proc_dointvec_minmax of i915_oa_min_timer_exponent */ static int zero; static int oa_exponent_max = OA_EXPONENT_MAX; @@ -113,7 +120,8 @@ static struct i915_oa_format gen8_plus_oa_formats[I915_OA_FORMAT_MAX] = { [I915_OA_FORMAT_C4_B8] = { 7, 64 }, }; -#define SAMPLE_OA_REPORT (1<<0) +#define SAMPLE_OA_REPORT (1<<0) +#define SAMPLE_OA_SOURCE_INFO (1<<1) struct perf_open_properties { u32 sample_flags; @@ -216,6 +224,27 @@ static int append_oa_sample(struct i915_perf_stream *stream, return -EFAULT; buf += sizeof(header); + if (sample_flags & SAMPLE_OA_SOURCE_INFO) { + enum drm_i915_perf_oa_event_source source; + + if (INTEL_INFO(dev_priv)->gen >= 8) { + u32 reason = *(u32 *)report; + + if (reason & GEN8_OAREPORT_REASON_CTX_SWITCH) + source = + I915_PERF_OA_EVENT_SOURCE_CONTEXT_SWITCH; + else if (reason & GEN8_OAREPORT_REASON_TIMER) + source = I915_PERF_OA_EVENT_SOURCE_PERIODIC; + else + source = I915_PERF_OA_EVENT_SOURCE_UNDEFINED; + } else + source = I915_PERF_OA_EVENT_SOURCE_PERIODIC; + + if (copy_to_user(buf, &source, 4)) + return -EFAULT; + buf += 4; + } + if (sample_flags & SAMPLE_OA_REPORT) { if (copy_to_user(buf, report, report_size)) return -EFAULT; @@ -1170,11 +1199,6 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, int format_size; int ret; - if (!(props->sample_flags & SAMPLE_OA_REPORT)) { - DRM_ERROR("Only OA report sampling supported\n"); - return -EINVAL; - } - if (!dev_priv->perf.oa.ops.init_oa_buffer) { DRM_ERROR("OA unit not supported\n"); return -ENODEV; @@ -1203,8 +1227,20 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, format_size = dev_priv->perf.oa.oa_formats[props->oa_format].size; - stream->sample_flags |= SAMPLE_OA_REPORT; - stream->sample_size += format_size; + if (props->sample_flags & SAMPLE_OA_REPORT) { + stream->sample_flags |= SAMPLE_OA_REPORT; + stream->sample_size += format_size; + } + + if (props->sample_flags & SAMPLE_OA_SOURCE_INFO) { + if (!(props->sample_flags & SAMPLE_OA_REPORT)) { + DRM_ERROR( + "OA source type can't be sampled without OA report"); + return -EINVAL; + } + stream->sample_flags |= SAMPLE_OA_SOURCE_INFO; + stream->sample_size += 4; + } dev_priv->perf.oa.oa_buffer.format_size = format_size; BUG_ON(dev_priv->perf.oa.oa_buffer.format_size == 0); @@ -1842,6 +1878,9 @@ static int read_properties_unlocked(struct drm_i915_private *dev_priv, props->oa_periodic = true; props->oa_period_exponent = value; break; + case DRM_I915_PERF_PROP_SAMPLE_OA_SOURCE: + props->sample_flags |= SAMPLE_OA_SOURCE_INFO; + break; case DRM_I915_PERF_PROP_MAX: BUG(); } dif
[Intel-gfx] [PATCH 11/15] drm/i915: Support opening multiple concurrent perf streams
From: Sourab Gupta This patch adds support for opening multiple concurrent perf streams for different gpu engines, while having the restriction to open only a single stream open for a particular gpu engine. This enables userspace client to open multiple streams, one per engine, at any time to capture sample data for multiple gpu engines. Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_drv.h | 2 +- drivers/gpu/drm/i915/i915_perf.c | 69 ++-- 2 files changed, 39 insertions(+), 32 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index a9a123b..9ccac83 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -2161,7 +2161,7 @@ struct drm_i915_private { spinlock_t hook_lock; struct hrtimer poll_check_timer; - struct i915_perf_stream *exclusive_stream; + struct i915_perf_stream *ring_stream[I915_NUM_ENGINES]; wait_queue_head_t poll_wq[I915_NUM_ENGINES]; struct { diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 65b4af6..aa3589e 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -944,7 +944,7 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream, * an invalid ID. It could be good to annotate these * reports with a _CTX_SWITCH_AWAY reason later. */ - if (!dev_priv->perf.exclusive_stream->ctx || + if (!stream->ctx || dev_priv->perf.oa.specific_ctx_id == ctx_id || dev_priv->perf.oa.oa_buffer.last_ctx_id == ctx_id) { @@ -955,7 +955,7 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream, * the switch-away reports with an invalid * context id to be recognisable by userspace. */ - if (dev_priv->perf.exclusive_stream->ctx && + if (stream->ctx && dev_priv->perf.oa.specific_ctx_id != ctx_id) report32[2] = 0x1f; @@ -1596,7 +1596,7 @@ static void i915_ring_stream_destroy(struct i915_perf_stream *stream) { struct drm_i915_private *dev_priv = stream->dev_priv; - BUG_ON(stream != dev_priv->perf.exclusive_stream); + BUG_ON(stream != dev_priv->perf.ring_stream[stream->engine]); if (stream->using_oa) { dev_priv->perf.oa.ops.disable_metric_set(dev_priv); @@ -1610,7 +1610,7 @@ static void i915_ring_stream_destroy(struct i915_perf_stream *stream) if (stream->cs_mode) free_command_stream_buf(dev_priv, stream->engine); - dev_priv->perf.exclusive_stream = NULL; + dev_priv->perf.ring_stream[stream->engine] = NULL; } static void gen7_init_oa_buffer(struct drm_i915_private *dev_priv) @@ -2012,14 +2012,14 @@ static void gen7_update_oacontrol_locked(struct drm_i915_private *dev_priv) { assert_spin_locked(&dev_priv->perf.hook_lock); - if (dev_priv->perf.exclusive_stream->state != + if (dev_priv->perf.ring_stream[RCS]->state != I915_PERF_STREAM_DISABLED) { unsigned long ctx_id = 0; - if (dev_priv->perf.exclusive_stream->ctx) + if (dev_priv->perf.ring_stream[RCS]->ctx) ctx_id = dev_priv->perf.oa.specific_ctx_id; - if (dev_priv->perf.exclusive_stream->ctx == NULL || ctx_id) { + if (dev_priv->perf.ring_stream[RCS]->ctx == NULL || ctx_id) { bool periodic = dev_priv->perf.oa.periodic; u32 period_exponent = dev_priv->perf.oa.period_exponent; u32 report_format = dev_priv->perf.oa.oa_buffer.format; @@ -2144,15 +2144,6 @@ static int i915_ring_stream_init(struct i915_perf_stream *stream, SAMPLE_TS); int ret; - /* To avoid the complexity of having to accurately filter -* counter reports and marshal to the appropriate client -* we currently only allow exclusive access -*/ - if (dev_priv->perf.exclusive_stream) { - DRM_ERROR("Stream already in use\n"); - return -EBUSY; - } - if ((props->sample_flags & SAMPLE_CTX_ID) && !props->cs_mode) { if (IS_HASWELL(dev_priv->dev)) { DRM_ERROR( @@ -2170,6 +2161,12 @@ static int i915_ring_stream_init(struct i915_perf_stream *stream, if (require_oa_unit) { int format_size; +
[Intel-gfx] [PATCH 08/15] drm/i915: Add support for emitting execbuffer tags through OA counter reports
From: Sourab Gupta This patch enables userspace to specify tags (per workload), provided via execbuffer ioctl, which could be added to OA reports, to help associate reports with the corresponding workloads. There may be multiple stages within a single context, from a userspace perspective. An ability is needed to individually associate the OA reports with their corresponding workloads(execbuffers), which may not be possible solely with ctx_id or pid information. This patch enables such a mechanism. In this patch, upper 32 bits of rsvd1 field, which were previously unused are now being used to pass in the tag. Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_drv.h| 7 -- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 5 ++-- drivers/gpu/drm/i915/i915_perf.c | 38 ++ drivers/gpu/drm/i915/intel_lrc.c | 4 ++-- include/uapi/drm/i915_drm.h| 12 ++ 5 files changed, 55 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 60e94e6..9da5007 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1708,6 +1708,7 @@ struct i915_execbuffer_params { struct drm_i915_gem_object *batch_obj; struct intel_context*ctx; struct drm_i915_gem_request *request; + uint32_ttag; }; /* used in computing the new watermarks state */ @@ -1796,7 +1797,7 @@ struct i915_perf_stream_ops { * Routine to emit the commands in the command streamer associated * with the corresponding gpu engine. */ - void (*command_stream_hook)(struct drm_i915_gem_request *req); + void (*command_stream_hook)(struct drm_i915_gem_request *req, u32 tag); }; enum i915_perf_stream_state { @@ -1852,6 +1853,7 @@ struct i915_perf_cs_data_node { u32 offset; u32 ctx_id; u32 pid; + u32 tag; }; struct drm_i915_private { @@ -2199,6 +2201,7 @@ struct drm_i915_private { u32 last_ctx_id; u32 last_pid; + u32 last_tag; struct list_head node_list; spinlock_t node_list_lock; } perf; @@ -3563,7 +3566,7 @@ void i915_oa_legacy_ctx_switch_notify(struct drm_i915_gem_request *req); void i915_oa_update_reg_state(struct intel_engine_cs *engine, struct intel_context *ctx, uint32_t *reg_state); -void i915_perf_command_stream_hook(struct drm_i915_gem_request *req); +void i915_perf_command_stream_hook(struct drm_i915_gem_request *req, u32 tag); /* i915_gem_evict.c */ int __must_check i915_gem_evict_something(struct drm_device *dev, diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index 8b759af..a6564a0 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -1305,7 +1305,7 @@ i915_gem_ringbuffer_submission(struct i915_execbuffer_params *params, if (exec_len == 0) exec_len = params->batch_obj->base.size; - i915_perf_command_stream_hook(params->request); + i915_perf_command_stream_hook(params->request, params->tag); ret = engine->dispatch_execbuffer(params->request, exec_start, exec_len, @@ -1313,7 +1313,7 @@ i915_gem_ringbuffer_submission(struct i915_execbuffer_params *params, if (ret) return ret; - i915_perf_command_stream_hook(params->request); + i915_perf_command_stream_hook(params->request, params->tag); trace_i915_gem_ring_dispatch(params->request, params->dispatch_flags); @@ -1634,6 +1634,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data, params->batch_obj = batch_obj; params->ctx = ctx; params->request = req; + params->tag = i915_execbuffer2_get_tag(*args); ret = dev_priv->gt.execbuf_submit(params, args, &eb->vmas); err_request: diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index bb5356f..902f84f 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -89,6 +89,7 @@ struct oa_sample_data { u32 source; u32 ctx_id; u32 pid; + u32 tag; const u8 *report; }; @@ -145,6 +146,7 @@ static const enum intel_engine_id user_ring_map[I915_USER_RINGS + 1] = { #define SAMPLE_OA_SOURCE_INFO (1<<1) #define SAMPLE_CTX_ID (1<<2) #define SAMPLE_PID (1<<3) +#define SAMPLE_TAG (1<<4) struct perf_open_properties { u32 sample_flags; @@ -169,7 +171,8 @@ struct perf_open_properties { * perf mutex lock. */
Re: [Intel-gfx] [PATCH 03/15] drm/i915: Framework for capturing command stream based OA reports
On Thu, 2016-06-02 at 11:30 +0530, Martin Peres wrote: > On 02/06/16 08:18, sourab.gu...@intel.com wrote: > > From: Sourab Gupta > > > > This patch introduces a framework to enable OA counter reports associated > > with Render command stream. We can then associate the reports captured > > through this mechanism with their corresponding context id's. This can be > > further extended to associate any other metadata information with the > > corresponding samples (since the association with Render command stream > > gives us the ability to capture these information while inserting the > > corresponding capture commands into the command stream). > > > > The OA reports generated in this way are associated with a corresponding > > workload, and thus can be used the delimit the workload (i.e. sample the > > counters at the workload boundaries), within an ongoing stream of periodic > > counter snapshots. > > > > There may be usecases wherein we need more than periodic OA capture mode > > which is supported currently. This mode is primarily used for two usecases: > > - Ability to capture system wide metrics, alongwith the ability to map > > the reports back to individual contexts (particularly for HSW). > > - Ability to inject tags for work, into the reports. This provides > > visibility into the multiple stages of work within single context. > > > > The userspace will be able to distinguish between the periodic and CS based > > OA reports by the virtue of source_info sample field. > > > > The command MI_REPORT_PERF_COUNT can be used to capture snapshots of OA > > counters, and is inserted at BB boundaries. > > So, it is possible to trigger a read of a set of counters (all?) from > the pushbuffer buffer? > > If so, I like this because I was wondering how would this work break > when we move to the GuC-submission model. > > Thanks for working on this! > Martin Yeah, we can trigger the capture of counter snapshot using command MI_REPORT_PERF_COUNT inserted in the ringbuffer. The capture is initiated on encountering this command, at an address specified in the command arguments. The exact details of counters captured (report format etc.) depends on the OA unit configuration done earlier. -Sourab ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Re: [Intel-gfx] [PATCH v8 10/12] drm/i915: add oa_event_min_timer_exponent sysctl
On Thu, 2016-10-27 at 19:14 -0700, Robert Bragg wrote: > The minimal sampling period is now configurable via a > dev.i915.oa_min_timer_exponent sysctl parameter. > > Following the precedent set by perf, the default is the minimum that > won't (on its own) exceed the default kernel.perf_event_max_sample_rate > default of 10 samples/s. > > Signed-off-by: Robert Bragg > Reviewed-by: Matthew Auld > --- > drivers/gpu/drm/i915/i915_perf.c | 42 > > 1 file changed, 30 insertions(+), 12 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_perf.c > b/drivers/gpu/drm/i915/i915_perf.c > index 4e42073..e3c6f51 100644 > --- a/drivers/gpu/drm/i915/i915_perf.c > +++ b/drivers/gpu/drm/i915/i915_perf.c > @@ -82,6 +82,22 @@ static u32 i915_perf_stream_paranoid = true; > #define INVALID_CTX_ID 0x > > > +/* for sysctl proc_dointvec_minmax of i915_oa_min_timer_exponent */ > +static int oa_exponent_max = OA_EXPONENT_MAX; > + > +/* Theoretically we can program the OA unit to sample every 160ns but don't > + * allow that by default unless root... > + * > + * The period is derived from the exponent as: > + * > + * period = 80ns * 2^(exponent + 1) > + * > + * Referring to perf's kernel.perf_event_max_sample_rate for a precedent > + * (10 by default); with an OA exponent of 6 we get a period of 10.240 > + * microseconds - just under 10Hz > + */ > +static u32 i915_oa_min_timer_exponent = 6; For HSW, the timestamp period is 80ns, so the exponent of 6 translates to sampling rate of ~10Hz. But the timestamp period may change for other platforms, leading to different values of oa_min_timer_exponent corresponding to sampling rate of ~10Hz. Do we plan to have this value platform specific subsequently, or the guidance value of ~10Hz min sampling rate needn't be strictly followed? > + > /* XXX: beware if future OA HW adds new report formats that the current > * code assumes all reports have a power-of-two size and ~(size - 1) can > * be used as a mask to align the OA tail pointer. > @@ -1353,21 +1369,14 @@ static int read_properties_unlocked(struct > drm_i915_private *dev_priv, > return -EINVAL; > } > > - /* NB: The exponent represents a period as follows: > - * > - * 80ns * 2^(period_exponent + 1) > - * > - * Theoretically we can program the OA unit to sample > + /* Theoretically we can program the OA unit to sample >* every 160ns but don't allow that by default unless >* root. > - * > - * Referring to perf's > - * kernel.perf_event_max_sample_rate for a precedent > - * (10 by default); with an OA exponent of 6 we get > - * a period of 10.240 microseconds -just under 10Hz >*/ > - if (value < 6 && !capable(CAP_SYS_ADMIN)) { > - DRM_ERROR("Minimum OA sampling exponent is 6 > without root privileges\n"); > + if (value < i915_oa_min_timer_exponent && > + !capable(CAP_SYS_ADMIN)) { > + DRM_ERROR("Minimum OA sampling exponent (sysctl > dev.i915.oa_min_timer_exponent) is %u without root privileges\n", > + i915_oa_min_timer_exponent); > return -EACCES; > } > > @@ -1475,6 +1484,15 @@ static struct ctl_table oa_table[] = { >.extra1 = &zero, >.extra2 = &one, >}, > + { > + .procname = "oa_min_timer_exponent", > + .data = &i915_oa_min_timer_exponent, > + .maxlen = sizeof(i915_oa_min_timer_exponent), > + .mode = 0644, > + .proc_handler = proc_dointvec_minmax, > + .extra1 = &zero, > + .extra2 = &oa_exponent_max, > + }, > {} > }; > ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Re: [Intel-gfx] [PATCH v8 03/12] drm/i915: rename OACONTROL GEN7_OACONTROL
On Thu, 2016-10-27 at 19:14 -0700, Robert Bragg wrote: > OACONTROL changes quite a bit for gen8, with some bits split out into a > per-context OACTXCONTROL register. Rename now before adding more gen7 OA > registers > > Signed-off-by: Robert Bragg > Reviewed-by: Matthew Auld Reviewed-by: Sourab Gupta ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Re: [Intel-gfx] [PATCH v8 04/12] drm/i915: return EACCES for check_cmd() failures
On Thu, 2016-10-27 at 19:14 -0700, Robert Bragg wrote: > check_cmd() is checking whether a command adheres to certain > restrictions that ensure it's safe to execute within a privileged batch > buffer. Returning false implies a privilege problem, not that the > command is invalid. > > The distinction makes the difference between allowing the buffer to be > executed as an unprivileged batch buffer or returning an EINVAL error to > userspace without executing anything. > > In a case where userspace may want to test whether it can successfully > write to a register that needs privileges the distinction may be > important and an EINVAL error may be considered fatal. > > In particular this is currently true for Mesa, which includes a test for > whether OACONTROL can be written too, but Mesa treats any error when > flushing a batch buffer as fatal, calling exit(1). > > As it is currently Mesa can gracefully handle a failure to write to > OACONTROL if the command parser is disabled, but if we were to remove > OACONTROL from the parser's whitelist then the returned EINVAL would > break Mesa applications as they attempt an OACONTROL write. > > This bumps the command parser version from 7 to 8, as the change is > visible to userspace. > > Signed-off-by: Robert Bragg > Reviewed-by: Matthew Auld Well, looks reasonable to me. Reviewed-by: Sourab Gupta ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Re: [Intel-gfx] [PATCH v8 02/12] drm/i915: Add i915 perf infrastructure
On Thu, 2016-10-27 at 19:14 -0700, Robert Bragg wrote: > Adds base i915 perf infrastructure for Gen performance metrics. > > This adds a DRM_IOCTL_I915_PERF_OPEN ioctl that takes an array of uint64 > properties to configure a stream of metrics and returns a new fd usable > with standard VFS system calls including read() to read typed and sized > records; ioctl() to enable or disable capture and poll() to wait for > data. > > A stream is opened something like: > > uint64_t properties[] = { > /* Single context sampling */ > DRM_I915_PERF_PROP_CTX_HANDLE,ctx_handle, > > /* Include OA reports in samples */ > DRM_I915_PERF_PROP_SAMPLE_OA, true, > > /* OA unit configuration */ > DRM_I915_PERF_PROP_OA_METRICS_SET,metrics_set_id, > DRM_I915_PERF_PROP_OA_FORMAT, report_format, > DRM_I915_PERF_PROP_OA_EXPONENT, period_exponent, >}; >struct drm_i915_perf_open_param parm = { > .flags = I915_PERF_FLAG_FD_CLOEXEC | >I915_PERF_FLAG_FD_NONBLOCK | >I915_PERF_FLAG_DISABLED, > .properties_ptr = (uint64_t)properties, > .num_properties = sizeof(properties) / 16, >}; >int fd = drmIoctl(drm_fd, DRM_IOCTL_I915_PERF_OPEN, ¶m); > > Records read all start with a common { type, size } header with > DRM_I915_PERF_RECORD_SAMPLE being of most interest. Sample records > contain an extensible number of fields and it's the > DRM_I915_PERF_PROP_SAMPLE_xyz properties given when opening that > determine what's included in every sample. > > No specific streams are supported yet so any attempt to open a stream > will return an error. > > v2: > use i915_gem_context_get() - Chris Wilson > v3: > update read() interface to avoid passing state struct - Chris Wilson > fix some rebase fallout, with i915-perf init/deinit > v4: > s/DRM_IORW/DRM_IOW/ - Emil Velikov > > Signed-off-by: Robert Bragg > --- > drivers/gpu/drm/i915/Makefile| 3 + > drivers/gpu/drm/i915/i915_drv.c | 4 + > drivers/gpu/drm/i915/i915_drv.h | 91 > drivers/gpu/drm/i915/i915_perf.c | 443 > +++ > include/uapi/drm/i915_drm.h | 67 ++ > 5 files changed, 608 insertions(+) > create mode 100644 drivers/gpu/drm/i915/i915_perf.c > > diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile > index 6123400..8d4e25f 100644 > --- a/drivers/gpu/drm/i915/Makefile > +++ b/drivers/gpu/drm/i915/Makefile > @@ -113,6 +113,9 @@ i915-$(CONFIG_DRM_I915_CAPTURE_ERROR) += i915_gpu_error.o > # virtual gpu code > i915-y += i915_vgpu.o > > +# perf code > +i915-y += i915_perf.o > + > ifeq ($(CONFIG_DRM_I915_GVT),y) > i915-y += intel_gvt.o > include $(src)/gvt/Makefile > diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c > index af3559d..685c96e 100644 > --- a/drivers/gpu/drm/i915/i915_drv.c > +++ b/drivers/gpu/drm/i915/i915_drv.c > @@ -836,6 +836,8 @@ static int i915_driver_init_early(struct drm_i915_private > *dev_priv, > > intel_detect_preproduction_hw(dev_priv); > > + i915_perf_init(dev_priv); > + > return 0; > > err_workqueues: > @@ -849,6 +851,7 @@ static int i915_driver_init_early(struct drm_i915_private > *dev_priv, > */ > static void i915_driver_cleanup_early(struct drm_i915_private *dev_priv) > { > + i915_perf_fini(dev_priv); > i915_gem_load_cleanup(&dev_priv->drm); > i915_workqueues_cleanup(dev_priv); > } > @@ -2556,6 +2559,7 @@ static const struct drm_ioctl_desc i915_ioctls[] = { > DRM_IOCTL_DEF_DRV(I915_GEM_USERPTR, i915_gem_userptr_ioctl, > DRM_RENDER_ALLOW), > DRM_IOCTL_DEF_DRV(I915_GEM_CONTEXT_GETPARAM, > i915_gem_context_getparam_ioctl, DRM_RENDER_ALLOW), > DRM_IOCTL_DEF_DRV(I915_GEM_CONTEXT_SETPARAM, > i915_gem_context_setparam_ioctl, DRM_RENDER_ALLOW), > + DRM_IOCTL_DEF_DRV(I915_PERF_OPEN, i915_perf_open_ioctl, > DRM_RENDER_ALLOW), > }; > > static struct drm_driver driver = { > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > index 5a260db..7a65c0b 100644 > --- a/drivers/gpu/drm/i915/i915_drv.h > +++ b/drivers/gpu/drm/i915/i915_drv.h > @@ -1767,6 +1767,84 @@ struct intel_wm_config { > bool sprites_scaled; > }; > > +struct i915_perf_stream; > + > +struct i915_perf_stream_ops { > + /* Enables the collection of HW samples, either in response to > + * I915_PERF_IOCTL_ENABLE or implicitly called when stream is > + * opened without I915_PERF_FLAG_DISABLED. > + */ > + void (*enable)(struct i915_perf_stream *stream); > + > + /* Disables the collection of HW samples, either in response to > + * I915_PERF_IOCTL_DISABLE or implicitly called before > + * destroying the stream. > + */ > + void (*disable)(struct i915_perf_stream *stream); > + > + /* Return: true if any i915 perf records are ready to read() > + * for this stream. > + */ >
Re: [Intel-gfx] [PATCH v8 08/12] drm/i915: advertise available metrics via sysfs
On Thu, 2016-10-27 at 19:14 -0700, Robert Bragg wrote: > Each metric set is given a sysfs entry like: > > /sys/class/drm/card0/metrics//id > > This allows userspace to enumerate the specific sets that are available > for the current system. The 'id' file contains an unsigned integer that > can be used to open the associated metric set via > DRM_IOCTL_I915_PERF_OPEN. The is a globally unique ID for a > specific OA unit register configuration that can be reliably used by > userspace as a key to lookup corresponding counter meta data and > normalization equations. > > The guid registry is currently maintained as part of gputop along with > the XML metric set descriptions and code generation scripts, ref: > > https://github.com/rib/gputop > > gputop-data/guids.xml > > scripts/update-guids.py > > gputop-data/oa-*.xml > > scripts/i915-perf-kernelgen.py > > $ make -C gputop-data -f Makefile.xml SYSFS=1 WHITELIST=RenderBasic > > Signed-off-by: Robert Bragg > Reviewed-by: Matthew Auld Looks good to me. Reviewed-by: Sourab Gupta ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Re: [Intel-gfx] [PATCH v8 09/12] drm/i915: Add dev.i915.perf_stream_paranoid sysctl option
On Thu, 2016-10-27 at 19:14 -0700, Robert Bragg wrote: > Consistent with the kernel.perf_event_paranoid sysctl option that can > allow non-root users to access system wide cpu metrics, this can > optionally allow non-root users to access system wide OA counter metrics > from Gen graphics hardware. > > Signed-off-by: Robert Bragg > Reviewed-by: Matthew Auld > --- > drivers/gpu/drm/i915/i915_drv.h | 1 + > drivers/gpu/drm/i915/i915_perf.c | 50 > +++- > 2 files changed, 50 insertions(+), 1 deletion(-) > > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > index 01438fb..a138f86 100644 > --- a/drivers/gpu/drm/i915/i915_drv.h > +++ b/drivers/gpu/drm/i915/i915_drv.h > @@ -2171,6 +2171,7 @@ struct drm_i915_private { > bool initialized; > > struct kobject *metrics_kobj; > + struct ctl_table_header *sysctl_header; > > struct mutex lock; > struct list_head streams; > diff --git a/drivers/gpu/drm/i915/i915_perf.c > b/drivers/gpu/drm/i915/i915_perf.c > index 8d07c41..4e42073 100644 > --- a/drivers/gpu/drm/i915/i915_perf.c > +++ b/drivers/gpu/drm/i915/i915_perf.c > @@ -64,6 +64,11 @@ > #define POLL_FREQUENCY 200 > #define POLL_PERIOD (NSEC_PER_SEC / POLL_FREQUENCY) > > +/* for sysctl proc_dointvec_minmax of dev.i915.perf_stream_paranoid */ > +static int zero; > +static int one = 1; > +static u32 i915_perf_stream_paranoid = true; > + > /* The maximum exponent the hardware accepts is 63 (essentially it selects > one > * of the 64bit timestamp bits to trigger reports from) but there's currently > * no known use case for sampling as infrequently as once per 47 thousand > years. > @@ -1207,7 +1212,13 @@ i915_perf_open_ioctl_locked(struct drm_i915_private > *dev_priv, > } > } > > - if (!specific_ctx && !capable(CAP_SYS_ADMIN)) { > + /* Similar to perf's kernel.perf_paranoid_cpu sysctl option > + * we check a dev.i915.perf_stream_paranoid sysctl option > + * to determine if it's ok to access system wide OA counters > + * without CAP_SYS_ADMIN privileges. > + */ > + if (!specific_ctx && > + i915_perf_stream_paranoid && !capable(CAP_SYS_ADMIN)) { > DRM_ERROR("Insufficient privileges to open system-wide i915 > perf stream\n"); > ret = -EACCES; > goto err_ctx; > @@ -1454,6 +1465,39 @@ void i915_perf_unregister(struct drm_i915_private > *dev_priv) > dev_priv->perf.metrics_kobj = NULL; > } > > +static struct ctl_table oa_table[] = { > + { > + .procname = "perf_stream_paranoid", > + .data = &i915_perf_stream_paranoid, > + .maxlen = sizeof(i915_perf_stream_paranoid), > + .mode = 0644, > + .proc_handler = proc_dointvec_minmax, > + .extra1 = &zero, > + .extra2 = &one, > + }, > + {} > +}; > + > +static struct ctl_table i915_root[] = { > + { > + .procname = "i915", > + .maxlen = 0, > + .mode = 0555, > + .child = oa_table, > + }, > + {} > +}; > + > +static struct ctl_table dev_root[] = { > + { > + .procname = "dev", > + .maxlen = 0, > + .mode = 0555, > + .child = i915_root, > + }, > + {} > +}; > + > void i915_perf_init(struct drm_i915_private *dev_priv) > { > if (!IS_HASWELL(dev_priv)) > @@ -1484,6 +1528,8 @@ void i915_perf_init(struct drm_i915_private *dev_priv) > dev_priv->perf.oa.n_builtin_sets = > i915_oa_n_builtin_metric_sets_hsw; > > + dev_priv->perf.sysctl_header = register_sysctl_table(dev_root); > + > dev_priv->perf.initialized = true; > } > > @@ -1492,6 +1538,8 @@ void i915_perf_fini(struct drm_i915_private *dev_priv) > if (!dev_priv->perf.initialized) > return; > > + unregister_sysctl_table(dev_priv->perf.sysctl_header); > + > memset(&dev_priv->perf.oa.ops, 0, sizeof(dev_priv->perf.oa.ops)); > dev_priv->perf.initialized = false; > } Looks fine. Reviewed-by: Sourab Gupta ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Re: [Intel-gfx] [PATCH v8 05/12] drm/i915: don't whitelist oacontrol in cmd parser
On Thu, 2016-10-27 at 19:14 -0700, Robert Bragg wrote: > Being able to program OACONTROL from a non-privileged batch buffer is > not sufficient to be able to configure the OA unit. This was originally > allowed to help enable Mesa to expose OA counters via the > INTEL_performance_query extension, but the current implementation based > on programming OACONTROL via a batch buffer isn't able to report useable > data without a more complete OA unit configuration. Mesa handles the > possibility that writes to OACONTROL may not be allowed and so only > advertises the extension after explicitly testing that a write to > OACONTROL succeeds. Based on this; removing OACONTROL from the whitelist > should be ok for userspace. > > Removing this simplifies adding a new kernel api for configuring the OA > unit without needing to consider the possibility that userspace might > trample on OACONTROL state which we'd like to start managing within > the kernel instead. In particular running any Mesa based GL application > currently results in clearing OACONTROL when initializing which would > disable the capturing of metrics. > > Signed-off-by: Robert Bragg > Reviewed-by: Matthew Auld Seems reasonable. Reviewed-by: Sourab Gupta ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
[Intel-gfx] [PATCH 08/15] drm/i915: Add support for emitting execbuffer tags through OA counter reports
From: Sourab Gupta This patch enables userspace to specify tags (per workload), provided via execbuffer ioctl, which could be added to OA reports, to help associate reports with the corresponding workloads. There may be multiple stages within a single context, from a userspace perspective. An ability is needed to individually associate the OA reports with their corresponding workloads(execbuffers), which may not be possible solely with ctx_id or pid information. This patch enables such a mechanism. In this patch, upper 32 bits of rsvd1 field, which were previously unused are now being used to pass in the tag. Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_drv.h| 6 +++-- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 6 +++-- drivers/gpu/drm/i915/i915_perf.c | 38 ++ include/uapi/drm/i915_drm.h| 12 ++ 4 files changed, 53 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index f250e7b..0f171f8 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1814,7 +1814,7 @@ struct i915_perf_stream_ops { * Routine to emit the commands in the command streamer associated * with the corresponding gpu engine. */ - void (*command_stream_hook)(struct drm_i915_gem_request *req); + void (*command_stream_hook)(struct drm_i915_gem_request *req, u32 tag); }; enum i915_perf_stream_state { @@ -1873,6 +1873,7 @@ struct i915_perf_cs_data_node { u32 offset; u32 ctx_id; u32 pid; + u32 tag; }; struct drm_i915_private { @@ -2244,6 +2245,7 @@ struct drm_i915_private { u32 last_ctx_id; u32 last_pid; + u32 last_tag; struct list_head node_list; spinlock_t node_list_lock; } perf; @@ -3666,7 +3668,7 @@ void i915_oa_legacy_ctx_switch_notify(struct drm_i915_gem_request *req); void i915_oa_update_reg_state(struct intel_engine_cs *engine, struct i915_gem_context *ctx, uint32_t *reg_state); -void i915_perf_command_stream_hook(struct drm_i915_gem_request *req); +void i915_perf_command_stream_hook(struct drm_i915_gem_request *req, u32 tag); /* i915_gem_evict.c */ int __must_check i915_gem_evict_something(struct i915_address_space *vm, diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index da502c7..d89787b 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -58,6 +58,7 @@ struct i915_execbuffer_params { struct intel_engine_cs *engine; struct i915_gem_context *ctx; struct drm_i915_gem_request *request; + uint32_ttag; }; struct eb_vmas { @@ -1523,7 +1524,7 @@ execbuf_submit(struct i915_execbuffer_params *params, if (exec_len == 0) exec_len = params->batch->size - params->args_batch_start_offset; - i915_perf_command_stream_hook(params->request); + i915_perf_command_stream_hook(params->request, params->tag); ret = params->engine->emit_bb_start(params->request, exec_start, exec_len, @@ -1531,7 +1532,7 @@ execbuf_submit(struct i915_execbuffer_params *params, if (ret) return ret; - i915_perf_command_stream_hook(params->request); + i915_perf_command_stream_hook(params->request, params->tag); trace_i915_gem_ring_dispatch(params->request, params->dispatch_flags); @@ -1843,6 +1844,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data, params->engine= engine; params->dispatch_flags = dispatch_flags; params->ctx = ctx; + params->tag = i915_execbuffer2_get_tag(*args); ret = execbuf_submit(params, args, &eb->vmas); err_request: diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 894d7a6..ca523b1 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -255,6 +255,7 @@ struct oa_sample_data { u32 source; u32 ctx_id; u32 pid; + u32 tag; const u8 *report; }; @@ -311,6 +312,7 @@ static const enum intel_engine_id user_ring_map[I915_USER_RINGS + 1] = { #define SAMPLE_OA_SOURCE_INFO (1<<1) #define SAMPLE_CTX_ID (1<<2) #define SAMPLE_PID (1<<3) +#define SAMPLE_TAG (1<<4) struct perf_open_properties { u32 sample_flags; @@ -335,7 +337,8 @@ struct perf_open_properties { * perf mutex lock. */ -void i915_perf_command_stream_hook(struct drm_i915_gem_reque
[Intel-gfx] [PATCH 03/15] drm/i915: Framework for capturing command stream based OA reports
From: Sourab Gupta This patch introduces a framework to enable OA counter reports associated with Render command stream. We can then associate the reports captured through this mechanism with their corresponding context id's. This can be further extended to associate any other metadata information with the corresponding samples (since the association with Render command stream gives us the ability to capture these information while inserting the corresponding capture commands into the command stream). The OA reports generated in this way are associated with a corresponding workload, and thus can be used the delimit the workload (i.e. sample the counters at the workload boundaries), within an ongoing stream of periodic counter snapshots. There may be usecases wherein we need more than periodic OA capture mode which is supported currently. This mode is primarily used for two usecases: - Ability to capture system wide metrics, alongwith the ability to map the reports back to individual contexts (particularly for HSW). - Ability to inject tags for work, into the reports. This provides visibility into the multiple stages of work within single context. The userspace will be able to distinguish between the periodic and CS based OA reports by the virtue of source_info sample field. The command MI_REPORT_PERF_COUNT can be used to capture snapshots of OA counters, and is inserted at BB boundaries. The data thus captured will be stored in a separate buffer, which will be different from the buffer used otherwise for periodic OA capture mode. The metadata information pertaining to snapshot is maintained in a list, which also has offsets into the gem buffer object per captured snapshot. In order to track whether the gpu has completed processing the node, a field pertaining to corresponding gem request is added, which is tracked for completion of the command. Both periodic and RCS based reports are associated with a single stream (corresponding to render engine), and it is expected to have the samples in the sequential order according to their timestamps. Now, since these reports are collected in separate buffers, these are merge sorted at the time of forwarding to userspace during the read call. v2: Aligining with the non-perf interface (custom drm ioctl based). Also, few related patches are squashed together for better readability Signed-off-by: Sourab Gupta Signed-off-by: Robert Bragg --- drivers/gpu/drm/i915/i915_drv.h| 44 +- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 4 + drivers/gpu/drm/i915/i915_perf.c | 895 - include/uapi/drm/i915_drm.h| 15 + 4 files changed, 805 insertions(+), 153 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index a6ac1c3..0561315 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1809,6 +1809,18 @@ struct i915_perf_stream_ops { * The stream will always be disabled before this is called. */ void (*destroy)(struct i915_perf_stream *stream); + + /* +* Routine to emit the commands in the command streamer associated +* with the corresponding gpu engine. +*/ + void (*command_stream_hook)(struct drm_i915_gem_request *req); +}; + +enum i915_perf_stream_state { + I915_PERF_STREAM_DISABLED, + I915_PERF_STREAM_ENABLE_IN_PROGRESS, + I915_PERF_STREAM_ENABLED, }; struct i915_perf_stream { @@ -1816,11 +1828,16 @@ struct i915_perf_stream { struct list_head link; + enum intel_engine_id engine; u32 sample_flags; int sample_size; struct i915_gem_context *ctx; bool enabled; + enum i915_perf_stream_state state; + + /* Whether command stream based data collection is enabled */ + bool cs_mode; const struct i915_perf_stream_ops *ops; }; @@ -1838,10 +1855,22 @@ struct i915_oa_ops { int (*read)(struct i915_perf_stream *stream, char __user *buf, size_t count, - size_t *offset); + size_t *offset, + u32 ts); bool (*oa_buffer_is_empty)(struct drm_i915_private *dev_priv); }; +/* + * List element to hold info about the perf sample data associated + * with a particular GPU command stream. + */ +struct i915_perf_cs_data_node { + struct list_head link; + struct drm_i915_gem_request *request; + u32 offset; + u32 ctx_id; +}; + struct drm_i915_private { struct drm_device drm; @@ -2149,6 +2178,8 @@ struct drm_i915_private { struct ctl_table_header *sysctl_header; struct mutex lock; + + struct mutex streams_lock; struct list_head streams; spinlock_t hook_lock; @@ -2195,6 +2226,16 @@ struct drm_i915_private { const s
[Intel-gfx] [PATCH 04/15] drm/i915: flush periodic samples, in case of no pending CS sample requests
From: Sourab Gupta When there are no pending CS OA samples, flush the periodic OA samples collected so far. We can safely forward the periodic OA samples in the case we have no pending CS samples, but we can't do so in the case we have pending CS samples, since we don't know what the ordering between pending CS samples and periodic samples will eventually be. If we have no pending CS sample, it won't be possible for future pending CS sample to have timestamps earlier than current periodic timestamp. Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_drv.h | 7 +- drivers/gpu/drm/i915/i915_perf.c | 163 +-- 2 files changed, 129 insertions(+), 41 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 0561315..dedb7f8 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1856,8 +1856,9 @@ struct i915_oa_ops { char __user *buf, size_t count, size_t *offset, - u32 ts); - bool (*oa_buffer_is_empty)(struct drm_i915_private *dev_priv); + u32 ts, u32 max_records); + int (*oa_buffer_num_samples)(struct drm_i915_private *dev_priv, + u32 *last_ts); }; /* @@ -2221,6 +,8 @@ struct drm_i915_private { u32 gen7_latched_oastatus1; u32 ctx_oactxctrl_off; u32 ctx_flexeu0_off; + u32 n_pending_periodic_samples; + u32 pending_periodic_ts; struct i915_oa_ops ops; const struct i915_oa_format *oa_formats; diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 7bbc757..2ee4711 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -540,13 +540,30 @@ static void i915_oa_rcs_free_requests(struct drm_i915_private *dev_priv) * pointers. A race here could result in a false positive !empty status which * is acceptable. */ -static bool gen8_oa_buffer_is_empty_fop_unlocked(struct drm_i915_private *dev_priv) +static int +gen8_oa_buffer_num_samples_fop_unlocked(struct drm_i915_private *dev_priv, + u32 *last_ts) { int report_size = dev_priv->perf.oa.oa_buffer.format_size; - u32 head = I915_READ(GEN8_OAHEADPTR); - u32 tail = I915_READ(GEN8_OATAILPTR); + u8 *oa_buf_base = dev_priv->perf.oa.oa_buffer.addr; + u32 head = I915_READ(GEN8_OAHEADPTR) & GEN8_OAHEADPTR_MASK; + u32 tail = I915_READ(GEN8_OATAILPTR) & GEN8_OATAILPTR_MASK; + u32 mask = (OA_BUFFER_SIZE - 1); + u32 num_samples; + u8 *report; + + head -= dev_priv->perf.oa.oa_buffer.gtt_offset; + tail -= dev_priv->perf.oa.oa_buffer.gtt_offset; + num_samples = OA_TAKEN(tail, head) / report_size; - return OA_TAKEN(tail, head) < report_size; + /* read the timestamp of the last sample */ + if (num_samples) { + head += report_size*(num_samples - 1); + report = oa_buf_base + (head & mask); + *last_ts = *(u32 *)(report + 4); + } + + return num_samples; } /* NB: This is either called via fops or the poll check hrtimer (atomic ctx) @@ -560,16 +577,32 @@ static bool gen8_oa_buffer_is_empty_fop_unlocked(struct drm_i915_private *dev_pr * pointers. A race here could result in a false positive !empty status which * is acceptable. */ -static bool gen7_oa_buffer_is_empty_fop_unlocked(struct drm_i915_private *dev_priv) +static int +gen7_oa_buffer_num_samples_fop_unlocked(struct drm_i915_private *dev_priv, + u32 *last_ts) { int report_size = dev_priv->perf.oa.oa_buffer.format_size; u32 oastatus2 = I915_READ(GEN7_OASTATUS2); u32 oastatus1 = I915_READ(GEN7_OASTATUS1); u32 head = oastatus2 & GEN7_OASTATUS2_HEAD_MASK; u32 tail = oastatus1 & GEN7_OASTATUS1_TAIL_MASK; + u8 *oa_buf_base = dev_priv->perf.oa.oa_buffer.addr; + u32 mask = (OA_BUFFER_SIZE - 1); + int available_size; + u32 num_samples = 0; + u8 *report; - return OA_TAKEN(tail, head) < - dev_priv->perf.oa.tail_margin + report_size; + head -= dev_priv->perf.oa.oa_buffer.gtt_offset; + tail -= dev_priv->perf.oa.oa_buffer.gtt_offset; + available_size = OA_TAKEN(tail, head) - dev_priv->perf.oa.tail_margin; + if (available_size >= report_size) { + num_samples = available_size / report_size; + head += report_size*(num_samples - 1); + report = oa_buf_base + (head & mask); + *last_ts = *(u32 *)(report + 4); + } + + return num_samples; } /** @@ -698,
[Intel-gfx] [PATCH 01/15] drm/i915: Add ctx getparam ioctl parameter to retrieve ctx unique id
From: Sourab Gupta This patch adds a new ctx getparam ioctl parameter, which can be used to retrieve ctx unique id by userspace. This can be used by userspace to map the i915 perf samples with their particular ctx's, since those would be having ctx unique id's. Otherwise the userspace has no way of maintaining this association, since it has the knowledge of only per-drm file specific ctx handles. Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_gem_context.c | 3 +++ include/uapi/drm/i915_drm.h | 1 + 2 files changed, 4 insertions(+) diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c index e6616ed..d0efa5e 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.c +++ b/drivers/gpu/drm/i915/i915_gem_context.c @@ -1078,6 +1078,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data, case I915_CONTEXT_PARAM_NO_ERROR_CAPTURE: args->value = !!(ctx->flags & CONTEXT_NO_ERROR_CAPTURE); break; + case I915_CONTEXT_PARAM_HW_ID: + args->value = ctx->hw_id; + break; default: ret = -EINVAL; break; diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index f63a392..e95f666 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -1223,6 +1223,7 @@ struct drm_i915_gem_context_param { #define I915_CONTEXT_PARAM_NO_ZEROMAP 0x2 #define I915_CONTEXT_PARAM_GTT_SIZE0x3 #define I915_CONTEXT_PARAM_NO_ERROR_CAPTURE0x4 +#define I915_CONTEXT_PARAM_HW_ID 0x5 __u64 value; }; -- 1.9.1 ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
[Intel-gfx] [PATCH 06/15] drm/i915: Populate ctx ID for periodic OA reports
From: Sourab Gupta This adds support for populating the ctx id for the periodic OA reports when requested through the corresponding property. For Gen8, the OA reports itself have the ctx ID and it is the one programmed into HW while submitting workloads. Thus it's retrieved from reports itself. For Gen7, the OA reports don't have any such field, and we can populate this field with the last seen ctx ID while sending CS reports. Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_drv.h | 3 +++ drivers/gpu/drm/i915/i915_perf.c | 52 +--- 2 files changed, 52 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index e9cf939..853cc7db 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1859,6 +1859,8 @@ struct i915_oa_ops { u32 ts, u32 max_records); int (*oa_buffer_num_samples)(struct drm_i915_private *dev_priv, u32 *last_ts); + u32 (*oa_buffer_get_ctx_id)(struct i915_perf_stream *stream, + const u8 *report); }; /* @@ -2239,6 +2241,7 @@ struct drm_i915_private { u32 status; } command_stream_buf; + u32 last_ctx_id; struct list_head node_list; spinlock_t node_list_lock; } perf; diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index e10e78f..84457f8 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -637,6 +637,46 @@ gen7_oa_buffer_num_samples_fop_unlocked(struct drm_i915_private *dev_priv, return num_samples; } +static u32 gen7_oa_buffer_get_ctx_id(struct i915_perf_stream *stream, + const u8 *report) +{ + struct drm_i915_private *dev_priv = stream->dev_priv; + + if (!stream->cs_mode) + WARN_ONCE(1, + "CTX ID can't be retrieved if command stream mode not enabled"); + + /* +* OA reports generated in Gen7 don't have the ctx ID information. +* Therefore, just rely on the ctx ID information from the last CS +* sample forwarded +*/ + return dev_priv->perf.last_ctx_id; +} + +static u32 gen8_oa_buffer_get_ctx_id(struct i915_perf_stream *stream, + const u8 *report) +{ + struct drm_i915_private *dev_priv = stream->dev_priv; + + /* The ctx ID present in the OA reports have intel_context::global_id +* present, since this is programmed into the ELSP in execlist mode. +* In non-execlist mode, fall back to retrieving the ctx ID from the +* last saved ctx ID from command stream mode. +*/ + if (i915.enable_execlists) { + u32 ctx_id = *(u32 *)(report + 12); + ctx_id &= 0xf; + return ctx_id; + } else { + if (!stream->cs_mode) + WARN_ONCE(1, + "CTX ID can't be retrieved if command stream mode not enabled"); + + return dev_priv->perf.last_ctx_id; + } +} + /** * Appends a status record to a userspace read() buffer. */ @@ -733,9 +773,9 @@ static int append_oa_buffer_sample(struct i915_perf_stream *stream, data.source = source; } -#warning "FIXME: append_oa_buffer_sample: read ctx ID from report and map that to an intel_context::global_id" if (sample_flags & SAMPLE_CTX_ID) - data.ctx_id = 0; + data.ctx_id = dev_priv->perf.oa.ops.oa_buffer_get_ctx_id( + stream, report); if (sample_flags & SAMPLE_OA_REPORT) data.report = report; @@ -1248,8 +1288,10 @@ static int append_oa_rcs_sample(struct i915_perf_stream *stream, if (sample_flags & SAMPLE_OA_SOURCE_INFO) data.source = I915_PERF_OA_EVENT_SOURCE_RCS; - if (sample_flags & SAMPLE_CTX_ID) + if (sample_flags & SAMPLE_CTX_ID) { data.ctx_id = node->ctx_id; + dev_priv->perf.last_ctx_id = node->ctx_id; + } if (sample_flags & SAMPLE_OA_REPORT) data.report = report; @@ -3092,6 +3134,8 @@ void i915_perf_init(struct drm_i915_private *dev_priv) dev_priv->perf.oa.ops.read = gen7_oa_read; dev_priv->perf.oa.ops.oa_buffer_num_samples = gen7_oa_buffer_num_samples_fop_unlocked; + dev_priv->perf.oa.ops.oa_buffer_get_ctx_id = + gen7_oa_buffer_get_ctx_id; dev_priv->perf.oa.timestamp_frequency = 1250; @@ -3106,6 +3150,8 @@ void i915_per
[Intel-gfx] [PATCH 09/15] drm/i915: Extend i915 perf framework for collecting timestamps on all gpu engines
From: Sourab Gupta This patch extends the i915 perf framework to handle the perf sample collection for any given gpu engine. Particularly, the support for collecting timestamp sample type is added, which can be requested for any engine. With this, for RCS, timestamps and OA reports can be collected together, and provided to userspace in separate sample fields. For other engines, the capabilility to collect timestamps is added. The thing to note is that, still only a single stream instance can be opened at any particular time. Though that stream may now be opened for any gpu engine, for collection of timestamp samples. So, this patch doesn't add the support to open multiple concurrent streams, as yet. Though it lays the groundwork for this support to be added susequently. Part of this groundwork involves having separate command stream buffers, per engine, for holding the samples generated. Likewise for a few other data structures maintaining per-engine state. Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_drv.h | 35 ++- drivers/gpu/drm/i915/i915_perf.c | 635 +-- drivers/gpu/drm/i915/i915_reg.h | 2 + include/uapi/drm/i915_drm.h | 7 + 4 files changed, 445 insertions(+), 234 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 0f171f8..a05335a 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1814,7 +1814,8 @@ struct i915_perf_stream_ops { * Routine to emit the commands in the command streamer associated * with the corresponding gpu engine. */ - void (*command_stream_hook)(struct drm_i915_gem_request *req, u32 tag); + void (*command_stream_hook)(struct i915_perf_stream *stream, + struct drm_i915_gem_request *req, u32 tag); }; enum i915_perf_stream_state { @@ -1839,6 +1840,9 @@ struct i915_perf_stream { /* Whether command stream based data collection is enabled */ bool cs_mode; + /* Whether the OA unit is in use */ + bool using_oa; + const struct i915_perf_stream_ops *ops; }; @@ -1870,7 +1874,16 @@ struct i915_oa_ops { struct i915_perf_cs_data_node { struct list_head link; struct drm_i915_gem_request *request; - u32 offset; + + /* Offsets into the GEM obj holding the data */ + u32 start_offset; + u32 oa_offset; + u32 ts_offset; + + /* buffer size corresponding to this entry */ + u32 size; + + /* Other metadata */ u32 ctx_id; u32 pid; u32 tag; @@ -2189,14 +2202,14 @@ struct drm_i915_private { spinlock_t hook_lock; - struct { - struct i915_perf_stream *exclusive_stream; - u32 specific_ctx_id; + struct hrtimer poll_check_timer; + struct i915_perf_stream *exclusive_stream; + wait_queue_head_t poll_wq[I915_NUM_ENGINES]; + atomic_t pollin[I915_NUM_ENGINES]; - struct hrtimer poll_check_timer; - wait_queue_head_t poll_wq; - atomic_t pollin; + struct { + u32 specific_ctx_id; bool periodic; int period_exponent; @@ -2241,13 +2254,13 @@ struct drm_i915_private { u8 *addr; #define I915_PERF_CMD_STREAM_BUF_STATUS_OVERFLOW (1<<0) u32 status; - } command_stream_buf; + } command_stream_buf[I915_NUM_ENGINES]; u32 last_ctx_id; u32 last_pid; u32 last_tag; - struct list_head node_list; - spinlock_t node_list_lock; + struct list_head node_list[I915_NUM_ENGINES]; + spinlock_t node_list_lock[I915_NUM_ENGINES]; } perf; /* Abstract the submission mechanism (legacy ringbuffer or execlists) away */ diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index ca523b1..516fd54 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -250,12 +250,17 @@ static u32 i915_perf_stream_paranoid = true; /* For determining the behavior on overflow of command stream samples */ #define CMD_STREAM_BUF_OVERFLOW_ALLOWED -/* Data common to periodic and RCS based samples */ -struct oa_sample_data { +#define OA_ADDR_ALIGN 64 +#define TS_ADDR_ALIGN 8 +#define I915_PERF_TS_SAMPLE_SIZE 8 + +/* Data common to all samples (periodic OA / CS based OA / Timestamps) */ +struct sample_data { u32 source; u32 ctx_id; u32 pid; u32 tag; + u64 ts; const u8 *report; }; @@ -313,6 +318,7 @@ static const enum intel_engine_id user_ring_map[I915_USER_RINGS + 1] = { #define SAMPLE_CTX_ID (1<<2) #
[Intel-gfx] [PATCH 07/15] drm/i915: Add support for having pid output with OA report
From: Sourab Gupta This patch introduces flags and adds support for having pid output with the OA reports generated through the RCS commands. When the stream is opened with pid sample type, the pid information is also captured through the command stream samples and forwarded along with the OA reports. Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_drv.h | 2 ++ drivers/gpu/drm/i915/i915_perf.c | 48 +++- include/uapi/drm/i915_drm.h | 7 ++ 3 files changed, 56 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 853cc7db..f250e7b 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1872,6 +1872,7 @@ struct i915_perf_cs_data_node { struct drm_i915_gem_request *request; u32 offset; u32 ctx_id; + u32 pid; }; struct drm_i915_private { @@ -2242,6 +2243,7 @@ struct drm_i915_private { } command_stream_buf; u32 last_ctx_id; + u32 last_pid; struct list_head node_list; spinlock_t node_list_lock; } perf; diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 84457f8..894d7a6 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -254,6 +254,7 @@ static u32 i915_perf_stream_paranoid = true; struct oa_sample_data { u32 source; u32 ctx_id; + u32 pid; const u8 *report; }; @@ -309,6 +310,7 @@ static const enum intel_engine_id user_ring_map[I915_USER_RINGS + 1] = { #define SAMPLE_OA_REPORT (1<<0) #define SAMPLE_OA_SOURCE_INFO (1<<1) #define SAMPLE_CTX_ID (1<<2) +#define SAMPLE_PID (1<<3) struct perf_open_properties { u32 sample_flags; @@ -484,6 +486,7 @@ static void i915_perf_command_stream_hook_oa(struct drm_i915_gem_request *req) goto out; entry->ctx_id = ctx->hw_id; + entry->pid = current->pid; i915_gem_request_assign(&entry->request, req); addr = dev_priv->perf.command_stream_buf.vma->node.start + @@ -735,6 +738,12 @@ static int append_oa_sample(struct i915_perf_stream *stream, buf += 4; } + if (sample_flags & SAMPLE_PID) { + if (copy_to_user(buf, &data->pid, 4)) + return -EFAULT; + buf += 4; + } + if (sample_flags & SAMPLE_OA_REPORT) { if (copy_to_user(buf, data->report, report_size)) return -EFAULT; @@ -777,6 +786,9 @@ static int append_oa_buffer_sample(struct i915_perf_stream *stream, data.ctx_id = dev_priv->perf.oa.ops.oa_buffer_get_ctx_id( stream, report); + if (sample_flags & SAMPLE_PID) + data.pid = dev_priv->perf.last_pid; + if (sample_flags & SAMPLE_OA_REPORT) data.report = report; @@ -1293,6 +1305,11 @@ static int append_oa_rcs_sample(struct i915_perf_stream *stream, dev_priv->perf.last_ctx_id = node->ctx_id; } + if (sample_flags & SAMPLE_PID) { + data.pid = node->pid; + dev_priv->perf.last_pid = node->pid; + } + if (sample_flags & SAMPLE_OA_REPORT) data.report = report; @@ -2127,6 +2144,7 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, struct drm_i915_private *dev_priv = stream->dev_priv; bool require_oa_unit = props->sample_flags & (SAMPLE_OA_REPORT | SAMPLE_OA_SOURCE_INFO); + bool require_cs_mode = props->sample_flags & SAMPLE_PID; bool cs_sample_data = props->sample_flags & SAMPLE_OA_REPORT; int ret; @@ -2268,6 +2286,20 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, if (props->sample_flags & SAMPLE_CTX_ID) { stream->sample_flags |= SAMPLE_CTX_ID; stream->sample_size += 4; + + /* +* NB: it's meaningful to request SAMPLE_CTX_ID with just CS +* mode or periodic OA mode sampling but we don't allow +* SAMPLE_CTX_ID without either mode +*/ + if (!require_oa_unit) + require_cs_mode = true; + } + + if (require_cs_mode && !props->cs_mode) { + DRM_ERROR("PID sampling requires a ring to be specified"); + ret = -EINVAL; + goto cs_error; } if (props->cs_mode) { @@ -2278,7 +2310,13 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, goto cs_error;
[Intel-gfx] [PATCH 15/15] drm/i915: Support for capturing MMIO register values
From: Sourab Gupta This patch adds support for capturing MMIO register values through i915 perf interface. The userspace can request upto 8 MMIO register values to be dumped. The addresses of these registers can be passed through the corresponding property 'value' field while opening the stream. The commands to dump the values of these MMIO registers are then inserted into the ring alongwith other commands. Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_drv.h | 4 + drivers/gpu/drm/i915/i915_perf.c | 153 ++- include/uapi/drm/i915_drm.h | 14 3 files changed, 168 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 557a124..14cd9cf 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1885,6 +1885,7 @@ struct i915_perf_cs_data_node { u32 start_offset; u32 oa_offset; u32 ts_offset; + u32 mmio_offset; /* buffer size corresponding to this entry */ u32 size; @@ -2230,6 +2231,9 @@ struct drm_i915_private { wait_queue_head_t poll_wq[I915_NUM_ENGINES]; atomic_t pollin[I915_NUM_ENGINES]; + u32 num_mmio; + u32 mmio_list[I915_PERF_MMIO_NUM_MAX]; + struct { u32 specific_ctx_id; diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index b11e953..ed6b31f 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -277,6 +277,7 @@ struct sample_data { u64 gpu_ts; u64 clk_monoraw; const u8 *report; + const u8 *mmio; }; /* for sysctl proc_dointvec_minmax of i915_oa_min_timer_exponent */ @@ -335,6 +336,7 @@ static const enum intel_engine_id user_ring_map[I915_USER_RINGS + 1] = { #define SAMPLE_TAG (1<<4) #define SAMPLE_TS (1<<5) #define SAMPLE_CLK_MONO_RAW(1<<6) +#define SAMPLE_MMIO(1<<7) struct perf_open_properties { u32 sample_flags; @@ -567,6 +569,9 @@ static int insert_perf_entry(struct drm_i915_private *dev_priv, sample_ts = true; } + if (sample_flags & SAMPLE_MMIO) + entry_size += 4*dev_priv->perf.num_mmio; + spin_lock(&dev_priv->perf.node_list_lock[id]); if (list_empty(&dev_priv->perf.node_list[id])) { offset = 0; @@ -644,6 +649,10 @@ out: entry->ts_offset = ALIGN(entry->ts_offset, TS_ADDR_ALIGN); offset = entry->ts_offset + I915_PERF_TS_SAMPLE_SIZE; } + if (sample_flags & SAMPLE_MMIO) { + entry->mmio_offset = offset; + offset = entry->mmio_offset + 4*dev_priv->perf.num_mmio; + } list_add_tail(&entry->link, &dev_priv->perf.node_list[id]); #ifndef CMD_STREAM_BUF_OVERFLOW_ALLOWED @@ -744,6 +753,47 @@ static int i915_ring_stream_capture_ts(struct drm_i915_gem_request *req, return 0; } +static int i915_ring_stream_capture_mmio(struct drm_i915_gem_request *req, + u32 offset) +{ + struct drm_i915_private *dev_priv = req->i915; + enum intel_engine_id id = req->engine->id; + struct intel_ring *ring = req->ring; + int num_mmio = dev_priv->perf.num_mmio; + u32 mmio_addr, addr = 0; + int ret, i; + + ret = intel_ring_begin(req, 4*num_mmio); + if (ret) + return ret; + + mmio_addr = + dev_priv->perf.command_stream_buf[id].vma->node.start + offset; + + for (i = 0; i < num_mmio; i++) { + uint32_t cmd; + + addr = mmio_addr + 4*i; + + if (INTEL_INFO(dev_priv)->gen >= 8) + cmd = MI_STORE_REGISTER_MEM_GEN8 | + MI_SRM_LRM_GLOBAL_GTT; + else + cmd = MI_STORE_REGISTER_MEM | + MI_SRM_LRM_GLOBAL_GTT; + + intel_ring_emit(ring, cmd); + intel_ring_emit(ring, dev_priv->perf.mmio_list[i]); + intel_ring_emit(ring, addr); + if (INTEL_INFO(dev_priv)->gen >= 8) + intel_ring_emit(ring, 0); + else + intel_ring_emit(ring, MI_NOOP); + } + intel_ring_advance(ring); + return 0; +} + static void i915_ring_stream_cs_hook(struct i915_perf_stream *stream, struct drm_i915_gem_request *req, u32 tag) { @@ -784,6 +834,12 @@ static void i915_ring_stream_cs_hook(struct i915_perf_stream *stream, goto err_unref; } + if (sample_flags & SAMPLE_MMIO) { + ret = i915_ring_stream_
[Intel-gfx] [PATCH 05/15] drm/i915: Handle the overflow condition for command stream buf
From: Sourab Gupta Add a compile time option for detecting the overflow condition of command stream buffer, and not overwriting the old entries in such a case. Also, set a status flag to forward the overflow condition to userspace if overflow is detected. Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_drv.h | 2 ++ drivers/gpu/drm/i915/i915_perf.c | 75 2 files changed, 62 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index dedb7f8..e9cf939 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -2235,6 +2235,8 @@ struct drm_i915_private { struct drm_i915_gem_object *obj; struct i915_vma *vma; u8 *addr; +#define I915_PERF_CMD_STREAM_BUF_STATUS_OVERFLOW (1<<0) + u32 status; } command_stream_buf; struct list_head node_list; diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 2ee4711..e10e78f 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -247,6 +247,9 @@ static u32 i915_perf_stream_paranoid = true; #define GEN8_OAREPORT_REASON_GO_TRANSITION (1<<23) #define GEN9_OAREPORT_REASON_CLK_RATIO (1<<24) +/* For determining the behavior on overflow of command stream samples */ +#define CMD_STREAM_BUF_OVERFLOW_ALLOWED + /* Data common to periodic and RCS based samples */ struct oa_sample_data { u32 source; @@ -348,6 +351,7 @@ void i915_perf_command_stream_hook(struct drm_i915_gem_request *request) mutex_unlock(&dev_priv->perf.streams_lock); } +#ifdef CMD_STREAM_BUF_OVERFLOW_ALLOWED /* * Release some perf entries to make space for a new entry data. We dereference * the associated request before deleting the entry. Also, no need to check for @@ -374,25 +378,26 @@ static void release_some_perf_entries(struct drm_i915_private *dev_priv, break; } } +#endif /* - * Insert the perf entry to the end of the list. This function never fails, - * since it always manages to insert the entry. If the space is exhausted in - * the buffer, it will remove the oldest entries in order to make space. + * Insert the perf entry to the end of the list. If the overwrite of old entries + * is allowed, the function always manages to insert the entry and returns 0. + * If overwrite is not allowed, on detection of overflow condition, an + * appropriate status flag is set, and function returns -ENOSPC. */ -static void insert_perf_entry(struct drm_i915_private *dev_priv, +static int insert_perf_entry(struct drm_i915_private *dev_priv, struct i915_perf_cs_data_node *entry) { struct i915_perf_cs_data_node *first_entry, *last_entry; int max_offset = dev_priv->perf.command_stream_buf.obj->base.size; u32 entry_size = dev_priv->perf.oa.oa_buffer.format_size; + int ret = 0; spin_lock(&dev_priv->perf.node_list_lock); if (list_empty(&dev_priv->perf.node_list)) { entry->offset = 0; - list_add_tail(&entry->link, &dev_priv->perf.node_list); - spin_unlock(&dev_priv->perf.node_list_lock); - return; + goto out; } first_entry = list_first_entry(&dev_priv->perf.node_list, @@ -410,29 +415,49 @@ static void insert_perf_entry(struct drm_i915_private *dev_priv, */ else if (entry_size < first_entry->offset) entry->offset = 0; - /* Insufficient space. Overwrite existing old entries */ + /* Insufficient space */ else { +#ifdef CMD_STREAM_BUF_OVERFLOW_ALLOWED u32 target_size = entry_size - first_entry->offset; release_some_perf_entries(dev_priv, target_size); entry->offset = 0; +#else + dev_priv->perf.command_stream_buf.status |= + I915_PERF_CMD_STREAM_BUF_STATUS_OVERFLOW; + ret = -ENOSPC; + goto out_unlock; +#endif } } else { /* Sufficient space available? */ if (last_entry->offset + 2*entry_size < first_entry->offset) entry->offset = last_entry->offset + entry_size; - /* Insufficient space. Overwrite existing old entries */ + /* Insufficient space */ else { +#ifdef CMD_STREAM_BUF_OVERFLOW_ALLOWED u32 target_size = entry_size - (first_entry->offset - last_entry->offset - entry_si
[Intel-gfx] [PATCH 13/15] time: export clocks_calc_mult_shift
From: Sourab Gupta Exporting clocks_calc_mult_shift is helpful for drivers to calculate the mult/shift values for their clocks, given their frequency. This is particularly useful when such drivers may want to associate timecounter/cyclecounter abstraction for their clock sources, in order to use the cross timestamp infrastructure for syncing device time with system time. Signed-off-by: Sourab Gupta --- kernel/time/clocksource.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 6a5a310..e2de743 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -89,6 +89,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec) *mult = tmp; *shift = sft; } +EXPORT_SYMBOL_GPL(clocks_calc_mult_shift); /*[Clocksource internal variables]- * curr_clocksource: -- 1.9.1 ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
[Intel-gfx] [PATCH 02/15] drm/i915: Expose OA sample source to userspace
From: Sourab Gupta This patch exposes a new sample source field to userspace. This field can be populated to specify the origin of the OA report. For e.g. for internally triggerred reports (non MI_RPC reports), the RPT_ID field has bitfields for specifying the origin such as timer, or render ctx switch, etc. Likewise this field can be used to specify the source as MI_RPC when such support is added. Signed-off-by: Sourab Gupta Signed-off-by: Robert Bragg --- drivers/gpu/drm/i915/i915_perf.c | 55 ++-- include/uapi/drm/i915_drm.h | 16 2 files changed, 63 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index d030cd7..58a1118 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -240,6 +240,13 @@ static u32 i915_perf_stream_paranoid = true; */ #define OA_EXPONENT_MAX 31 +#define GEN8_OAREPORT_REASON_TIMER (1<<19) +#define GEN8_OAREPORT_REASON_TRIGGER1 (1<<20) +#define GEN8_OAREPORT_REASON_TRIGGER2 (1<<21) +#define GEN8_OAREPORT_REASON_CTX_SWITCH (1<<22) +#define GEN8_OAREPORT_REASON_GO_TRANSITION (1<<23) +#define GEN9_OAREPORT_REASON_CLK_RATIO (1<<24) + /* for sysctl proc_dointvec_minmax of i915_oa_min_timer_exponent */ static int zero; static int oa_exponent_max = OA_EXPONENT_MAX; @@ -279,7 +286,8 @@ static struct i915_oa_format gen8_plus_oa_formats[I915_OA_FORMAT_MAX] = { [I915_OA_FORMAT_C4_B8] = { 7, 64 }, }; -#define SAMPLE_OA_REPORT (1<<0) +#define SAMPLE_OA_REPORT (1<<0) +#define SAMPLE_OA_SOURCE_INFO (1<<1) struct perf_open_properties { u32 sample_flags; @@ -385,6 +393,27 @@ static int append_oa_sample(struct i915_perf_stream *stream, return -EFAULT; buf += sizeof(header); + if (sample_flags & SAMPLE_OA_SOURCE_INFO) { + enum drm_i915_perf_oa_event_source source; + + if (INTEL_INFO(dev_priv)->gen >= 8) { + u32 reason = *(u32 *)report; + + if (reason & GEN8_OAREPORT_REASON_CTX_SWITCH) + source = + I915_PERF_OA_EVENT_SOURCE_CONTEXT_SWITCH; + else if (reason & GEN8_OAREPORT_REASON_TIMER) + source = I915_PERF_OA_EVENT_SOURCE_PERIODIC; + else + source = I915_PERF_OA_EVENT_SOURCE_UNDEFINED; + } else + source = I915_PERF_OA_EVENT_SOURCE_PERIODIC; + + if (copy_to_user(buf, &source, 4)) + return -EFAULT; + buf += 4; + } + if (sample_flags & SAMPLE_OA_REPORT) { if (copy_to_user(buf, report, report_size)) return -EFAULT; @@ -1453,11 +1482,6 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, return -EINVAL; } - if (!(props->sample_flags & SAMPLE_OA_REPORT)) { - DRM_ERROR("Only OA report sampling supported\n"); - return -EINVAL; - } - if (!dev_priv->perf.oa.ops.init_oa_buffer) { DRM_ERROR("OA unit not supported\n"); return -ENODEV; @@ -1486,8 +1510,20 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, format_size = dev_priv->perf.oa.oa_formats[props->oa_format].size; - stream->sample_flags |= SAMPLE_OA_REPORT; - stream->sample_size += format_size; + if (props->sample_flags & SAMPLE_OA_REPORT) { + stream->sample_flags |= SAMPLE_OA_REPORT; + stream->sample_size += format_size; + } + + if (props->sample_flags & SAMPLE_OA_SOURCE_INFO) { + if (!(props->sample_flags & SAMPLE_OA_REPORT)) { + DRM_ERROR( + "OA source type can't be sampled without OA report"); + return -EINVAL; + } + stream->sample_flags |= SAMPLE_OA_SOURCE_INFO; + stream->sample_size += 4; + } dev_priv->perf.oa.oa_buffer.format_size = format_size; BUG_ON(dev_priv->perf.oa.oa_buffer.format_size == 0); @@ -2160,6 +2196,9 @@ static int read_properties_unlocked(struct drm_i915_private *dev_priv, props->oa_periodic = true; props->oa_period_exponent = value; break; + case DRM_I915_PERF_PROP_SAMPLE_OA_SOURCE: + props->sample_flags |= SAMPLE_OA_SOURCE_INFO; + break; case DRM_I915_PERF_PROP_MAX: BUG(); } dif
[Intel-gfx] [PATCH 00/15] Framework to collect command stream gpu metrics using i915 perf
From: Sourab Gupta Refloating the series rebased on Robert's latest patchset. Since Robert's patches are being reviewed and this patch series extends his framework to enable multiple concurrent streams to capture command stream based metrics, it would be good to keep this work in perspective. Looking to receive feedback on the series (and possibly r-b's :)) This series adds framework for collection of gpu performance metrics associated with the command stream of a particular engine. These metrics include OA reports, timestamps, mmio metrics, etc. These metrics are are collected around batchbuffer boundaries. This work utilizes the underlying infrastructure introduced in Robert Bragg's patches for collecting periodic OA counter snapshots (based on Haswell): https://patchwork.freedesktop.org/series/14505/ This patch set is based on Gen8+ version of Robert's patches which can be found here: https://github.com/rib/linux/tree/wip/rib/oa-next In the last series floated earlier (https://patchwork.freedesktop.org/series/6154/), based on Chris's suggestion, I had tried experimenting with using the cross timestamp framework for the purpose of retrieving tightly coupled device/system timestamps. In our case, this framework enables us to have correlated pairs of gpu+system time which can be used over a period of time to correct the frequency of timestamp clock, and thus enable to accurately send system time (_MONO_RAW) as requested to the userspace. The results are generally observed to quite better with the use of cross timestamps and the frequency delta gradually tapers down to 0 with increasing correction periods. The use of cross timestamp framework though requires us to have clockcounter/timecounter abstraction for the timestamp clocksource, and further requires few changes in the kernel timekeeping/clocksource code. I am looking for feedback on the use of this framework and the changes involved. These patches can be found for viewing at https://github.com/sourabgu/linux/tree/oa-19oct Sourab Gupta (15): drm/i915: Add ctx getparam ioctl parameter to retrieve ctx unique id drm/i915: Expose OA sample source to userspace drm/i915: Framework for capturing command stream based OA reports drm/i915: flush periodic samples, in case of no pending CS sample requests drm/i915: Handle the overflow condition for command stream buf drm/i915: Populate ctx ID for periodic OA reports drm/i915: Add support for having pid output with OA report drm/i915: Add support for emitting execbuffer tags through OA counter reports drm/i915: Extend i915 perf framework for collecting timestamps on all gpu engines drm/i915: Extract raw GPU timestamps from OA reports to forward in perf samples drm/i915: Support opening multiple concurrent perf streams time: Expose current clocksource in use by timekeeping framework time: export clocks_calc_mult_shift drm/i915: Mechanism to forward clock monotonic raw time in perf samples drm/i915: Support for capturing MMIO register values drivers/gpu/drm/i915/i915_drv.c|2 + drivers/gpu/drm/i915/i915_drv.h| 112 +- drivers/gpu/drm/i915/i915_gem_context.c|3 + drivers/gpu/drm/i915/i915_gem_execbuffer.c |6 + drivers/gpu/drm/i915/i915_perf.c | 1911 drivers/gpu/drm/i915/i915_reg.h|6 + include/linux/timekeeping.h|5 + include/uapi/drm/i915_drm.h| 79 ++ kernel/time/clocksource.c |1 + kernel/time/timekeeping.c | 12 + 10 files changed, 1910 insertions(+), 227 deletions(-) -- 1.9.1 ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
[Intel-gfx] [PATCH 12/15] time: Expose current clocksource in use by timekeeping framework
From: Sourab Gupta For the drivers to be able to use the cross timestamp framework, they need the information of current clocksource being used by the kernel timekeeping. This is needed since the callback given by driver into the get_device_system_crosststamp(), in order to synchronously read the device time and system counter value, requires the knowledge of the clocksource being used to read system counter value (as a part of struct system_counterval_t). Signed-off-by: Sourab Gupta --- include/linux/timekeeping.h | 5 + kernel/time/timekeeping.c | 12 2 files changed, 17 insertions(+) diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 816b754..101aaa3 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -336,6 +336,11 @@ extern int get_device_system_crosststamp( struct system_device_crosststamp *xtstamp); /* + * Get current clocksource used by system timekeeping framework + */ +struct clocksource *get_current_clocksource(void); + +/* * Simultaneously snapshot realtime and monotonic raw clocks */ extern void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e07fb09..bb1e9c0 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1138,6 +1138,18 @@ int get_device_system_crosststamp(int (*get_time_fn) EXPORT_SYMBOL_GPL(get_device_system_crosststamp); /** + * get_current_clocksource - Returns the current clocksource in used by tk_core + * + */ +struct clocksource *get_current_clocksource(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + + return tk->tkr_mono.clock; +} +EXPORT_SYMBOL_GPL(get_current_clocksource); + +/** * do_gettimeofday - Returns the time of day in a timeval * @tv:pointer to the timeval to be set * -- 1.9.1 ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
[Intel-gfx] [PATCH 14/15] drm/i915: Mechanism to forward clock monotonic raw time in perf samples
From: Sourab Gupta Currently, we have the ability to only forward the GPU timestamps in the samples (which are generated via OA reports or PIPE_CONTROL commands inserted in the ring). This limits the ability to correlate these samples with the system events. If we scale the GPU timestamps according the timestamp base/frequency info present in bspec, it is observed that the timestamps drift really quickly from the system time. An ability is therefore needed to report timestamps in different clock domains, such as CLOCK_MONOTONIC (or _MONO_RAW), in the perf samples to be of more practical use to the userspace. This ability becomes important when we want to correlate/plot GPU events/samples with other system events on the same timeline (e.g. vblank events, or timestamps when work was submitted to kernel, etc.) The patch here proposes a mechanism to achieve this. The correlation between gpu time and system time is established using the cross timestamp framework. For this purpose, the timestamp clock associated with the command stream, is abstracted as timecounter/cyclecounter, before utilizing cross timestamp framework to retrieve gpu/system time correlated values. Different such gpu/system time values are then used to detect and correct the error in published gpu timestamp clock frequency. The userspace can request CLOCK_MONOTONIC_RAW timestamps in samples by requesting the corresponding property while opening the stream. Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_drv.c | 2 + drivers/gpu/drm/i915/i915_drv.h | 24 +++- drivers/gpu/drm/i915/i915_perf.c | 273 +++ include/uapi/drm/i915_drm.h | 9 +- 4 files changed, 284 insertions(+), 24 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c index 06c7b55..0dc2384 100644 --- a/drivers/gpu/drm/i915/i915_drv.c +++ b/drivers/gpu/drm/i915/i915_drv.c @@ -1088,6 +1088,8 @@ static int i915_driver_init_hw(struct drm_i915_private *dev_priv) DRM_DEBUG_DRIVER("can't enable MSI"); } + i915_perf_init_late(dev_priv); + return 0; out_ggtt: diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index e912679..557a124 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -42,6 +42,9 @@ #include #include #include +#include +#include +#include #include #include @@ -1843,6 +1846,9 @@ struct i915_perf_stream { /* Whether the OA unit is in use */ bool using_oa; + /* monotonic_raw clk timestamp (in ns) for last sample */ + u64 last_sample_ts; + const struct i915_perf_stream_ops *ops; }; @@ -1889,6 +1895,20 @@ struct i915_perf_cs_data_node { u32 tag; }; +/** + * struct i915_clock_info - decribes i915 timestamp clock + * + */ +struct i915_clock_info { + struct cyclecounter cc; + struct timecounter tc; + struct system_device_crosststamp xtstamp; + ktime_t clk_offset; /* Offset (in ns) between monoraw clk and gpu time */ + u32 timestamp_frequency; + u32 resync_period; /* in msecs */ + struct delayed_work clk_sync_work; +}; + struct drm_i915_private { struct drm_device drm; @@ -2189,6 +2209,8 @@ struct drm_i915_private { struct i915_runtime_pm pm; + struct i915_clock_info ts_clk_info; + struct { bool initialized; @@ -2213,7 +2235,6 @@ struct drm_i915_private { bool periodic; int period_exponent; - int timestamp_frequency; int tail_margin; @@ -3796,6 +3817,7 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine, /* i915_perf.c */ extern void i915_perf_init(struct drm_i915_private *dev_priv); +extern void i915_perf_init_late(struct drm_i915_private *dev_priv); extern void i915_perf_fini(struct drm_i915_private *dev_priv); extern void i915_perf_register(struct drm_i915_private *dev_priv); extern void i915_perf_unregister(struct drm_i915_private *dev_priv); diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 8eb80e8..b11e953 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -189,6 +189,7 @@ #include #include +#include #include "i915_drv.h" #include "intel_ringbuffer.h" @@ -228,6 +229,9 @@ #define POLL_FREQUENCY 200 #define POLL_PERIOD (NSEC_PER_SEC / POLL_FREQUENCY) +#define MAX_CLK_SYNC_PERIOD (60*MSEC_PER_SEC) +#define INIT_CLK_SYNC_PERIOD (20) /* in msecs */ + static u32 i915_perf_stream_paranoid = true; /* The maximum exponent the hardware accepts is 63 (essentially it selects one @@ -254,13 +258,24 @@ static u32 i915_perf_stream_paranoid = true; #define TS_ADDR_ALIGN 8 #define I915_PERF_TS_SAMPLE_SIZE 8 +/* Published frequency of GT command stream timestamp c
[Intel-gfx] [PATCH 10/15] drm/i915: Extract raw GPU timestamps from OA reports to forward in perf samples
From: Sourab Gupta The OA reports contain the least significant 32 bits of the gpu timestamp. This patch enables retrieval of the timestamp field from OA reports, to forward as 64 bit raw gpu timestamps in the perf samples. Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_drv.h | 1 + drivers/gpu/drm/i915/i915_perf.c | 47 ++-- drivers/gpu/drm/i915/i915_reg.h | 4 3 files changed, 40 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index a05335a..119c82b 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -2241,6 +2241,7 @@ struct drm_i915_private { u32 ctx_flexeu0_off; u32 n_pending_periodic_samples; u32 pending_periodic_ts; + u64 last_gpu_ts; struct i915_oa_ops ops; const struct i915_oa_format *oa_formats; diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 516fd54..b05c41a 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -903,6 +903,24 @@ static int append_sample(struct i915_perf_stream *stream, return 0; } +static u64 get_gpu_ts_from_oa_report(struct drm_i915_private *dev_priv, + const u8 *report) +{ + u32 sample_ts = *(u32 *)(report + 4); + u32 delta; + + /* +* NB: We have to assume we're updating last_gpu_ts frequently +* enough that it's never possible to see multiple overflows before +* we compare sample_ts to last_gpu_ts. Since this is significantly +* large duration (~6min for 80ns ts base), we can safely assume so. +*/ + delta = sample_ts - (u32)dev_priv->perf.oa.last_gpu_ts; + dev_priv->perf.oa.last_gpu_ts += delta; + + return dev_priv->perf.oa.last_gpu_ts; +} + static int append_oa_buffer_sample(struct i915_perf_stream *stream, char __user *buf, size_t count, size_t *offset, const u8 *report) @@ -940,10 +958,9 @@ static int append_oa_buffer_sample(struct i915_perf_stream *stream, if (sample_flags & SAMPLE_TAG) data.tag = dev_priv->perf.last_tag; - /* Derive timestamp from OA report, after scaling with the ts base */ -#warning "FIXME: append_oa_buffer_sample: derive the timestamp from OA report" + /* Derive timestamp from OA report */ if (sample_flags & SAMPLE_TS) - data.ts = 0; + data.ts = get_gpu_ts_from_oa_report(dev_priv, report); if (sample_flags & SAMPLE_OA_REPORT) data.report = report; @@ -1443,6 +1460,7 @@ static int append_one_cs_sample(struct i915_perf_stream *stream, enum intel_engine_id id = stream->engine; struct sample_data data = { 0 }; u32 sample_flags = stream->sample_flags; + u64 gpu_ts = 0; int ret = 0; if (sample_flags & SAMPLE_OA_REPORT) { @@ -1459,6 +1477,9 @@ static int append_one_cs_sample(struct i915_perf_stream *stream, sample_ts, U32_MAX); if (ret) return ret; + + if (sample_flags & SAMPLE_TS) + gpu_ts = get_gpu_ts_from_oa_report(dev_priv, report); } if (sample_flags & SAMPLE_OA_SOURCE_INFO) @@ -1480,20 +1501,16 @@ static int append_one_cs_sample(struct i915_perf_stream *stream, } if (sample_flags & SAMPLE_TS) { - /* For RCS, if OA samples are also being collected, derive the -* timestamp from OA report, after scaling with the TS base. + /* If OA sampling is enabled, derive the ts from OA report. * Else, forward the timestamp collected via command stream. */ -#warning "FIXME: append_one_cs_sample: derive the timestamp from OA report" - if (sample_flags & SAMPLE_OA_REPORT) - data.ts = 0; - else - data.ts = *(u64 *) + if (!(sample_flags & SAMPLE_OA_REPORT)) + gpu_ts = *(u64 *) (dev_priv->perf.command_stream_buf[id].addr + node->ts_offset); + data.ts = gpu_ts; } - return append_sample(stream, buf, count, offset, &data); } @@ -2279,9 +2296,15 @@ static void i915_ring_stream_enable(struct i915_perf_stream *stream) { struct drm_i915_private *dev_priv = stream->dev_priv; - if (stream->sample_flags & SAMPLE_OA_REPORT) + if (stream->sample_flags & SAMPLE_OA_REPORT) {
[Intel-gfx] [PATCH 11/15] drm/i915: Support opening multiple concurrent perf streams
From: Sourab Gupta This patch adds support for opening multiple concurrent perf streams for different gpu engines, while having the restriction to open only a single stream open for a particular gpu engine. This enables userspace client to open multiple streams, one per engine, at any time to capture sample data for multiple gpu engines. Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_drv.h | 2 +- drivers/gpu/drm/i915/i915_perf.c | 69 ++-- 2 files changed, 39 insertions(+), 32 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 119c82b..e912679 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -2204,7 +2204,7 @@ struct drm_i915_private { struct hrtimer poll_check_timer; - struct i915_perf_stream *exclusive_stream; + struct i915_perf_stream *ring_stream[I915_NUM_ENGINES]; wait_queue_head_t poll_wq[I915_NUM_ENGINES]; atomic_t pollin[I915_NUM_ENGINES]; diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index b05c41a..8eb80e8 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -1086,7 +1086,7 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream, * an invalid ID. It could be good to annotate these * reports with a _CTX_SWITCH_AWAY reason later. */ - if (!dev_priv->perf.exclusive_stream->ctx || + if (!stream->ctx || dev_priv->perf.oa.specific_ctx_id == ctx_id || dev_priv->perf.oa.oa_buffer.last_ctx_id == ctx_id) { @@ -1097,7 +1097,7 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream, * the switch-away reports with an invalid * context id to be recognisable by userspace. */ - if (dev_priv->perf.exclusive_stream->ctx && + if (stream->ctx && dev_priv->perf.oa.specific_ctx_id != ctx_id) report32[2] = 0x; @@ -1763,7 +1763,7 @@ static void i915_ring_stream_destroy(struct i915_perf_stream *stream) { struct drm_i915_private *dev_priv = stream->dev_priv; - BUG_ON(stream != dev_priv->perf.exclusive_stream); + BUG_ON(stream != dev_priv->perf.ring_stream[stream->engine]); if (stream->using_oa) { dev_priv->perf.oa.ops.disable_metric_set(dev_priv); @@ -1777,7 +1777,7 @@ static void i915_ring_stream_destroy(struct i915_perf_stream *stream) if (stream->cs_mode) free_command_stream_buf(dev_priv, stream->engine); - dev_priv->perf.exclusive_stream = NULL; + dev_priv->perf.ring_stream[stream->engine] = NULL; } static void gen7_init_oa_buffer(struct drm_i915_private *dev_priv) @@ -2220,14 +2220,14 @@ static void gen7_update_oacontrol_locked(struct drm_i915_private *dev_priv) { assert_spin_locked(&dev_priv->perf.hook_lock); - if (dev_priv->perf.exclusive_stream->state != + if (dev_priv->perf.ring_stream[RCS]->state != I915_PERF_STREAM_DISABLED) { unsigned long ctx_id = 0; - if (dev_priv->perf.exclusive_stream->ctx) + if (dev_priv->perf.ring_stream[RCS]->ctx) ctx_id = dev_priv->perf.oa.specific_ctx_id; - if (dev_priv->perf.exclusive_stream->ctx == NULL || ctx_id) { + if (dev_priv->perf.ring_stream[RCS]->ctx == NULL || ctx_id) { bool periodic = dev_priv->perf.oa.periodic; u32 period_exponent = dev_priv->perf.oa.period_exponent; u32 report_format = dev_priv->perf.oa.oa_buffer.format; @@ -2366,15 +2366,6 @@ static int i915_ring_stream_init(struct i915_perf_stream *stream, SAMPLE_TS); int ret; - /* To avoid the complexity of having to accurately filter -* counter reports and marshal to the appropriate client -* we currently only allow exclusive access -*/ - if (dev_priv->perf.exclusive_stream) { - DRM_ERROR("Stream already in use\n"); - return -EBUSY; - } - if ((props->sample_flags & SAMPLE_CTX_ID) && !props->cs_mode) { if (IS_HASWELL(dev_priv)) { DRM_ERROR( @@ -2392,6 +2383,12 @@ static int i915_ring_stream_init(struct i915_perf_stream *stream, if (require_oa_unit) { int format_size; +
Re: [Intel-gfx] [PATCH v8 02/12] drm/i915: Add i915 perf infrastructure
On Fri, 2016-11-04 at 06:19 -0700, Robert Bragg wrote: > > > On Fri, Nov 4, 2016 at 8:59 AM, sourab gupta > wrote: > On Thu, 2016-10-27 at 19:14 -0700, Robert Bragg wrote: > > Adds base i915 perf infrastructure for Gen performance > metrics. > > > > This adds a DRM_IOCTL_I915_PERF_OPEN ioctl that takes an > array of uint64 > > properties to configure a stream of metrics and returns a > new fd usable > > with standard VFS system calls including read() to read > typed and sized > > records; ioctl() to enable or disable capture and poll() to > wait for > > data. > > > > A stream is opened something like: > > > > uint64_t properties[] = { > > /* Single context sampling */ > > DRM_I915_PERF_PROP_CTX_HANDLE,ctx_handle, > > > > /* Include OA reports in samples */ > > DRM_I915_PERF_PROP_SAMPLE_OA, true, > > > > /* OA unit configuration */ > > DRM_I915_PERF_PROP_OA_METRICS_SET,metrics_set_id, > > DRM_I915_PERF_PROP_OA_FORMAT, report_format, > > DRM_I915_PERF_PROP_OA_EXPONENT, period_exponent, > >}; > >struct drm_i915_perf_open_param parm = { > > .flags = I915_PERF_FLAG_FD_CLOEXEC | > >I915_PERF_FLAG_FD_NONBLOCK | > >I915_PERF_FLAG_DISABLED, > > .properties_ptr = (uint64_t)properties, > > .num_properties = sizeof(properties) / 16, > >}; > >int fd = drmIoctl(drm_fd, DRM_IOCTL_I915_PERF_OPEN, > ¶m); > > > > Records read all start with a common { type, size } header > with > > DRM_I915_PERF_RECORD_SAMPLE being of most interest. Sample > records > > contain an extensible number of fields and it's the > > DRM_I915_PERF_PROP_SAMPLE_xyz properties given when opening > that > > determine what's included in every sample. > > > > No specific streams are supported yet so any attempt to open > a stream > > will return an error. > > > > v2: > > use i915_gem_context_get() - Chris Wilson > > v3: > > update read() interface to avoid passing state struct - > Chris Wilson > > fix some rebase fallout, with i915-perf init/deinit > > v4: > > s/DRM_IORW/DRM_IOW/ - Emil Velikov > > > > Signed-off-by: Robert Bragg > > --- > > drivers/gpu/drm/i915/Makefile| 3 + > > drivers/gpu/drm/i915/i915_drv.c | 4 + > > drivers/gpu/drm/i915/i915_drv.h | 91 > > drivers/gpu/drm/i915/i915_perf.c | 443 > +++ > > include/uapi/drm/i915_drm.h | 67 ++ > > 5 files changed, 608 insertions(+) > > create mode 100644 drivers/gpu/drm/i915/i915_perf.c > > > > diff --git a/drivers/gpu/drm/i915/Makefile > b/drivers/gpu/drm/i915/Makefile > > index 6123400..8d4e25f 100644 > > --- a/drivers/gpu/drm/i915/Makefile > > +++ b/drivers/gpu/drm/i915/Makefile > > @@ -113,6 +113,9 @@ i915-$(CONFIG_DRM_I915_CAPTURE_ERROR) += > i915_gpu_error.o > > # virtual gpu code > > i915-y += i915_vgpu.o > > > > +# perf code > > +i915-y += i915_perf.o > > + > > ifeq ($(CONFIG_DRM_I915_GVT),y) > > i915-y += intel_gvt.o > > include $(src)/gvt/Makefile > > diff --git a/drivers/gpu/drm/i915/i915_drv.c > b/drivers/gpu/drm/i915/i915_drv.c > > index af3559d..685c96e 100644 > > --- a/drivers/gpu/drm/i915/i915_drv.c > > +++ b/drivers/gpu/drm/i915/i915_drv.c > > @@ -836,6 +836,8 @@ static int i915_driver_init_early(struct > drm_i915_private *dev_priv, > > > > intel_detect_preproduction_hw(dev_priv); > > > > + i915_perf_init(dev_priv); > > + > > return 0; > > > > err_workqueues: > > @@ -849,6 +851,7 @@ static int i915_driver_init_early(struct >
[Intel-gfx] [PATCH v2 06/15] drm/i915: Populate ctx ID for periodic OA reports
From: Sourab Gupta This adds support for populating the ctx id for the periodic OA reports when requested through the corresponding property. For Gen8, the OA reports itself have the ctx ID and it is the one programmed into HW while submitting workloads. Thus it's retrieved from reports itself. For Gen7, the OA reports don't have any such field, and we can populate this field with the last seen ctx ID while sending CS reports. v2: - Corrected the ctx id mask bits (21 bits instead of 20) (Chris) - removed user-triggerable WARN_ONCE's. (Chris) Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_drv.h | 3 +++ drivers/gpu/drm/i915/i915_perf.c | 52 +--- 2 files changed, 52 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index e9cf939..853cc7db 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1859,6 +1859,8 @@ struct i915_oa_ops { u32 ts, u32 max_records); int (*oa_buffer_num_samples)(struct drm_i915_private *dev_priv, u32 *last_ts); + u32 (*oa_buffer_get_ctx_id)(struct i915_perf_stream *stream, + const u8 *report); }; /* @@ -2239,6 +2241,7 @@ struct drm_i915_private { u32 status; } command_stream_buf; + u32 last_ctx_id; struct list_head node_list; spinlock_t node_list_lock; } perf; diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index e10e78f..7abbf30 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -637,6 +637,46 @@ gen7_oa_buffer_num_samples_fop_unlocked(struct drm_i915_private *dev_priv, return num_samples; } +static u32 gen7_oa_buffer_get_ctx_id(struct i915_perf_stream *stream, + const u8 *report) +{ + struct drm_i915_private *dev_priv = stream->dev_priv; + + if (!stream->cs_mode) + DRM_ERROR( + "CTX ID can't be retrieved if command stream mode not enabled"); + + /* +* OA reports generated in Gen7 don't have the ctx ID information. +* Therefore, just rely on the ctx ID information from the last CS +* sample forwarded +*/ + return dev_priv->perf.last_ctx_id; +} + +static u32 gen8_oa_buffer_get_ctx_id(struct i915_perf_stream *stream, + const u8 *report) +{ + struct drm_i915_private *dev_priv = stream->dev_priv; + + /* The ctx ID present in the OA reports have intel_context::global_id +* present, since this is programmed into the ELSP in execlist mode. +* In non-execlist mode, fall back to retrieving the ctx ID from the +* last saved ctx ID from command stream mode. +*/ + if (i915.enable_execlists) { + u32 ctx_id = *(u32 *)(report + 12); + ctx_id &= 0x1f; + return ctx_id; + } else { + if (!stream->cs_mode) + DRM_ERROR( + "CTX ID can't be retrieved if command stream mode not enabled"); + + return dev_priv->perf.last_ctx_id; + } +} + /** * Appends a status record to a userspace read() buffer. */ @@ -733,9 +773,9 @@ static int append_oa_buffer_sample(struct i915_perf_stream *stream, data.source = source; } -#warning "FIXME: append_oa_buffer_sample: read ctx ID from report and map that to an intel_context::global_id" if (sample_flags & SAMPLE_CTX_ID) - data.ctx_id = 0; + data.ctx_id = dev_priv->perf.oa.ops.oa_buffer_get_ctx_id( + stream, report); if (sample_flags & SAMPLE_OA_REPORT) data.report = report; @@ -1248,8 +1288,10 @@ static int append_oa_rcs_sample(struct i915_perf_stream *stream, if (sample_flags & SAMPLE_OA_SOURCE_INFO) data.source = I915_PERF_OA_EVENT_SOURCE_RCS; - if (sample_flags & SAMPLE_CTX_ID) + if (sample_flags & SAMPLE_CTX_ID) { data.ctx_id = node->ctx_id; + dev_priv->perf.last_ctx_id = node->ctx_id; + } if (sample_flags & SAMPLE_OA_REPORT) data.report = report; @@ -3092,6 +3134,8 @@ void i915_perf_init(struct drm_i915_private *dev_priv) dev_priv->perf.oa.ops.read = gen7_oa_read; dev_priv->perf.oa.ops.oa_buffer_num_samples = gen7_oa_buffer_num_samples_fop_unlocked; + dev_priv->perf.oa.ops.oa_buffer_get_ctx_id = + gen7_oa_buffer_get_ctx_id;
[Intel-gfx] [PATCH v2 08/15] drm/i915: Add support for emitting execbuffer tags through OA counter reports
From: Sourab Gupta This patch enables userspace to specify tags (per workload), provided via execbuffer ioctl, which could be added to OA reports, to help associate reports with the corresponding workloads. There may be multiple stages within a single context, from a userspace perspective. An ability is needed to individually associate the OA reports with their corresponding workloads(execbuffers), which may not be possible solely with ctx_id or pid information. This patch enables such a mechanism. In this patch, upper 32 bits of rsvd1 field, which were previously unused are now being used to pass in the tag. v2: Corrected the tag extraction macro (Chris) Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_drv.h| 6 +++-- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 6 +++-- drivers/gpu/drm/i915/i915_perf.c | 38 ++ include/uapi/drm/i915_drm.h| 12 ++ 4 files changed, 53 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index f250e7b..0f171f8 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1814,7 +1814,7 @@ struct i915_perf_stream_ops { * Routine to emit the commands in the command streamer associated * with the corresponding gpu engine. */ - void (*command_stream_hook)(struct drm_i915_gem_request *req); + void (*command_stream_hook)(struct drm_i915_gem_request *req, u32 tag); }; enum i915_perf_stream_state { @@ -1873,6 +1873,7 @@ struct i915_perf_cs_data_node { u32 offset; u32 ctx_id; u32 pid; + u32 tag; }; struct drm_i915_private { @@ -2244,6 +2245,7 @@ struct drm_i915_private { u32 last_ctx_id; u32 last_pid; + u32 last_tag; struct list_head node_list; spinlock_t node_list_lock; } perf; @@ -3666,7 +3668,7 @@ void i915_oa_legacy_ctx_switch_notify(struct drm_i915_gem_request *req); void i915_oa_update_reg_state(struct intel_engine_cs *engine, struct i915_gem_context *ctx, uint32_t *reg_state); -void i915_perf_command_stream_hook(struct drm_i915_gem_request *req); +void i915_perf_command_stream_hook(struct drm_i915_gem_request *req, u32 tag); /* i915_gem_evict.c */ int __must_check i915_gem_evict_something(struct i915_address_space *vm, diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index da502c7..d89787b 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -58,6 +58,7 @@ struct i915_execbuffer_params { struct intel_engine_cs *engine; struct i915_gem_context *ctx; struct drm_i915_gem_request *request; + uint32_ttag; }; struct eb_vmas { @@ -1523,7 +1524,7 @@ execbuf_submit(struct i915_execbuffer_params *params, if (exec_len == 0) exec_len = params->batch->size - params->args_batch_start_offset; - i915_perf_command_stream_hook(params->request); + i915_perf_command_stream_hook(params->request, params->tag); ret = params->engine->emit_bb_start(params->request, exec_start, exec_len, @@ -1531,7 +1532,7 @@ execbuf_submit(struct i915_execbuffer_params *params, if (ret) return ret; - i915_perf_command_stream_hook(params->request); + i915_perf_command_stream_hook(params->request, params->tag); trace_i915_gem_ring_dispatch(params->request, params->dispatch_flags); @@ -1843,6 +1844,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data, params->engine= engine; params->dispatch_flags = dispatch_flags; params->ctx = ctx; + params->tag = i915_execbuffer2_get_tag(*args); ret = execbuf_submit(params, args, &eb->vmas); err_request: diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 0a13672..18489c2 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -255,6 +255,7 @@ struct oa_sample_data { u32 source; u32 ctx_id; u32 pid; + u32 tag; const u8 *report; }; @@ -311,6 +312,7 @@ static const enum intel_engine_id user_ring_map[I915_USER_RINGS + 1] = { #define SAMPLE_OA_SOURCE_INFO (1<<1) #define SAMPLE_CTX_ID (1<<2) #define SAMPLE_PID (1<<3) +#define SAMPLE_TAG (1<<4) struct perf_open_properties { u32 sample_flags; @@ -335,7 +337,8 @@ struct perf_open_properties { * perf mutex lock. */ -void i915_perf_command_stream_
[Intel-gfx] [PATCH v2 14/15] drm/i915: Mechanism to forward clock monotonic raw time in perf samples
From: Sourab Gupta Currently, we have the ability to only forward the GPU timestamps in the samples (which are generated via OA reports or PIPE_CONTROL commands inserted in the ring). This limits the ability to correlate these samples with the system events. If we scale the GPU timestamps according the timestamp base/frequency info present in bspec, it is observed that the timestamps drift really quickly from the system time. An ability is therefore needed to report timestamps in different clock domains, such as CLOCK_MONOTONIC (or _MONO_RAW), in the perf samples to be of more practical use to the userspace. This ability becomes important when we want to correlate/plot GPU events/samples with other system events on the same timeline (e.g. vblank events, or timestamps when work was submitted to kernel, etc.) The patch here proposes a mechanism to achieve this. The correlation between gpu time and system time is established using the cross timestamp framework. For this purpose, the timestamp clock associated with the command stream, is abstracted as timecounter/cyclecounter, before utilizing cross timestamp framework to retrieve gpu/system time correlated values. Different such gpu/system time values are then used to detect and correct the error in published gpu timestamp clock frequency. The userspace can request CLOCK_MONOTONIC_RAW timestamps in samples by requesting the corresponding property while opening the stream. v2: Added i915_driver_init_late() function to capture the new late init phase for perf (Chris) Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_drv.c | 14 ++ drivers/gpu/drm/i915/i915_drv.h | 24 +++- drivers/gpu/drm/i915/i915_perf.c | 273 +++ include/uapi/drm/i915_drm.h | 9 +- 4 files changed, 296 insertions(+), 24 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c index 06c7b55..2140fa7 100644 --- a/drivers/gpu/drm/i915/i915_drv.c +++ b/drivers/gpu/drm/i915/i915_drv.c @@ -876,6 +876,18 @@ static void i915_driver_cleanup_early(struct drm_i915_private *dev_priv) i915_workqueues_cleanup(dev_priv); } +/** + * i915_driver_init_late - initialize late stage driver components + * @dev_priv: device private + * + * Setup the driver components, which need to be inited after driver state has + * been registered and device enabled. + */ +static int i915_driver_init_late(struct drm_i915_private *dev_priv) +{ + i915_perf_init_late(dev_priv); +} + static int i915_mmio_setup(struct drm_device *dev) { struct drm_i915_private *dev_priv = to_i915(dev); @@ -1254,6 +1266,8 @@ int i915_driver_load(struct pci_dev *pdev, const struct pci_device_id *ent) i915_driver_register(dev_priv); + i915_driver_init_late(dev_priv); + intel_runtime_pm_enable(dev_priv); /* Everything is in place, we can now relax! */ diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index e912679..557a124 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -42,6 +42,9 @@ #include #include #include +#include +#include +#include #include #include @@ -1843,6 +1846,9 @@ struct i915_perf_stream { /* Whether the OA unit is in use */ bool using_oa; + /* monotonic_raw clk timestamp (in ns) for last sample */ + u64 last_sample_ts; + const struct i915_perf_stream_ops *ops; }; @@ -1889,6 +1895,20 @@ struct i915_perf_cs_data_node { u32 tag; }; +/** + * struct i915_clock_info - decribes i915 timestamp clock + * + */ +struct i915_clock_info { + struct cyclecounter cc; + struct timecounter tc; + struct system_device_crosststamp xtstamp; + ktime_t clk_offset; /* Offset (in ns) between monoraw clk and gpu time */ + u32 timestamp_frequency; + u32 resync_period; /* in msecs */ + struct delayed_work clk_sync_work; +}; + struct drm_i915_private { struct drm_device drm; @@ -2189,6 +2209,8 @@ struct drm_i915_private { struct i915_runtime_pm pm; + struct i915_clock_info ts_clk_info; + struct { bool initialized; @@ -2213,7 +2235,6 @@ struct drm_i915_private { bool periodic; int period_exponent; - int timestamp_frequency; int tail_margin; @@ -3796,6 +3817,7 @@ int intel_engine_cmd_parser(struct intel_engine_cs *engine, /* i915_perf.c */ extern void i915_perf_init(struct drm_i915_private *dev_priv); +extern void i915_perf_init_late(struct drm_i915_private *dev_priv); extern void i915_perf_fini(struct drm_i915_private *dev_priv); extern void i915_perf_register(struct drm_i915_private *dev_priv); extern void i915_perf_unregister(struct drm_i915_private *dev_priv); diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 3e39d69..9bf0f90 100644
Re: [Intel-gfx] [PATCH 05/15] drm/i915: Handle the overflow condition for command stream buf
On Mon, 2016-11-07 at 03:10 -0800, Matthew Auld wrote: > On 4 November 2016 at 09:30, wrote: > > From: Sourab Gupta > > > > Add a compile time option for detecting the overflow condition of command > > stream buffer, and not overwriting the old entries in such a case. > > Also, set a status flag to forward the overflow condition to userspace if > > overflow is detected. > > > > Signed-off-by: Sourab Gupta > > --- > > drivers/gpu/drm/i915/i915_drv.h | 2 ++ > > drivers/gpu/drm/i915/i915_perf.c | 75 > > > > 2 files changed, 62 insertions(+), 15 deletions(-) > > > > diff --git a/drivers/gpu/drm/i915/i915_drv.h > > b/drivers/gpu/drm/i915/i915_drv.h > > index dedb7f8..e9cf939 100644 > > --- a/drivers/gpu/drm/i915/i915_drv.h > > +++ b/drivers/gpu/drm/i915/i915_drv.h > > @@ -2235,6 +2235,8 @@ struct drm_i915_private { > > struct drm_i915_gem_object *obj; > > struct i915_vma *vma; > > u8 *addr; > > +#define I915_PERF_CMD_STREAM_BUF_STATUS_OVERFLOW (1<<0) > > + u32 status; > > } command_stream_buf; > > > > struct list_head node_list; > > diff --git a/drivers/gpu/drm/i915/i915_perf.c > > b/drivers/gpu/drm/i915/i915_perf.c > > index 2ee4711..e10e78f 100644 > > --- a/drivers/gpu/drm/i915/i915_perf.c > > +++ b/drivers/gpu/drm/i915/i915_perf.c > > @@ -247,6 +247,9 @@ static u32 i915_perf_stream_paranoid = true; > > #define GEN8_OAREPORT_REASON_GO_TRANSITION (1<<23) > > #define GEN9_OAREPORT_REASON_CLK_RATIO (1<<24) > > > > +/* For determining the behavior on overflow of command stream samples */ > > +#define CMD_STREAM_BUF_OVERFLOW_ALLOWED > By compile time option I sort of imagined this would be a kconfig > option, otherwise I would be expected to manually hack at this file > and carry around the local change ? Well, I intend to remove the compile time option and have the behavior so as to allow the overflow of buffer. It has to be in compliance with the behavior of periodic OA stream (Robert's patchset). Robert, What are your views here. Should default behavior be allow overflow? (and possibly set a status flag informing userspace of overflow?) ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Re: [Intel-gfx] [PATCH v9 05/11] drm/i915: Add 'render basic' Haswell OA unit config
On Mon, 2016-11-07 at 11:49 -0800, Robert Bragg wrote: > Adds a static OA unit, MUX + B Counter configuration for basic render > metrics on Haswell. This is auto generated from an XML > description of metric sets, currently maintained in gputop, ref: > > https://github.com/rib/gputop > > gputop-data/oa-*.xml > > scripts/i915-perf-kernelgen.py > > $ make -C gputop-data -f Makefile.xml SYSFS=0 WHITELIST=RenderBasic > > Signed-off-by: Robert Bragg > Reviewed-by: Matthew Auld > --- Looks good. Reviewed-by: Sourab Gupta ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Re: [Intel-gfx] [PATCH v9 09/11] drm/i915: add dev.i915.oa_max_sample_rate sysctl
On Mon, 2016-11-07 at 11:49 -0800, Robert Bragg wrote: > The maximum OA sampling frequency is now configurable via a > dev.i915.oa_max_sample_rate sysctl parameter. > > Following the precedent set by perf's similar > kernel.perf_event_max_sample_rate the default maximum rate is 10Hz > > Signed-off-by: Robert Bragg > --- > drivers/gpu/drm/i915/i915_perf.c | 61 > > 1 file changed, 50 insertions(+), 11 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_perf.c > b/drivers/gpu/drm/i915/i915_perf.c > index e51c1d8..1a87fe9 100644 > --- a/drivers/gpu/drm/i915/i915_perf.c > +++ b/drivers/gpu/drm/i915/i915_perf.c > @@ -82,6 +82,21 @@ static u32 i915_perf_stream_paranoid = true; > #define INVALID_CTX_ID 0x > > > +/* For sysctl proc_dointvec_minmax of i915_oa_max_sample_rate > + * > + * 160ns is the smallest sampling period we can theoretically program the OA > + * unit with on Haswell, corresponding to 6.25MHz. > + */ > +static int oa_sample_rate_hard_limit = 625; There's no check for 'oa_sample_rate_hard_limit' anywhere below. > + > +/* Theoretically we can program the OA unit to sample every 160ns but don't > + * allow that by default unless root... > + * > + * The default threshold of 10Hz is based on perf's similar > + * kernel.perf_event_max_sample_rate sysctl parameter. > + */ > +static u32 i915_oa_max_sample_rate = 10; > + > /* XXX: beware if future OA HW adds new report formats that the current > * code assumes all reports have a power-of-two size and ~(size - 1) can > * be used as a mask to align the OA tail pointer. > @@ -1314,6 +1329,7 @@ static int read_properties_unlocked(struct > drm_i915_private *dev_priv, > } > > for (i = 0; i < n_props; i++) { > + u64 oa_period, oa_freq_hz; > u64 id, value; > int ret; > > @@ -1359,21 +1375,35 @@ static int read_properties_unlocked(struct > drm_i915_private *dev_priv, > return -EINVAL; > } > > - /* NB: The exponent represents a period as follows: > - * > - * 80ns * 2^(period_exponent + 1) > - * > - * Theoretically we can program the OA unit to sample > + /* Theoretically we can program the OA unit to sample >* every 160ns but don't allow that by default unless >* root. >* > - * Referring to perf's > - * kernel.perf_event_max_sample_rate for a precedent > - * (10 by default); with an OA exponent of 6 we get > - * a period of 10.240 microseconds -just under 10Hz > + * On Haswell the period is derived from the exponent > + * as: > + * > + * period = 80ns * 2^(exponent + 1) > + */ > + BUILD_BUG_ON(sizeof(oa_period) != 8); > + oa_period = 80ull * (2ull << value); I assume now that there'll be a platform specific check for 80ull, while programming oa_period, for subquent Gen8+ platforms, which should be fine. > + > + /* This check is primarily to ensure that oa_period <= > + * UINT32_MAX (before passing to do_div which only > + * accepts a u32 denominator), but we can also skip > + * checking anything < 1Hz which implicitly can't be > + * limited via an integer oa_max_sample_rate. >*/ > - if (value < 6 && !capable(CAP_SYS_ADMIN)) { > - DRM_ERROR("Minimum OA sampling exponent is 6 > without root privileges\n"); > + if (oa_period <= NSEC_PER_SEC) { > + u64 tmp = NSEC_PER_SEC; > + do_div(tmp, oa_period); > + oa_freq_hz = tmp; > + } else > + oa_freq_hz = 0; > + > + if (oa_freq_hz > i915_oa_max_sample_rate && > + !capable(CAP_SYS_ADMIN)) { > + DRM_ERROR("OA exponent would exceed the max > sampling frequency (sysctl dev.i915.oa_max_sample_rate) %uHz without root > privileges\n", > + i915_oa_max_sample_rate); > return -EACCES; > } > > @@ -1481,6 +1511,15 @@ static struct ctl_table oa_table[] = { >.extra1 = &zero, >.extra2 = &one, >}, > + { > + .procname = "oa_max_sample_rate", > + .data = &i915_oa_max_sample_rate, > + .maxlen = sizeof(i915_oa_max_sample_rate), > + .mode = 0644, > + .proc_handler = proc_dointvec_minma
Re: [Intel-gfx] [PATCH v9 11/11] drm/i915: Add a kerneldoc summary for i915_perf.c
On Mon, 2016-11-07 at 11:49 -0800, Robert Bragg wrote: > In particular this tries to capture for posterity some of the early > challenges we had with using the core perf infrastructure in case we > ever want to revisit adapting perf for device metrics. > > Cc: Chris Wilson > Signed-off-by: Robert Bragg > Reviewed-by: Matthew Auld > --- Good summary of early challenges faced while adapting core perf. Reviewed-by: Sourab Gupta ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Re: [Intel-gfx] [PATCH v9 09/11] drm/i915: add dev.i915.oa_max_sample_rate sysctl
On Tue, 2016-11-08 at 03:47 -0800, Robert Bragg wrote: > > > On Tue, Nov 8, 2016 at 6:19 AM, sourab gupta > wrote: > On Mon, 2016-11-07 at 11:49 -0800, Robert Bragg wrote: > > The maximum OA sampling frequency is now configurable via a > > dev.i915.oa_max_sample_rate sysctl parameter. > > > > Following the precedent set by perf's similar > > kernel.perf_event_max_sample_rate the default maximum rate > is 10Hz > > > > Signed-off-by: Robert Bragg > > --- > > drivers/gpu/drm/i915/i915_perf.c | 61 > > > 1 file changed, 50 insertions(+), 11 deletions(-) > > > > diff --git a/drivers/gpu/drm/i915/i915_perf.c > b/drivers/gpu/drm/i915/i915_perf.c > > index e51c1d8..1a87fe9 100644 > > --- a/drivers/gpu/drm/i915/i915_perf.c > > +++ b/drivers/gpu/drm/i915/i915_perf.c > > @@ -82,6 +82,21 @@ static u32 i915_perf_stream_paranoid = > true; > > #define INVALID_CTX_ID 0x > > > > > > +/* For sysctl proc_dointvec_minmax of > i915_oa_max_sample_rate > > + * > > + * 160ns is the smallest sampling period we can > theoretically program the OA > > + * unit with on Haswell, corresponding to 6.25MHz. > > + */ > > +static int oa_sample_rate_hard_limit = 625; > There's no check for 'oa_sample_rate_hard_limit' anywhere > below. > > > It's in the struct ctl_table oa_table[] declaration of the > "oa_max_sample_rate" paramater, assigned to .extra2 which is > referenced by the proc_dointvec_minmax validation handler for the > parameter. > Ok. Seems fine then. > > > > + > > +/* Theoretically we can program the OA unit to sample every > 160ns but don't > > + * allow that by default unless root... > > + * > > + * The default threshold of 10Hz is based on perf's > similar > > + * kernel.perf_event_max_sample_rate sysctl parameter. > > + */ > > +static u32 i915_oa_max_sample_rate = 10; > > + > > /* XXX: beware if future OA HW adds new report formats that > the current > > * code assumes all reports have a power-of-two size and > ~(size - 1) can > > * be used as a mask to align the OA tail pointer. > > @@ -1314,6 +1329,7 @@ static int > read_properties_unlocked(struct drm_i915_private *dev_priv, > > } > > > > for (i = 0; i < n_props; i++) { > > + u64 oa_period, oa_freq_hz; > > u64 id, value; > > int ret; > > > > @@ -1359,21 +1375,35 @@ static int > read_properties_unlocked(struct drm_i915_private *dev_priv, > > return -EINVAL; > > } > > > > - /* NB: The exponent represents a > period as follows: > > - * > > - * 80ns * 2^(period_exponent + 1) > > - * > > - * Theoretically we can program the OA > unit to sample > > + /* Theoretically we can program the OA > unit to sample > >* every 160ns but don't allow that by > default unless > >* root. > >* > > - * Referring to perf's > > - * kernel.perf_event_max_sample_rate > for a precedent > > - * (10 by default); with an OA > exponent of 6 we get > > - * a period of 10.240 microseconds > -just under 10Hz > > + * On Haswell the period is derived > from the exponent > > + * as: > > + * > > + * period = 80ns * 2^(exponent + 1) > > + */ > > + BUILD_BUG_ON(sizeof(oa_period) != 8); > > + oa_period =
Re: [Intel-gfx] [PATCH v9 06/11] drm/i915: Enable i915 perf stream for Haswell OA unit
On Mon, 2016-11-07 at 11:49 -0800, Robert Bragg wrote: > Gen graphics hardware can be set up to periodically write snapshots of > performance counters into a circular buffer via its Observation > Architecture and this patch exposes that capability to userspace via > the > i915 perf interface. > > v2: >Make sure to initialize ->specific_ctx_id when opening, without >relying on _pin_notify hook, in case ctx already pinned. > v3: >Revert back to pinning ctx upfront when opening stream, removing >need to hook in to pinning and to update OACONTROL on the fly. > > Signed-off-by: Robert Bragg > Signed-off-by: Zhenyu Wang > Cc: Chris Wilson > Reviewed-by: Matthew Auld Have been working for quite some time on extending the interfaces per the usecase of multiple concurrent streams (on different engines), and infrastructure fits quite well for these usecases. With Chris' comments addressed, the patch can have my r-b. Reviewed-by: Sourab Gupta ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
[Intel-gfx] [PATCH 02/11] drm/i915: Constrain intel_context::global_id to 20 bits
From: Robert Bragg This will allow the ID to be given to the HW as the unique context identifier that's written, for example, to the context status buffer on preemption and included in reports written by the OA unit. Cc: Sourab Gupta Signed-off-by: Robert Bragg --- drivers/gpu/drm/i915/i915_gem_context.c | 11 +-- 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c index 6f38810..3a90e79 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.c +++ b/drivers/gpu/drm/i915/i915_gem_context.c @@ -90,6 +90,10 @@ #include "i915_drv.h" #include "i915_trace.h" +/* With execlist scheduling we can program our own HW context ID but we we + * are limited to 20bits */ +#define I915_MAX_HW_CTX_ID ((1<<20)-1) + /* This is a HW constraint. The value below is the largest known requirement * I've seen in a spec to date, and that was a workaround for a non-shipping * part. It should be safe to decrease this, but it's more future proof as is. @@ -257,13 +261,8 @@ __create_hw_context(struct drm_device *dev, ctx->file_priv = file_priv; ctx->user_handle = ret; - /* TODO: If required, this global id can be used for programming the hw -* fields too. In that case, we'll have take care of hw restrictions -* while allocating idr. e.g. for some hw, we may not have full 32 bits -* available. -*/ ret = idr_alloc_cyclic(&dev_priv->global_ctx_idr, - ctx, 0, 0, GFP_KERNEL); + ctx, 0, I915_MAX_HW_CTX_ID, GFP_KERNEL); if (ret < 0) goto err_out; -- 1.9.1 ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
[Intel-gfx] [PATCH 00/11] Framework to collect gpu metrics using i915 perf infrastructure
From: Sourab Gupta This series adds framework for collection of gpu performance metrics associated with the command stream of a particular engine. These metrics include OA reports, timestamps, mmio metrics, etc. These metrics are are collected around batchbuffer boundaries. This work utilizes the underlying infrastructure introduced in Robert Bragg's patches for collecting periodic OA counter snapshots (based on Haswell): https://lists.freedesktop.org/archives/intel-gfx/2016-February/086909.html This patch set is based on Gen8+ version of Robert's patch series, which can be found here: https://github.com/rib/linux/tree/wip/rib/oa-next These are not yet individually floated in the mailing list, which I hope doesn't lead to any significant loss of clarity in order to review the work proposed in this patch series. Compared to last series sent earlier, this series is based on drm i915 ioctl based implementation ( which can be referred to, in Robert's work). As such, the design has been changed (and simplified) due to some earlier core perf assumptions going away. Few salient features are listed below: * Ability to collect command stream based OA reports on render engine, in conjunction with the periodic reports generated with the OA unit. These would be collected in seperate buffers and forwarded to userspace in the respective timestamp order. The samples are differentiated in userspace by distinguishing the value of OA sample source field. * Ability to collect timestamps and mmio metrics, associated with command stream of any particular gpu engine. The particular sample metrics to be collected are requested by userspace client in the properties associated with the stream being opened. The samples generated depend on original sample flags requested in the stream properties. * Ability to collect associated metadata information with the samples such as pid, tags, etc. These are collected at the time of inserting the commands into the command stream of particular gpu engine, and forwarded along with samples * Multiple streams belonging to different engines can be opened concurrently (while restricting only one instance of open stream per engine). This allows us to open simultaneously streams belonging to different gpu engines to collect samples belonging to all of them concurrently. * The different stages of a single workload (belonging to a single context) can be delimited by using 'execbuffer tagging' mechanism introduced here. For e.g. for the media pipeline, CodecHAL encoding stage has a single context and involves multiple stages such as Scaling, ME, MBEnc, PAK for which there are separate execbuffer calls. There is a need to have the samples generated to have such information, so as to be able to associate them with the particular workload stage. The presence of a tag sample_type, which is passed in by userspace during execbuffer ioctl fulfills this requirement. I am looking for feedback on the design proposed here, particularly pertaining to the mechanics of metrics collection through insertion of commands in the command stream of associated gpu engines, sample generation according to the requested sample flags in stream properties, concurrent operation of different streams to collect the samples from multiple gpu engines, and any such design/implementation aspects per se. Few open issues which I'm working on include: * In case both timestamp and OA sample type are requested for render engine, the ts information should be able to be derived from OA report only, and we should not need to insert seperate commands for dumping timestamps. Though, we need to apply relevant timestamp base conversion for converting from OA timestamps into ns. * The sample consistency has to be maintained between the periodic OA reports and the ones generated by command stream. This implies, for e.g., that if pid sample_type is requested, the most recent pid collected in the CS samples should be used to populate the relevant field in the periodic samples. Likewise, the field 'ctx_id' needs to be deduced from the periodic OA reports and mapped to 'intel_context::global_id', for periodic OA reports. These open issues, though, shouldn't be distracting us too much from reviewing the general mechanism proposed here, and these can be ironed out subsequently, if there's a general agreement on the design here. Also, one of the pre-requisite for this work is presence of globally unique id associated with each context. The present context id is specific to drm fd, and as such, it can't uniquely be used to associate the reports generated with the corresponding context scheduled from userspace in a global way. The first few patches in the series introduce the globally unique context id, and subsequent ones introduce the framework for collection of metrics. Robert Bragg (2): drm/i915: Constrain
[Intel-gfx] [PATCH 01/11] drm/i915: Introduce global id for contexts
From: Sourab Gupta The current context user handles are specific to drm file instance. There are some usecases, which may require a global id for the contexts. For e.g. a system level GPU profiler tool may lean upon the global context ids to associate the performance snapshots with individual contexts. This global id may also be used further in order to provide a unique context id to hw. In this patch, the global ids are allocated from a separate cyclic idr and can be further utilized for any usecase described above. v2: According to Chris' suggestion, implemented a separate idr for holding global ids for contexts, as opposed to overloading the file specific ctx->user_handle for this purpose. This global id can also further be used wherever hw has to be programmed with ctx unique id, though this patch just introduces the hw global id as such. Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_drv.h | 3 +++ drivers/gpu/drm/i915/i915_gem_context.c | 21 + 2 files changed, 24 insertions(+) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 3ea4656..c409c8f 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -875,6 +875,7 @@ struct i915_ctx_hang_stats { struct intel_context { struct kref ref; int user_handle; + int global_id; uint8_t remap_slice; struct drm_i915_private *i915; int flags; @@ -1868,6 +1869,8 @@ struct drm_i915_private { bool preserve_bios_swizzle; + struct idr global_ctx_idr; + /* overlay */ struct intel_overlay *overlay; diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c index a53f591..6f38810 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.c +++ b/drivers/gpu/drm/i915/i915_gem_context.c @@ -256,6 +256,18 @@ __create_hw_context(struct drm_device *dev, ctx->file_priv = file_priv; ctx->user_handle = ret; + + /* TODO: If required, this global id can be used for programming the hw +* fields too. In that case, we'll have take care of hw restrictions +* while allocating idr. e.g. for some hw, we may not have full 32 bits +* available. +*/ + ret = idr_alloc_cyclic(&dev_priv->global_ctx_idr, + ctx, 0, 0, GFP_KERNEL); + if (ret < 0) + goto err_out; + + ctx->global_id = ret; /* NB: Mark all slices as needing a remap so that when the context first * loads it will restore whatever remap state already exists. If there * is no remap info, it will be a NOP. */ @@ -280,6 +292,7 @@ i915_gem_create_context(struct drm_device *dev, struct drm_i915_file_private *file_priv) { const bool is_global_default_ctx = file_priv == NULL; + struct drm_i915_private *dev_priv = dev->dev_private; struct intel_context *ctx; int ret = 0; @@ -326,6 +339,7 @@ err_unpin: i915_gem_object_ggtt_unpin(ctx->legacy_hw_ctx.rcs_state); err_destroy: idr_remove(&file_priv->context_idr, ctx->user_handle); + idr_remove(&dev_priv->global_ctx_idr, ctx->global_id); i915_gem_context_unreference(ctx); return ERR_PTR(ret); } @@ -389,6 +403,7 @@ int i915_gem_context_init(struct drm_device *dev) dev_priv->hw_context_size = 0; } } + idr_init(&dev_priv->global_ctx_idr); ctx = i915_gem_create_context(dev, NULL); if (IS_ERR(ctx)) { @@ -416,6 +431,8 @@ void i915_gem_context_fini(struct drm_device *dev) struct intel_context *dctx = dev_priv->ring[RCS].default_context; int i; + idr_destroy(&dev_priv->global_ctx_idr); + if (dctx->legacy_hw_ctx.rcs_state) { /* The only known way to stop the gpu from accessing the hw context is * to reset it. Do this as the very last operation to avoid confusing @@ -478,6 +495,8 @@ static int context_idr_cleanup(int id, void *p, void *data) { struct intel_context *ctx = p; + idr_remove(&ctx->i915->global_ctx_idr, ctx->global_id); + i915_gem_context_unreference(ctx); return 0; } @@ -890,6 +909,7 @@ int i915_gem_context_destroy_ioctl(struct drm_device *dev, void *data, { struct drm_i915_gem_context_destroy *args = data; struct drm_i915_file_private *file_priv = file->driver_priv; + struct drm_i915_private *dev_priv = dev->dev_private; struct intel_context *ctx; int ret; @@ -907,6 +927,7 @@ int i915_gem_context_destroy_ioctl(struct drm_device *dev, void *data, } idr_remove(&ctx->file_priv->context_idr, ctx->user_handle); + idr_remove(&dev_priv->global_ct
[Intel-gfx] [PATCH 03/11] drm/i915: return ctx->global_id from intel_execlists_ctx_id()
From: Robert Bragg The newly added intel_context::global_id is suitable (a globally unique 20 bit ID) for giving to the hardware as a unique context identifier. Compared to using the pinned address of a logical ring context these IDs are constant for the lifetime of a context whereas a context could be repinned at different addresses during its lifetime. Having a stable ID is useful when we need to buffer information associated with a context based on this ID so the association can't be lost. For example the OA unit writes out counter reports to a circular buffer tagged with this ID and we want to be able to accurately filter reports for a specific context, ideally without the added complexity of tracking context re-pinning while the OA buffer may contain reports with older IDs. Cc: Sourab Gupta Signed-off-by: Robert Bragg --- drivers/gpu/drm/i915/i915_debugfs.c | 7 --- drivers/gpu/drm/i915/intel_lrc.c| 22 ++ drivers/gpu/drm/i915/intel_lrc.h| 2 +- 3 files changed, 15 insertions(+), 16 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index 8aab974..ff4a6fe 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -1970,6 +1970,7 @@ static int i915_context_status(struct seq_file *m, void *unused) static void i915_dump_lrc_obj(struct seq_file *m, struct intel_engine_cs *ring, + struct intel_context *ctx, struct drm_i915_gem_object *ctx_obj) { struct page *page; @@ -1984,7 +1985,7 @@ static void i915_dump_lrc_obj(struct seq_file *m, } seq_printf(m, "CONTEXT: %s %u\n", ring->name, - intel_execlists_ctx_id(ctx_obj)); + intel_execlists_ctx_id(ctx)); if (!i915_gem_obj_ggtt_bound(ctx_obj)) seq_puts(m, "\tNot bound in GGTT\n"); @@ -2033,7 +2034,7 @@ static int i915_dump_lrc(struct seq_file *m, void *unused) list_for_each_entry(ctx, &dev_priv->context_list, link) { for_each_ring(ring, dev_priv, i) { if (ring->default_context != ctx) - i915_dump_lrc_obj(m, ring, + i915_dump_lrc_obj(m, ring, ctx, ctx->engine[i].state); } } @@ -2112,7 +2113,7 @@ static int i915_execlists(struct seq_file *m, void *data) ctx_obj = head_req->ctx->engine[ring_id].state; seq_printf(m, "\tHead request id: %u\n", - intel_execlists_ctx_id(ctx_obj)); + intel_execlists_ctx_id(head_req->ctx)); seq_printf(m, "\tHead request tail: %u\n", head_req->tail); } diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index 40bda8d..4789555 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -260,7 +260,7 @@ int intel_sanitize_enable_execlists(struct drm_device *dev, int enable_execlists /** * intel_execlists_ctx_id() - get the Execlists Context ID - * @ctx_obj: Logical Ring Context backing object. + * @ctx: LR context * * Do not confuse with ctx->id! Unfortunately we have a name overload * here: the old context ID we pass to userspace as a handler so that @@ -269,15 +269,15 @@ int intel_sanitize_enable_execlists(struct drm_device *dev, int enable_execlists * interrupts. * * Return: 20-bits globally unique context ID. + * + * Further the ID given to HW can now be relied on to be constant for + * the lifetime of the context, unlike previously when we used an + * associated logical ring context address (which could be repinned at + * a different address). */ -u32 intel_execlists_ctx_id(struct drm_i915_gem_object *ctx_obj) +u32 intel_execlists_ctx_id(struct intel_context *ctx) { - u32 lrca = i915_gem_obj_ggtt_offset(ctx_obj) + - LRC_PPHWSP_PN * PAGE_SIZE; - - /* LRCA is required to be 4K aligned so the more significant 20 bits -* are globally unique */ - return lrca >> 12; + return ctx->global_id; } static bool disable_lite_restore_wa(struct intel_engine_cs *ring) @@ -305,7 +305,7 @@ uint64_t intel_lr_context_descriptor(struct intel_context *ctx, desc |= GEN8_CTX_L3LLC_COHERENT; desc |= GEN8_CTX_PRIVILEGE; desc |= lrca; - desc |= (u64)intel_execlists_ctx_id(ctx_obj) << GEN8_CTX_ID_SHIFT; + desc |= (u64)intel_execlists_ctx_id(ctx) << GEN8_CTX_ID_SHIFT; /* TODO: WaDisableLiteRestore when we start using semaphore * signalling between Command Streamer
[Intel-gfx] [PATCH 04/11] drm/i915: Add ctx getparam ioctl parameter to retrieve ctx global id
From: Sourab Gupta This patch adds a new ctx getparam ioctl parameter, which can be used to retrieve ctx global_id for any particular ctx by userspace. This can be used by userspace to map the i915 perf samples with their particular ctx's, since those would be having ctx global_id's. Otherwise the userspace has no way of maintaining this association, since it has the knowledge of only per-drm file specific ctx handles. Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_gem_context.c | 3 +++ include/uapi/drm/i915_drm.h | 1 + 2 files changed, 4 insertions(+) diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c index 3a90e79..18c545c 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.c +++ b/drivers/gpu/drm/i915/i915_gem_context.c @@ -960,6 +960,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data, case I915_CONTEXT_PARAM_NO_ZEROMAP: args->value = ctx->flags & CONTEXT_NO_ZEROMAP; break; + case I915_CONTEXT_PARAM_GLOBAL_ID: + args->value = ctx->global_id; + break; default: ret = -EINVAL; break; diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index 4a67895..e1f13b4 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -1131,6 +1131,7 @@ struct drm_i915_gem_context_param { __u64 param; #define I915_CONTEXT_PARAM_BAN_PERIOD 0x1 #define I915_CONTEXT_PARAM_NO_ZEROMAP 0x2 +#define I915_CONTEXT_PARAM_GLOBAL_ID 0x3 __u64 value; }; -- 1.9.1 ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
[Intel-gfx] [PATCH 10/11] drm/i915: Support opening multiple concurrent perf streams
From: Sourab Gupta This patch adds support for opening multiple concurrent perf streams for different gpu engines, while having the restriction to open only a single stream open for a particular gpu engine. This enables userspace client to open multiple streams, one per engine, at any time to capture sample data for multiple gpu engines. Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_drv.h | 2 +- drivers/gpu/drm/i915/i915_perf.c | 65 +++- 2 files changed, 38 insertions(+), 29 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index b1c952c..bf65acb 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -2086,7 +2086,7 @@ struct drm_i915_private { spinlock_t hook_lock; struct hrtimer poll_check_timer; - struct i915_perf_stream *exclusive_stream; + struct i915_perf_stream *exclusive_stream[I915_NUM_RINGS]; wait_queue_head_t poll_wq[I915_NUM_RINGS]; struct { diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 1d2712d..3eb56d4 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -1061,7 +1061,7 @@ static void i915_perf_stream_destroy(struct i915_perf_stream *stream) { struct drm_i915_private *dev_priv = stream->dev_priv; - BUG_ON(stream != dev_priv->perf.exclusive_stream); + BUG_ON(stream != dev_priv->perf.exclusive_stream[stream->ring_id]); if (stream->using_oa) { dev_priv->perf.oa.ops.disable_metric_set(dev_priv); @@ -1075,7 +1075,7 @@ static void i915_perf_stream_destroy(struct i915_perf_stream *stream) if (stream->cs_mode) free_command_stream_buf(dev_priv, stream->ring_id); - dev_priv->perf.exclusive_stream = NULL; + dev_priv->perf.exclusive_stream[stream->ring_id] = NULL; } static void *vmap_oa_buffer(struct drm_i915_gem_object *obj) @@ -1434,17 +1434,17 @@ static void gen7_update_oacontrol_locked(struct drm_i915_private *dev_priv) { assert_spin_locked(&dev_priv->perf.hook_lock); - if (dev_priv->perf.exclusive_stream->enabled) { + if (dev_priv->perf.exclusive_stream[RCS]->enabled) { unsigned long ctx_id = 0; bool pinning_ok = false; - if (dev_priv->perf.exclusive_stream->ctx && + if (dev_priv->perf.exclusive_stream[RCS]->ctx && dev_priv->perf.oa.specific_ctx_id) { ctx_id = dev_priv->perf.oa.specific_ctx_id; pinning_ok = true; } - if (dev_priv->perf.exclusive_stream->ctx == NULL || + if (dev_priv->perf.exclusive_stream[RCS]->ctx == NULL || pinning_ok) { bool periodic = dev_priv->perf.oa.periodic; u32 period_exponent = dev_priv->perf.oa.period_exponent; @@ -1556,14 +1556,6 @@ static int i915_perf_stream_init(struct i915_perf_stream *stream, SAMPLE_TS); int ret; - /* To avoid the complexity of having to accurately filter -* counter reports and marshal to the appropriate client -* we currently only allow exclusive access */ - if (dev_priv->perf.exclusive_stream) { - DRM_ERROR("Stream already in use\n"); - return -EBUSY; - } - /* Ctx Id can be sampled in HSW only through command streamer mode */ if (IS_HASWELL(dev_priv->dev) && (props->sample_flags & SAMPLE_CTX_ID) && !props->cs_mode) { @@ -1576,6 +1568,13 @@ static int i915_perf_stream_init(struct i915_perf_stream *stream, if (require_oa_unit) { int format_size; + + /* Only allow exclusive access per stream */ + if (dev_priv->perf.exclusive_stream[RCS]) { + DRM_ERROR("Stream:0 already in use\n"); + return -EBUSY; + } + if (!dev_priv->perf.oa.ops.init_oa_buffer) { DRM_ERROR("OA unit not supported\n"); return -ENODEV; @@ -1673,6 +1672,13 @@ static int i915_perf_stream_init(struct i915_perf_stream *stream, } if (props->cs_mode) { + /* Only allow exclusive access per stream */ + if (dev_priv->perf.exclusive_stream[props->ring_id]) { + DRM_ERROR("Stream:%d already in use\n", props->ring_id); + ret = -EBUSY; + goto cs_error; + } + /* * The
[Intel-gfx] [PATCH 07/11] drm/i915: Add support for having pid output with OA report
From: Sourab Gupta This patch introduces flags and adds support for having pid output with the OA reports generated through the RCS commands. When the stream is opened with pid sample type, the pid information is also captured through the command stream samples and forwarded along with the OA reports. Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_drv.h | 1 + drivers/gpu/drm/i915/i915_perf.c | 39 ++- include/uapi/drm/i915_drm.h | 7 +++ 3 files changed, 46 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 44fcbf4..a8b374f 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1808,6 +1808,7 @@ struct i915_perf_cs_data_node { struct drm_i915_gem_request *request; u32 offset; u32 ctx_id; + u32 pid; }; struct drm_i915_private { diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 2904745..ea331eb 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -55,6 +55,7 @@ static u32 i915_perf_stream_paranoid = true; struct oa_sample_data { u32 source; u32 ctx_id; + u32 pid; const u8 *report; }; @@ -96,6 +97,7 @@ static struct i915_oa_format gen8_plus_oa_formats[I915_OA_FORMAT_MAX] = { #define SAMPLE_OA_REPORT (1<<0) #define SAMPLE_OA_SOURCE_INFO (1<<1) #define SAMPLE_CTX_ID (1<<2) +#define SAMPLE_PID (1<<3) struct perf_open_properties { @@ -255,6 +257,7 @@ static void i915_perf_command_stream_hook_oa(struct drm_i915_gem_request *req) } entry->ctx_id = ctx->global_id; + entry->pid = current->pid; i915_gem_request_assign(&entry->request, req); insert_perf_entry(dev_priv, entry); @@ -405,6 +408,12 @@ static bool append_oa_sample(struct i915_perf_stream *stream, read_state->buf += 4; } + if (sample_flags & SAMPLE_PID) { + if (copy_to_user(read_state->buf, &data->pid, 4)) + return false; + read_state->buf += 4; + } + if (sample_flags & SAMPLE_OA_REPORT) { if (copy_to_user(read_state->buf, data->report, report_size)) return false; @@ -446,6 +455,10 @@ static bool append_oa_buffer_sample(struct i915_perf_stream *stream, if (sample_flags & SAMPLE_CTX_ID) data.ctx_id = 0; +#warning "FIXME: append_oa_buffer_sample: deduce pid for periodic samples based on most recent RCS pid for ctx" + if (sample_flags & SAMPLE_PID) + data.pid = 0; + if (sample_flags & SAMPLE_OA_REPORT) data.report = report; @@ -685,6 +698,9 @@ static bool append_oa_rcs_sample(struct i915_perf_stream *stream, if (sample_flags & SAMPLE_CTX_ID) data.ctx_id = node->ctx_id; + if (sample_flags & SAMPLE_PID) + data.pid = node->pid; + if (sample_flags & SAMPLE_OA_REPORT) data.report = report; @@ -1307,6 +1323,7 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, struct drm_i915_private *dev_priv = stream->dev_priv; bool require_oa_unit = props->sample_flags & (SAMPLE_OA_REPORT | SAMPLE_OA_SOURCE_INFO); + bool require_cs_mode = props->sample_flags & SAMPLE_PID; int format_size; int ret; @@ -1416,8 +1433,20 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, require_cs_mode = true; } + if (require_cs_mode && !props->cs_mode) { + DRM_ERROR("PID sampling requires a ring to be specified"); + ret = -EINVAL; + goto cs_error; + } + if (props->cs_mode) { - if (!(props->sample_flags & SAMPLE_CTX_ID)) { + /* +* The only time we should allow enabling CS mode if it's not +* strictly required, is if SAMPLE_CTX_ID has been requested +* as it's usable with periodic OA or CS sampling. +*/ + if (!require_cs_mode && + !(props->sample_flags & SAMPLE_CTX_ID)) { DRM_ERROR( "Ring given without requesting any CS data to sample"); ret = -EINVAL; @@ -1426,6 +1455,11 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, stream->cs_mode = true; + if (props->sample_flags & SAMPLE_PID) { + stream->sample_flags |= SAMPLE_PID; +
[Intel-gfx] [PATCH 05/11] drm/i915: Expose OA sample source to userspace
From: Sourab Gupta This patch exposes a new sample source field to userspace. This field can be populated to specify the origin of the OA report. For e.g. for internally triggerred reports (non MI_RPC reports), the RPT_ID field has bitfields for specifying the origin such as timer, or render ctx switch, etc. Likewise this field can be used to specify the source as MI_RPC when such support is added. Signed-off-by: Sourab Gupta Signed-off-by: Robert Bragg --- drivers/gpu/drm/i915/i915_perf.c | 50 +--- include/uapi/drm/i915_drm.h | 16 + 2 files changed, 58 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index c5447b4..06de4b3 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -44,6 +44,13 @@ static u32 i915_perf_stream_paranoid = true; #define OA_EXPONENT_MAX 0x3f +#define GEN8_OAREPORT_REASON_TIMER (1<<19) +#define GEN8_OAREPORT_REASON_TRIGGER1 (1<<20) +#define GEN8_OAREPORT_REASON_TRIGGER2 (1<<21) +#define GEN8_OAREPORT_REASON_CTX_SWITCH (1<<22) +#define GEN8_OAREPORT_REASON_GO_TRANSITION (1<<23) +#define GEN9_OAREPORT_REASON_CLK_RATIO (1<<24) + /* for sysctl proc_dointvec_minmax of i915_oa_min_timer_exponent */ static int zero; static int oa_exponent_max = OA_EXPONENT_MAX; @@ -79,7 +86,8 @@ static struct i915_oa_format gen8_plus_oa_formats[I915_OA_FORMAT_MAX] = { [I915_OA_FORMAT_C4_B8] = { 7, 64 }, }; -#define SAMPLE_OA_REPORT (1<<0) +#define SAMPLE_OA_REPORT (1<<0) +#define SAMPLE_OA_SOURCE_INFO (1<<1) struct perf_open_properties { @@ -149,6 +157,27 @@ static bool append_oa_sample(struct i915_perf_stream *stream, copy_to_user(read_state->buf, &header, sizeof(header)); read_state->buf += sizeof(header); + if (sample_flags & SAMPLE_OA_SOURCE_INFO) { + enum drm_i915_perf_oa_event_source source; + + if (INTEL_INFO(dev_priv)->gen >= 8) { + u32 reason = *(u32 *)report; + + if (reason & GEN8_OAREPORT_REASON_CTX_SWITCH) + source = + I915_PERF_OA_EVENT_SOURCE_CONTEXT_SWITCH; + else if (reason & GEN8_OAREPORT_REASON_TIMER) + source = I915_PERF_OA_EVENT_SOURCE_PERIODIC; + else + source = I915_PERF_OA_EVENT_SOURCE_UNDEFINED; + } else + source = I915_PERF_OA_EVENT_SOURCE_PERIODIC; + + if (copy_to_user(read_state->buf, &source, 4)) + return false; + read_state->buf += 4; + } + if (sample_flags & SAMPLE_OA_REPORT) { copy_to_user(read_state->buf, report, report_size); read_state->buf += report_size; @@ -841,11 +870,6 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, int format_size; int ret; - if (!(props->sample_flags & SAMPLE_OA_REPORT)) { - DRM_ERROR("Only OA report sampling supported\n"); - return -EINVAL; - } - if (!dev_priv->perf.oa.ops.init_oa_buffer) { DRM_ERROR("OA unit not supported\n"); return -ENODEV; @@ -873,8 +897,15 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, format_size = dev_priv->perf.oa.oa_formats[props->oa_format].size; - stream->sample_flags |= SAMPLE_OA_REPORT; - stream->sample_size += format_size; + if (props->sample_flags & SAMPLE_OA_REPORT) { + stream->sample_flags |= SAMPLE_OA_REPORT; + stream->sample_size += format_size; + } + + if (props->sample_flags & SAMPLE_OA_SOURCE_INFO) { + stream->sample_flags |= SAMPLE_OA_SOURCE_INFO; + stream->sample_size += 4; + } dev_priv->perf.oa.oa_buffer.format_size = format_size; BUG_ON(dev_priv->perf.oa.oa_buffer.format_size == 0); @@ -1480,6 +1511,9 @@ static int read_properties_unlocked(struct drm_i915_private *dev_priv, props->oa_periodic = true; props->oa_period_exponent = value; break; + case DRM_I915_PERF_SAMPLE_OA_SOURCE_PROP: + props->sample_flags |= SAMPLE_OA_SOURCE_INFO; + break; case DRM_I915_PERF_PROP_MAX: BUG(); } diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index e1f13b4..6dbaa6d 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@
[Intel-gfx] [PATCH 09/11] drm/i915: Extend i915 perf framework for collecting timestamps on all gpu engines
From: Sourab Gupta This patch extends the i915 perf framework to handle the perf sample collection for any given gpu engine. Particularly, the support for collecting timestamp sample type is added, which can be requested for any engine. The thing to note is that, still only a single stream instance can be opened at any particular time. Though that stream may now be opened for any gpu engine, for collection of timestamp samples. So, this patch doesn't add the support to open multiple concurrent streams, as yet. Though it lays the groundwork for this support to be added susequently. Part of this groundwork involves having separate command stream buffers, per engine, for holding the samples generated. Likewise for a few other data structures maintaining per-engine state. Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_drv.h | 33 ++- drivers/gpu/drm/i915/i915_perf.c | 578 +++ drivers/gpu/drm/i915/i915_reg.h | 2 + include/uapi/drm/i915_drm.h | 7 + 4 files changed, 431 insertions(+), 189 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index cf86228..b1c952c 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1725,6 +1725,7 @@ struct i915_perf_stream { struct list_head link; + enum intel_ring_id ring_id; u32 sample_flags; int sample_size; @@ -1734,6 +1735,9 @@ struct i915_perf_stream { /* Whether command stream based data collection is enabled */ bool cs_mode; + /* Whether the OA unit is in use */ + bool using_oa; + /* Enables the collection of HW samples, either in response to * I915_PERF_IOCTL_ENABLE or implicitly called when stream is * opened without I915_PERF_FLAG_DISABLED */ @@ -1782,7 +1786,8 @@ struct i915_perf_stream { * Routine to emit the commands in the command streamer associated * with the corresponding gpu engine. */ - void (*command_stream_hook)(struct drm_i915_gem_request *req, u32 tag); + void (*command_stream_hook)(struct i915_perf_stream *stream, + struct drm_i915_gem_request *req, u32 tag); }; struct i915_oa_ops { @@ -1807,7 +1812,16 @@ struct i915_oa_ops { struct i915_perf_cs_data_node { struct list_head link; struct drm_i915_gem_request *request; - u32 offset; + + /* Offsets into the GEM obj holding the data */ + u32 start_offset; + u32 oa_offset; + u32 ts_offset; + + /* buffer size corresponding to this entry */ + u32 size; + + /* Other metadata */ u32 ctx_id; u32 pid; u32 tag; @@ -2071,14 +2085,13 @@ struct drm_i915_private { spinlock_t hook_lock; - struct { - struct i915_perf_stream *exclusive_stream; + struct hrtimer poll_check_timer; + struct i915_perf_stream *exclusive_stream; + wait_queue_head_t poll_wq[I915_NUM_RINGS]; + struct { u32 specific_ctx_id; - struct hrtimer poll_check_timer; - wait_queue_head_t poll_wq; - bool periodic; u32 period_exponent; @@ -2115,10 +2128,10 @@ struct drm_i915_private { struct drm_i915_gem_object *obj; struct i915_vma *vma; u8 *addr; - } command_stream_buf; + } command_stream_buf[I915_NUM_RINGS]; - struct list_head node_list; - spinlock_t node_list_lock; + struct list_head node_list[I915_NUM_RINGS]; + spinlock_t node_list_lock[I915_NUM_RINGS]; } perf; /* Abstract the submission mechanism (legacy ringbuffer or execlists) away */ diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 141f721..1d2712d 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -51,12 +51,17 @@ static u32 i915_perf_stream_paranoid = true; #define GEN8_OAREPORT_REASON_GO_TRANSITION (1<<23) #define GEN9_OAREPORT_REASON_CLK_RATIO (1<<24) -/* Data common to periodic and RCS based samples */ -struct oa_sample_data { +#define OA_ADDR_ALIGN 64 +#define TS_ADDR_ALIGN 8 +#define I915_PERF_TS_SAMPLE_SIZE 8 + +/* Data common to all samples (periodic OA / CS based OA / Timestamps) */ +struct sample_data { u32 source; u32 ctx_id; u32 pid; u32 tag; + u64 ts; const u8 *report; }; @@ -100,6 +105,7 @@ static struct i915_oa_format gen8_plus_oa_formats[I915_OA_FORMAT_MAX] = { #define SAMPLE_CTX_ID (1<<2) #define SAMPLE_PID (1<<3) #define SAMPLE_TAG (1<<4) +#define SAMPLE_TS (1<<5)
[Intel-gfx] [PATCH 06/11] drm/i915: Framework for capturing command stream based OA reports
From: Sourab Gupta This patch introduces a framework to enable OA counter reports associated with Render command stream. We can then associate the reports captured through this mechanism with their corresponding context id's. This can be further extended to associate any other metadata information with the corresponding samples (since the association with Render command stream gives us the ability to capture these information while inserting the corresponding capture commands into the command stream). The OA reports generated in this way are associated with a corresponding workload, and thus can be used the delimit the workload (i.e. sample the counters at the workload boundaries), within an ongoing stream of periodic counter snapshots. There may be usecases wherein we need more than periodic OA capture mode which is supported currently. This mode is primarily used for two usecases: - Ability to capture system wide metrics, alongwith the ability to map the reports back to individual contexts (particularly for HSW). - Ability to inject tags for work, into the reports. This provides visibility into the multiple stages of work within single context. The userspace will be able to distinguish between the periodic and CS based OA reports by the virtue of source_info sample field. The commands to capture OA reports are inserted at BB boundaries. The metadata information pertaining to snapshot is maintained in a list, which also has corresponding gem request field which is tracked for completion of command. Both periodic and RCS based reports are associated with a single stream (corresponding to render engine), and the samples will be forwarded to userspace in the sequential order according to their timestamps. v2: Aligining with the non-perf interface (custom drm ioctl based). Also, few related patches are squashed together for better readability Signed-off-by: Sourab Gupta Signed-off-by: Robert Bragg --- drivers/gpu/drm/i915/i915_drv.h| 35 +- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 4 + drivers/gpu/drm/i915/i915_perf.c | 733 + drivers/gpu/drm/i915/intel_lrc.c | 4 + include/uapi/drm/i915_drm.h| 15 + 5 files changed, 690 insertions(+), 101 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index c409c8f..44fcbf4 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1730,6 +1730,9 @@ struct i915_perf_stream { struct intel_context *ctx; bool enabled; + /* Whether command stream based data collection is enabled */ + bool cs_mode; + /* Enables the collection of HW samples, either in response to * I915_PERF_IOCTL_ENABLE or implicitly called when stream is * opened without I915_PERF_FLAG_DISABLED */ @@ -1773,6 +1776,12 @@ struct i915_perf_stream { * * The stream will always be disabled before this is called */ void (*destroy)(struct i915_perf_stream *stream); + + /* +* Routine to emit the commands in the command streamer associated +* with the corresponding gpu engine. +*/ + void (*command_stream_hook)(struct drm_i915_gem_request *req); }; struct i915_oa_ops { @@ -1786,10 +1795,21 @@ struct i915_oa_ops { u32 ctx_id); void (*legacy_ctx_switch_unlocked)(struct drm_i915_gem_request *req); void (*read)(struct i915_perf_stream *stream, -struct i915_perf_read_state *read_state); +struct i915_perf_read_state *read_state, u32 ts); bool (*oa_buffer_is_empty)(struct drm_i915_private *dev_priv); }; +/* + * List element to hold info about the perf sample data associated + * with a particular GPU command stream. + */ +struct i915_perf_cs_data_node { + struct list_head link; + struct drm_i915_gem_request *request; + u32 offset; + u32 ctx_id; +}; + struct drm_i915_private { struct drm_device *dev; struct kmem_cache *objects; @@ -2042,6 +2062,8 @@ struct drm_i915_private { struct ctl_table_header *sysctl_header; struct mutex lock; + + struct mutex streams_lock; struct list_head streams; spinlock_t hook_lock; @@ -2084,6 +2106,16 @@ struct drm_i915_private { const struct i915_oa_format *oa_formats; int n_builtin_sets; } oa; + + /* Command stream based perf data buffer */ + struct { + struct drm_i915_gem_object *obj; + struct i915_vma *vma; + u8 *addr; + } command_stream_buf; + + struct list_head node_list; + spinlock_t node_list_lock; } perf; /* Abstract the submission mech
[Intel-gfx] [PATCH 08/11] drm/i915: Add support to add execbuffer tags to OA counter reports
From: Sourab Gupta This patch enables userspace to specify tags (per workload), provided via execbuffer ioctl, which could be added to OA reports, to help associate reports with the corresponding workloads. There may be multiple stages within a single context, from a userspace perspective. An ability is needed to individually associate the OA reports with their corresponding workloads(execbuffers), which may not be possible solely with ctx_id or pid information. This patch enables such a mechanism. In this patch, upper 32 bits of rsvd1 field, which were previously unused are now being used to pass in the tag. Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_drv.h| 6 +++-- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 5 +++-- drivers/gpu/drm/i915/i915_perf.c | 36 +- drivers/gpu/drm/i915/intel_lrc.c | 4 ++-- include/uapi/drm/i915_drm.h| 12 ++ 5 files changed, 52 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index a8b374f..cf86228 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1701,6 +1701,7 @@ struct i915_execbuffer_params { struct drm_i915_gem_object *batch_obj; struct intel_context*ctx; struct drm_i915_gem_request *request; + uint32_ttag; }; struct i915_oa_format { @@ -1781,7 +1782,7 @@ struct i915_perf_stream { * Routine to emit the commands in the command streamer associated * with the corresponding gpu engine. */ - void (*command_stream_hook)(struct drm_i915_gem_request *req); + void (*command_stream_hook)(struct drm_i915_gem_request *req, u32 tag); }; struct i915_oa_ops { @@ -1809,6 +1810,7 @@ struct i915_perf_cs_data_node { u32 offset; u32 ctx_id; u32 pid; + u32 tag; }; struct drm_i915_private { @@ -3361,7 +3363,7 @@ void i915_oa_context_pin_notify(struct drm_i915_private *dev_priv, struct intel_context *context); void i915_oa_legacy_ctx_switch_notify(struct drm_i915_gem_request *req); void i915_oa_update_reg_state(struct intel_engine_cs *ring, uint32_t *reg_state); -void i915_perf_command_stream_hook(struct drm_i915_gem_request *req); +void i915_perf_command_stream_hook(struct drm_i915_gem_request *req, u32 tag); /* i915_gem_evict.c */ int __must_check i915_gem_evict_something(struct drm_device *dev, diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index 6860fca..5e3ed23 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -1258,7 +1258,7 @@ i915_gem_ringbuffer_submission(struct i915_execbuffer_params *params, exec_start = params->batch_obj_vm_offset + params->args_batch_start_offset; - i915_perf_command_stream_hook(params->request); + i915_perf_command_stream_hook(params->request, params->tag); ret = ring->dispatch_execbuffer(params->request, exec_start, exec_len, @@ -1266,7 +1266,7 @@ i915_gem_ringbuffer_submission(struct i915_execbuffer_params *params, if (ret) return ret; - i915_perf_command_stream_hook(params->request); + i915_perf_command_stream_hook(params->request, params->tag); trace_i915_gem_ring_dispatch(params->request, params->dispatch_flags); @@ -1574,6 +1574,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data, params->dispatch_flags = dispatch_flags; params->batch_obj = batch_obj; params->ctx = ctx; + params->tag = i915_execbuffer2_get_tag(*args); ret = dev_priv->gt.execbuf_submit(params, args, &eb->vmas); diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index ea331eb..141f721 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -56,6 +56,7 @@ struct oa_sample_data { u32 source; u32 ctx_id; u32 pid; + u32 tag; const u8 *report; }; @@ -98,6 +99,7 @@ static struct i915_oa_format gen8_plus_oa_formats[I915_OA_FORMAT_MAX] = { #define SAMPLE_OA_SOURCE_INFO (1<<1) #define SAMPLE_CTX_ID (1<<2) #define SAMPLE_PID (1<<3) +#define SAMPLE_TAG (1<<4) struct perf_open_properties { @@ -123,7 +125,7 @@ struct perf_open_properties * perf mutex lock. */ -void i915_perf_command_stream_hook(struct drm_i915_gem_request *req) +void i915_perf_command_stream_hook(struct drm_i915_gem_request *req, u32 tag) { struct intel_engine_cs *ring = req->ring; struct drm_i915_private *dev_priv =
[Intel-gfx] [PATCH 11/11] drm/i915: Support for capturing MMIO register values
From: Sourab Gupta This patch adds support for capturing MMIO register values through i915 perf interface. The userspace can request upto 8 MMIO register values to be dumped. The addresses of these registers can be passed through the corresponding property 'value' field while opening the stream. The commands to dump the values of these MMIO registers are then inserted into the ring alongwith other commands. Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_drv.h | 4 + drivers/gpu/drm/i915/i915_perf.c | 177 ++- include/uapi/drm/i915_drm.h | 14 3 files changed, 193 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index bf65acb..fcaee75 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1817,6 +1817,7 @@ struct i915_perf_cs_data_node { u32 start_offset; u32 oa_offset; u32 ts_offset; + u32 mmio_offset; /* buffer size corresponding to this entry */ u32 size; @@ -2089,6 +2090,9 @@ struct drm_i915_private { struct i915_perf_stream *exclusive_stream[I915_NUM_RINGS]; wait_queue_head_t poll_wq[I915_NUM_RINGS]; + u32 num_mmio; + u32 mmio_list[I915_PERF_MMIO_NUM_MAX]; + struct { u32 specific_ctx_id; diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 3eb56d4..45a7f22 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -63,6 +63,7 @@ struct sample_data { u32 tag; u64 ts; const u8 *report; + const u8 *mmio; }; /* for sysctl proc_dointvec_minmax of i915_oa_min_timer_exponent */ @@ -106,6 +107,7 @@ static struct i915_oa_format gen8_plus_oa_formats[I915_OA_FORMAT_MAX] = { #define SAMPLE_PID (1<<3) #define SAMPLE_TAG (1<<4) #define SAMPLE_TS (1<<5) +#define SAMPLE_MMIO(1<<6) struct perf_open_properties { @@ -203,6 +205,9 @@ static void insert_perf_entry(struct drm_i915_private *dev_priv, sample_ts = true; } + if (sample_flags & SAMPLE_MMIO) + entry_size += 4*dev_priv->perf.num_mmio; + spin_lock(&dev_priv->perf.node_list_lock[id]); if (list_empty(&dev_priv->perf.node_list[id])) { offset = 0; @@ -266,6 +271,10 @@ out: entry->ts_offset = ALIGN(entry->ts_offset, TS_ADDR_ALIGN); offset = entry->ts_offset + I915_PERF_TS_SAMPLE_SIZE; } + if (sample_flags & SAMPLE_MMIO) { + entry->mmio_offset = offset; + offset = entry->mmio_offset + 4*dev_priv->perf.num_mmio; + } list_add_tail(&entry->link, &dev_priv->perf.node_list[id]); spin_unlock(&dev_priv->perf.node_list_lock[id]); @@ -415,6 +424,72 @@ static int i915_perf_stream_capture_ts_data(struct drm_i915_gem_request *req, return 0; } +static int i915_perf_stream_capture_mmio_data(struct drm_i915_gem_request *req, + u32 offset) +{ + struct intel_engine_cs *ring = req->ring; + struct intel_ringbuffer *ringbuf = req->ringbuf; + struct drm_i915_private *dev_priv = ring->dev->dev_private; + int num_mmio = dev_priv->perf.num_mmio; + u32 mmio_addr, addr = 0; + int ret, i; + + if (i915.enable_execlists) + ret = intel_logical_ring_begin(req, 4*num_mmio); + else + ret = intel_ring_begin(req, 4*num_mmio); + + if (ret) + return ret; + + mmio_addr = + dev_priv->perf.command_stream_buf[ring->id].vma->node.start + + offset; + + if (i915.enable_execlists) { + for (i = 0; i < num_mmio; i++) { + uint32_t cmd; + + addr = mmio_addr + + i * sizeof(dev_priv->perf.mmio_list[i]); + + cmd = MI_STORE_REGISTER_MEM_GEN8 | + MI_SRM_LRM_GLOBAL_GTT; + + intel_logical_ring_emit(ringbuf, cmd); + intel_logical_ring_emit(ringbuf, + dev_priv->perf.mmio_list[i]); + intel_logical_ring_emit(ringbuf, addr); + intel_logical_ring_emit(ringbuf, 0); + } + intel_logical_ring_advance(ringbuf); + } else { + for (i = 0; i < num_mmio; i++) { + uint32_t cmd; + + addr = mmio_addr + + i * sizeof(dev_priv->perf.mmio_
Re: [Intel-gfx] [PATCH 06/11] drm/i915: Framework for capturing command stream based OA reports
On Wed, 2016-02-17 at 23:00 +0530, Robert Bragg wrote: > Hi Sourab, > > > As Sergio Martinez has started experimenting with this in gputop and > reported seeing lots of ENOSPC errors being reported when reading I > had a look into this and saw a few issues with how we check that > there's data available to read in command stream mode, and a I think > there's a possibility of incorrectly sorting the samples sometimes... Hi Robert, Thanks for spotting this anomaly. I'll have this fixed in the next version of patch set. > > On Tue, Feb 16, 2016 at 5:27 AM, wrote: > From: Sourab Gupta > > > -static bool i915_oa_can_read(struct i915_perf_stream *stream) > +static bool append_oa_rcs_sample(struct i915_perf_stream > *stream, > +struct i915_perf_read_state > *read_state, > +struct i915_perf_cs_data_node > *node) > +{ > + struct drm_i915_private *dev_priv = stream->dev_priv; > + struct oa_sample_data data = { 0 }; > + const u8 *report = > dev_priv->perf.command_stream_buf.addr + > + node->offset; > + u32 sample_flags = stream->sample_flags; > + u32 report_ts; > + > + /* > +* Forward the periodic OA samples which have the > timestamp lower > +* than timestamp of this sample, before forwarding > this sample. > +* This ensures samples read by user are order acc. to > their timestamps > +*/ > + report_ts = *(u32 *)(report + 4); > + dev_priv->perf.oa.ops.read(stream, read_state, > report_ts); > + > + if (sample_flags & SAMPLE_OA_SOURCE_INFO) > + data.source = I915_PERF_OA_EVENT_SOURCE_RCS; > + > + if (sample_flags & SAMPLE_CTX_ID) > + data.ctx_id = node->ctx_id; > + > + if (sample_flags & SAMPLE_OA_REPORT) > + data.report = report; > + > + append_oa_sample(stream, read_state, &data); > + > + return true; > +} > + > +static void oa_rcs_append_reports(struct i915_perf_stream > *stream, > + struct i915_perf_read_state > *read_state) > +{ > + struct drm_i915_private *dev_priv = stream->dev_priv; > + struct i915_perf_cs_data_node *entry, *next; > + > + list_for_each_entry_safe(entry, next, > +&dev_priv->perf.node_list, > link) { > + if (! > i915_gem_request_completed(entry->request, true)) > + break; > + > + if (!append_oa_rcs_sample(stream, read_state, > entry)) > + break; > + > + spin_lock(&dev_priv->perf.node_list_lock); > + list_del(&entry->link); > + spin_unlock(&dev_priv->perf.node_list_lock); > + > + > i915_gem_request_unreference__unlocked(entry->request); > + kfree(entry); > + } > + > + /* Flush any remaining periodic reports */ > + dev_priv->perf.oa.ops.read(stream, read_state, > U32_MAX); > > I don't think we can flush all remaining periodic reports here - at > least not if we have any in-flight MI_RPC commands - in case the next > request to complete might have reports with earlier timestamps than > some of these periodic reports. > > > Even if we have periodic reports available I think we need to throttle > forwarding them based on the command stream requests completing. > > > This is something that userspace should understand when it explicitly > decides to use command stream mode in conjunction with periodic > sampling. > I agree, there shouldn't be any flushing of remaining periodic reports here, instead any periodic reports remaining here should be taken care of during the next processing of command stream samples. > > +} > + > +static bool command_stream_buf_is_empty(struct > i915_perf_stream *stream) > { > struct drm_i915_private *dev_priv = stream->dev
Re: [Intel-gfx] [RFC 1/3] fs: Introduce drmfs pseudo filesystem interfaces
On Thu, 2016-12-01 at 00:07 -0800, Chris Wilson wrote: > On Thu, Dec 01, 2016 at 12:32:31PM +0530, swati.dhin...@intel.com wrote: > > +int drmfs_init(void) > > +{ > > + int retval; > > + > > + retval = sysfs_create_mount_point(kernel_kobj, "drm"); > > + if (retval) > > + return -EINVAL; > > + > > + retval = register_filesystem(&drm_fs_type); > > + if (!retval) > > + drmfs_registered = true; > > + > > + return retval; > > +} > > +EXPORT_SYMBOL(drmfs_init); > > + > > +int drmfs_fini(void) > > +{ > > + int retval; > > + > > + retval = unregister_filesystem(&drm_fs_type); > > + if (retval) > > + return retval; > > + > > + drmfs_registered = false; > > + > > + sysfs_remove_mount_point(kernel_kobj, "drm"); > > +} > > +EXPORT_SYMBOL(drmfs_fini); > > This needs to act like a singleton for multiple DRM drivers, i.e. > add a mutex and use drmfs_registered as a reference count (also then > don't call the entrypoint init/fini). Or alternatively (and probably > better?), simply do init/fini from the DRM module init. > -Chris > Hi Chris, In the second patch, drmfs_init is called by drm_core_init, which should thus be called only once (i.e. during drm module init), and likewise for drmfs_fini which is called during drm_core_exit. Am I missing something here? ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Re: [Intel-gfx] [RFC 1/3] fs: Introduce drmfs pseudo filesystem interfaces
On Thu, 2016-12-01 at 00:11 -0800, Chris Wilson wrote: > On Thu, Dec 01, 2016 at 12:32:31PM +0530, swati.dhin...@intel.com wrote: > > diff --git a/fs/Kconfig b/fs/Kconfig > > index 4bd03a2..7d0ac20 100644 > > --- a/fs/Kconfig > > +++ b/fs/Kconfig > > @@ -200,6 +200,15 @@ config HUGETLBFS > > config HUGETLB_PAGE > > def_bool HUGETLBFS > > > > +config DRMFS > > + bool "Drmfs file system support" > > + depends on DRM > > + help > > + Drmfs is a pseudo file system for drm subsystem output data. > > + > > + drmfs is a filesystem to hold miscellaneous output data from drm > > + subsystems. > > + > > config ARCH_HAS_GIGANTIC_PAGE > > bool > > > > diff --git a/fs/Makefile b/fs/Makefile > > index ed2b632..b34a96e 100644 > > --- a/fs/Makefile > > +++ b/fs/Makefile > > @@ -120,6 +120,7 @@ obj-$(CONFIG_BEFS_FS) += befs/ > > obj-$(CONFIG_HOSTFS) += hostfs/ > > obj-$(CONFIG_CACHEFILES) += cachefiles/ > > obj-$(CONFIG_DEBUG_FS) += debugfs/ > > +obj-$(CONFIG_DRMFS)+= drmfs/ > > A filesystem does not have to live under fs/. Since this is dedicated > and tied to the lifetime of drivers/gpu/drm/drm.ko, we will be happier > with not adding a new MAINTAINERS entry. > -Chris > Ok, agreed. So, should we have the drmfs/ source directory (with its associated files) under drivers/gpu/drm/? Can you please suggest where should the associated 'DRMFS' config be defined? Should drm/Kconfig be a good place? ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Re: [Intel-gfx] [RFC 2/3] drm: Register drmfs filesystem from drm init
On Thu, 2016-12-01 at 00:15 -0800, Chris Wilson wrote: > On Thu, Dec 01, 2016 at 12:32:32PM +0530, swati.dhin...@intel.com wrote: > > diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c > > index 84fcfcb..ead360bd 100644 > > --- a/drivers/gpu/drm/drm_drv.c > > +++ b/drivers/gpu/drm/drm_drv.c > > @@ -688,6 +688,14 @@ int drm_dev_register(struct drm_device *dev, unsigned > > long flags) > > { > > int ret; > > > > +#ifdef CONFIG_DRMFS > > Rule of thumb: avoid #ifdeffry in the body of the code, use headers to > hide conditional compilation. Ok. Will do the requisite changes. > > > + dev->driver->drmfs_root = drmfs_create_dir(dev->driver->name, NULL); > > + if (IS_ERR(dev->driver->drmfs_root)) { > > + DRM_ERROR("Failed to get drmfs root dentry\n"); > > + return PTR_ERR(dev->driver->drmfs_root); > > + } > > Considering use of drmfs is optional, should an error here prevent the > driver from loading? > -Chris Ok. Will remove the return on the error here. ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Re: [Intel-gfx] [RFC 1/3] fs: Introduce drmfs pseudo filesystem interfaces
On Thu, 2016-12-01 at 23:53 -0800, Daniel Vetter wrote: > On Thu, Dec 01, 2016 at 01:44:15PM +0530, swati.dhin...@intel.com wrote: > > From: Swati Dhingra > > > > The patch introduces a new pseudo filesystem type, named 'drmfs' which is > > intended to house the files for the data generated by drm subsystem that > > cannot be accommodated by any of the existing filesystems. > > The filesystem is modelled on the lines of existing pseudo-filesystems such > > as debugfs/tracefs, and borrows ideas from their implementation. > > This filesystem will be appearing at sys/kernel/drm. > > > > A new config 'CONFIG_DRMFS' is introduced to enable/disable the filesystem, > > which is dependent on CONFIG_DRM. > > The filesystem will not be registered standalone during kernel init time, > > instead it is intended to be initialized/registered during drm > > initialization. > > > > The intent for introduction of the filesystem is to act as a location to > > hold > > various kinds of data output from Linux DRM subsystems, which can't really > > fit > > anywhere else into the existing filesystems such as debugfs/sysfs etc. All > > these > > filesystems have their own constraints and are intended to output a > > particular > > type of data such as attributes and small debug parameter data. Due to these > > constraints, there is a need for a new pseudo filesytem, customizable to DRM > > specific requirements and catering to the needs to DRM subsystem components > > > > Signed-off-by: Sourab Gupta > > Signed-off-by: Swati Dhingra > > I thought review feedback was to put that into drm, not under fs/? Also, > needs proper ccing. > -Daniel > Hi Daniel, We'll remove it from fs and add under drm/ in the next version of series. Sorry, forgot to add dri-devel, will add it when we send next version. > > --- > > drivers/gpu/drm/drm_drv.c | 1 + > > fs/Kconfig | 9 + > > fs/Makefile| 1 + > > fs/drmfs/Makefile | 3 + > > fs/drmfs/inode.c | 561 > > + > > include/linux/drmfs.h | 56 + > > include/uapi/linux/magic.h | 3 + > > 7 files changed, 634 insertions(+) > > create mode 100644 fs/drmfs/Makefile > > create mode 100644 fs/drmfs/inode.c > > create mode 100644 include/linux/drmfs.h > > > > diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c > > index 6dbb986..84fcfcb 100644 > > --- a/drivers/gpu/drm/drm_drv.c > > +++ b/drivers/gpu/drm/drm_drv.c > > @@ -27,6 +27,7 @@ > > */ > > > > #include > > +#include > > #include > > #include > > #include > > diff --git a/fs/Kconfig b/fs/Kconfig > > index 4bd03a2..7d0ac20 100644 > > --- a/fs/Kconfig > > +++ b/fs/Kconfig > > @@ -200,6 +200,15 @@ config HUGETLBFS > > config HUGETLB_PAGE > > def_bool HUGETLBFS > > > > +config DRMFS > > + bool "Drmfs file system support" > > + depends on DRM > > + help > > + Drmfs is a pseudo file system for drm subsystem output data. > > + > > + drmfs is a filesystem to hold miscellaneous output data from drm > > + subsystems. > > + > > config ARCH_HAS_GIGANTIC_PAGE > > bool > > > > diff --git a/fs/Makefile b/fs/Makefile > > index ed2b632..b34a96e 100644 > > --- a/fs/Makefile > > +++ b/fs/Makefile > > @@ -120,6 +120,7 @@ obj-$(CONFIG_BEFS_FS) += befs/ > > obj-$(CONFIG_HOSTFS) += hostfs/ > > obj-$(CONFIG_CACHEFILES) += cachefiles/ > > obj-$(CONFIG_DEBUG_FS) += debugfs/ > > +obj-$(CONFIG_DRMFS)+= drmfs/ > > obj-$(CONFIG_TRACING) += tracefs/ > > obj-$(CONFIG_OCFS2_FS) += ocfs2/ > > obj-$(CONFIG_BTRFS_FS) += btrfs/ > > diff --git a/fs/drmfs/Makefile b/fs/drmfs/Makefile > > new file mode 100644 > > index 000..ac87e497 > > --- /dev/null > > +++ b/fs/drmfs/Makefile > > @@ -0,0 +1,3 @@ > > +drmfs-objs := inode.o > > + > > +obj-$(CONFIG_DRMFS)+= drmfs.o > > diff --git a/fs/drmfs/inode.c b/fs/drmfs/inode.c > > new file mode 100644 > > index 000..9220705 > > --- /dev/null > > +++ b/fs/drmfs/inode.c > > @@ -0,0 +1,561 @@ > > +/* > > + * Copyright © 2014 Intel Corporation > > + * > > + * Permission is hereby granted, free of charge, to any person obtaining a > > + * copy of this software and
Re: [Intel-gfx] [RFC 2/3] drm: Register drmfs filesystem from drm init
On Thu, 2016-12-01 at 23:57 -0800, Daniel Vetter wrote: > On Thu, Dec 01, 2016 at 01:44:16PM +0530, swati.dhin...@intel.com wrote: > > From: Swati Dhingra > > > > During drm module initialization, drm_core_init initializes the drmfs > > filesystem and register this with kernel. A driver specific directory is > > created > > inside drmfs root, and dentry of this directory is saved for subsequent use > > by the driver (e.g. i915). The driver can then create files/directories > > inside > > this root directory directly. > > In case of i915 driver, the top directory is created at > > '/sys/kernel/drm/i915'. > > > > Signed-off-by: Sourab Gupta > > Signed-off-by: Swati Dhingra > > --- > > drivers/gpu/drm/drm_drv.c | 22 ++ > > include/drm/drm_drv.h | 3 +++ > > 2 files changed, 25 insertions(+) > > > > diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c > > index 84fcfcb..ead360bd 100644 > > --- a/drivers/gpu/drm/drm_drv.c > > +++ b/drivers/gpu/drm/drm_drv.c > > @@ -688,6 +688,14 @@ int drm_dev_register(struct drm_device *dev, unsigned > > long flags) > > { > > int ret; > > > > +#ifdef CONFIG_DRMFS > > + dev->driver->drmfs_root = drmfs_create_dir(dev->driver->name, NULL); > > + if (IS_ERR(dev->driver->drmfs_root)) { > > + DRM_ERROR("Failed to get drmfs root dentry\n"); > > + return PTR_ERR(dev->driver->drmfs_root); > > + } > > +#endif > > Don't do #ifdef in the code, instead provide dummy static inline functions > that do nothing in headers when a feature is disabled. For an example see > CONFIG_DRM_FBDEV_EMULATION in drm_fb_helper.[hc]. > Sorry, Will take take of this going forward. > Also, drmfs here is seriously lacking documentation. E.g. where are we > supposed to put different things related to rendering, modesetting, and > all these issues? You need to add a section in drm-uabi.rst, write > kernel-doc + overview sections for all of this and pull it in. > -Daniel > Ok. Will work on adding requisite documentation for drmfs. Thanks, Sourab > > + > > mutex_lock(&drm_global_mutex); > > > > ret = drm_minor_register(dev, DRM_MINOR_CONTROL); > > @@ -758,6 +766,9 @@ void drm_dev_unregister(struct drm_device *dev) > > drm_minor_unregister(dev, DRM_MINOR_PRIMARY); > > drm_minor_unregister(dev, DRM_MINOR_RENDER); > > drm_minor_unregister(dev, DRM_MINOR_CONTROL); > > +#ifdef CONFIG_DRMFS > > + drmfs_remove(dev->driver->drmfs_root); > > +#endif > > } > > EXPORT_SYMBOL(drm_dev_unregister); > > > > @@ -825,6 +836,9 @@ static void drm_core_exit(void) > > { > > unregister_chrdev(DRM_MAJOR, "drm"); > > debugfs_remove(drm_debugfs_root); > > +#ifdef CONFIG_DRMFS > > + drmfs_fini(); > > +#endif > > drm_sysfs_destroy(); > > idr_destroy(&drm_minors_idr); > > drm_connector_ida_destroy(); > > @@ -845,6 +859,14 @@ static int __init drm_core_init(void) > > goto error; > > } > > > > +#ifdef CONFIG_DRMFS > > + ret = drmfs_init(); > > + if (ret < 0) { > > + DRM_ERROR("Cannot create DRM FS: %d\n", ret); > > + goto error; > > + } > > +#endif > > + > > drm_debugfs_root = debugfs_create_dir("dri", NULL); > > if (!drm_debugfs_root) { > > ret = -ENOMEM; > > diff --git a/include/drm/drm_drv.h b/include/drm/drm_drv.h > > index aad8bba..34804de 100644 > > --- a/include/drm/drm_drv.h > > +++ b/include/drm/drm_drv.h > > @@ -403,6 +403,9 @@ struct drm_driver { > > > > /* List of devices hanging off this driver with stealth attach. */ > > struct list_head legacy_dev_list; > > + > > + /* drmfs parent directory dentry for this driver */ > > + struct dentry *drmfs_root; > > }; > > > > extern __printf(6, 7) > > -- > > 2.7.4 > > > > ___ > > Intel-gfx mailing list > > Intel-gfx@lists.freedesktop.org > > https://lists.freedesktop.org/mailman/listinfo/intel-gfx > ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Re: [Intel-gfx] [RFC 1/3] fs: Introduce drmfs pseudo filesystem interfaces
On Fri, 2016-12-02 at 14:40 -0800, Roper, Matthew D wrote: > On Thu, Dec 01, 2016 at 01:44:15PM +0530, swati.dhin...@intel.com wrote: > > From: Swati Dhingra > > > > The patch introduces a new pseudo filesystem type, named 'drmfs' which is > > intended to house the files for the data generated by drm subsystem that > > cannot be accommodated by any of the existing filesystems. > > The filesystem is modelled on the lines of existing pseudo-filesystems such > > as debugfs/tracefs, and borrows ideas from their implementation. > > This filesystem will be appearing at sys/kernel/drm. > > > > A new config 'CONFIG_DRMFS' is introduced to enable/disable the filesystem, > > which is dependent on CONFIG_DRM. > > The filesystem will not be registered standalone during kernel init time, > > instead it is intended to be initialized/registered during drm > > initialization. > > > > The intent for introduction of the filesystem is to act as a location to > > hold > > various kinds of data output from Linux DRM subsystems, which can't really > > fit > > anywhere else into the existing filesystems such as debugfs/sysfs etc. All > > these > > filesystems have their own constraints and are intended to output a > > particular > > type of data such as attributes and small debug parameter data. Due to these > > constraints, there is a need for a new pseudo filesytem, customizable to DRM > > specific requirements and catering to the needs to DRM subsystem components > > Am I correct in assuming the data made available via drmfs is not > considered ABI? If we're not going to guarantee a stable ABI and > backward compatibility forever for the items exposed here, then it's > important to state that very explicitly up front so that no userspace > software gets written with the wrong assumption. I'd suggest making an > explicit note one way or the other in the commit message here and > probably in the Kconfig help text as well. > > > Matt > We've intended for drmfs to be ABI as Chris mentioned here: https://lists.freedesktop.org/archives/intel-gfx/2016-December/113245.html The intent is for drmfs to be a stable ABI for the files it's holding. This can be ensured moresoever since it'll be under sole control of drm. Chris, can you correct me if i'm wrong. -sourab > > > > Signed-off-by: Sourab Gupta > > Signed-off-by: Swati Dhingra > > --- > > drivers/gpu/drm/drm_drv.c | 1 + > > fs/Kconfig | 9 + > > fs/Makefile| 1 + > > fs/drmfs/Makefile | 3 + > > fs/drmfs/inode.c | 561 > > + > > include/linux/drmfs.h | 56 + > > include/uapi/linux/magic.h | 3 + > > 7 files changed, 634 insertions(+) > > create mode 100644 fs/drmfs/Makefile > > create mode 100644 fs/drmfs/inode.c > > create mode 100644 include/linux/drmfs.h > > > > diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c > > index 6dbb986..84fcfcb 100644 > > --- a/drivers/gpu/drm/drm_drv.c > > +++ b/drivers/gpu/drm/drm_drv.c > > @@ -27,6 +27,7 @@ > > */ > > > > #include > > +#include > > #include > > #include > > #include > > diff --git a/fs/Kconfig b/fs/Kconfig > > index 4bd03a2..7d0ac20 100644 > > --- a/fs/Kconfig > > +++ b/fs/Kconfig > > @@ -200,6 +200,15 @@ config HUGETLBFS > > config HUGETLB_PAGE > > def_bool HUGETLBFS > > > > +config DRMFS > > + bool "Drmfs file system support" > > + depends on DRM > > + help > > + Drmfs is a pseudo file system for drm subsystem output data. > > + > > + drmfs is a filesystem to hold miscellaneous output data from drm > > + subsystems. > > + > > config ARCH_HAS_GIGANTIC_PAGE > > bool > > > > diff --git a/fs/Makefile b/fs/Makefile > > index ed2b632..b34a96e 100644 > > --- a/fs/Makefile > > +++ b/fs/Makefile > > @@ -120,6 +120,7 @@ obj-$(CONFIG_BEFS_FS) += befs/ > > obj-$(CONFIG_HOSTFS) += hostfs/ > > obj-$(CONFIG_CACHEFILES) += cachefiles/ > > obj-$(CONFIG_DEBUG_FS) += debugfs/ > > +obj-$(CONFIG_DRMFS)+= drmfs/ > > obj-$(CONFIG_TRACING) += tracefs/ > > obj-$(CONFIG_OCFS2_FS) += ocfs2/ > > obj-$(CONFIG_BTRFS_FS) += btrfs/ > > diff --git a/fs/drmfs/Makefile b/fs/drmfs/Makefile > > new file mode 100644 > > index 000.
Re: [Intel-gfx] [RFC 0/4] Introduce drmfs pseudo filesystem for drm subsystem
On Mon, 2016-12-05 at 03:06 -0800, Dhingra, Swati wrote: > From: Swati Dhingra > > Currently, we don't have a stable ABI which can be used for the purpose of > providing output debug/loggging/crc and other such data from DRM. > The ABI in current use (filesystems, ioctls, et al.) have their own > constraints and are intended to output a particular type of data. > Few cases in point: > sysfs - stable ABI, but constrained to one textual value per file > debugfs - unstable ABI, free-for-all > ioctls - not as suitable to many single purpose continuous data > dumping, we would very quickly run out ioctl space; requires more > userspace support than "cat" > device nodes - a real possibilty, kernel instantiation is more tricky, > requires udev (+udev.rules) or userspace discovery of the > dynamic major:minor (via sysfs) [mounting a registered > filesystem is easy in comparison] > netlink - stream based, therefore involves numerous copies. > > Debugfs is the lesser among the evils here, thereby we have grown used to the > convenience and flexibility in presentation that debugfs gives us > (including relayfs inodes) that we want some of that hierachy in stable user > ABI form. > > Due to these limitations, there is a need for a new pseudo filesytem, that > would act as a stable 'free-for-all' ABI, with the heirarchial structure and > thus convenience of debugfs. This will be managed by drm, thus named 'drmfs'. > DRM would register this filesystem to manage a canonical mountpoint, but this > wouldn't limit everyone to only using that pseudofs underneath. > > This can serve to hold various kinds of output data from Linux DRM subsystems, > for the files which can't truely fit anywhere else with existing ABI's but > present so, for the lack of a better place. > > In this patch series, we have introduced a pseudo filesystem named as 'drmfs' > for now. The filesystem is introduced in the first patch, and the subsequent > patches make use of the filesystem interfaces, in drm driver, and making them > available for use by the drm subsystem components, one of which is i915. > We've moved the location of i915 GuC logs from debugfs to drmfs in the last > patch. Subsequently, more such files such as pipe_crc, error states, memory > stats, etc. can be move to this filesystem, if the idea introduced here is > acceptable per se. The filesystem introduced is being used to house the data > generated by i915 driver in this patch series, but will hopefully be generic > enough to provide scope for usage by any other drm subsystem component. > > The patch series is being floated as RFC to gather feedback on the idea and > infrastructure proposed here and it's suitability to address the specific > problem statement/use case. > > TODO: Create documentation. Will do so in next version. > > v2: fix the bat failures caused due to missing config check > > v3: Changes made: > - Move the location of drmfs from fs/ to drivers/gpu/drm/ (Chris) > - Moving config checks to header (Chris,Daniel) > > > Sourab Gupta (4): > drm: Introduce drmfs pseudo filesystem interfaces > drm: Register drmfs filesystem from drm init > drm: Create driver specific root directory inside drmfs > drm/i915: Creating guc log file in drmfs instead of debugfs > > drivers/gpu/drm/Kconfig| 9 + > drivers/gpu/drm/Makefile | 1 + > drivers/gpu/drm/drm_drv.c | 12 + > drivers/gpu/drm/drmfs.c| 555 > + > drivers/gpu/drm/i915/i915_guc_submission.c | 33 +- > include/drm/drm_drv.h | 3 + > include/drm/drmfs.h| 77 > include/uapi/linux/magic.h | 3 + > 8 files changed, 672 insertions(+), 21 deletions(-) > create mode 100644 drivers/gpu/drm/drmfs.c > create mode 100644 include/drm/drmfs.h > > -- > 2.7.4 > Hi dri-devel folks, Any feedback on the proposed drmfs infrastructure being proposed here? Regards, Sourab ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Re: [Intel-gfx] [RFC 0/4] Introduce drmfs pseudo filesystem for drm subsystem
On Mon, 2016-12-12 at 07:33 -0800, Alex Deucher wrote: > On Mon, Dec 12, 2016 at 1:14 AM, sourab gupta wrote: > > On Mon, 2016-12-05 at 03:06 -0800, Dhingra, Swati wrote: > >> From: Swati Dhingra > >> > >> Currently, we don't have a stable ABI which can be used for the purpose of > >> providing output debug/loggging/crc and other such data from DRM. > >> The ABI in current use (filesystems, ioctls, et al.) have their own > >> constraints and are intended to output a particular type of data. > >> Few cases in point: > >> sysfs - stable ABI, but constrained to one textual value per file > >> debugfs - unstable ABI, free-for-all > >> ioctls - not as suitable to many single purpose continuous data > >> dumping, we would very quickly run out ioctl space; requires more > >> userspace support than "cat" > >> device nodes - a real possibilty, kernel instantiation is more tricky, > >> requires udev (+udev.rules) or userspace discovery of the > >> dynamic major:minor (via sysfs) [mounting a registered > >> filesystem is easy in comparison] > >> netlink - stream based, therefore involves numerous copies. > >> > >> Debugfs is the lesser among the evils here, thereby we have grown used to > >> the > >> convenience and flexibility in presentation that debugfs gives us > >> (including relayfs inodes) that we want some of that hierachy in stable > >> user > >> ABI form. > >> > >> Due to these limitations, there is a need for a new pseudo filesytem, that > >> would act as a stable 'free-for-all' ABI, with the heirarchial structure > >> and > >> thus convenience of debugfs. This will be managed by drm, thus named > >> 'drmfs'. > >> DRM would register this filesystem to manage a canonical mountpoint, but > >> this > >> wouldn't limit everyone to only using that pseudofs underneath. > >> > >> This can serve to hold various kinds of output data from Linux DRM > >> subsystems, > >> for the files which can't truely fit anywhere else with existing ABI's but > >> present so, for the lack of a better place. > >> > >> In this patch series, we have introduced a pseudo filesystem named as > >> 'drmfs' > >> for now. The filesystem is introduced in the first patch, and the > >> subsequent > >> patches make use of the filesystem interfaces, in drm driver, and making > >> them > >> available for use by the drm subsystem components, one of which is i915. > >> We've moved the location of i915 GuC logs from debugfs to drmfs in the last > >> patch. Subsequently, more such files such as pipe_crc, error states, memory > >> stats, etc. can be move to this filesystem, if the idea introduced here is > >> acceptable per se. The filesystem introduced is being used to house the > >> data > >> generated by i915 driver in this patch series, but will hopefully be > >> generic > >> enough to provide scope for usage by any other drm subsystem component. > >> > >> The patch series is being floated as RFC to gather feedback on the idea and > >> infrastructure proposed here and it's suitability to address the specific > >> problem statement/use case. > >> > >> TODO: Create documentation. Will do so in next version. > >> > >> v2: fix the bat failures caused due to missing config check > >> > >> v3: Changes made: > >> - Move the location of drmfs from fs/ to drivers/gpu/drm/ (Chris) > >> - Moving config checks to header (Chris,Daniel) > >> > >> > >> Sourab Gupta (4): > >> drm: Introduce drmfs pseudo filesystem interfaces > >> drm: Register drmfs filesystem from drm init > >> drm: Create driver specific root directory inside drmfs > >> drm/i915: Creating guc log file in drmfs instead of debugfs > >> > >> drivers/gpu/drm/Kconfig| 9 + > >> drivers/gpu/drm/Makefile | 1 + > >> drivers/gpu/drm/drm_drv.c | 12 + > >> drivers/gpu/drm/drmfs.c| 555 > >> + > >> drivers/gpu/drm/i915/i915_guc_submission.c | 33 +- > >> include/drm/drm_drv.h | 3 + > >> include/drm/drmfs.h| 77 > >> include/u
[Intel-gfx] [PATCH 01/16] drm/i915: Introduce global id for contexts
From: Sourab Gupta The current context user handles are specific to drm file instance. There are some usecases, which may require a global id for the contexts. For e.g. a system level GPU profiler tool may lean upon the global context ids to associate the performance snapshots with individual contexts. This global id may also be used further in order to provide a unique context id to hw. In this patch, the global ids are allocated from a separate cyclic idr and can be further utilized for any usecase described above. v2: According to Chris' suggestion, implemented a separate idr for holding global ids for contexts, as opposed to overloading the file specific ctx->user_handle for this purpose. This global id can also further be used wherever hw has to be programmed with ctx unique id, though this patch just introduces the hw global id as such. Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_drv.h | 3 +++ drivers/gpu/drm/i915/i915_gem_context.c | 21 + 2 files changed, 24 insertions(+) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 2817a88..cfc135d 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -848,6 +848,7 @@ struct i915_ctx_hang_stats { struct intel_context { struct kref ref; int user_handle; + int global_id; uint8_t remap_slice; struct drm_i915_private *i915; int flags; @@ -1890,6 +1891,8 @@ struct drm_i915_private { bool preserve_bios_swizzle; + struct idr global_ctx_idr; + /* overlay */ struct intel_overlay *overlay; diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c index ca7a1c0..9cb124e 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.c +++ b/drivers/gpu/drm/i915/i915_gem_context.c @@ -263,6 +263,18 @@ __create_hw_context(struct drm_device *dev, ctx->file_priv = file_priv; ctx->user_handle = ret; + + /* TODO: If required, this global id can be used for programming the hw +* fields too. In that case, we'll have take care of hw restrictions +* while allocating idr. e.g. for some hw, we may not have full 32 bits +* available. +*/ + ret = idr_alloc_cyclic(&dev_priv->global_ctx_idr, + ctx, 0, 0, GFP_KERNEL); + if (ret < 0) + goto err_out; + + ctx->global_id = ret; /* NB: Mark all slices as needing a remap so that when the context first * loads it will restore whatever remap state already exists. If there * is no remap info, it will be a NOP. */ @@ -287,6 +299,7 @@ i915_gem_create_context(struct drm_device *dev, struct drm_i915_file_private *file_priv) { const bool is_global_default_ctx = file_priv == NULL; + struct drm_i915_private *dev_priv = dev->dev_private; struct intel_context *ctx; int ret = 0; @@ -333,6 +346,7 @@ err_unpin: i915_gem_object_ggtt_unpin(ctx->legacy_hw_ctx.rcs_state); err_destroy: idr_remove(&file_priv->context_idr, ctx->user_handle); + idr_remove(&dev_priv->global_ctx_idr, ctx->global_id); i915_gem_context_unreference(ctx); return ERR_PTR(ret); } @@ -403,6 +417,7 @@ int i915_gem_context_init(struct drm_device *dev) dev_priv->hw_context_size = 0; } } + idr_init(&dev_priv->global_ctx_idr); ctx = i915_gem_create_context(dev, NULL); if (IS_ERR(ctx)) { @@ -425,6 +440,8 @@ void i915_gem_context_fini(struct drm_device *dev) struct intel_context *dctx = dev_priv->kernel_context; int i; + idr_destroy(&dev_priv->global_ctx_idr); + if (dctx->legacy_hw_ctx.rcs_state) { /* The only known way to stop the gpu from accessing the hw context is * to reset it. Do this as the very last operation to avoid confusing @@ -480,6 +497,8 @@ static int context_idr_cleanup(int id, void *p, void *data) { struct intel_context *ctx = p; + idr_remove(&ctx->i915->global_ctx_idr, ctx->global_id); + i915_gem_context_unreference(ctx); return 0; } @@ -906,6 +925,7 @@ int i915_gem_context_destroy_ioctl(struct drm_device *dev, void *data, { struct drm_i915_gem_context_destroy *args = data; struct drm_i915_file_private *file_priv = file->driver_priv; + struct drm_i915_private *dev_priv = dev->dev_private; struct intel_context *ctx; int ret; @@ -926,6 +946,7 @@ int i915_gem_context_destroy_ioctl(struct drm_device *dev, void *data, } idr_remove(&ctx->file_priv->context_idr, ctx->user_handle); + idr_remove(&dev_priv->global_ct
[Intel-gfx] [PATCH 02/16] drm/i915: Constrain intel_context::global_id to 20 bits
From: Robert Bragg This will allow the ID to be given to the HW as the unique context identifier that's written, for example, to the context status buffer on preemption and included in reports written by the OA unit. Cc: Sourab Gupta Signed-off-by: Robert Bragg --- drivers/gpu/drm/i915/i915_gem_context.c | 11 +-- 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c index 9cb124e..6db5e3a 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.c +++ b/drivers/gpu/drm/i915/i915_gem_context.c @@ -90,6 +90,10 @@ #include "i915_drv.h" #include "i915_trace.h" +/* With execlist scheduling we can program our own HW context ID but we we + * are limited to 20bits */ +#define I915_MAX_HW_CTX_ID ((1<<20)-1) + /* This is a HW constraint. The value below is the largest known requirement * I've seen in a spec to date, and that was a workaround for a non-shipping * part. It should be safe to decrease this, but it's more future proof as is. @@ -264,13 +268,8 @@ __create_hw_context(struct drm_device *dev, ctx->file_priv = file_priv; ctx->user_handle = ret; - /* TODO: If required, this global id can be used for programming the hw -* fields too. In that case, we'll have take care of hw restrictions -* while allocating idr. e.g. for some hw, we may not have full 32 bits -* available. -*/ ret = idr_alloc_cyclic(&dev_priv->global_ctx_idr, - ctx, 0, 0, GFP_KERNEL); + ctx, 0, I915_MAX_HW_CTX_ID, GFP_KERNEL); if (ret < 0) goto err_out; -- 1.9.1 ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
[Intel-gfx] [PATCH 03/16] drm/i915: return ctx->global_id from intel_execlists_ctx_id()
From: Robert Bragg The newly added intel_context::global_id is suitable (a globally unique 20 bit ID) for giving to the hardware as a unique context identifier. Compared to using the pinned address of a logical ring context these IDs are constant for the lifetime of a context whereas a context could be repinned at different addresses during its lifetime. Having a stable ID is useful when we need to buffer information associated with a context based on this ID so the association can't be lost. For example the OA unit writes out counter reports to a circular buffer tagged with this ID and we want to be able to accurately filter reports for a specific context, ideally without the added complexity of tracking context re-pinning while the OA buffer may contain reports with older IDs. Cc: Sourab Gupta Signed-off-by: Robert Bragg --- drivers/gpu/drm/i915/i915_debugfs.c | 4 ++-- drivers/gpu/drm/i915/intel_lrc.c| 28 ++-- drivers/gpu/drm/i915/intel_lrc.h| 3 +-- 3 files changed, 17 insertions(+), 18 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index 931dc60..c172bf5 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -2050,7 +2050,7 @@ static void i915_dump_lrc_obj(struct seq_file *m, } seq_printf(m, "CONTEXT: %s %u\n", engine->name, - intel_execlists_ctx_id(ctx, engine)); + intel_execlists_ctx_id(ctx)); if (!i915_gem_obj_ggtt_bound(ctx_obj)) seq_puts(m, "\tNot bound in GGTT\n"); @@ -2171,7 +2171,7 @@ static int i915_execlists(struct seq_file *m, void *data) seq_printf(m, "\t%d requests in queue\n", count); if (head_req) { seq_printf(m, "\tHead request id: %u\n", - intel_execlists_ctx_id(head_req->ctx, engine)); + intel_execlists_ctx_id(head_req->ctx)); seq_printf(m, "\tHead request tail: %u\n", head_req->tail); } diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index 452ea0d..1425ede 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -307,21 +307,23 @@ logical_ring_init_platform_invariants(struct intel_engine_cs *engine) * This is what a descriptor looks like, from LSB to MSB: *bits 0-11:flags, GEN8_CTX_* (cached in ctx_desc_template) *bits 12-31:LRCA, GTT address of (the HWSP of) this context - *bits 32-51:ctx ID, a globally unique tag (the LRCA again!) + *bits 32-51:ctx ID, a globally unique tag (ctx->global_id) *bits 52-63:reserved, may encode the engine ID (for GuC) */ static void intel_lr_context_descriptor_update(struct intel_context *ctx, struct intel_engine_cs *engine) { - uint64_t lrca, desc; + uint64_t lrca, id, desc; lrca = ctx->engine[engine->id].lrc_vma->node.start + LRC_PPHWSP_PN * PAGE_SIZE; - desc = engine->ctx_desc_template; /* bits 0-11 */ - desc |= lrca; /* bits 12-31 */ - desc |= (lrca >> PAGE_SHIFT) << GEN8_CTX_ID_SHIFT; /* bits 32-51 */ + id = ctx->global_id; + + desc = engine->ctx_desc_template; /* bits 0-11 */ + desc |= lrca; /* bits 12-31 */ + desc |= id << GEN8_CTX_ID_SHIFT;/* bits 32-51 */ ctx->engine[engine->id].lrc_desc = desc; } @@ -335,7 +337,6 @@ uint64_t intel_lr_context_descriptor(struct intel_context *ctx, /** * intel_execlists_ctx_id() - get the Execlists Context ID * @ctx: Context to get the ID for - * @ring: Engine to get the ID for * * Do not confuse with ctx->id! Unfortunately we have a name overload * here: the old context ID we pass to userspace as a handler so that @@ -343,15 +344,14 @@ uint64_t intel_lr_context_descriptor(struct intel_context *ctx, * ELSP so that the GPU can inform us of the context status via * interrupts. * - * The context ID is a portion of the context descriptor, so we can - * just extract the required part from the cached descriptor. - * - * Return: 20-bits globally unique context ID. + * Further the ID given to HW can now be relied on to be constant for + * the lifetime of the context, unlike previously when we used an + * associated logical ring context address (which could be repinned at + * a different address). */ -u32 intel_execlists_ctx_id(struct intel_context *ctx, - struct intel_engine_cs *engine) +u32 intel_execlists_ctx_id(struct intel_context *ctx) { - return intel_lr_context_descriptor(ctx, engine) >> GEN8_CTX_ID
[Intel-gfx] [PATCH 04/16] drm/i915: Add ctx getparam ioctl parameter to retrieve ctx global id
From: Sourab Gupta This patch adds a new ctx getparam ioctl parameter, which can be used to retrieve ctx global_id for any particular ctx by userspace. This can be used by userspace to map the i915 perf samples with their particular ctx's, since those would be having ctx global_id's. Otherwise the userspace has no way of maintaining this association, since it has the knowledge of only per-drm file specific ctx handles. Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_gem_context.c | 3 +++ include/uapi/drm/i915_drm.h | 1 + 2 files changed, 4 insertions(+) diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c index 6db5e3a..83d0d34 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.c +++ b/drivers/gpu/drm/i915/i915_gem_context.c @@ -987,6 +987,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data, else args->value = to_i915(dev)->ggtt.base.total; break; + case I915_CONTEXT_PARAM_GLOBAL_ID: + args->value = ctx->global_id; + break; default: ret = -EINVAL; break; diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index 4a1bcfd8..7e24ec4 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -1171,6 +1171,7 @@ struct drm_i915_gem_context_param { #define I915_CONTEXT_PARAM_BAN_PERIOD 0x1 #define I915_CONTEXT_PARAM_NO_ZEROMAP 0x2 #define I915_CONTEXT_PARAM_GTT_SIZE0x3 +#define I915_CONTEXT_PARAM_GLOBAL_ID 0x4 __u64 value; }; -- 1.9.1 ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
[Intel-gfx] [PATCH 05/16] drm/i915: Expose OA sample source to userspace
From: Sourab Gupta This patch exposes a new sample source field to userspace. This field can be populated to specify the origin of the OA report. For e.g. for internally triggerred reports (non MI_RPC reports), the RPT_ID field has bitfields for specifying the origin such as timer, or render ctx switch, etc. Likewise this field can be used to specify the source as MI_RPC when such support is added. Signed-off-by: Sourab Gupta Signed-off-by: Robert Bragg --- drivers/gpu/drm/i915/i915_perf.c | 55 ++-- include/uapi/drm/i915_drm.h | 16 2 files changed, 63 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 383a698..f86cd15 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -73,6 +73,13 @@ static u32 i915_perf_stream_paranoid = true; */ #define OA_EXPONENT_MAX 31 +#define GEN8_OAREPORT_REASON_TIMER (1<<19) +#define GEN8_OAREPORT_REASON_TRIGGER1 (1<<20) +#define GEN8_OAREPORT_REASON_TRIGGER2 (1<<21) +#define GEN8_OAREPORT_REASON_CTX_SWITCH (1<<22) +#define GEN8_OAREPORT_REASON_GO_TRANSITION (1<<23) +#define GEN9_OAREPORT_REASON_CLK_RATIO (1<<24) + /* for sysctl proc_dointvec_minmax of i915_oa_min_timer_exponent */ static int zero; static int oa_exponent_max = OA_EXPONENT_MAX; @@ -112,7 +119,8 @@ static struct i915_oa_format gen8_plus_oa_formats[I915_OA_FORMAT_MAX] = { [I915_OA_FORMAT_C4_B8] = { 7, 64 }, }; -#define SAMPLE_OA_REPORT (1<<0) +#define SAMPLE_OA_REPORT (1<<0) +#define SAMPLE_OA_SOURCE_INFO (1<<1) struct perf_open_properties { u32 sample_flags; @@ -214,6 +222,27 @@ static int append_oa_sample(struct i915_perf_stream *stream, return -EFAULT; buf += sizeof(header); + if (sample_flags & SAMPLE_OA_SOURCE_INFO) { + enum drm_i915_perf_oa_event_source source; + + if (INTEL_INFO(dev_priv)->gen >= 8) { + u32 reason = *(u32 *)report; + + if (reason & GEN8_OAREPORT_REASON_CTX_SWITCH) + source = + I915_PERF_OA_EVENT_SOURCE_CONTEXT_SWITCH; + else if (reason & GEN8_OAREPORT_REASON_TIMER) + source = I915_PERF_OA_EVENT_SOURCE_PERIODIC; + else + source = I915_PERF_OA_EVENT_SOURCE_UNDEFINED; + } else + source = I915_PERF_OA_EVENT_SOURCE_PERIODIC; + + if (copy_to_user(buf, &source, 4)) + return -EFAULT; + buf += 4; + } + if (sample_flags & SAMPLE_OA_REPORT) { if (copy_to_user(buf, report, report_size)) return -EFAULT; @@ -1130,11 +1159,6 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, int format_size; int ret; - if (!(props->sample_flags & SAMPLE_OA_REPORT)) { - DRM_ERROR("Only OA report sampling supported\n"); - return -EINVAL; - } - if (!dev_priv->perf.oa.ops.init_oa_buffer) { DRM_ERROR("OA unit not supported\n"); return -ENODEV; @@ -1163,8 +1187,20 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, format_size = dev_priv->perf.oa.oa_formats[props->oa_format].size; - stream->sample_flags |= SAMPLE_OA_REPORT; - stream->sample_size += format_size; + if (props->sample_flags & SAMPLE_OA_REPORT) { + stream->sample_flags |= SAMPLE_OA_REPORT; + stream->sample_size += format_size; + } + + if (props->sample_flags & SAMPLE_OA_SOURCE_INFO) { + if (!(props->sample_flags & SAMPLE_OA_REPORT)) { + DRM_ERROR( + "OA source type can't be sampled without OA report"); + return -EINVAL; + } + stream->sample_flags |= SAMPLE_OA_SOURCE_INFO; + stream->sample_size += 4; + } dev_priv->perf.oa.oa_buffer.format_size = format_size; BUG_ON(dev_priv->perf.oa.oa_buffer.format_size == 0); @@ -1817,6 +1853,9 @@ static int read_properties_unlocked(struct drm_i915_private *dev_priv, props->oa_periodic = true; props->oa_period_exponent = value; break; + case DRM_I915_PERF_PROP_SAMPLE_OA_SOURCE: + props->sample_flags |= SAMPLE_OA_SOURCE_INFO; + break; case DRM_I915_PERF_PROP_MAX: BUG(); } dif
[Intel-gfx] [PATCH 00/16] Framework to collect command stream gpu metrics using i915 perf
From: Sourab Gupta This series adds framework for collection of gpu performance metrics associated with the command stream of a particular engine. These metrics include OA reports, timestamps, mmio metrics, etc. These metrics are are collected around batchbuffer boundaries. This work utilizes the underlying infrastructure introduced in Robert Bragg's patches for collecting periodic OA counter snapshots (based on Haswell): https://lists.freedesktop.org/archives/intel-gfx/2016-April/093206.html This patch set is based on Gen8+ version of Robert's patches which can be found here: https://github.com/rib/linux/commits/wip/rib/oa-2016-04-18-nightly These are not yet individually floated in the mailing list, which I hope doesn't lead to any significant loss of clarity in order to review the work proposed in this patch series. Compared to last series I floated earlier, (https://lists.freedesktop.org/archives/intel-gfx/2016-February/087686.html), this series incorporates the following changes/fixes, besides rebasing on Robert's latest work: * Few refinements related to flushing periodic OA samples in case of no pending CS samples, but not doing so in case there are pending CS samples (queued but requests not yet completed). * For the case of overflow of command stream buf, we can choose to overwrite old entries or stop collecting more samples. This is right now controlled via compile time macro. We can move to either of these behaviors going forward. * The sample consistency is maintained between the periodic OA reports and command stream ones. This implies, for e.g., that if ctx_id/pid sample type is requested, the most recent pid collected in the CS samples is used to populate the relevant field in the periodic samples. * In case both timestamp and OA sample type are requested for render engine, the raw gpu timestamps are extracted from OA report only, and we don't need to insert seperate commands for retreiving timestamps. * Introduction of a new property to request inclusion of CLOCK_MONOTONIC time in the samples. Being able to correlate gpu events/samples with CLOCK_MONOTONIC is of practical use to userspace, for usecases involving correlation of gpu events with system time. This may, for e.g., involve plotting gpu and system events on the same timeline (such as vblank events, or timestamps for when work was submitted to the kernel, etc.). The patch introduces a sync mechanism in order to correlate the gpu timestamps with CLOCK_MONOTONIC time to begin with. This can further be extended for other clock domains. Sync is needed because published gpu timestamp clock frequency may differ and lead to clock drift. The sync mechanism may be crude right now and improved upon going forward, but this is the general thinking behind introduction of this mechanism. * The gpu raw timestamp can also be forwarded in conjunction with CLOCK_MONOTONIC time, since userspace may have a need for both. E.g. the raw timestamps are exposed to userspace if it uses PIPE_CONTROL + post_sync_op='write timestamp' and userspace may want to correlate these with perf metrics. For reference, the patches can be fetched from here: https://github.com/sourabgu/linux/tree/perf-2016-04-19 Robert Bragg (2): drm/i915: Constrain intel_context::global_id to 20 bits drm/i915: return ctx->global_id from intel_execlists_ctx_id() Sourab Gupta (14): drm/i915: Introduce global id for contexts drm/i915: Add ctx getparam ioctl parameter to retrieve ctx global id drm/i915: Expose OA sample source to userspace drm/i915: Framework for capturing command stream based OA reports drm/i915: flush periodic samples, in case of no pending CS sample requests drm/i915: Handle the overflow condition for command stream buf drm/i915: Populate ctx ID for periodic OA reports drm/i915: Add support for having pid output with OA report drm/i915: Add support for emitting execbuffer tags through OA counter reports drm/i915: Extend i915 perf framework for collecting timestamps on all gpu engines drm/i915: Extract raw GPU timestamps from OA reports to forward in perf samples drm/i915: Support opening multiple concurrent perf streams drm/i915: Mechanism to forward clock monotonic time in perf samples drm/i915: Support for capturing MMIO register values drivers/gpu/drm/i915/i915_debugfs.c|4 +- drivers/gpu/drm/i915/i915_drv.h| 97 +- drivers/gpu/drm/i915/i915_gem_context.c| 23 + drivers/gpu/drm/i915/i915_gem_execbuffer.c |5 + drivers/gpu/drm/i915/i915_perf.c | 1842 +--- drivers/gpu/drm/i915/i915_reg.h| 16 + drivers/gpu/drm/i915/intel_lrc.c | 32 +- drivers/gpu/drm/i915/intel_lrc.h |3 +- include/uapi/drm/i915_drm.h| 79 ++ 9 files changed, 1907 insertions(+), 194 deletions(-) -- 1.9.1 ___
[Intel-gfx] [PATCH 10/16] drm/i915: Add support for having pid output with OA report
From: Sourab Gupta This patch introduces flags and adds support for having pid output with the OA reports generated through the RCS commands. When the stream is opened with pid sample type, the pid information is also captured through the command stream samples and forwarded along with the OA reports. Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_drv.h | 2 ++ drivers/gpu/drm/i915/i915_perf.c | 48 +++- include/uapi/drm/i915_drm.h | 7 ++ 3 files changed, 56 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 53bf148..021b34e 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1836,6 +1836,7 @@ struct i915_perf_cs_data_node { struct drm_i915_gem_request *request; u32 offset; u32 ctx_id; + u32 pid; }; struct drm_i915_private { @@ -2177,6 +2178,7 @@ struct drm_i915_private { } command_stream_buf; u32 last_ctx_id; + u32 last_pid; struct list_head node_list; spinlock_t node_list_lock; } perf; diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index a112c20..0d347d2 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -87,6 +87,7 @@ static u32 i915_perf_stream_paranoid = true; struct oa_sample_data { u32 source; u32 ctx_id; + u32 pid; const u8 *report; }; @@ -132,6 +133,7 @@ static struct i915_oa_format gen8_plus_oa_formats[I915_OA_FORMAT_MAX] = { #define SAMPLE_OA_REPORT (1<<0) #define SAMPLE_OA_SOURCE_INFO (1<<1) #define SAMPLE_CTX_ID (1<<2) +#define SAMPLE_PID (1<<3) struct perf_open_properties { u32 sample_flags; @@ -313,6 +315,7 @@ static void i915_perf_command_stream_hook_oa(struct drm_i915_gem_request *req) goto out; entry->ctx_id = ctx->global_id; + entry->pid = current->pid; i915_gem_request_assign(&entry->request, req); addr = dev_priv->perf.command_stream_buf.vma->node.start + @@ -573,6 +576,12 @@ static int append_oa_sample(struct i915_perf_stream *stream, buf += 4; } + if (sample_flags & SAMPLE_PID) { + if (copy_to_user(buf, &data->pid, 4)) + return -EFAULT; + buf += 4; + } + if (sample_flags & SAMPLE_OA_REPORT) { if (copy_to_user(buf, data->report, report_size)) return -EFAULT; @@ -615,6 +624,9 @@ static int append_oa_buffer_sample(struct i915_perf_stream *stream, data.ctx_id = dev_priv->perf.oa.ops.oa_buffer_get_ctx_id( stream, report); + if (sample_flags & SAMPLE_PID) + data.pid = dev_priv->perf.last_pid; + if (sample_flags & SAMPLE_OA_REPORT) data.report = report; @@ -1043,6 +1055,11 @@ static int append_oa_rcs_sample(struct i915_perf_stream *stream, dev_priv->perf.last_ctx_id = node->ctx_id; } + if (sample_flags & SAMPLE_PID) { + data.pid = node->pid; + dev_priv->perf.last_pid = node->pid; + } + if (sample_flags & SAMPLE_OA_REPORT) data.report = report; @@ -1833,6 +1850,7 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, struct drm_i915_private *dev_priv = stream->dev_priv; bool require_oa_unit = props->sample_flags & (SAMPLE_OA_REPORT | SAMPLE_OA_SOURCE_INFO); + bool require_cs_mode = props->sample_flags & SAMPLE_PID; bool cs_sample_data = props->sample_flags & SAMPLE_OA_REPORT; int ret; @@ -1959,6 +1977,20 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, if (props->sample_flags & SAMPLE_CTX_ID) { stream->sample_flags |= SAMPLE_CTX_ID; stream->sample_size += 4; + + /* +* NB: it's meaningful to request SAMPLE_CTX_ID with just CS +* mode or periodic OA mode sampling but we don't allow +* SAMPLE_CTX_ID without either mode +*/ + if (!require_oa_unit) + require_cs_mode = true; + } + + if (require_cs_mode && !props->cs_mode) { + DRM_ERROR("PID sampling requires a ring to be specified"); + ret = -EINVAL; + goto cs_error; } if (props->cs_mode) { @@ -1969,7 +2001,13 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, goto cs_error;
[Intel-gfx] [PATCH 14/16] drm/i915: Support opening multiple concurrent perf streams
From: Sourab Gupta This patch adds support for opening multiple concurrent perf streams for different gpu engines, while having the restriction to open only a single stream open for a particular gpu engine. This enables userspace client to open multiple streams, one per engine, at any time to capture sample data for multiple gpu engines. Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_drv.h | 2 +- drivers/gpu/drm/i915/i915_perf.c | 65 ++-- 2 files changed, 37 insertions(+), 30 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 2ac07fb..0923a17 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -2141,7 +2141,7 @@ struct drm_i915_private { spinlock_t hook_lock; struct hrtimer poll_check_timer; - struct i915_perf_stream *exclusive_stream; + struct i915_perf_stream *ring_stream[I915_NUM_ENGINES]; wait_queue_head_t poll_wq[I915_NUM_ENGINES]; struct { diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 2bf9cf0..abb9d04 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -1545,7 +1545,7 @@ static void i915_ring_stream_destroy(struct i915_perf_stream *stream) { struct drm_i915_private *dev_priv = stream->dev_priv; - BUG_ON(stream != dev_priv->perf.exclusive_stream); + BUG_ON(stream != dev_priv->perf.ring_stream[stream->engine]); if (stream->using_oa) { dev_priv->perf.oa.ops.disable_metric_set(dev_priv); @@ -1559,7 +1559,7 @@ static void i915_ring_stream_destroy(struct i915_perf_stream *stream) if (stream->cs_mode) free_command_stream_buf(dev_priv, stream->engine); - dev_priv->perf.exclusive_stream = NULL; + dev_priv->perf.ring_stream[stream->engine] = NULL; } static void *vmap_oa_buffer(struct drm_i915_gem_object *obj) @@ -1983,13 +1983,13 @@ static void gen7_update_oacontrol_locked(struct drm_i915_private *dev_priv) { assert_spin_locked(&dev_priv->perf.hook_lock); - if (dev_priv->perf.exclusive_stream->enabled) { + if (dev_priv->perf.ring_stream[RCS]->enabled) { unsigned long ctx_id = 0; - if (dev_priv->perf.exclusive_stream->ctx) + if (dev_priv->perf.ring_stream[RCS]->ctx) ctx_id = dev_priv->perf.oa.specific_ctx_id; - if (dev_priv->perf.exclusive_stream->ctx == NULL || ctx_id) { + if (dev_priv->perf.ring_stream[RCS]->ctx == NULL || ctx_id) { bool periodic = dev_priv->perf.oa.periodic; u32 period_exponent = dev_priv->perf.oa.period_exponent; u32 report_format = dev_priv->perf.oa.oa_buffer.format; @@ -2105,15 +2105,6 @@ static int i915_ring_stream_init(struct i915_perf_stream *stream, SAMPLE_TS); int ret; - /* To avoid the complexity of having to accurately filter -* counter reports and marshal to the appropriate client -* we currently only allow exclusive access -*/ - if (dev_priv->perf.exclusive_stream) { - DRM_ERROR("Stream already in use\n"); - return -EBUSY; - } - if ((props->sample_flags & SAMPLE_CTX_ID) && !props->cs_mode) { if (IS_HASWELL(dev_priv->dev)) { DRM_ERROR( @@ -2131,6 +2122,12 @@ static int i915_ring_stream_init(struct i915_perf_stream *stream, if (require_oa_unit) { int format_size; + /* Only allow exclusive access per stream */ + if (dev_priv->perf.ring_stream[RCS]) { + DRM_ERROR("Stream:0 already in use\n"); + return -EBUSY; + } + if (!dev_priv->perf.oa.ops.init_oa_buffer) { DRM_ERROR("OA unit not supported\n"); return -ENODEV; @@ -2260,6 +2257,13 @@ static int i915_ring_stream_init(struct i915_perf_stream *stream, } if (props->cs_mode) { + /* Only allow exclusive access per stream */ + if (dev_priv->perf.ring_stream[props->engine]) { + DRM_ERROR("Stream:%d already in use\n", props->engine); + ret = -EBUSY; + goto cs_error; + } + if (!cs_sample_data) { DRM_ERROR( "Ring given without requesting any CS data to sample"); @@ -2311,7 +2315,7 @@ static int i915_ring_stream_init(struct
[Intel-gfx] [PATCH 07/16] drm/i915: flush periodic samples, in case of no pending CS sample requests
From: Sourab Gupta When there are no pending CS OA samples, flush the periodic OA samples collected so far. We can safely forward the periodic OA samples in the case we have no pending CS samples, but we can't do so in the case we have pending CS samples, since we don't know what the ordering between pending CS samples and periodic samples will eventually be. If we have no pending CS sample, it won't be possible for future pending CS sample to have timestamps earlier than current periodic timestamp. Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_drv.h | 14 ++-- drivers/gpu/drm/i915/i915_perf.c | 173 +-- 2 files changed, 140 insertions(+), 47 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 050df37..bc4fc1b 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1760,7 +1760,7 @@ struct i915_perf_stream { /* Return: true if any i915 perf records are ready to read() * for this stream. */ - bool (*can_read)(struct i915_perf_stream *stream); + bool (*can_read_unlocked)(struct i915_perf_stream *stream); /* Call poll_wait, passing a wait queue that will be woken * once there is something ready to read() for the stream @@ -1772,8 +1772,8 @@ struct i915_perf_stream { /* For handling a blocking read, wait until there is something * to ready to read() for the stream. E.g. wait on the same * wait queue that would be passed to poll_wait() until -* ->can_read() returns true (if its safe to call ->can_read() -* without the i915 perf lock held). +* ->can_read_unlocked() returns true (if its safe to call +* ->can_read_unlocked() without the i915 perf lock held). */ int (*wait_unlocked)(struct i915_perf_stream *stream); @@ -1819,8 +1819,10 @@ struct i915_oa_ops { u32 ctx_id); void (*legacy_ctx_switch_unlocked)(struct drm_i915_gem_request *req); int (*read)(struct i915_perf_stream *stream, - struct i915_perf_read_state *read_state, u32 ts); - bool (*oa_buffer_is_empty)(struct drm_i915_private *dev_priv); + struct i915_perf_read_state *read_state, + u32 ts, u32 max_records); + int (*oa_buffer_num_samples)(struct drm_i915_private *dev_priv, + u32 *last_ts); }; /* @@ -2155,6 +2157,8 @@ struct drm_i915_private { u32 gen7_latched_oastatus1; u32 ctx_oactxctrl_off; u32 ctx_flexeu0_off; + u32 n_pending_periodic_samples; + u32 pending_periodic_ts; struct i915_oa_ops ops; const struct i915_oa_format *oa_formats; diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 4adbf26..222de00 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -380,13 +380,30 @@ static void i915_oa_rcs_free_requests(struct drm_i915_private *dev_priv) * pointers. A race here could result in a false positive !empty status which * is acceptable. */ -static bool gen8_oa_buffer_is_empty_fop_unlocked(struct drm_i915_private *dev_priv) +static int +gen8_oa_buffer_num_samples_fop_unlocked(struct drm_i915_private *dev_priv, + u32 *last_ts) { int report_size = dev_priv->perf.oa.oa_buffer.format_size; - u32 head = I915_READ(GEN8_OAHEADPTR); - u32 tail = I915_READ(GEN8_OATAILPTR); + u8 *oa_buf_base = dev_priv->perf.oa.oa_buffer.addr; + u32 head = I915_READ(GEN8_OAHEADPTR) & GEN8_OAHEADPTR_MASK; + u32 tail = I915_READ(GEN8_OATAILPTR) & GEN8_OATAILPTR_MASK; + u32 mask = (OA_BUFFER_SIZE - 1); + u32 num_samples; + u8 *report; + + head -= dev_priv->perf.oa.oa_buffer.gtt_offset; + tail -= dev_priv->perf.oa.oa_buffer.gtt_offset; + num_samples = OA_TAKEN(tail, head) / report_size; - return OA_TAKEN(tail, head) < report_size; + /* read the timestamp of the last sample */ + if (num_samples) { + head += report_size*(num_samples - 1); + report = oa_buf_base + (head & mask); + *last_ts = *(u32 *)(report + 4); + } + + return num_samples; } /* NB: This is either called via fops or the poll check hrtimer (atomic ctx) @@ -400,15 +417,32 @@ static bool gen8_oa_buffer_is_empty_fop_unlocked(struct drm_i915_private *dev_pr * pointers. A race here could result in a false positive !empty status which * is acceptable. */ -static bool gen7_oa_buffer_is_empty_fop_unlocked(struct drm_i915_private *dev_priv) +static int +gen7_oa_buffer_num_
[Intel-gfx] [PATCH 11/16] drm/i915: Add support for emitting execbuffer tags through OA counter reports
From: Sourab Gupta This patch enables userspace to specify tags (per workload), provided via execbuffer ioctl, which could be added to OA reports, to help associate reports with the corresponding workloads. There may be multiple stages within a single context, from a userspace perspective. An ability is needed to individually associate the OA reports with their corresponding workloads(execbuffers), which may not be possible solely with ctx_id or pid information. This patch enables such a mechanism. In this patch, upper 32 bits of rsvd1 field, which were previously unused are now being used to pass in the tag. Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_drv.h| 7 -- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 5 ++-- drivers/gpu/drm/i915/i915_perf.c | 37 ++ drivers/gpu/drm/i915/intel_lrc.c | 4 ++-- include/uapi/drm/i915_drm.h| 12 ++ 5 files changed, 54 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 021b34e..127ccc1 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1705,6 +1705,7 @@ struct i915_execbuffer_params { struct drm_i915_gem_object *batch_obj; struct intel_context*ctx; struct drm_i915_gem_request *request; + uint32_ttag; }; /* used in computing the new watermarks state */ @@ -1805,7 +1806,7 @@ struct i915_perf_stream { * Routine to emit the commands in the command streamer associated * with the corresponding gpu engine. */ - void (*command_stream_hook)(struct drm_i915_gem_request *req); + void (*command_stream_hook)(struct drm_i915_gem_request *req, u32 tag); }; struct i915_oa_ops { @@ -1837,6 +1838,7 @@ struct i915_perf_cs_data_node { u32 offset; u32 ctx_id; u32 pid; + u32 tag; }; struct drm_i915_private { @@ -2179,6 +2181,7 @@ struct drm_i915_private { u32 last_ctx_id; u32 last_pid; + u32 last_tag; struct list_head node_list; spinlock_t node_list_lock; } perf; @@ -3548,7 +3551,7 @@ void i915_oa_legacy_ctx_switch_notify(struct drm_i915_gem_request *req); void i915_oa_update_reg_state(struct intel_engine_cs *engine, struct intel_context *ctx, uint32_t *reg_state); -void i915_perf_command_stream_hook(struct drm_i915_gem_request *req); +void i915_perf_command_stream_hook(struct drm_i915_gem_request *req, u32 tag); /* i915_gem_evict.c */ int __must_check i915_gem_evict_something(struct drm_device *dev, diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index 89b114b..d10c800 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -1313,7 +1313,7 @@ i915_gem_ringbuffer_submission(struct i915_execbuffer_params *params, if (exec_len == 0) exec_len = params->batch_obj->base.size; - i915_perf_command_stream_hook(params->request); + i915_perf_command_stream_hook(params->request, params->tag); ret = engine->dispatch_execbuffer(params->request, exec_start, exec_len, @@ -1321,7 +1321,7 @@ i915_gem_ringbuffer_submission(struct i915_execbuffer_params *params, if (ret) return ret; - i915_perf_command_stream_hook(params->request); + i915_perf_command_stream_hook(params->request, params->tag); trace_i915_gem_ring_dispatch(params->request, params->dispatch_flags); @@ -1642,6 +1642,7 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data, params->batch_obj = batch_obj; params->ctx = ctx; params->request = req; + params->tag = i915_execbuffer2_get_tag(*args); ret = dev_priv->gt.execbuf_submit(params, args, &eb->vmas); err_request: diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 0d347d2..c921c4d 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -88,6 +88,7 @@ struct oa_sample_data { u32 source; u32 ctx_id; u32 pid; + u32 tag; const u8 *report; }; @@ -134,6 +135,7 @@ static struct i915_oa_format gen8_plus_oa_formats[I915_OA_FORMAT_MAX] = { #define SAMPLE_OA_SOURCE_INFO (1<<1) #define SAMPLE_CTX_ID (1<<2) #define SAMPLE_PID (1<<3) +#define SAMPLE_TAG (1<<4) struct perf_open_properties { u32 sample_flags; @@ -158,7 +160,7 @@ struct perf_open_properties { * perf mutex lock.
[Intel-gfx] [PATCH 06/16] drm/i915: Framework for capturing command stream based OA reports
From: Sourab Gupta This patch introduces a framework to enable OA counter reports associated with Render command stream. We can then associate the reports captured through this mechanism with their corresponding context id's. This can be further extended to associate any other metadata information with the corresponding samples (since the association with Render command stream gives us the ability to capture these information while inserting the corresponding capture commands into the command stream). The OA reports generated in this way are associated with a corresponding workload, and thus can be used the delimit the workload (i.e. sample the counters at the workload boundaries), within an ongoing stream of periodic counter snapshots. There may be usecases wherein we need more than periodic OA capture mode which is supported currently. This mode is primarily used for two usecases: - Ability to capture system wide metrics, alongwith the ability to map the reports back to individual contexts (particularly for HSW). - Ability to inject tags for work, into the reports. This provides visibility into the multiple stages of work within single context. The userspace will be able to distinguish between the periodic and CS based OA reports by the virtue of source_info sample field. The command MI_REPORT_PERF_COUNT can be used to capture snapshots of OA counters, and is inserted at BB boundaries. The data thus captured will be stored in a separate buffer, which will be different from the buffer used otherwise for periodic OA capture mode. The metadata information pertaining to snapshot is maintained in a list, which also has offsets into the gem buffer object per captured snapshot. In order to track whether the gpu has completed processing the node, a field pertaining to corresponding gem request is added, which is tracked for completion of the command. Both periodic and RCS based reports are associated with a single stream (corresponding to render engine), and it is expected to have the samples in the sequential order according to their timestamps. Now, since these reports are collected in separate buffers, these are merge sorted at the time of forwarding to userspace during the read call. v2: Aligining with the non-perf interface (custom drm ioctl based). Also, few related patches are squashed together for better readability Signed-off-by: Sourab Gupta Signed-off-by: Robert Bragg --- drivers/gpu/drm/i915/i915_drv.h| 36 +- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 4 + drivers/gpu/drm/i915/i915_perf.c | 829 + drivers/gpu/drm/i915/intel_lrc.c | 4 + include/uapi/drm/i915_drm.h| 15 + 5 files changed, 774 insertions(+), 114 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index cfc135d..050df37 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1735,12 +1735,16 @@ struct i915_perf_stream { struct list_head link; + enum intel_engine_id engine; u32 sample_flags; int sample_size; struct intel_context *ctx; bool enabled; + /* Whether command stream based data collection is enabled */ + bool cs_mode; + /* Enables the collection of HW samples, either in response to * I915_PERF_IOCTL_ENABLE or implicitly called when stream is * opened without I915_PERF_FLAG_DISABLED. @@ -1796,6 +1800,12 @@ struct i915_perf_stream { * The stream will always be disabled before this is called. */ void (*destroy)(struct i915_perf_stream *stream); + + /* +* Routine to emit the commands in the command streamer associated +* with the corresponding gpu engine. +*/ + void (*command_stream_hook)(struct drm_i915_gem_request *req); }; struct i915_oa_ops { @@ -1809,10 +1819,21 @@ struct i915_oa_ops { u32 ctx_id); void (*legacy_ctx_switch_unlocked)(struct drm_i915_gem_request *req); int (*read)(struct i915_perf_stream *stream, - struct i915_perf_read_state *read_state); + struct i915_perf_read_state *read_state, u32 ts); bool (*oa_buffer_is_empty)(struct drm_i915_private *dev_priv); }; +/* + * List element to hold info about the perf sample data associated + * with a particular GPU command stream. + */ +struct i915_perf_cs_data_node { + struct list_head link; + struct drm_i915_gem_request *request; + u32 offset; + u32 ctx_id; +}; + struct drm_i915_private { struct drm_device *dev; struct kmem_cache *objects; @@ -2093,6 +2114,8 @@ struct drm_i915_private { struct ctl_table_header *sysctl_header; struct mutex lock; + + struct mutex streams_lock; struct list_head streams; spinl
[Intel-gfx] [PATCH 08/16] drm/i915: Handle the overflow condition for command stream buf
From: Sourab Gupta Add a compile time option for detecting the overflow condition of command stream buffer, and not overwriting the old entries in such a case. Also, set a status flag to forward the overflow condition to userspace if overflow is detected. Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_drv.h | 2 ++ drivers/gpu/drm/i915/i915_perf.c | 76 +++- 2 files changed, 62 insertions(+), 16 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index bc4fc1b..6bea3bb 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -2170,6 +2170,8 @@ struct drm_i915_private { struct drm_i915_gem_object *obj; struct i915_vma *vma; u8 *addr; +#define I915_PERF_CMD_STREAM_BUF_STATUS_OVERFLOW (1<<0) + u32 status; } command_stream_buf; struct list_head node_list; diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 222de00..147f377 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -80,6 +80,9 @@ static u32 i915_perf_stream_paranoid = true; #define GEN8_OAREPORT_REASON_GO_TRANSITION (1<<23) #define GEN9_OAREPORT_REASON_CLK_RATIO (1<<24) +/* For determining the behavior on overflow of command stream samples */ +#define CMD_STREAM_BUF_OVERFLOW_ALLOWED + /* Data common to periodic and RCS based samples */ struct oa_sample_data { u32 source; @@ -170,6 +173,7 @@ void i915_perf_command_stream_hook(struct drm_i915_gem_request *req) mutex_unlock(&dev_priv->perf.streams_lock); } +#ifdef CMD_STREAM_BUF_OVERFLOW_ALLOWED /* * Release some perf entries to make space for a new entry data. We dereference * the associated request before deleting the entry. Also, no need to check for @@ -196,25 +200,26 @@ static void release_some_perf_entries(struct drm_i915_private *dev_priv, break; } } +#endif /* - * Insert the perf entry to the end of the list. This function never fails, - * since it always manages to insert the entry. If the space is exhausted in - * the buffer, it will remove the oldest entries in order to make space. + * Insert the perf entry to the end of the list. If the overwrite of old entries + * is allowed, the function always manages to insert the entry and returns 0. + * If overwrite is not allowed, on detection of overflow condition, an + * appropriate status flag is set, and function returns -ENOSPC. */ -static void insert_perf_entry(struct drm_i915_private *dev_priv, +static int insert_perf_entry(struct drm_i915_private *dev_priv, struct i915_perf_cs_data_node *entry) { struct i915_perf_cs_data_node *first_entry, *last_entry; int max_offset = dev_priv->perf.command_stream_buf.obj->base.size; u32 entry_size = dev_priv->perf.oa.oa_buffer.format_size; + int ret = 0; spin_lock(&dev_priv->perf.node_list_lock); if (list_empty(&dev_priv->perf.node_list)) { entry->offset = 0; - list_add_tail(&entry->link, &dev_priv->perf.node_list); - spin_unlock(&dev_priv->perf.node_list_lock); - return; + goto out; } first_entry = list_first_entry(&dev_priv->perf.node_list, @@ -232,29 +237,49 @@ static void insert_perf_entry(struct drm_i915_private *dev_priv, */ else if (entry_size < first_entry->offset) entry->offset = 0; - /* Insufficient space. Overwrite existing old entries */ + /* Insufficient space */ else { +#ifdef CMD_STREAM_BUF_OVERFLOW_ALLOWED u32 target_size = entry_size - first_entry->offset; release_some_perf_entries(dev_priv, target_size); entry->offset = 0; +#else + dev_priv->perf.command_stream_buf.status |= + I915_PERF_CMD_STREAM_BUF_STATUS_OVERFLOW; + ret = -ENOSPC; + goto out_unlock; +#endif } } else { /* Sufficient space available? */ if (last_entry->offset + 2*entry_size < first_entry->offset) entry->offset = last_entry->offset + entry_size; - /* Insufficient space. Overwrite existing old entries */ + /* Insufficient space */ else { +#ifdef CMD_STREAM_BUF_OVERFLOW_ALLOWED u32 target_size = entry_size - (first_entry->offset - last_entry->offset - entry_size);
[Intel-gfx] [PATCH 13/16] drm/i915: Extract raw GPU timestamps from OA reports to forward in perf samples
From: Sourab Gupta The OA reports contain the least significant 32 bits of the gpu timestamp. This patch enables retrieval of the timestamp field from OA reports, to forward as 64 bit raw gpu timestamps in the perf samples. Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_drv.h | 1 + drivers/gpu/drm/i915/i915_perf.c | 44 ++-- drivers/gpu/drm/i915/i915_reg.h | 4 3 files changed, 38 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index bdc7ad4..2ac07fb 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -2179,6 +2179,7 @@ struct drm_i915_private { u32 ctx_flexeu0_off; u32 n_pending_periodic_samples; u32 pending_periodic_ts; + u64 last_gpu_ts; struct i915_oa_ops ops; const struct i915_oa_format *oa_formats; diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index f1c26e5..2bf9cf0 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -775,6 +775,24 @@ static int append_sample(struct i915_perf_stream *stream, return 0; } +static u64 get_gpu_ts_from_oa_report(struct drm_i915_private *dev_priv, + const u8 *report) +{ + u32 sample_ts = *(u32 *)(report + 4); + u32 delta; + + /* +* NB: We have to assume we're updating last_gpu_ts frequently +* enough that it's never possible to see multiple overflows before +* we compare sample_ts to last_gpu_ts. Since this is significantly +* large duration (~6min for 80ns ts base), we can safely assume so. +*/ + delta = sample_ts - (u32)dev_priv->perf.oa.last_gpu_ts; + dev_priv->perf.oa.last_gpu_ts += delta; + + return dev_priv->perf.oa.last_gpu_ts; +} + static int append_oa_buffer_sample(struct i915_perf_stream *stream, struct i915_perf_read_state *read_state, const u8 *report) @@ -811,10 +829,9 @@ static int append_oa_buffer_sample(struct i915_perf_stream *stream, if (sample_flags & SAMPLE_TAG) data.tag = dev_priv->perf.last_tag; - /* Derive timestamp from OA report, after scaling with the ts base */ -#warning "FIXME: append_oa_buffer_sample: derive the timestamp from OA report" + /* Derive timestamp from OA report */ if (sample_flags & SAMPLE_TS) - data.ts = 0; + data.ts = get_gpu_ts_from_oa_report(dev_priv, report); if (sample_flags & SAMPLE_OA_REPORT) data.report = report; @@ -1226,6 +1243,7 @@ static int append_one_cs_sample(struct i915_perf_stream *stream, enum intel_engine_id id = stream->engine; struct sample_data data = { 0 }; u32 sample_flags = stream->sample_flags; + u64 gpu_ts = 0; int ret = 0; if (sample_flags & SAMPLE_OA_REPORT) { @@ -1242,6 +1260,9 @@ static int append_one_cs_sample(struct i915_perf_stream *stream, U32_MAX); if (ret) return ret; + + if (sample_flags & SAMPLE_TS) + gpu_ts = get_gpu_ts_from_oa_report(dev_priv, report); } if (sample_flags & SAMPLE_OA_SOURCE_INFO) @@ -1263,17 +1284,14 @@ static int append_one_cs_sample(struct i915_perf_stream *stream, } if (sample_flags & SAMPLE_TS) { - /* For RCS, if OA samples are also being collected, derive the -* timestamp from OA report, after scaling with the TS base. + /* If OA sampling is enabled, derive the ts from OA report. * Else, forward the timestamp collected via command stream. */ -#warning "FIXME: append_one_cs_sample: derive the timestamp from OA report" - if (sample_flags & SAMPLE_OA_REPORT) - data.ts = 0; - else - data.ts = *(u64 *) + if (!(sample_flags & SAMPLE_OA_REPORT)) + gpu_ts = *(u64 *) (dev_priv->perf.command_stream_buf[id].addr + node->ts_offset); + data.ts = gpu_ts; } return append_sample(stream, read_state, &data); @@ -2025,8 +2043,12 @@ static void i915_ring_stream_enable(struct i915_perf_stream *stream) { struct drm_i915_private *dev_priv = stream->dev_priv; - if (stream->sample_flags & SAMPLE_OA_REPORT) + if (stream->sample_flags & SAMPLE_OA_REPORT) { + dev_priv->perf.oa.last_g
[Intel-gfx] [PATCH 15/16] drm/i915: Mechanism to forward clock monotonic time in perf samples
From: Sourab Gupta Currently, we have the ability to only forward the GPU timestamps in the samples (which are generated via OA reports or PIPE_CONTROL commands inserted in the ring). This limits the ability to correlate these samples with the system events. If we scale the GPU timestamps according the timestamp base/frequency info present in bspec, it is observed that the timestamps drift really quickly from the system time. An ability is therefore needed to report timestamps in different clock domains, such as CLOCK_MONOTONIC, in the perf samples to be of more practical use to the userspace. This ability becomes important when we want to correlate/plot GPU events/samples with other system events on the same timeline (e.g. vblank events, or timestamps when work was submitted to kernel, etc.) The patch here proposes a mechanism to achieve this. The gpu time and CLOCK_MONOTONIC system time are correlated to detect and correct the error in published gpu timestamp clock frequency. The userspace can request CLOCK_MONOTONIC in samples by requesting the corresponding property while opening the stream. Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_drv.h | 12 ++- drivers/gpu/drm/i915/i915_perf.c | 218 --- drivers/gpu/drm/i915/i915_reg.h | 10 ++ include/uapi/drm/i915_drm.h | 9 +- 4 files changed, 230 insertions(+), 19 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 0923a17..e6a1a93 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1749,6 +1749,9 @@ struct i915_perf_stream { /* Whether the OA unit is in use */ bool using_oa; + /* monotonic clk ts for last sample */ + u64 last_sample_ts; + /* Enables the collection of HW samples, either in response to * I915_PERF_IOCTL_ENABLE or implicitly called when stream is * opened without I915_PERF_FLAG_DISABLED. @@ -2144,6 +2147,14 @@ struct drm_i915_private { struct i915_perf_stream *ring_stream[I915_NUM_ENGINES]; wait_queue_head_t poll_wq[I915_NUM_ENGINES]; + /* Timekeeping Info */ + u64 clk_mono; /* last monotonic clk value */ + u64 gpu_time; /* last gpu time value */ + s64 clk_offset; /* Offset between clk mono and gpu time */ + u32 timestamp_frequency; + u32 resync_period; /* in msecs */ + struct delayed_work clk_sync_work; + struct { u32 specific_ctx_id; @@ -2152,7 +2163,6 @@ struct drm_i915_private { bool periodic; int period_exponent; - int timestamp_frequency; int tail_margin; diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index abb9d04..af9ec93 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -61,6 +61,12 @@ #define POLL_FREQUENCY 200 #define POLL_PERIOD (NSEC_PER_SEC / POLL_FREQUENCY) +/* Max period for clock synchronization. Defined as 25 seconds, as this is seen + * to give best results. + */ +#define MAX_CLK_SYNC_PERIOD (25*MSEC_PER_SEC) +#define INIT_CLK_SYNC_PERIOD (20) /* in msecs */ + static u32 i915_perf_stream_paranoid = true; /* The maximum exponent the hardware accepts is 63 (essentially it selects one @@ -93,7 +99,8 @@ struct sample_data { u32 ctx_id; u32 pid; u32 tag; - u64 ts; + u64 gpu_ts; + u64 clk_mono; const u8 *report; }; @@ -142,6 +149,7 @@ static struct i915_oa_format gen8_plus_oa_formats[I915_OA_FORMAT_MAX] = { #define SAMPLE_PID (1<<3) #define SAMPLE_TAG (1<<4) #define SAMPLE_TS (1<<5) +#define SAMPLE_CLK_MONO(1<<6) struct perf_open_properties { u32 sample_flags; @@ -232,7 +240,7 @@ static int insert_perf_entry(struct drm_i915_private *dev_priv, if (stream->sample_flags & SAMPLE_OA_REPORT) entry_size += dev_priv->perf.oa.oa_buffer.format_size; - else if (sample_flags & SAMPLE_TS) { + else if (sample_flags & (SAMPLE_TS|SAMPLE_CLK_MONO)) { /* * XXX: Since TS data can anyways be derived from OA report, so * no need to capture it for RCS engine, if capture oa data is @@ -501,7 +509,7 @@ static void i915_ring_stream_cs_hook(struct i915_perf_stream *stream, ret = i915_ring_stream_capture_oa(req, entry->oa_offset); if (ret) goto err_unref; - } else if (sample_flags & SAMPLE_TS) { + } else if (sample_flags & (SAMPLE_TS|SAMPLE_CLK_MONO)) { /* * XXX: Since TS data can anyways be derived from OA report, so * no nee
[Intel-gfx] [PATCH 09/16] drm/i915: Populate ctx ID for periodic OA reports
From: Sourab Gupta This adds support for populating the ctx id for the periodic OA reports when requested through the corresponding property. For Gen8, the OA reports itself have the ctx ID and it is the one programmed into HW while submitting workloads. Thus it's retrieved from reports itself. For Gen7, the OA reports don't have any such field, and we can populate this field with the last seen ctx ID while sending CS reports. Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_drv.h | 3 +++ drivers/gpu/drm/i915/i915_perf.c | 52 +--- 2 files changed, 52 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 6bea3bb..53bf148 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1823,6 +1823,8 @@ struct i915_oa_ops { u32 ts, u32 max_records); int (*oa_buffer_num_samples)(struct drm_i915_private *dev_priv, u32 *last_ts); + u32 (*oa_buffer_get_ctx_id)(struct i915_perf_stream *stream, + const u8 *report); }; /* @@ -2174,6 +2176,7 @@ struct drm_i915_private { u32 status; } command_stream_buf; + u32 last_ctx_id; struct list_head node_list; spinlock_t node_list_lock; } perf; diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 147f377..a112c20 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -476,6 +476,46 @@ gen7_oa_buffer_num_samples_fop_unlocked(struct drm_i915_private *dev_priv, return num_samples; } +static u32 gen7_oa_buffer_get_ctx_id(struct i915_perf_stream *stream, + const u8 *report) +{ + struct drm_i915_private *dev_priv = stream->dev_priv; + + if (!stream->cs_mode) + WARN_ONCE(1, + "CTX ID can't be retrieved if command stream mode not enabled"); + + /* +* OA reports generated in Gen7 don't have the ctx ID information. +* Therefore, just rely on the ctx ID information from the last CS +* sample forwarded +*/ + return dev_priv->perf.last_ctx_id; +} + +static u32 gen8_oa_buffer_get_ctx_id(struct i915_perf_stream *stream, + const u8 *report) +{ + struct drm_i915_private *dev_priv = stream->dev_priv; + + /* The ctx ID present in the OA reports have intel_context::global_id +* present, since this is programmed into the ELSP in execlist mode. +* In non-execlist mode, fall back to retrieving the ctx ID from the +* last saved ctx ID from command stream mode. +*/ + if (i915.enable_execlists) { + u32 ctx_id = *(u32 *)(report + 12); + ctx_id &= 0xf; + return ctx_id; + } else { + if (!stream->cs_mode) + WARN_ONCE(1, + "CTX ID can't be retrieved if command stream mode not enabled"); + + return dev_priv->perf.last_ctx_id; + } +} + /** * Appends a status record to a userspace read() buffer. */ @@ -571,9 +611,9 @@ static int append_oa_buffer_sample(struct i915_perf_stream *stream, data.source = source; } -#warning "FIXME: append_oa_buffer_sample: read ctx ID from report and map that to an intel_context::global_id" if (sample_flags & SAMPLE_CTX_ID) - data.ctx_id = 0; + data.ctx_id = dev_priv->perf.oa.ops.oa_buffer_get_ctx_id( + stream, report); if (sample_flags & SAMPLE_OA_REPORT) data.report = report; @@ -998,8 +1038,10 @@ static int append_oa_rcs_sample(struct i915_perf_stream *stream, if (sample_flags & SAMPLE_OA_SOURCE_INFO) data.source = I915_PERF_OA_EVENT_SOURCE_RCS; - if (sample_flags & SAMPLE_CTX_ID) + if (sample_flags & SAMPLE_CTX_ID) { data.ctx_id = node->ctx_id; + dev_priv->perf.last_ctx_id = node->ctx_id; + } if (sample_flags & SAMPLE_OA_REPORT) data.report = report; @@ -2717,6 +2759,8 @@ void i915_perf_init(struct drm_device *dev) dev_priv->perf.oa.ops.read = gen7_oa_read; dev_priv->perf.oa.ops.oa_buffer_num_samples = gen7_oa_buffer_num_samples_fop_unlocked; + dev_priv->perf.oa.ops.oa_buffer_get_ctx_id = + gen7_oa_buffer_get_ctx_id; dev_priv->perf.oa.oa_formats = hsw_oa_formats; @@ -2732,6 +2776,8 @@ void i915_perf_init(struct drm_device
[Intel-gfx] [PATCH 12/16] drm/i915: Extend i915 perf framework for collecting timestamps on all gpu engines
From: Sourab Gupta This patch extends the i915 perf framework to handle the perf sample collection for any given gpu engine. Particularly, the support for collecting timestamp sample type is added, which can be requested for any engine. With this, for RCS, timestamps and OA reports can be collected together, and provided to userspace in separate sample fields. For other engines, the capabilility to collect timestamps is added. The thing to note is that, still only a single stream instance can be opened at any particular time. Though that stream may now be opened for any gpu engine, for collection of timestamp samples. So, this patch doesn't add the support to open multiple concurrent streams, as yet. Though it lays the groundwork for this support to be added susequently. Part of this groundwork involves having separate command stream buffers, per engine, for holding the samples generated. Likewise for a few other data structures maintaining per-engine state. Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_drv.h | 29 +- drivers/gpu/drm/i915/i915_perf.c | 650 ++- drivers/gpu/drm/i915/i915_reg.h | 2 + include/uapi/drm/i915_drm.h | 7 + 4 files changed, 469 insertions(+), 219 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 127ccc1..bdc7ad4 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1746,6 +1746,9 @@ struct i915_perf_stream { /* Whether command stream based data collection is enabled */ bool cs_mode; + /* Whether the OA unit is in use */ + bool using_oa; + /* Enables the collection of HW samples, either in response to * I915_PERF_IOCTL_ENABLE or implicitly called when stream is * opened without I915_PERF_FLAG_DISABLED. @@ -1806,7 +1809,8 @@ struct i915_perf_stream { * Routine to emit the commands in the command streamer associated * with the corresponding gpu engine. */ - void (*command_stream_hook)(struct drm_i915_gem_request *req, u32 tag); + void (*command_stream_hook)(struct i915_perf_stream *stream, + struct drm_i915_gem_request *req, u32 tag); }; struct i915_oa_ops { @@ -1835,7 +1839,16 @@ struct i915_oa_ops { struct i915_perf_cs_data_node { struct list_head link; struct drm_i915_gem_request *request; - u32 offset; + + /* Offsets into the GEM obj holding the data */ + u32 start_offset; + u32 oa_offset; + u32 ts_offset; + + /* buffer size corresponding to this entry */ + u32 size; + + /* Other metadata */ u32 ctx_id; u32 pid; u32 tag; @@ -2127,9 +2140,11 @@ struct drm_i915_private { spinlock_t hook_lock; - struct { - struct i915_perf_stream *exclusive_stream; + struct hrtimer poll_check_timer; + struct i915_perf_stream *exclusive_stream; + wait_queue_head_t poll_wq[I915_NUM_ENGINES]; + struct { u32 specific_ctx_id; struct hrtimer poll_check_timer; @@ -2177,13 +2192,13 @@ struct drm_i915_private { u8 *addr; #define I915_PERF_CMD_STREAM_BUF_STATUS_OVERFLOW (1<<0) u32 status; - } command_stream_buf; + } command_stream_buf[I915_NUM_ENGINES]; u32 last_ctx_id; u32 last_pid; u32 last_tag; - struct list_head node_list; - spinlock_t node_list_lock; + struct list_head node_list[I915_NUM_ENGINES]; + spinlock_t node_list_lock[I915_NUM_ENGINES]; } perf; /* Abstract the submission mechanism (legacy ringbuffer or execlists) away */ diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index c921c4d..f1c26e5 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -83,12 +83,17 @@ static u32 i915_perf_stream_paranoid = true; /* For determining the behavior on overflow of command stream samples */ #define CMD_STREAM_BUF_OVERFLOW_ALLOWED -/* Data common to periodic and RCS based samples */ -struct oa_sample_data { +#define OA_ADDR_ALIGN 64 +#define TS_ADDR_ALIGN 8 +#define I915_PERF_TS_SAMPLE_SIZE 8 + +/* Data common to all samples (periodic OA / CS based OA / Timestamps) */ +struct sample_data { u32 source; u32 ctx_id; u32 pid; u32 tag; + u64 ts; const u8 *report; }; @@ -136,6 +141,7 @@ static struct i915_oa_format gen8_plus_oa_formats[I915_OA_FORMAT_MAX] = { #define SAMPLE_CTX_ID (1<<2) #define SAMPLE_PID (1<<3) #define SAMPLE_TAG (1<<4) +#define SAMPLE_TS (1<<5) struct perf_open_pr
[Intel-gfx] [PATCH 16/16] drm/i915: Support for capturing MMIO register values
From: Sourab Gupta This patch adds support for capturing MMIO register values through i915 perf interface. The userspace can request upto 8 MMIO register values to be dumped. The addresses of these registers can be passed through the corresponding property 'value' field while opening the stream. The commands to dump the values of these MMIO registers are then inserted into the ring alongwith other commands. Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_drv.h | 4 + drivers/gpu/drm/i915/i915_perf.c | 179 ++- include/uapi/drm/i915_drm.h | 14 +++ 3 files changed, 194 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index e6a1a93..eb7e26b 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1847,6 +1847,7 @@ struct i915_perf_cs_data_node { u32 start_offset; u32 oa_offset; u32 ts_offset; + u32 mmio_offset; /* buffer size corresponding to this entry */ u32 size; @@ -2155,6 +2156,9 @@ struct drm_i915_private { u32 resync_period; /* in msecs */ struct delayed_work clk_sync_work; + u32 num_mmio; + u32 mmio_list[I915_PERF_MMIO_NUM_MAX]; + struct { u32 specific_ctx_id; diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index af9ec93..aacc892 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -102,6 +102,7 @@ struct sample_data { u64 gpu_ts; u64 clk_mono; const u8 *report; + const u8 *mmio; }; /* for sysctl proc_dointvec_minmax of i915_oa_min_timer_exponent */ @@ -150,6 +151,7 @@ static struct i915_oa_format gen8_plus_oa_formats[I915_OA_FORMAT_MAX] = { #define SAMPLE_TAG (1<<4) #define SAMPLE_TS (1<<5) #define SAMPLE_CLK_MONO(1<<6) +#define SAMPLE_MMIO(1<<7) struct perf_open_properties { u32 sample_flags; @@ -250,6 +252,9 @@ static int insert_perf_entry(struct drm_i915_private *dev_priv, sample_ts = true; } + if (sample_flags & SAMPLE_MMIO) + entry_size += 4*dev_priv->perf.num_mmio; + spin_lock(&dev_priv->perf.node_list_lock[id]); if (list_empty(&dev_priv->perf.node_list[id])) { offset = 0; @@ -327,6 +332,10 @@ out: entry->ts_offset = ALIGN(entry->ts_offset, TS_ADDR_ALIGN); offset = entry->ts_offset + I915_PERF_TS_SAMPLE_SIZE; } + if (sample_flags & SAMPLE_MMIO) { + entry->mmio_offset = offset; + offset = entry->mmio_offset + 4*dev_priv->perf.num_mmio; + } list_add_tail(&entry->link, &dev_priv->perf.node_list[id]); #ifndef CMD_STREAM_BUF_OVERFLOW_ALLOWED @@ -479,6 +488,72 @@ static int i915_ring_stream_capture_ts(struct drm_i915_gem_request *req, return 0; } +static int i915_ring_stream_capture_mmio(struct drm_i915_gem_request *req, + u32 offset) +{ + struct intel_engine_cs *engine = req->engine; + struct intel_ringbuffer *ringbuf = req->ringbuf; + struct drm_i915_private *dev_priv = engine->dev->dev_private; + int num_mmio = dev_priv->perf.num_mmio; + u32 mmio_addr, addr = 0; + int ret, i; + + if (i915.enable_execlists) + ret = intel_logical_ring_begin(req, 4*num_mmio); + else + ret = intel_ring_begin(req, 4*num_mmio); + + if (ret) + return ret; + + mmio_addr = + dev_priv->perf.command_stream_buf[engine->id].vma->node.start + + offset; + + if (i915.enable_execlists) { + for (i = 0; i < num_mmio; i++) { + uint32_t cmd; + + addr = mmio_addr + + i * sizeof(dev_priv->perf.mmio_list[i]); + + cmd = MI_STORE_REGISTER_MEM_GEN8 | + MI_SRM_LRM_GLOBAL_GTT; + + intel_logical_ring_emit(ringbuf, cmd); + intel_logical_ring_emit(ringbuf, + dev_priv->perf.mmio_list[i]); + intel_logical_ring_emit(ringbuf, addr); + intel_logical_ring_emit(ringbuf, 0); + } + intel_logical_ring_advance(ringbuf); + } else { + for (i = 0; i < num_mmio; i++) { + uint32_t cmd; + + addr = mmio_addr + + i * sizeof(dev_priv->perf.mmio_list[i]);
Re: [Intel-gfx] [PATCH 15/16] drm/i915: Mechanism to forward clock monotonic time in perf samples
On Sat, 2016-04-23 at 01:19 +0530, Chris Wilson wrote: > On Fri, Apr 22, 2016 at 05:04:04PM +0530, sourab.gu...@intel.com wrote: > > +static u64 get_current_gpu_ts(struct drm_i915_private *dev_priv) > > +{ > > + return ((u64)I915_READ(GT_TIMESTAMP_COUNT_UDW) << 32) | > > + I915_READ(GT_TIMESTAMP_COUNT); > > return I915_READ64_2x32(GT_TIMESTAMP_COUNT, GT_TIMESTAMP_COUNT_UDW); Thanks for pointing out. Will make this change. > > > +static void i915_perf_get_clock(struct drm_i915_private *dev_priv, > > + u64 *clk_mono, u64 *gpu_time, u64 *gpu_ts) > > +{ > > + u64 remainder, ts_interval = NSEC_PER_SEC; > > + u32 gpu_freq = dev_priv->perf.timestamp_frequency; > > + unsigned long flags; > > + > > + local_irq_save(flags); > > + *clk_mono = ktime_get_mono_fast_ns(); > > + *gpu_ts = get_current_gpu_ts(dev_priv); > > + local_irq_restore(flags); > > + > > + remainder = do_div(ts_interval, gpu_freq); > > + remainder *= *gpu_ts; > > + do_div(remainder, gpu_freq); > > + > > + *gpu_time = ((*gpu_ts) * ts_interval) + remainder; > > +} > > + > > +static void i915_perf_clock_sync_work(struct work_struct *work) > > +{ > > Have you looked at cross-timestamps? I was looking at the cross-timestamp patch set (https://lkml.org/lkml/2016/1/4/541), but I'm not entirely sure the cross timestamp stuff is solving the same problem. The cross timestamp stuff is trying to "synchronously capture system/device timestamp(s)". While here, we have a log of many recorded device timestamps and want to correlate all of those with system timestamp (_CLOCK_MONOTONOC for now). As we aren't reading system/device timestamps together for all timestamps we have, our problem is more about deriving an accurate idea of timestamp frequency to improve the accuracy when correlating between the synchronization points. Maybe the cross timestamp stuff will help to periodically read tightly correlated timestamp pairs synchronously which can be used as basis for correlating the logged timestamps based on our derived timestamp frequency. This idea can be evaluated once the cross timestamp patches land in kernel. > -Chris > ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Re: [Intel-gfx] [RFC 0/4] Introduce drmfs pseudo filesystem for drm subsystem
generated by i915 driver in this patch series, > >> but will hopefully be generic enough to provide scope for usage by any > >> other drm subsystem component. > >> > >> The patch series is being floated as RFC to gather feedback on the idea and > >> infrastructure proposed here and it's suitability to address the specific > >> problem statement/use case. > >> > >> v2: fix the bat failures caused due to missing config check > >> > >> v3: Changes made: > >> - Move the location of drmfs from fs/ to drivers/gpu/drm/ (Chris) > >> - Moving config checks to header (Chris,Daniel) > >> > >> v4: Added the kernel Documentaion (using Sphinx). > >> > >> Sourab Gupta (4): > >> drm: Introduce drmfs pseudo filesystem interfaces > >> drm: Register drmfs filesystem from drm init > >> drm: Create driver specific root directory inside drmfs > >> drm/i915: Creating guc log file in drmfs instead of debugfs > >> > >> Documentation/gpu/drm-uapi.rst | 76 > >> drivers/gpu/drm/Kconfig| 9 + > >> drivers/gpu/drm/Makefile | 1 + > >> drivers/gpu/drm/drm_drv.c | 26 ++ > >> drivers/gpu/drm/drmfs.c| 566 > >> ++ > >> drivers/gpu/drm/i915/i915_guc_submission.c | 33 +- > >> include/drm/drm_drv.h | 3 + > >> include/drm/drmfs.h| 77 > >> include/uapi/linux/magic.h | 3 + > >> 9 files changed, 773 insertions(+), 21 deletions(-) > >> create mode 100644 drivers/gpu/drm/drmfs.c > >> create mode 100644 include/drm/drmfs.h > > -- > Jani Nikula, Intel Open Source Technology Center ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Re: [Intel-gfx] [RFC 0/4] Introduce drmfs pseudo filesystem for drm subsystem
On Mon, 2016-12-19 at 17:15 -0800, Laurent Pinchart wrote: > Hi Swati, > > On Monday 19 Dec 2016 16:12:22 swati.dhin...@intel.com wrote: > > From: Swati Dhingra > > > > Currently, we don't have a stable ABI which can be used for the purpose of > > providing output debug/loggging/crc and other such data from DRM. > > The ABI in current use (filesystems, ioctls, et al.) have their own > > constraints and are intended to output a particular type of data. > > Few cases in point: > > sysfs - stable ABI, but constrained to one textual value per file > > debugfs - unstable ABI, free-for-all > > ioctls- not as suitable to many single purpose continuous data > > dumping, we would very quickly run out ioctl space; requires more > > userspace support than "cat" > > device nodes - a real possibilty, kernel instantiation is more tricky, > > requires udev (+udev.rules) or userspace discovery of the > > dynamic major:minor (via sysfs) [mounting a registered > > filesystem is easy in comparison] > > netlink - stream based, therefore involves numerous copies. > > > > Debugfs is the lesser among the evils here, thereby we have grown used to > > the convenience and flexibility in presentation that debugfs gives us > > (including relayfs inodes) that we want some of that hierachy in stable user > > ABI form. > > Seriously, why ? A subsystem growing its own file system sounds so wrong. It > seems that you want to have all the benefits of a stable ABI without going > through the standardization effort that this requires. I can see so many ways > that drmfs could be abused, with drivers throwing in new data with little or > no review. You'll need very compelling arguments to convince me. > Hi Laurent, Can you please let us know how to address the standardization issues? As per our (limited) knowledge, drmfs seemed to be the most suitable solution for exposing usecases such as microprocessor logs in i915, and possibly other such usecases, which ideally can't fit in with sysfs/debugfs/ioctls due to reasons mentioned above. But having said that, standardization requires a lot more effort (defining the constraints etc.), which we're not familiar with, frankly. Can you please provide your views on how to proceed as such, since the idea seemed worth pursuing to us (It's a drm based filesystem, whose existence depends on drm, and directory contents solely controlled by the corresponding drm driver - as such the contents shouldn't be controllable by an external driver). Or, should this be dropped, if the idea of a subsystem having its own filesystem is fundamentally wrong to its core? Regards, Sourab > > Due to these limitations, there is a need for a new pseudo filesytem, that > > would act as a stable 'free-for-all' ABI, with the heirarchial structure and > > thus convenience of debugfs. This will be managed by drm, thus named > > 'drmfs'. DRM would register this filesystem to manage a canonical > > mountpoint, but this wouldn't limit everyone to only using that pseudofs > > underneath. > > > > This can serve to hold various kinds of output data from Linux DRM > > subsystems, for the files which can't truely fit anywhere else with > > existing ABI's but present so, for the lack of a better place. > > > > In this patch series, we have introduced a pseudo filesystem named as > > 'drmfs' for now. The filesystem is introduced in the first patch, and the > > subsequent patches make use of the filesystem interfaces, in drm driver, > > and making them available for use by the drm subsystem components, one of > > which is i915. We've moved the location of i915 GuC logs from debugfs to > > drmfs in the last patch. Subsequently, more such files such as pipe_crc, > > error states, memory stats, etc. can be move to this filesystem, if the > > idea introduced here is acceptable per se. The filesystem introduced is > > being used to house the data generated by i915 driver in this patch series, > > but will hopefully be generic enough to provide scope for usage by any > > other drm subsystem component. > > > > The patch series is being floated as RFC to gather feedback on the idea and > > infrastructure proposed here and it's suitability to address the specific > > problem statement/use case. > > > > v2: fix the bat failures caused due to missing config check > > > > v3: Changes made: > > - Move the location of drmfs from fs/ to drivers/gpu/drm/ (Chris) > > - Moving config checks to header (Chris,Daniel) > >
Re: [Intel-gfx] [PATCH 03/12] drm/i915: Framework for capturing command stream based OA reports and ctx id info.
On Wed, Aug 2, 2017 at 2:28 AM, Lionel Landwerlin < lionel.g.landwer...@intel.com> wrote: > On 01/08/17 19:05, sourab gupta wrote: > > > > On Tue, Aug 1, 2017 at 2:59 PM, Kamble, Sagar A > wrote: > >> >> >> -Original Message- >> From: Landwerlin, Lionel G >> Sent: Monday, July 31, 2017 9:16 PM >> To: Kamble, Sagar A ; >> intel-gfx@lists.freedesktop.org >> Cc: Sourab Gupta >> Subject: Re: [Intel-gfx] [PATCH 03/12] drm/i915: Framework for capturing >> command stream based OA reports and ctx id info. >> >> On 31/07/17 08:59, Sagar Arun Kamble wrote: >> > From: Sourab Gupta >> > >> > This patch introduces a framework to capture OA counter reports >> associated >> > with Render command stream. We can then associate the reports captured >> > through this mechanism with their corresponding context id's. This can >> be >> > further extended to associate any other metadata information with the >> > corresponding samples (since the association with Render command stream >> > gives us the ability to capture these information while inserting the >> > corresponding capture commands into the command stream). >> > >> > The OA reports generated in this way are associated with a corresponding >> > workload, and thus can be used the delimit the workload (i.e. sample the >> > counters at the workload boundaries), within an ongoing stream of >> periodic >> > counter snapshots. >> > >> > There may be usecases wherein we need more than periodic OA capture mode >> > which is supported currently. This mode is primarily used for two >> usecases: >> > - Ability to capture system wide metrics, alongwith the ability to >> map >> >the reports back to individual contexts (particularly for HSW). >> > - Ability to inject tags for work, into the reports. This provides >> >visibility into the multiple stages of work within single >> context. >> > >> > The userspace will be able to distinguish between the periodic and CS >> based >> > OA reports by the virtue of source_info sample field. >> > >> > The command MI_REPORT_PERF_COUNT can be used to capture snapshots of OA >> > counters, and is inserted at BB boundaries. >> > The data thus captured will be stored in a separate buffer, which will >> > be different from the buffer used otherwise for periodic OA capture >> mode. >> > The metadata information pertaining to snapshot is maintained in a list, >> > which also has offsets into the gem buffer object per captured snapshot. >> > In order to track whether the gpu has completed processing the node, >> > a field pertaining to corresponding gem request is added, which is >> tracked >> > for completion of the command. >> > >> > Both periodic and CS based reports are associated with a single stream >> > (corresponding to render engine), and it is expected to have the samples >> > in the sequential order according to their timestamps. Now, since these >> > reports are collected in separate buffers, these are merge sorted at the >> > time of forwarding to userspace during the read call. >> > >> > v2: Aligning with the non-perf interface (custom drm ioctl based). Also, >> > few related patches are squashed together for better readability >> > >> > v3: Updated perf sample capture emit hook name. Reserving space upfront >> > in the ring for emitting sample capture commands and using >> > req->fence.seqno for tracking samples. Added SRCU protection for >> streams. >> > Changed the stream last_request tracking to resv object. (Chris) >> > Updated perf.sample_lock spin_lock usage to avoid softlockups. Moved >> > stream to global per-engine structure. (Sagar) >> > Update unpin and put in the free routines to i915_vma_unpin_and_release. >> > Making use of perf stream cs_buffer vma resv instead of separate resv >> obj. >> > Pruned perf stream vma resv during gem_idle. (Chris) >> > Changed payload field ctx_id to u64 to keep all sample data aligned at 8 >> > bytes. (Lionel) >> > stall/flush prior to sample capture is not added. Do we need to give >> this >> > control to user to select whether to stall/flush at each sample? >> > >> > Signed-off-by: Sourab Gupta >> > Signed-off-by: Robert Bragg >> > Signed-off-by: Sagar Arun Kamble >> > --- >> > drivers/gpu/drm/i915/i915_drv.h
[Intel-gfx] [PATCH 0/8] Collect command stream based OA reports using i915 perf
From: Sourab Gupta This series adds framework for collection of OA reports associated with the render command stream, which are collected around batchbuffer boundaries. Refloating the series rebased on Robert's latest patch set for 'Enabling OA unit for Gen 8 and 9 in i915 perf', which can be found here: https://patchwork.freedesktop.org/series/20084/ Since Robert's patches are being reviewed and this patch series extends his framework to collect command stream based OA metrics, it would be good to keep this work in perspective. Looking to receive feedback (and possibly r-b's :)) on the series. Since the OA reports collected associated with the render command stream, this also gives us the ability to collect other metadata such as ctx_id, pid, etc. with the samples, and thus we can establish the association of samples collected with the corresponding process/workload. These patches can be found for viewing at https://github.com/sourabgu/linux/tree/oa-6march2017 Sourab Gupta (8): drm/i915: Add ctx getparam ioctl parameter to retrieve ctx unique id drm/i915: Expose OA sample source to userspace drm/i915: Framework for capturing command stream based OA reports drm/i915: flush periodic samples, in case of no pending CS sample requests drm/i915: Inform userspace about command stream OA buf overflow drm/i915: Populate ctx ID for periodic OA reports drm/i915: Add support for having pid output with OA report drm/i915: Add support for emitting execbuffer tags through OA counter reports drivers/gpu/drm/i915/i915_drv.h| 125 ++- drivers/gpu/drm/i915/i915_gem_context.c|3 + drivers/gpu/drm/i915/i915_gem_execbuffer.c |6 + drivers/gpu/drm/i915/i915_perf.c | 1149 include/uapi/drm/i915_drm.h| 49 ++ 5 files changed, 1184 insertions(+), 148 deletions(-) -- 1.9.1 ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
[Intel-gfx] [PATCH 1/8] drm/i915: Add ctx getparam ioctl parameter to retrieve ctx unique id
From: Sourab Gupta This patch adds a new ctx getparam ioctl parameter, which can be used to retrieve ctx unique id by userspace. This can be used by userspace to map the OA reports received in the i915 perf samples with their associated ctx's (The OA reports have the hw ctx ID information embedded for Gen8+). Otherwise the userspace has no way of maintaining this association, since it has the knowledge of only per-drm file specific ctx handles. Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_gem_context.c | 3 +++ include/uapi/drm/i915_drm.h | 1 + 2 files changed, 4 insertions(+) diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c index baceca1..b6d2125 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.c +++ b/drivers/gpu/drm/i915/i915_gem_context.c @@ -1062,6 +1062,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data, case I915_CONTEXT_PARAM_BANNABLE: args->value = i915_gem_context_is_bannable(ctx); break; + case I915_CONTEXT_PARAM_HW_ID: + args->value = ctx->hw_id; + break; default: ret = -EINVAL; break; diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index 03b8338..835e711 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -1291,6 +1291,7 @@ struct drm_i915_gem_context_param { #define I915_CONTEXT_PARAM_GTT_SIZE0x3 #define I915_CONTEXT_PARAM_NO_ERROR_CAPTURE0x4 #define I915_CONTEXT_PARAM_BANNABLE0x5 +#define I915_CONTEXT_PARAM_HW_ID 0x6 __u64 value; }; -- 1.9.1 ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
[Intel-gfx] [PATCH 8/8] drm/i915: Add support for emitting execbuffer tags through OA counter reports
From: Sourab Gupta This patch enables userspace to specify tags (per workload), provided via execbuffer ioctl, which could be added to OA reports, to help associate reports with the corresponding workloads. There may be multiple stages within a single context, from a userspace perspective. An ability is needed to individually associate the OA reports with their corresponding workloads(execbuffers), which may not be possible solely with ctx_id or pid information. This patch enables such a mechanism. In this patch, upper 32 bits of rsvd1 field, which were previously unused are now being used to pass in the tag. Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_drv.h| 17 +++-- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 6 +++-- drivers/gpu/drm/i915/i915_perf.c | 40 +- include/uapi/drm/i915_drm.h| 12 + 4 files changed, 65 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index fa1d3fc..414afa5 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1982,7 +1982,8 @@ struct i915_perf_stream_ops { * the batch buffer. */ void (*command_stream_hook)(struct i915_perf_stream *stream, - struct drm_i915_gem_request *request); + struct drm_i915_gem_request *request, + u32 tag); }; enum i915_perf_stream_state { @@ -2171,6 +2172,17 @@ struct i915_perf_cs_sample { * submitted, pertaining to this perf sample */ u32 pid; + + /** +* @tag: Tag associated with workload, for which the perf sample is +* being collected. +* +* Userspace can specify tags (provided via execbuffer ioctl), which +* can be associated with the perf samples, and be used to functionally +* distinguish different workload stages, and associate samples with +* these different stages. +*/ + u32 tag; }; struct intel_cdclk_state { @@ -2627,6 +2639,7 @@ struct drm_i915_private { u32 last_cmd_stream_ctx_id; u32 last_pid; + u32 last_tag; struct list_head cs_samples; spinlock_t sample_lock; } perf; @@ -3690,7 +3703,7 @@ void i915_oa_init_reg_state(struct intel_engine_cs *engine, void i915_oa_update_reg_state(struct intel_engine_cs *engine, struct i915_gem_context *ctx, uint32_t *reg_state); -void i915_perf_command_stream_hook(struct drm_i915_gem_request *req); +void i915_perf_command_stream_hook(struct drm_i915_gem_request *req, u32 tag); /* i915_gem_evict.c */ int __must_check i915_gem_evict_something(struct i915_address_space *vm, diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index 7af32c97..b42d47e 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -59,6 +59,7 @@ struct i915_execbuffer_params { struct intel_engine_cs *engine; struct i915_gem_context *ctx; struct drm_i915_gem_request *request; + uint32_ttag; }; struct eb_vmas { @@ -1441,7 +1442,7 @@ static void eb_export_fence(struct drm_i915_gem_object *obj, if (exec_len == 0) exec_len = params->batch->size - params->args_batch_start_offset; - i915_perf_command_stream_hook(params->request); + i915_perf_command_stream_hook(params->request, params->tag); ret = params->engine->emit_bb_start(params->request, exec_start, exec_len, @@ -1449,7 +1450,7 @@ static void eb_export_fence(struct drm_i915_gem_object *obj, if (ret) return ret; - i915_perf_command_stream_hook(params->request); + i915_perf_command_stream_hook(params->request, params->tag); i915_gem_execbuffer_move_to_active(vmas, params->request); @@ -1791,6 +1792,7 @@ static void eb_export_fence(struct drm_i915_gem_object *obj, params->engine= engine; params->dispatch_flags = dispatch_flags; params->ctx = ctx; + params->tag = i915_execbuffer2_get_tag(*args); trace_i915_gem_request_queue(params->request, dispatch_flags); diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 6e8af2d..759865e 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -290,6 +290,7 @@ struct oa_sample_data { u32 source; u32 ctx_id; u32 pid; + u32 tag; const u8 *report; }; @@ -344,6 +345,7 @@ struct oa_sample_data { #defin
[Intel-gfx] [PATCH 5/8] drm/i915: Inform userspace about command stream OA buf overflow
From: Sourab Gupta Considering how we don't currently give userspace control over the OA buffer size and always configure a large 16MB buffer, then a buffer overflow does anyway likely indicate that something has gone quite badly wrong. Here we set a status flag to detect overflow and inform userspace of the report_lost condition accordingly. This is in line with the behavior of the periodic OA buffer. Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_drv.h | 2 ++ drivers/gpu/drm/i915/i915_perf.c | 15 +++ 2 files changed, 17 insertions(+) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index d0f43e9..0f2a552 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -2609,6 +2609,8 @@ struct drm_i915_private { struct { struct i915_vma *vma; u8 *vaddr; +#define I915_PERF_CMD_STREAM_BUF_STATUS_OVERFLOW (1<<0) + u32 status; } command_stream_buf; struct list_head cs_samples; diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index d1d9853..2841d0a 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -478,6 +478,8 @@ static void insert_perf_sample(struct drm_i915_private *dev_priv, else { u32 target_size = sample_size - first->offset; + dev_priv->perf.command_stream_buf.status |= + I915_PERF_CMD_STREAM_BUF_STATUS_OVERFLOW; release_perf_samples(dev_priv, target_size); sample->offset = 0; } @@ -491,6 +493,8 @@ static void insert_perf_sample(struct drm_i915_private *dev_priv, (first->offset - last->offset - sample_size); + dev_priv->perf.command_stream_buf.status |= + I915_PERF_CMD_STREAM_BUF_STATUS_OVERFLOW; release_perf_samples(dev_priv, target_size); sample->offset = last->offset + sample_size; } @@ -1577,6 +1581,17 @@ static int oa_rcs_append_reports(struct i915_perf_stream *stream, struct i915_perf_cs_sample *entry, *next; LIST_HEAD(free_list); int ret = 0; + u32 status = dev_priv->perf.command_stream_buf.status; + + if (unlikely(status & I915_PERF_CMD_STREAM_BUF_STATUS_OVERFLOW)) { + ret = append_oa_status(stream, buf, count, offset, + DRM_I915_PERF_RECORD_OA_BUFFER_LOST); + if (ret) + return ret; + + dev_priv->perf.command_stream_buf.status &= + ~I915_PERF_CMD_STREAM_BUF_STATUS_OVERFLOW; + } spin_lock(&dev_priv->perf.sample_lock); if (list_empty(&dev_priv->perf.cs_samples)) { -- 1.9.1 ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
[Intel-gfx] [PATCH 2/8] drm/i915: Expose OA sample source to userspace
From: Sourab Gupta This patch exposes a new sample source field to userspace. This field can be populated to specify the origin of the OA report. Currently, the OA samples are being generated only periodically, and hence there's only source flag enum definition right now, but there are other means of generating OA samples, such as via MI_RPC commands. The OA_SOURCE sample type is introducing a mechanism (for userspace) to distinguish various OA reports generated via different sources. Signed-off-by: Sourab Gupta Signed-off-by: Robert Bragg --- drivers/gpu/drm/i915/i915_perf.c | 18 ++ include/uapi/drm/i915_drm.h | 14 ++ 2 files changed, 32 insertions(+) diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 4b1db73..540c5b2 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -324,6 +324,7 @@ }; #define SAMPLE_OA_REPORT (1<<0) +#define SAMPLE_OA_SOURCE_INFO (1<<1) /** * struct perf_open_properties - for validated properties given to open a stream @@ -659,6 +660,15 @@ static int append_oa_sample(struct i915_perf_stream *stream, return -EFAULT; buf += sizeof(header); + if (sample_flags & SAMPLE_OA_SOURCE_INFO) { + enum drm_i915_perf_oa_event_source source = + I915_PERF_OA_EVENT_SOURCE_PERIODIC; + + if (copy_to_user(buf, &source, 4)) + return -EFAULT; + buf += 4; + } + if (sample_flags & SAMPLE_OA_REPORT) { if (copy_to_user(buf, report, report_size)) return -EFAULT; @@ -2030,6 +2040,11 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, stream->sample_flags |= SAMPLE_OA_REPORT; stream->sample_size += format_size; + if (props->sample_flags & SAMPLE_OA_SOURCE_INFO) { + stream->sample_flags |= SAMPLE_OA_SOURCE_INFO; + stream->sample_size += 4; + } + dev_priv->perf.oa.oa_buffer.format_size = format_size; if (WARN_ON(dev_priv->perf.oa.oa_buffer.format_size == 0)) return -EINVAL; @@ -2814,6 +2829,9 @@ static int read_properties_unlocked(struct drm_i915_private *dev_priv, props->oa_periodic = true; props->oa_period_exponent = value; break; + case DRM_I915_PERF_PROP_SAMPLE_OA_SOURCE: + props->sample_flags |= SAMPLE_OA_SOURCE_INFO; + break; default: MISSING_CASE(id); DRM_DEBUG("Unknown i915 perf property ID\n"); diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index 835e711..c597e36 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -1312,6 +1312,12 @@ enum drm_i915_oa_format { I915_OA_FORMAT_MAX /* non-ABI */ }; +enum drm_i915_perf_oa_event_source { + I915_PERF_OA_EVENT_SOURCE_PERIODIC, + + I915_PERF_OA_EVENT_SOURCE_MAX /* non-ABI */ +}; + enum drm_i915_perf_property_id { /** * Open the stream for a specific context handle (as used with @@ -1346,6 +1352,13 @@ enum drm_i915_perf_property_id { */ DRM_I915_PERF_PROP_OA_EXPONENT, + /** +* The value of this property set to 1 requests inclusion of sample +* source field to be given to userspace. The sample source field +* specifies the origin of OA report. +*/ + DRM_I915_PERF_PROP_SAMPLE_OA_SOURCE, + DRM_I915_PERF_PROP_MAX /* non-ABI */ }; @@ -1411,6 +1424,7 @@ enum drm_i915_perf_record_type { * struct { * struct drm_i915_perf_record_header header; * +* { u32 source_info; } && DRM_I915_PERF_PROP_SAMPLE_OA_SOURCE * { u32 oa_report[]; } && DRM_I915_PERF_PROP_SAMPLE_OA * }; */ -- 1.9.1 ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
[Intel-gfx] [PATCH 4/8] drm/i915: flush periodic samples, in case of no pending CS sample requests
From: Sourab Gupta When there are no pending CS OA samples, flush the periodic OA samples collected so far. We can safely forward the periodic OA samples in the case we have no pending CS samples, but we can't do so in the case we have pending CS samples, since we don't know what the ordering between pending CS samples and periodic samples will eventually be. If we have no pending CS sample, it won't be possible for future pending CS sample to have timestamps earlier than current periodic timestamp. Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_drv.h | 10 ++- drivers/gpu/drm/i915/i915_perf.c | 175 +-- 2 files changed, 139 insertions(+), 46 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 0b70052..d0f43e9 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -2110,10 +2110,11 @@ struct i915_oa_ops { char __user *buf, size_t count, size_t *offset, - u32 ts); + u32 ts, + u32 max_reports); /** -* @oa_buffer_check: Check for OA buffer data + update tail +* @oa_buffer_num_reports: Return number of OA reports + update tail * * This is either called via fops or the poll check hrtimer (atomic * ctx) without any locks taken. @@ -2126,7 +2127,8 @@ struct i915_oa_ops { * here, which will be handled gracefully - likely resulting in an * %EAGAIN error for userspace. */ - bool (*oa_buffer_check)(struct drm_i915_private *dev_priv); + u32 (*oa_buffer_num_reports)(struct drm_i915_private *dev_priv, + u32 *last_ts); }; /* @@ -2589,6 +2591,8 @@ struct drm_i915_private { u32 gen7_latched_oastatus1; u32 ctx_oactxctrl_off; u32 ctx_flexeu0_off; + u32 n_pending_periodic_samples; + u32 pending_periodic_ts; /* The RPT_ID/reason field for Gen8+ includes a bit * to determine if the CTX ID in the report is valid diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 3321dad..d1d9853 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -583,7 +583,7 @@ static void i915_oa_rcs_release_samples(struct drm_i915_private *dev_priv) } /** - * gen8_oa_buffer_check_unlocked - check for data and update tail ptr state + * gen8_oa_buffer_num_reports_unlocked - check for data and update tail ptr state * @dev_priv: i915 device instance * * This is either called via fops (for blocking reads in user ctx) or the poll @@ -596,7 +596,7 @@ static void i915_oa_rcs_release_samples(struct drm_i915_private *dev_priv) * the pointers time to 'age' before they are made available for reading. * (See description of OA_TAIL_MARGIN_NSEC above for further details.) * - * Besides returning true when there is data available to read() this function + * Besides returning num of reports when there is data available to read() it * also has the side effect of updating the oa_buffer.tails[], .aging_timestamp * and .aged_tail_idx state used for reading. * @@ -604,14 +604,15 @@ static void i915_oa_rcs_release_samples(struct drm_i915_private *dev_priv) * only called while the stream is enabled, while the global OA configuration * can't be modified. * - * Returns: %true if the OA buffer contains data, else %false + * Returns: number of samples available to read */ -static bool gen8_oa_buffer_check_unlocked(struct drm_i915_private *dev_priv) +static u32 gen8_oa_buffer_num_reports_unlocked( + struct drm_i915_private *dev_priv, u32 *last_ts) { int report_size = dev_priv->perf.oa.oa_buffer.format_size; unsigned long flags; unsigned int aged_idx; - u32 head, hw_tail, aged_tail, aging_tail; + u32 head, hw_tail, aged_tail, aging_tail, num_reports = 0; u64 now; /* We have to consider the (unlikely) possibility that read() errors @@ -652,6 +653,13 @@ static bool gen8_oa_buffer_check_unlocked(struct drm_i915_private *dev_priv) if (aging_tail != INVALID_TAIL_PTR && ((now - dev_priv->perf.oa.oa_buffer.aging_timestamp) > OA_TAIL_MARGIN_NSEC)) { + u32 mask = (OA_BUFFER_SIZE - 1); + u32 gtt_offset = i915_ggtt_offset( + dev_priv->perf.oa.oa_buffer.vma); + u32 head = (dev_priv->perf.oa.oa_buffer.head - gtt_offset) + & mask; + u8 *oa_buf_base = dev_priv->perf.oa.oa_buffer.vaddr; + u32 *report32; aged_idx ^= 1; dev_priv-&g
[Intel-gfx] [PATCH 6/8] drm/i915: Populate ctx ID for periodic OA reports
From: Sourab Gupta This adds support for populating the ctx id for the periodic OA reports when requested through the corresponding property. For Gen8, the OA reports itself have the ctx ID and it is the one programmed into HW while submitting workloads. Thus it's retrieved from reports itself. For Gen7, the OA reports don't have any such field, and we can populate this field with the last seen ctx ID while sending CS reports. Signed-off-by: Sourab Gupta --- drivers/gpu/drm/i915/i915_drv.h | 7 ++ drivers/gpu/drm/i915/i915_perf.c | 53 +++- 2 files changed, 54 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 0f2a552..7a6dcb3 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -2129,6 +2129,12 @@ struct i915_oa_ops { */ u32 (*oa_buffer_num_reports)(struct drm_i915_private *dev_priv, u32 *last_ts); + + /** +* @get_ctx_id: Retrieve the ctx_id associated with the (periodic) OA +* report. +*/ + u32 (*get_ctx_id)(struct i915_perf_stream *stream, const u8 *report); }; /* @@ -2613,6 +2619,7 @@ struct drm_i915_private { u32 status; } command_stream_buf; + u32 last_cmd_stream_ctx_id; struct list_head cs_samples; spinlock_t sample_lock; } perf; diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 2841d0a..208179f 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -850,6 +850,46 @@ static u32 gen7_oa_buffer_num_reports_unlocked( return aged_tail == INVALID_TAIL_PTR ? 0 : num_reports; } +static u32 gen7_oa_buffer_get_ctx_id(struct i915_perf_stream *stream, + const u8 *report) +{ + struct drm_i915_private *dev_priv = stream->dev_priv; + + if (!stream->cs_mode) + WARN_ONCE(1, + "CTX ID can't be retrieved if command stream mode not enabled"); + + /* +* OA reports generated in Gen7 don't have the ctx ID information. +* Therefore, just rely on the ctx ID information from the last CS +* sample forwarded +*/ + return dev_priv->perf.last_cmd_stream_ctx_id; +} + +static u32 gen8_oa_buffer_get_ctx_id(struct i915_perf_stream *stream, + const u8 *report) +{ + struct drm_i915_private *dev_priv = stream->dev_priv; + + /* The ctx ID present in the OA reports have intel_context::hw_id +* present, since this is programmed into the ELSP in execlist mode. +* In non-execlist mode, fall back to retrieving the ctx ID from the +* last saved ctx ID from command stream mode. +*/ + if (i915.enable_execlists) { + u32 *report32 = (void *)report; + u32 ctx_id = report32[2] & 0x1f; + return ctx_id; + } else { + if (!stream->cs_mode) + WARN_ONCE(1, + "CTX ID can't be retrieved if command stream mode not enabled"); + + return dev_priv->perf.last_cmd_stream_ctx_id; + } +} + /** * append_oa_status - Appends a status record to a userspace read() buffer. * @stream: An i915-perf stream opened for OA metrics @@ -963,18 +1003,15 @@ static int append_oa_buffer_sample(struct i915_perf_stream *stream, char __user *buf, size_t count, size_t *offset, const u8 *report) { + struct drm_i915_private *dev_priv = stream->dev_priv; u32 sample_flags = stream->sample_flags; struct oa_sample_data data = { 0 }; if (sample_flags & SAMPLE_OA_SOURCE_INFO) data.source = I915_PERF_OA_EVENT_SOURCE_PERIODIC; - /* -* FIXME: append_oa_buffer_sample: read ctx ID from report and map -* that to a intel_context::hw_id" -*/ if (sample_flags & SAMPLE_CTX_ID) - data.ctx_id = 0; + data.ctx_id = dev_priv->perf.oa.ops.get_ctx_id(stream, report); if (sample_flags & SAMPLE_OA_REPORT) data.report = report; @@ -1547,8 +1584,10 @@ static int append_oa_rcs_sample(struct i915_perf_stream *stream, if (sample_flags & SAMPLE_OA_SOURCE_INFO) data.source = I915_PERF_OA_EVENT_SOURCE_RCS; - if (sample_flags & SAMPLE_CTX_ID) + if (sample_flags & SAMPLE_CTX_ID) { data.ctx_id = node->ctx_id; + dev_priv->perf.last_cmd_stream_ctx_id = node->ctx_id; + } if (sample_flags & SAMPLE_OA_REPORT) data.report = report; @@ -3794,6 +3833
[Intel-gfx] [PATCH 3/8] drm/i915: Framework for capturing command stream based OA reports
From: Sourab Gupta This patch introduces a framework to capture OA counter reports associated with Render command stream. We can then associate the reports captured through this mechanism with their corresponding context id's. This can be further extended to associate any other metadata information with the corresponding samples (since the association with Render command stream gives us the ability to capture these information while inserting the corresponding capture commands into the command stream). The OA reports generated in this way are associated with a corresponding workload, and thus can be used the delimit the workload (i.e. sample the counters at the workload boundaries), within an ongoing stream of periodic counter snapshots. There may be usecases wherein we need more than periodic OA capture mode which is supported currently. This mode is primarily used for two usecases: - Ability to capture system wide metrics, alongwith the ability to map the reports back to individual contexts (particularly for HSW). - Ability to inject tags for work, into the reports. This provides visibility into the multiple stages of work within single context. The userspace will be able to distinguish between the periodic and CS based OA reports by the virtue of source_info sample field. The command MI_REPORT_PERF_COUNT can be used to capture snapshots of OA counters, and is inserted at BB boundaries. The data thus captured will be stored in a separate buffer, which will be different from the buffer used otherwise for periodic OA capture mode. The metadata information pertaining to snapshot is maintained in a list, which also has offsets into the gem buffer object per captured snapshot. In order to track whether the gpu has completed processing the node, a field pertaining to corresponding gem request is added, which is tracked for completion of the command. Both periodic and RCS based reports are associated with a single stream (corresponding to render engine), and it is expected to have the samples in the sequential order according to their timestamps. Now, since these reports are collected in separate buffers, these are merge sorted at the time of forwarding to userspace during the read call. v2: Aligining with the non-perf interface (custom drm ioctl based). Also, few related patches are squashed together for better readability Signed-off-by: Sourab Gupta Signed-off-by: Robert Bragg --- drivers/gpu/drm/i915/i915_drv.h| 88 ++- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 4 + drivers/gpu/drm/i915/i915_perf.c | 888 - include/uapi/drm/i915_drm.h| 15 + 4 files changed, 861 insertions(+), 134 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index ef104ff5..0b70052 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1972,6 +1972,23 @@ struct i915_perf_stream_ops { * The stream will always be disabled before this is called. */ void (*destroy)(struct i915_perf_stream *stream); + + /* +* @command_stream_hook: Emit the commands in the command streamer +* for a particular gpu engine. +* +* The commands are inserted to capture the perf sample data at +* specific points during workload execution, such as before and after +* the batch buffer. +*/ + void (*command_stream_hook)(struct i915_perf_stream *stream, + struct drm_i915_gem_request *request); +}; + +enum i915_perf_stream_state { + I915_PERF_STREAM_DISABLED, + I915_PERF_STREAM_ENABLE_IN_PROGRESS, + I915_PERF_STREAM_ENABLED, }; /** @@ -1989,6 +2006,10 @@ struct i915_perf_stream { struct list_head link; /** +* @engine: GPU engine associated with this particular stream +*/ + enum intel_engine_id engine; + /** * @sample_flags: Flags representing the `DRM_I915_PERF_PROP_SAMPLE_*` * properties given when opening a stream, representing the contents * of a single sample as read() by userspace. @@ -2009,11 +2030,25 @@ struct i915_perf_stream { struct i915_gem_context *ctx; /** -* @enabled: Whether the stream is currently enabled, considering -* whether the stream was opened in a disabled state and based -* on `I915_PERF_IOCTL_ENABLE` and `I915_PERF_IOCTL_DISABLE` calls. +* @state: Current stream state, which can be either disabled, enabled, +* or enable_in_progress, while considering whether the stream was +* opened in a disabled state and based on `I915_PERF_IOCTL_ENABLE` and +* `I915_PERF_IOCTL_DISABLE` calls. */ - bool enabled; + enum i915_perf_stream_state state; + + /** +* @cs_mode: Whether command stream based perf sample collection is +* enabled for this s