perf: Add OA unit support for Gen 8+

Matthew Auld Wed, 12 Apr 2017 04:34:00 -0700

On 04/05, Robert Bragg wrote:
> Enables access to OA unit metrics for BDW, CHV, SKL and BXT which all
> share (more-or-less) the same OA unit design.
> 
> Of particular note in comparison to Haswell: some OA unit HW config
> state has become per-context state and as a consequence it is somewhat
> more complicated to manage synchronous state changes from the cpu while
> there's no guarantee of what context (if any) is currently actively
> running on the gpu.
> 
> The periodic sampling frequency which can be particularly useful for
> system-wide analysis (as opposed to command stream synchronised
> MI_REPORT_PERF_COUNT commands) is perhaps the most surprising state to
> have become per-context save and restored (while the OABUFFER
> destination is still a shared, system-wide resource).
> 
> This support for gen8+ takes care to consider a number of timing
> challenges involved in synchronously updating per-context state
> primarily by programming all config state from the cpu and updating all
> current and saved contexts synchronously while the OA unit is still
> disabled.
> 
> The driver intentionally avoids depending on command streamer
> programming to update OA state considering the lack of synchronization
> between the automatic loading of OACTXCONTROL state (that includes the
> periodic sampling state and enable state) on context restore and the
> parsing of any general purpose BB the driver can control. I.e. this
> implementation is careful to avoid the possibility of a context restore
> temporarily enabling any out-of-date periodic sampling state. In
> addition to the risk of transiently-out-of-date state being loaded
> automatically; there are also internal HW latencies involved in the
> loading of MUX configurations which would be difficult to account for
> from the command streamer (and we only want to enable the unit when once
> the MUX configuration is complete).
> 
> Since the Gen8+ OA unit design no longer supports clock gating the unit
> off for a single given context (which effectively stopped any progress
> of counters while any other context was running) and instead supports
> tagging OA reports with a context ID for filtering on the CPU, it means
> we can no longer hide the system-wide progress of counters from a
> non-privileged application only interested in metrics for its own
> context. Although we could theoretically try and subtract the progress
> of other contexts before forwarding reports via read() we aren't in a
> position to filter reports captured via MI_REPORT_PERF_COUNT commands.
> As a result, for Gen8+, we always require the
> dev.i915.perf_stream_paranoid to be unset for any access to OA metrics
> if not root.
> 
> Signed-off-by: Robert Bragg <rob...@sixbynine.org>
> ---
>  drivers/gpu/drm/i915/i915_drv.h         |  45 +-
>  drivers/gpu/drm/i915/i915_gem_context.h |   1 +
>  drivers/gpu/drm/i915/i915_perf.c        | 938 
> +++++++++++++++++++++++++++++---
>  drivers/gpu/drm/i915/i915_reg.h         |  22 +
>  drivers/gpu/drm/i915/intel_lrc.c        |   5 +
>  include/uapi/drm/i915_drm.h             |  19 +-
>  6 files changed, 937 insertions(+), 93 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 9c37b73ac7ac..3a22b6fd0ee6 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2067,9 +2067,17 @@ struct i915_oa_ops {
>       void (*init_oa_buffer)(struct drm_i915_private *dev_priv);
>  
>       /**
> -      * @enable_metric_set: Applies any MUX configuration to set up the
> -      * Boolean and Custom (B/C) counters that are part of the counter
> -      * reports being sampled. May apply system constraints such as
> +      * @select_metric_set: The auto generated code that checks whether a
> +      * requested OA config is applicable to the system and if so sets up
> +      * the mux, oa and flex eu register config pointers according to the
> +      * current dev_priv->perf.oa.metrics_set.
> +      */
> +     int (*select_metric_set)(struct drm_i915_private *dev_priv);
> +
> +     /**
> +      * @enable_metric_set: Selects and applies any MUX configuration to set
> +      * up the Boolean and Custom (B/C) counters that are part of the
> +      * counter reports being sampled. May apply system constraints such as
>        * disabling EU clock gating as required.
>        */
>       int (*enable_metric_set)(struct drm_i915_private *dev_priv);
> @@ -2100,20 +2108,13 @@ struct i915_oa_ops {
>                   size_t *offset);
>  
>       /**
> -      * @oa_buffer_check: Check for OA buffer data + update tail
> -      *
> -      * This is either called via fops or the poll check hrtimer (atomic
> -      * ctx) without any locks taken.
> +      * @oa_hw_tail_read: read the OA tail pointer register
>        *
> -      * It's safe to read OA config state here unlocked, assuming that this
> -      * is only called while the stream is enabled, while the global OA
> -      * configuration can't be modified.
> -      *
> -      * Efficiency is more important than avoiding some false positives
> -      * here, which will be handled gracefully - likely resulting in an
> -      * %EAGAIN error for userspace.
> +      * In particular this enables us to share all the fiddly code for
> +      * handling the OA unit tail pointer race that affects multiple
> +      * generations.
>        */
> -     bool (*oa_buffer_check)(struct drm_i915_private *dev_priv);
> +     u32 (*oa_hw_tail_read)(struct drm_i915_private *dev_priv);
>  };
>  
>  struct intel_cdclk_state {
> @@ -2475,6 +2476,7 @@ struct drm_i915_private {
>                       struct {
>                               struct i915_vma *vma;
>                               u8 *vaddr;
> +                             u32 last_ctx_id;
>                               int format;
>                               int format_size;
>  
> @@ -2544,6 +2546,14 @@ struct drm_i915_private {
>                       } oa_buffer;
>  
>                       u32 gen7_latched_oastatus1;
> +                     u32 ctx_oactxctrl_offset;
> +                     u32 ctx_flexeu0_offset;
> +
> +                     /* The RPT_ID/reason field for Gen8+ includes a bit
> +                      * to determine if the CTX ID in the report is valid
> +                      * but the specific bit differs between Gen 8 and 9
> +                      */
> +                     u32 gen8_valid_ctx_bit;
>  
>                       struct i915_oa_ops ops;
>                       const struct i915_oa_format *oa_formats;
> @@ -2854,6 +2864,8 @@ intel_info(const struct drm_i915_private *dev_priv)
>  #define IS_KBL_ULX(dev_priv) (INTEL_DEVID(dev_priv) == 0x590E || \
>                                INTEL_DEVID(dev_priv) == 0x5915 || \
>                                INTEL_DEVID(dev_priv) == 0x591E)
> +#define IS_SKL_GT2(dev_priv) (IS_SKYLAKE(dev_priv) && \
> +                              (INTEL_DEVID(dev_priv) & 0x00F0) == 0x0010)
>  #define IS_SKL_GT3(dev_priv) (IS_SKYLAKE(dev_priv) && \
>                                (INTEL_DEVID(dev_priv) & 0x00F0) == 0x0020)
>  #define IS_SKL_GT4(dev_priv) (IS_SKYLAKE(dev_priv) && \
> @@ -3605,6 +3617,9 @@ i915_gem_context_lookup_timeline(struct 
> i915_gem_context *ctx,
>  
>  int i915_perf_open_ioctl(struct drm_device *dev, void *data,
>                        struct drm_file *file);
> +void i915_oa_update_reg_state(struct intel_engine_cs *engine,
> +                           struct i915_gem_context *ctx,
> +                           uint32_t *reg_state);
>  
>  /* i915_gem_evict.c */
>  int __must_check i915_gem_evict_something(struct i915_address_space *vm,
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.h 
> b/drivers/gpu/drm/i915/i915_gem_context.h
> index 4af2ab94558b..3f4ce73bea43 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.h
> +++ b/drivers/gpu/drm/i915/i915_gem_context.h
> @@ -151,6 +151,7 @@ struct i915_gem_context {
>               u64 lrc_desc;
>               int pin_count;
>               bool initialised;
> +             atomic_t oa_state_dirty;
>       } engine[I915_NUM_ENGINES];
>  
>       /** ring_size: size for allocating the per-engine ring buffer */
> diff --git a/drivers/gpu/drm/i915/i915_perf.c 
> b/drivers/gpu/drm/i915/i915_perf.c
> index 3277a52ce98e..98eb6415b63a 100644
> --- a/drivers/gpu/drm/i915/i915_perf.c
> +++ b/drivers/gpu/drm/i915/i915_perf.c
> @@ -196,6 +196,12 @@
>  
>  #include "i915_drv.h"
>  #include "i915_oa_hsw.h"
> +#include "i915_oa_bdw.h"
> +#include "i915_oa_chv.h"
> +#include "i915_oa_sklgt2.h"
> +#include "i915_oa_sklgt3.h"
> +#include "i915_oa_sklgt4.h"
> +#include "i915_oa_bxt.h"
>  
>  /* HW requires this to be a power of two, between 128k and 16M, though driver
>   * is currently generally designed assuming the largest 16M size is used such
> @@ -215,7 +221,7 @@
>   *
>   * Although this can be observed explicitly while copying reports to 
> userspace
>   * by checking for a zeroed report-id field in tail reports, we want to 
> account
> - * for this earlier, as part of the _oa_buffer_check to avoid lots of 
> redundant
> + * for this earlier, as part of the oa_buffer_check to avoid lots of 
> redundant
>   * read() attempts.
>   *
>   * In effect we define a tail pointer for reading that lags the real tail
> @@ -237,7 +243,7 @@
>   * indicates that an updated tail pointer is needed.
>   *
>   * Most of the implementation details for this workaround are in
> - * gen7_oa_buffer_check_unlocked() and gen7_appand_oa_reports()
> + * oa_buffer_check_unlocked() and _append_oa_reports()
>   *
>   * Note for posterity: previously the driver used to define an effective tail
>   * pointer that lagged the real pointer by a 'tail margin' measured in bytes
> @@ -272,6 +278,13 @@ static u32 i915_perf_stream_paranoid = true;
>  
>  #define INVALID_CTX_ID 0xffffffff
>  
> +/* On Gen8+ automatically triggered OA reports include a 'reason' field... */
> +#define OAREPORT_REASON_MASK           0x3f
> +#define OAREPORT_REASON_SHIFT          19
> +#define OAREPORT_REASON_TIMER          (1<<0)
> +#define OAREPORT_REASON_CTX_SWITCH     (1<<3)
> +#define OAREPORT_REASON_CLK_RATIO      (1<<5)
> +
>  
>  /* For sysctl proc_dointvec_minmax of i915_oa_max_sample_rate
>   *
> @@ -303,6 +316,13 @@ static struct i915_oa_format 
> hsw_oa_formats[I915_OA_FORMAT_MAX] = {
>       [I915_OA_FORMAT_C4_B8]      = { 7, 64 },
>  };
>  
> +static struct i915_oa_format gen8_plus_oa_formats[I915_OA_FORMAT_MAX] = {
> +     [I915_OA_FORMAT_A12]                = { 0, 64 },
> +     [I915_OA_FORMAT_A12_B8_C8]          = { 2, 128 },
> +     [I915_OA_FORMAT_A32u40_A4u32_B8_C8] = { 5, 256 },
> +     [I915_OA_FORMAT_C4_B8]              = { 7, 64 },
> +};
> +
>  #define SAMPLE_OA_REPORT      (1<<0)
>  
>  /**
> @@ -332,8 +352,20 @@ struct perf_open_properties {
>       int oa_period_exponent;
>  };
>  
> +static u32 gen8_oa_hw_tail_read(struct drm_i915_private *dev_priv)
> +{
> +     return I915_READ(GEN8_OATAILPTR) & GEN8_OATAILPTR_MASK;
> +}
> +
> +static u32 gen7_oa_hw_tail_read(struct drm_i915_private *dev_priv)
> +{
> +     u32 oastatus1 = I915_READ(GEN7_OASTATUS1);
> +
> +     return oastatus1 & GEN7_OASTATUS1_TAIL_MASK;
> +}
> +
>  /**
> - * gen7_oa_buffer_check_unlocked - check for data and update tail ptr state
> + * oa_buffer_check_unlocked - check for data and update tail ptr state
>   * @dev_priv: i915 device instance
>   *
>   * This is either called via fops (for blocking reads in user ctx) or the 
> poll
> @@ -356,12 +388,11 @@ struct perf_open_properties {
>   *
>   * Returns: %true if the OA buffer contains data, else %false
>   */
> -static bool gen7_oa_buffer_check_unlocked(struct drm_i915_private *dev_priv)
> +static bool oa_buffer_check_unlocked(struct drm_i915_private *dev_priv)
>  {
>       int report_size = dev_priv->perf.oa.oa_buffer.format_size;
>       unsigned long flags;
>       unsigned int aged_idx;
> -     u32 oastatus1;
>       u32 head, hw_tail, aged_tail, aging_tail;
>       u64 now;
>  
> @@ -381,8 +412,7 @@ static bool gen7_oa_buffer_check_unlocked(struct 
> drm_i915_private *dev_priv)
>       aged_tail = dev_priv->perf.oa.oa_buffer.tails[aged_idx].offset;
>       aging_tail = dev_priv->perf.oa.oa_buffer.tails[!aged_idx].offset;
>  
> -     oastatus1 = I915_READ(GEN7_OASTATUS1);
> -     hw_tail = oastatus1 & GEN7_OASTATUS1_TAIL_MASK;
> +     hw_tail = dev_priv->perf.oa.ops.oa_hw_tail_read(dev_priv);
>  
>       /* The tail pointer increases in 64 byte increments,
>        * not in report_size steps...
> @@ -404,6 +434,7 @@ static bool gen7_oa_buffer_check_unlocked(struct 
> drm_i915_private *dev_priv)
>       if (aging_tail != INVALID_TAIL_PTR &&
>           ((now - dev_priv->perf.oa.oa_buffer.aging_timestamp) >
>            OA_TAIL_MARGIN_NSEC)) {
> +
>               aged_idx ^= 1;
>               dev_priv->perf.oa.oa_buffer.aged_tail_idx = aged_idx;
>  
> @@ -553,6 +584,284 @@ static int append_oa_sample(struct i915_perf_stream 
> *stream,
>   *
>   * Returns: 0 on success, negative error code on failure.
>   */
> +static int gen8_append_oa_reports(struct i915_perf_stream *stream,
> +                               char __user *buf,
> +                               size_t count,
> +                               size_t *offset)
> +{
> +     struct drm_i915_private *dev_priv = stream->dev_priv;
> +     int report_size = dev_priv->perf.oa.oa_buffer.format_size;
> +     u8 *oa_buf_base = dev_priv->perf.oa.oa_buffer.vaddr;
> +     u32 gtt_offset = i915_ggtt_offset(dev_priv->perf.oa.oa_buffer.vma);
> +     u32 mask = (OA_BUFFER_SIZE - 1);
> +     size_t start_offset = *offset;
> +     unsigned long flags;
> +     unsigned int aged_tail_idx;
> +     u32 head, tail;
> +     u32 taken;
> +     int ret = 0;
> +
> +     if (WARN_ON(!stream->enabled))
> +             return -EIO;
> +
> +     spin_lock_irqsave(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
> +
> +     head = dev_priv->perf.oa.oa_buffer.head;
> +     aged_tail_idx = dev_priv->perf.oa.oa_buffer.aged_tail_idx;
> +     tail = dev_priv->perf.oa.oa_buffer.tails[aged_tail_idx].offset;
> +
> +     spin_unlock_irqrestore(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
> +
> +     /* An invalid tail pointer here means we're still waiting for the poll
> +      * hrtimer callback to give us a pointer
> +      */
> +     if (tail == INVALID_TAIL_PTR)
> +             return -EAGAIN;
> +
> +     /* NB: oa_buffer.head/tail include the gtt_offset which we don't want
> +      * while indexing relative to oa_buf_base.
> +      */
> +     head -= gtt_offset;
> +     tail -= gtt_offset;
> +
> +     /* An out of bounds or misaligned head or tail pointer implies a driver
> +      * bug since we validate + align the tail pointers we read from the
> +      * hardware and we are in full control of the head pointer which should
> +      * only be incremented by multiples of the report size (notably also
> +      * all a power of two).
> +      */
> +     if (WARN_ONCE(head > OA_BUFFER_SIZE || head % report_size ||
> +                   tail > OA_BUFFER_SIZE || tail % report_size,
> +                   "Inconsistent OA buffer pointers: head = %u, tail = %u\n",
> +                   head, tail))
> +             return -EIO;
> +
> +
> +     for (/* none */;
> +          (taken = OA_TAKEN(tail, head));
> +          head = (head + report_size) & mask) {
> +             u8 *report = oa_buf_base + head;
> +             u32 *report32 = (void *)report;
> +             u32 ctx_id;
> +             u32 reason;
> +
> +             /* All the report sizes factor neatly into the buffer
> +              * size so we never expect to see a report split
> +              * between the beginning and end of the buffer.
> +              *
> +              * Given the initial alignment check a misalignment
> +              * here would imply a driver bug that would result
> +              * in an overrun.
> +              */
> +             if (WARN_ON((OA_BUFFER_SIZE - head) < report_size)) {
> +                     DRM_ERROR("Spurious OA head ptr: non-integral report 
> offset\n");
> +                     break;
> +             }
> +
> +             /* The reason field includes flags identifying what
> +              * triggered this specific report (mostly timer
> +              * triggered or e.g. due to a context switch).
> +              *
> +              * This field is never expected to be zero so we can
> +              * check that the report isn't invalid before copying
> +              * it to userspace...
> +              */
> +             reason = ((report32[0] >> OAREPORT_REASON_SHIFT) &
> +                       OAREPORT_REASON_MASK);
> +             if (reason == 0) {
> +                     if (__ratelimit(&dev_priv->perf.oa.spurious_report_rs))
> +                             DRM_NOTE("Skipping spurious, invalid OA 
> report\n");
> +                     continue;
> +             }
> +
> +             /* XXX: Just keep the lower 21 bits for now since I'm not
> +              * entirely sure if the HW touches any of the higher bits in
> +              * this field
> +              */
> +             ctx_id = report32[2] & 0x1fffff;
> +
> +             /* Squash whatever is in the CTX_ID field if it's
> +              * marked as invalid to be sure we avoid
> +              * false-positive, single-context filtering below...
> +              */
> +             if (!(report32[0] & dev_priv->perf.oa.gen8_valid_ctx_bit))
> +                     ctx_id = report32[2] = INVALID_CTX_ID;
> +
> +             /* NB: For Gen 8 the OA unit no longer supports clock gating
> +              * off for a specific context and the kernel can't securely
> +              * stop the counters from updating as system-wide / global
> +              * values.
> +              *
> +              * Automatic reports now include a context ID so reports can be
> +              * filtered on the cpu but it's not worth trying to
> +              * automatically subtract/hide counter progress for other
> +              * contexts while filtering since we can't stop userspace
> +              * issuing MI_REPORT_PERF_COUNT commands which would still
> +              * provide a side-band view of the real values.
> +              *
> +              * To allow userspace (such as Mesa/GL_INTEL_performance_query)
> +              * to normalize counters for a single filtered context then it
> +              * needs be forwarded bookend context-switch reports so that it
> +              * can track switches in between MI_REPORT_PERF_COUNT commands
> +              * and can itself subtract/ignore the progress of counters
> +              * associated with other contexts. Note that the hardware
> +              * automatically triggers reports when switching to a new
> +              * context which are tagged with the ID of the newly active
> +              * context. To avoid the complexity (and likely fragility) of
> +              * reading ahead while parsing reports to try and minimize
> +              * forwarding redundant context switch reports (i.e. between
> +              * other, unrelated contexts) we simply elect to forward them
> +              * all.
> +              *
> +              * We don't rely solely on the reason field to identify context
> +              * switches since it's not-uncommon for periodic samples to
> +              * identify a switch before any 'context switch' report.
> +              */
> +             if (!dev_priv->perf.oa.exclusive_stream->ctx ||
> +                 dev_priv->perf.oa.specific_ctx_id == ctx_id ||
> +                 (dev_priv->perf.oa.oa_buffer.last_ctx_id ==
> +                  dev_priv->perf.oa.specific_ctx_id) ||
> +                 reason & OAREPORT_REASON_CTX_SWITCH) {
> +
> +                     /* While filtering for a single context we avoid
> +                      * leaking the IDs of other contexts.
> +                      */
> +                     if (dev_priv->perf.oa.exclusive_stream->ctx &&
> +                         dev_priv->perf.oa.specific_ctx_id != ctx_id) {
> +                             report32[2] = INVALID_CTX_ID;
> +                     }
> +
> +                     ret = append_oa_sample(stream, buf, count, offset,
> +                                            report);
> +                     if (ret)
> +                             break;
> +
> +                     dev_priv->perf.oa.oa_buffer.last_ctx_id = ctx_id;
> +             }
> +
> +             /* The above reason field sanity check is based on
> +              * the assumption that the OA buffer is initially
> +              * zeroed and we reset the field after copying so the
> +              * check is still meaningful once old reports start
> +              * being overwritten.
> +              */
> +             report32[0] = 0;
> +     }
> +
> +     if (start_offset != *offset) {
> +             spin_lock_irqsave(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
> +
> +             /* We removed the gtt_offset for the copy loop above, indexing
> +              * relative to oa_buf_base so put back here...
> +              */
> +             head += gtt_offset;
> +
> +             I915_WRITE(GEN8_OAHEADPTR, head & GEN8_OAHEADPTR_MASK);
> +             dev_priv->perf.oa.oa_buffer.head = head;
> +
> +             spin_unlock_irqrestore(&dev_priv->perf.oa.oa_buffer.ptr_lock, 
> flags);
> +     }
> +
> +     return ret;
> +}
> +
> +/**
> + * gen8_oa_read - copy status records then buffered OA reports
> + * @stream: An i915-perf stream opened for OA metrics
> + * @buf: destination buffer given by userspace
> + * @count: the number of bytes userspace wants to read
> + * @offset: (inout): the current position for writing into @buf
> + *
> + * Checks OA unit status registers and if necessary appends corresponding
> + * status records for userspace (such as for a buffer full condition) and 
> then
> + * initiate appending any buffered OA reports.
> + *
> + * Updates @offset according to the number of bytes successfully copied into
> + * the userspace buffer.
> + *
> + * NB: some data may be successfully copied to the userspace buffer
> + * even if an error is returned, and this is reflected in the
> + * updated @offset.
> + *
> + * Returns: zero on success or a negative error code
> + */
> +static int gen8_oa_read(struct i915_perf_stream *stream,
> +                     char __user *buf,
> +                     size_t count,
> +                     size_t *offset)
> +{
> +     struct drm_i915_private *dev_priv = stream->dev_priv;
> +     u32 oastatus;
> +     int ret;
> +
> +     if (WARN_ON(!dev_priv->perf.oa.oa_buffer.vaddr))
> +             return -EIO;
> +
> +     oastatus = I915_READ(GEN8_OASTATUS);
> +
> +     /* We treat OABUFFER_OVERFLOW as a significant error:
> +      *
> +      * Although theoretically we could handle this more gracefully
> +      * sometimes, some Gens don't correctly suppress certain
> +      * automatically triggered reports in this condition and so we
> +      * have to assume that old reports are now being trampled
> +      * over.
> +      *
> +      * Considering how we don't currently give userspace control
> +      * over the OA buffer size and always configure a large 16MB
> +      * buffer, then a buffer overflow does anyway likely indicate
> +      * that something has gone quite badly wrong.
> +      */
> +     if (oastatus & GEN8_OASTATUS_OABUFFER_OVERFLOW) {
> +             ret = append_oa_status(stream, buf, count, offset,
> +                                    DRM_I915_PERF_RECORD_OA_BUFFER_LOST);
> +             if (ret)
> +                     return ret;
> +
> +             DRM_DEBUG("OA buffer overflow (exponent = %d): force restart\n",
> +                       dev_priv->perf.oa.period_exponent);
> +
> +             dev_priv->perf.oa.ops.oa_disable(dev_priv);
> +             dev_priv->perf.oa.ops.oa_enable(dev_priv);
> +
> +             /* Note: .oa_enable() is expected to re-init the oabuffer
> +              * and reset GEN8_OASTATUS for us
> +              */
> +             oastatus = I915_READ(GEN8_OASTATUS);
> +     }
> +
> +     if (oastatus & GEN8_OASTATUS_REPORT_LOST) {
> +             ret = append_oa_status(stream, buf, count, offset,
> +                                    DRM_I915_PERF_RECORD_OA_REPORT_LOST);
if ret != 0 shouldn't we bail?


> +             if (ret == 0) {
> +                     I915_WRITE(GEN8_OASTATUS,
> +                                oastatus & ~GEN8_OASTATUS_REPORT_LOST);
> +             }
> +     }
> +
> +     return gen8_append_oa_reports(stream, buf, count, offset);
> +}

<SNIP>

> +/*
> + * From Broadwell PRM, 3D-Media-GPGPU -> Register State Context
> + *
> + * MMIO reads or writes to any of the registers listed in the
> + * “Register State Context image” subsections through HOST/IA
> + * MMIO interface for debug purposes must follow the steps below:
> + *
> + * - SW should set the Force Wakeup bit to prevent GT from entering C6.
> + * - Write 0x2050[31:0] = 0x00010001 (disable sequence).
> + * - Disable IDLE messaging in CS (Write 0x2050[31:0] = 0x00010001).
> + * - BDW:  Poll/Wait for register bits of 0x22AC[6:0] turn to 0x30 value.
> + * - SKL+: Poll/Wait for register bits of 0x22A4[6:0] turn to 0x30 value.
> + * - Read/Write to desired MMIO registers.
> + * - Enable IDLE messaging in CS (Write 0x2050[31:0] = 0x00010000).
> + * - Force Wakeup bit should be reset to enable C6 entry.
> + *
> + * XXX: don't nest or overlap calls to this API, it has no ref
> + * counting to track how many entities require the RCS to be
> + * blocked from being idle.
> + */
> +static int gen8_begin_ctx_mmio(struct drm_i915_private *dev_priv)
> +{
> +     i915_reg_t fsm_reg = dev_priv->info.gen > 8 ?
INTEL_GEN?

> +             GEN9_RCS_FE_FSM2 : GEN6_RCS_PWR_FSM;
> +     int ret = 0;
> +
> +     /* There's only no active context while idle in execlist mode
> +      * (though we shouldn't be using this in any other case)
> +      */
So there may not be an active context? What happens if we do the mmio
and there is no active context? 

> +     if (WARN_ON(!i915.enable_execlists))
> +             return -EINVAL;
> +
> +     intel_uncore_forcewake_get(dev_priv, FORCEWAKE_RENDER);
> +
> +     I915_WRITE(GEN6_RC_SLEEP_PSMI_CONTROL,
> +                _MASKED_BIT_ENABLE(GEN6_PSMI_SLEEP_MSG_DISABLE));
> +
> +     /* Note: we don't currently have a good handle on the maximum
> +      * latency for this wake up so while we only need to hold rcs
> +      * busy from process context we at least keep the waiting
> +      * interruptible...
> +      */
> +     ret = wait_for((I915_READ(fsm_reg) & 0x3f) == 0x30, 50);
intel_wait_for_register(dev_priv, fsm_reg, 0x3f, 0x30, 50), and maybe a
DRM_ERROR timeout message?

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Re: [Intel-gfx] [PATCH v3 4/7] drm/i915/perf: Add OA unit support for Gen 8+

Reply via email to