Re: [Intel-gfx] [PATCH 2/2] i915/guc: Add Kabylake GuC Loading

2016-07-01 Thread Michel Thierry

On 6/30/2016 5:37 PM, Rodrigo Vivi wrote:

From: Peter Antoine 

This patch added the loading of the GuC for Kabylake.
It loads a 9.14 firmware.

Hello, in case you need a fresh r-b for v3:



v2: Fix commit message
v3: Fix major/minor var names to match -nightly. (Rodrigo)

Cc: Christophe Prigent 
Signed-off-by: Peter Antoine 
Signed-off-by: Michel Thierry 
Reviewed-by: Rodrigo Vivi 
Signed-off-by: Rodrigo Vivi 


Reviewed-by: Michel Thierry  (v3)


---
 drivers/gpu/drm/i915/intel_guc_loader.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/drivers/gpu/drm/i915/intel_guc_loader.c 
b/drivers/gpu/drm/i915/intel_guc_loader.c
index 4f6311a..d80b617 100644
--- a/drivers/gpu/drm/i915/intel_guc_loader.c
+++ b/drivers/gpu/drm/i915/intel_guc_loader.c
@@ -65,6 +65,9 @@ MODULE_FIRMWARE(I915_SKL_GUC_UCODE);
 #define I915_BXT_GUC_UCODE "i915/bxt_guc_ver8_7.bin"
 MODULE_FIRMWARE(I915_BXT_GUC_UCODE);

+#define I915_KBL_GUC_UCODE "i915/kbl_guc_ver9_14.bin"
+MODULE_FIRMWARE(I915_KBL_GUC_UCODE);
+
 /* User-friendly representation of an enum */
 const char *intel_guc_fw_status_repr(enum intel_guc_fw_status status)
 {
@@ -698,6 +701,10 @@ void intel_guc_init(struct drm_device *dev)
fw_path = I915_BXT_GUC_UCODE;
guc_fw->guc_fw_major_wanted = 8;
guc_fw->guc_fw_minor_wanted = 7;
+   } else if (IS_KABYLAKE(dev)) {
+   fw_path = I915_KBL_GUC_UCODE;
+   guc_fw->guc_fw_major_wanted = 9;
+   guc_fw->guc_fw_minor_wanted = 14;
} else {
fw_path = ""; /* unknown device */
}


___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH] drm/i915: Fix compilation (panel orientation x enum plane rename).

2017-12-04 Thread Michel Thierry

On 12/4/2017 4:04 PM, Rodrigo Vivi wrote:

When commit '82daca297506 ("drm/i915: Add "panel orientation"
property to the panel connector, v6.")' was done and tested
by CI, commit 'ed15030d7ab0 ("drm/i915: s/enum plane/enum
i9xx_plane_id/")' wasn't there already.

On this race the second patch got merged first so the first one
broke i915 compilation. Thanks to Michel this was found quickly.

Cc: Michel Thierry 
Cc: Daniel Vetter 
Cc: Hans de Goede 
Suggested-by: Michel Thierry 
Fixes: 82daca297506 ("drm/i915: Add "panel orientation" property to the panel 
connector, v6.")
Signed-off-by: Rodrigo Vivi 
---
  drivers/gpu/drm/i915/intel_dsi.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/intel_dsi.c b/drivers/gpu/drm/i915/intel_dsi.c
index 1b60df3c14a0..f67d321376e4 100644
--- a/drivers/gpu/drm/i915/intel_dsi.c
+++ b/drivers/gpu/drm/i915/intel_dsi.c
@@ -1670,7 +1670,7 @@ static int intel_dsi_get_panel_orientation(struct 
intel_connector *connector)
  {
struct drm_i915_private *dev_priv = to_i915(connector->base.dev);
int orientation = DRM_MODE_PANEL_ORIENTATION_NORMAL;
-   enum plane plane;
+   enum i9xx_plane_id plane;
u32 val;
  
  	if (IS_VALLEYVIEW(dev_priv) || IS_CHERRYVIEW(dev_priv)) {




Reviewed-by: Michel Thierry 
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH] drm/i915/execlists: Cache ELSP register offset

2017-12-07 Thread Michel Thierry

On 07/12/17 12:45, Chris Wilson wrote:

Currently on every submission, we recalculate the ELSP register offset
for the engine, after chasing the pointers to find the iomem base. Since
this is fixed for the lifetime of the driver record the offset in the
execlists struct.

In practice the difference is negligible, it just happens to remove 27
bytes of eyesore pointer dancing from next to the hottest instruction
(which is itself due to stalling for a cache miss) in perf profiles of
the execlists_submission_tasklet().

Signed-off-by: Chris Wilson 
Cc: Tvrtko Ursulin 
Cc: Mika Kuoppala 
---
  drivers/gpu/drm/i915/intel_lrc.c| 12 ++--
  drivers/gpu/drm/i915/intel_ringbuffer.h |  5 +
  2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 2a8160f603ab..93b5ce6307af 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -431,8 +431,7 @@ static inline void elsp_write(u64 desc, u32 __iomem *elsp)
  static void execlists_submit_ports(struct intel_engine_cs *engine)
  {
 struct execlist_port *port = engine->execlists.port;
-   u32 __iomem *elsp =
-   engine->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine));
+   u32 __iomem *elsp = engine->execlists.elsp;
 unsigned int n;

 for (n = execlists_num_ports(&engine->execlists); n--; ) {


Since you're moving this, probably it can now use engine->execlists.elsp 
directly (as inject_preempt_context does), i.e.:


---
@@ -431,7 +431,6 @@ static inline void elsp_write(u64 desc, u32 __iomem 
*elsp)

 static void execlists_submit_ports(struct intel_engine_cs *engine)
 {
struct execlist_port *port = engine->execlists.port;
-   u32 __iomem *elsp = engine->execlists.elsp;
unsigned int n;

for (n = execlists_num_ports(&engine->execlists); n--; ) {
@@ -457,7 +456,7 @@ static void execlists_submit_ports(struct 
intel_engine_cs *engine)

desc = 0;
}

-   elsp_write(desc, elsp);
+   elsp_write(desc, engine->execlists.elsp);
}
execlists_clear_active(&engine->execlists, EXECLISTS_ACTIVE_HWACK);
 }
---

Anyway,

Reviewed-by: Michel Thierry 


@@ -496,8 +495,6 @@ static void inject_preempt_context(struct intel_engine_cs 
*engine)
  {
 struct intel_context *ce =
 &engine->i915->preempt_context->engine[engine->id];
-   u32 __iomem *elsp =
-   engine->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine));
 unsigned int n;

 GEM_BUG_ON(engine->i915->preempt_context->hw_id != PREEMPT_ID);
@@ -510,9 +507,9 @@ static void inject_preempt_context(struct intel_engine_cs 
*engine)

 GEM_TRACE("\n");
 for (n = execlists_num_ports(&engine->execlists); --n; )
-   elsp_write(0, elsp);
+   elsp_write(0, engine->execlists.elsp);

-   elsp_write(ce->lrc_desc, elsp);
+   elsp_write(ce->lrc_desc, engine->execlists.elsp);
 execlists_clear_active(&engine->execlists, EXECLISTS_ACTIVE_HWACK);
  }

@@ -1509,6 +1506,9 @@ static int gen8_init_common_ring(struct intel_engine_cs 
*engine)
 execlists->csb_head = -1;
 execlists->active = 0;

+   execlists->elsp =
+   dev_priv->regs + i915_mmio_reg_offset(RING_ELSP(engine));
+
 /* After a GPU reset, we may have requests to replay */
 if (execlists->first)
 tasklet_schedule(&execlists->tasklet);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h 
b/drivers/gpu/drm/i915/intel_ringbuffer.h
index c68ab3ead83c..183165b9b3fb 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -199,6 +199,11 @@ struct intel_engine_execlists {
  */
 bool no_priolist;

+   /**
+* @elsp: the ExecList Submission Port register
+*/
+   u32 __iomem *elsp;
+
 /**
  * @port: execlist port states
  *
--
2.15.1



___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH i-g-t] tests/gem_reset_stats: Fix retrieval of hangcheck stats expectation

2017-12-08 Thread Michel Thierry

On 08/12/17 09:07, Antonio Argenziano wrote:



On 08/12/17 08:46, Chris Wilson wrote:

Quoting Antonio Argenziano (2017-12-08 16:27:17)

The test expected IOCTL 'I915_GET_RESET_STATS' would return an error
when not root. That is no longer true in the driver and therefore

I would add the commit that changed the behaviour,

...when not root. This is no longer true in the driver since commit 
4c9c0d09741d ("drm/i915: Fix retrieval of hangcheck stats") and therefore...



the test was incorrectly failing.

Cc: Michel Thierry 
Cc: Arkadiusz Hiler 
Signed-off-by: Antonio Argenziano 
---
   tests/gem_reset_stats.c | 22 +++---
   1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/tests/gem_reset_stats.c b/tests/gem_reset_stats.c
index edc40767..83c91f0f 100644
--- a/tests/gem_reset_stats.c
+++ b/tests/gem_reset_stats.c
@@ -605,10 +605,7 @@ static void test_reset_count(const struct 
intel_execution_engine *e,

  c2 = get_reset_count(fd, ctx);

-   if (ctx == 0)
-   igt_assert(c2 == -EPERM);
-   else
-   igt_assert(c2 == 0);
+   igt_assert(c2 == 0);
  }

  igt_waitchildren();
@@ -619,6 +616,11 @@ static void test_reset_count(const struct 
intel_execution_engine *e,
  close(fd);
   }

+static int __get_reset_stats(int fd, struct local_drm_i915_reset_stats *rs)
+{
+   return drmIoctl(fd, GET_RESET_STATS_IOCTL, &rs);
+}
+
   static int _test_params(int fd, int ctx, uint32_t flags, uint32_t pad)
   {
  struct local_drm_i915_reset_stats rs;
@@ -644,10 +646,16 @@ static void _check_param_ctx(const int fd, const int ctx, 
const cap_t cap)
  const uint32_t bad = rand() + 1;

  if (ctx == 0) {
-   if (cap == root)
  igt_assert_eq(_test_params(fd, ctx, 0, 0), 0);


Spurious indenting leftover.


-   else
-   igt_assert_eq(_test_params(fd, ctx, 0, 0), -EPERM);
+   if (cap != root) {


So what are you expecting to happen if you do happen to be rot? Is this
test redundant, which is why you skipped it?


Yes, I think it is redundant because the only expectation for root is
for the IOCTL to be successful as it is for non root users (that is why
I left the first assert to be run unconditionally), and, even if root is
supposed to get the correct reset_count value, unless I am missing
something, that test is not in the scope of this subtest.

-Antonio


-Chris


___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 5/8] drm/i915/guc: Extract doorbell creation from client allocation

2017-12-13 Thread Michel Thierry

On 13/12/17 04:50, Winiarski, Michal wrote:

Full GPU reset causes GuC to be reset. This means that every time we're
doing a reset, we need to talk to GuC and tell it about doorbells.
Let's separate the communication part (create_doorbell) from our
internal bookkeeping (reserve_doorbell) so that we can cleanly separate
the initialization done at module load from reinitialization done at
reset in the following patch.
While I'm here, let's also add a proper (although slightly asymetric)
cleanup that doesn't try to communicate with GuC after it's already
gone, getting rid of "expected" warnings caused by GuC action failures
on module unload.

Note that I've also removed one of the tests (bitmap out of sync), since
it doesn't make much sense anymore - bitmaps are now not expected to
change during the lifetime of a client.

Signed-off-by: Michał Winiarski 
Cc: Chris Wilson 
Cc: Joonas Lahtinen 
Cc: Michal Wajdeczko 
Cc: Michel Thierry 
---
  drivers/gpu/drm/i915/intel_guc_submission.c | 151 
  drivers/gpu/drm/i915/selftests/intel_guc.c  | 110 +---
  2 files changed, 88 insertions(+), 173 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c 
b/drivers/gpu/drm/i915/intel_guc_submission.c
index 8f4b274d66a7..c74e78b6ba41 100644
--- a/drivers/gpu/drm/i915/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/intel_guc_submission.c
@@ -88,7 +88,7 @@ static inline bool is_high_priority(struct intel_guc_client 
*client)
 client->priority == GUC_CLIENT_PRIORITY_HIGH);
  }

-static int __reserve_doorbell(struct intel_guc_client *client)
+static int reserve_doorbell(struct intel_guc_client *client)
  {
 unsigned long offset;
 unsigned long end;
@@ -120,7 +120,7 @@ static int __reserve_doorbell(struct intel_guc_client 
*client)
 return 0;
  }

-static void __unreserve_doorbell(struct intel_guc_client *client)
+static void unreserve_doorbell(struct intel_guc_client *client)
  {
 GEM_BUG_ON(client->doorbell_id == GUC_DOORBELL_INVALID);

@@ -188,32 +188,21 @@ static bool has_doorbell(struct intel_guc_client *client)
 return test_bit(client->doorbell_id, client->guc->doorbell_bitmap);
  }

-static int __create_doorbell(struct intel_guc_client *client)
+static void __create_doorbell(struct intel_guc_client *client)
  {
 struct guc_doorbell_info *doorbell;
-   int err;

 doorbell = __get_doorbell(client);
 doorbell->db_status = GUC_DOORBELL_ENABLED;
 doorbell->cookie = 0;
-
-   err = __guc_allocate_doorbell(client->guc, client->stage_id);
-   if (err) {
-   doorbell->db_status = GUC_DOORBELL_DISABLED;
-   DRM_ERROR("Couldn't create client %u doorbell: %d\n",
- client->stage_id, err);
-   }
-
-   return err;
  }


__create_doorbell isn't creating anything, and now it is just changing 
the db status, but that has nothing to do with this patch.


Reviewed-by: Michel Thierry 



-static int __destroy_doorbell(struct intel_guc_client *client)
+static void __destroy_doorbell(struct intel_guc_client *client)
  {
 struct drm_i915_private *dev_priv = guc_to_i915(client->guc);
 struct guc_doorbell_info *doorbell;
 u16 db_id = client->doorbell_id;

-   GEM_BUG_ON(db_id >= GUC_DOORBELL_INVALID);

 doorbell = __get_doorbell(client);
 doorbell->db_status = GUC_DOORBELL_DISABLED;
@@ -225,50 +214,42 @@ static int __destroy_doorbell(struct intel_guc_client 
*client)
  */
 if (wait_for_us(!(I915_READ(GEN8_DRBREGL(db_id)) & GEN8_DRB_VALID), 
10))
 WARN_ONCE(true, "Doorbell never became invalid after 
disable\n");
-
-   return __guc_deallocate_doorbell(client->guc, client->stage_id);
  }

  static int create_doorbell(struct intel_guc_client *client)
  {
 int ret;

-   ret = __reserve_doorbell(client);
-   if (ret)
-   return ret;
-
 __update_doorbell_desc(client, client->doorbell_id);
+   __create_doorbell(client);

-   ret = __create_doorbell(client);
-   if (ret)
-   goto err;
+   ret = __guc_allocate_doorbell(client->guc, client->stage_id);
+   if (ret) {
+   __destroy_doorbell(client);
+   __update_doorbell_desc(client, GUC_DOORBELL_INVALID);
+   DRM_ERROR("Couldn't create client %u doorbell: %d\n",
+ client->stage_id, ret);
+   return ret;
+   }

 return 0;
-
-err:
-   __update_doorbell_desc(client, GUC_DOORBELL_INVALID);
-   __unreserve_doorbell(client);
-   return ret;
  }

  static int destroy_doorbell(struct intel_guc_client *client)
  {
-   int err;
+   int ret;

 GEM_BUG_ON(!has_doorbell(client));

-   /* 

Re: [Intel-gfx] [PATCH 6/8] drm/i915/guc: Extract clients allocation to submission_init

2017-12-13 Thread Michel Thierry

On 13/12/17 04:50, Michał Winiarski wrote:

We can now move the clients allocation to submission_init path, rather
than keeping the condition inside submission_enable called on every
reset.

Signed-off-by: Michał Winiarski 
Cc: Chris Wilson 
Cc: Joonas Lahtinen 
Cc: Michal Wajdeczko 
---


Reviewed-by: Michel Thierry 


  drivers/gpu/drm/i915/intel_guc_submission.c | 33 ++---
  1 file changed, 11 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c 
b/drivers/gpu/drm/i915/intel_guc_submission.c
index c74e78b6ba41..488110602e7e 100644
--- a/drivers/gpu/drm/i915/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/intel_guc_submission.c
@@ -1149,6 +1149,10 @@ int intel_guc_submission_init(struct intel_guc *guc)
 goto err_log;
 GEM_BUG_ON(!guc->ads_vma);

+   ret = guc_clients_create(guc);
+   if (ret)
+   return ret;
+
 for_each_engine(engine, dev_priv, id) {
 guc->preempt_work[id].engine = engine;
 INIT_WORK(&guc->preempt_work[id].work, inject_preempt_context);
@@ -1172,6 +1176,7 @@ void intel_guc_submission_fini(struct intel_guc *guc)
 for_each_engine(engine, dev_priv, id)
 cancel_work_sync(&guc->preempt_work[id].work);

+   guc_clients_destroy(guc);
 guc_ads_destroy(guc);
 intel_guc_log_destroy(guc);
 guc_stage_desc_pool_destroy(guc);
@@ -1277,28 +1282,18 @@ int intel_guc_submission_enable(struct intel_guc *guc)
  sizeof(struct guc_wq_item) *
  I915_NUM_ENGINES > GUC_WQ_SIZE);

-   /*
-* We're being called on both module initialization and on reset,
-* until this flow is changed, we're using regular client presence to
-* determine which case are we in, and whether we should allocate new
-* clients or just reset their workqueues.
-*/
-   if (!guc->execbuf_client) {
-   err = guc_clients_create(guc);
-   if (err)
-   return err;
-   } else {
-   guc_reset_wq(guc->execbuf_client);
-   guc_reset_wq(guc->preempt_client);
-   }
+   GEM_BUG_ON(!guc->execbuf_client);
+
+   guc_reset_wq(guc->execbuf_client);
+   guc_reset_wq(guc->preempt_client);

 err = intel_guc_sample_forcewake(guc);
 if (err)
-   goto err_free_clients;
+   return err;

 err = guc_clients_doorbell_init(guc);
 if (err)
-   goto err_free_clients;
+   return err;

 /* Take over from manual control of ELSP (execlists) */
 guc_interrupts_capture(dev_priv);
@@ -1315,10 +1310,6 @@ int intel_guc_submission_enable(struct intel_guc *guc)
 }

 return 0;
-
-err_free_clients:
-   guc_clients_destroy(guc);
-   return err;
  }

  void intel_guc_submission_disable(struct intel_guc *guc)
@@ -1332,8 +1323,6 @@ void intel_guc_submission_disable(struct intel_guc *guc)

 /* Revert back to manual ELSP submission */
 intel_engines_reset_default_submission(dev_priv);
-
-   guc_clients_destroy(guc);
  }

  #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
--
2.14.3

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 7/8] drm/i915/guc: Extract doorbell verification into a function

2017-12-13 Thread Michel Thierry

On 13/12/17 07:23, Michal Wajdeczko wrote:

On Wed, 13 Dec 2017 13:50:45 +0100, Michał Winiarski
 wrote:


We have the selftest that's checking doorbell create/destroy, so there's
no need to check all doorbells delaying the reset every time.
We do want to have that extra sanity check at module load/unload though.

Signed-off-by: Michał Winiarski 
Cc: Chris Wilson 
Cc: Joonas Lahtinen 
Cc: Michal Wajdeczko 
---


Reviewed-by: Michal Wajdeczko 



Reviewed-by: Michel Thierry 
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH] drm/i915: Skip an engine reset if it recovered before our preparations

2017-12-15 Thread Michel Thierry

Hi,

On 12/15/2017 3:52 PM, Chris Wilson wrote:

At the beginning of a reset, we disable the submission method and find
the stuck request. We expect to find a stuck request for we have
declared the engine stalled. However, if we find no active request, the
engine must have recovered from its stall before we could issue a reset,
so let the engine continue on without a reset. If the engine is truly
stuck, we will back soon enough with the next reset attempt.

Signed-off-by: Chris Wilson 
Cc: Michel Thierry 
Cc: Mika Kuoppala 
---
  drivers/gpu/drm/i915/i915_drv.c | 14 +++---
  1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index ca9f4b2862eb..6f24435ddffe 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -2011,19 +2011,19 @@ int i915_reset_engine(struct intel_engine_cs *engine, 
unsigned int flags)
  
  	GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &error->flags));
  
-	if (!(flags & I915_RESET_QUIET)) {

-   dev_notice(engine->i915->drm.dev,
-  "Resetting %s after gpu hang\n", engine->name);
-   }
-   error->reset_engine_count[engine->id]++;
-
active_request = i915_gem_reset_prepare_engine(engine);
-   if (IS_ERR(active_request)) {
+   if (IS_ERR_OR_NULL(active_request)) {
DRM_DEBUG_DRIVER("Previous reset failed, promote to full 
reset\n");
ret = PTR_ERR(active_request);


Will a static checker complain about PTR_ERR(NULL)?
And the DRM_DEBUG_DRIVER isn't also correct in that case.


goto out;
}
  
+	if (!(flags & I915_RESET_QUIET)) {

+   dev_notice(engine->i915->drm.dev,
+  "Resetting %s after gpu hang\n", engine->name);
+   }
+   error->reset_engine_count[engine->id]++;
+
if (!engine->i915->guc.execbuf_client)
ret = intel_gt_reset_engine(engine->i915, engine);
else


___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH] drm/i915: Skip an engine reset if it recovered before our preparations

2017-12-15 Thread Michel Thierry

On 12/15/2017 4:16 PM, Chris Wilson wrote:

Quoting Michel Thierry (2017-12-16 00:02:47)

Hi,

On 12/15/2017 3:52 PM, Chris Wilson wrote:

At the beginning of a reset, we disable the submission method and find
the stuck request. We expect to find a stuck request for we have
declared the engine stalled. However, if we find no active request, the
engine must have recovered from its stall before we could issue a reset,
so let the engine continue on without a reset. If the engine is truly
stuck, we will back soon enough with the next reset attempt.

Signed-off-by: Chris Wilson 
Cc: Michel Thierry 
Cc: Mika Kuoppala 
---
   drivers/gpu/drm/i915/i915_drv.c | 14 +++---
   1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index ca9f4b2862eb..6f24435ddffe 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -2011,19 +2011,19 @@ int i915_reset_engine(struct intel_engine_cs *engine, 
unsigned int flags)
   
   GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &error->flags));
   
- if (!(flags & I915_RESET_QUIET)) {

- dev_notice(engine->i915->drm.dev,
-"Resetting %s after gpu hang\n", engine->name);
- }
- error->reset_engine_count[engine->id]++;
-
   active_request = i915_gem_reset_prepare_engine(engine);
- if (IS_ERR(active_request)) {
+ if (IS_ERR_OR_NULL(active_request)) {
   DRM_DEBUG_DRIVER("Previous reset failed, promote to full 
reset\n");
   ret = PTR_ERR(active_request);


Will a static checker complain about PTR_ERR(NULL)?


It shouldn't. PTR_ERR(NULL) -> 0 is one of the valid tricks of PTR_ERR.


And the DRM_DEBUG_DRIVER isn't also correct in that case.


Bah, I was betting on those who read this would know that the full chip
reset was pardoned. If you want, we can just remove the debug.


Yes, the problem is sometimes we only get logs without knowing the code. 
I would vote to either remove it or change it to just say 'reset skipped'.


-Michel
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH v2] drm/i915: Skip an engine reset if it recovered before our preparations

2017-12-15 Thread Michel Thierry

On 12/15/2017 4:22 PM, Chris Wilson wrote:

At the beginning of a reset, we disable the submission method and find
the stuck request. We expect to find a stuck request for we have
declared the engine stalled. However, if we find no active request, the
engine must have recovered from its stall before we could issue a reset,
so let the engine continue on without a reset. If the engine is truly
stuck, we will back soon enough with the next reset attempt.

v2: Remove the stale debug message.

Signed-off-by: Chris Wilson 
Cc: Michel Thierry 
Cc: Mika Kuoppala 
---


Reviewed-by: Michel Thierry 


  drivers/gpu/drm/i915/i915_drv.c | 14 +++---
  1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index ca9f4b2862eb..6d39fdf2b604 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -2011,19 +2011,19 @@ int i915_reset_engine(struct intel_engine_cs *engine, 
unsigned int flags)
  
  	GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &error->flags));
  
+	active_request = i915_gem_reset_prepare_engine(engine);

+   if (IS_ERR_OR_NULL(active_request)) {
+   /* Either the previous reset failed, or we pardon the reset. */
+   ret = PTR_ERR(active_request);
+   goto out;
+   }
+
if (!(flags & I915_RESET_QUIET)) {
dev_notice(engine->i915->drm.dev,
   "Resetting %s after gpu hang\n", engine->name);
}
error->reset_engine_count[engine->id]++;
  
-	active_request = i915_gem_reset_prepare_engine(engine);

-   if (IS_ERR(active_request)) {
-   DRM_DEBUG_DRIVER("Previous reset failed, promote to full 
reset\n");
-   ret = PTR_ERR(active_request);
-   goto out;
-   }
-
if (!engine->i915->guc.execbuf_client)
ret = intel_gt_reset_engine(engine->i915, engine);
else


___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH] drm/i915: Restore the kernel context after a GPU reset on an idle engine

2017-12-15 Thread Michel Thierry

On 12/15/2017 4:03 PM, Chris Wilson wrote:

As part of the system requirement for powersaving is that we always have
a context loaded. Upon boot and resume, we load the kernel_context to
ensure that some valid state is set before powersaving kicks in, we
should do so after a full GPU reset as well. We only need to do so for
an idle engine, as any active engines will restart by executing the
stuck request, loading its context, for the idle engine we create a
new request to load the kernel_context instead.

Signed-off-by: Chris Wilson 
Cc: Mika Kuoppala 
Cc: Joonas Lahtinen 
---
  drivers/gpu/drm/i915/i915_gem.c | 9 +
  1 file changed, 9 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 4a7f5579a7a5..189725a8fed6 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3119,6 +3119,15 @@ void i915_gem_reset(struct drm_i915_private *dev_priv)
 ctx = fetch_and_zero(&engine->last_retired_context);
 if (ctx)
 engine->context_unpin(engine, ctx);
+
+   if (list_empty(&engine->timeline->requests)) {
+   struct drm_i915_gem_request *rq;
+
+   rq = i915_gem_request_alloc(engine,
+   dev_priv->kernel_context);
+   if (!IS_ERR(rq))
+   __i915_add_request(rq, false);
+   }
 }

 i915_gem_restore_fences(dev_priv);


It shouldn't hurt and if it fixes something,

Reviewed-by: Michel Thierry 
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 3/3] drm/i915/selftests: Fix up igt_reset_engine

2017-12-18 Thread Michel Thierry

On 17/12/17 05:28, Chris Wilson wrote:

Now that we skip a per-engine reset on an idle engine, we need to update
the selftest to take that into account. In the process, we find that we
were not stressing the per-engine reset very hard, so add those missing
active resets.

v2: Actually test i915_reset_engine() by loading it with requests.

Fixes: f6ba181ada55 ("drm/i915: Skip an engine reset if it recovered before our 
preparations")
Signed-off-by: Chris Wilson 
Cc: Michel Thierry 
Cc: Mika Kuoppala 



Reviewed-by: Michel Thierry 

And all these subtests passed with and without GuC in SKL.


---
  drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 314 ++-
  1 file changed, 250 insertions(+), 64 deletions(-)

diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c 
b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index f98546b8a7fa..c8a756e2139f 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -132,6 +132,12 @@ static int emit_recurse_batch(struct hang *h,
*batch++ = lower_32_bits(hws_address(hws, rq));
*batch++ = upper_32_bits(hws_address(hws, rq));
*batch++ = rq->fence.seqno;
+   *batch++ = MI_ARB_CHECK;
+
+   memset(batch, 0, 1024);
+   batch += 1024 / sizeof(*batch);
+
+   *batch++ = MI_ARB_CHECK;
*batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
*batch++ = lower_32_bits(vma->node.start);
*batch++ = upper_32_bits(vma->node.start);
@@ -140,6 +146,12 @@ static int emit_recurse_batch(struct hang *h,
*batch++ = 0;
*batch++ = lower_32_bits(hws_address(hws, rq));
*batch++ = rq->fence.seqno;
+   *batch++ = MI_ARB_CHECK;
+
+   memset(batch, 0, 1024);
+   batch += 1024 / sizeof(*batch);
+
+   *batch++ = MI_ARB_CHECK;
*batch++ = MI_BATCH_BUFFER_START | 1 << 8;
*batch++ = lower_32_bits(vma->node.start);
} else if (INTEL_GEN(i915) >= 4) {
@@ -147,12 +159,24 @@ static int emit_recurse_batch(struct hang *h,
*batch++ = 0;
*batch++ = lower_32_bits(hws_address(hws, rq));
*batch++ = rq->fence.seqno;
+   *batch++ = MI_ARB_CHECK;
+
+   memset(batch, 0, 1024);
+   batch += 1024 / sizeof(*batch);
+
+   *batch++ = MI_ARB_CHECK;
*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
*batch++ = lower_32_bits(vma->node.start);
} else {
*batch++ = MI_STORE_DWORD_IMM;
*batch++ = lower_32_bits(hws_address(hws, rq));
*batch++ = rq->fence.seqno;
+   *batch++ = MI_ARB_CHECK;
+
+   memset(batch, 0, 1024);
+   batch += 1024 / sizeof(*batch);
+
+   *batch++ = MI_ARB_CHECK;
*batch++ = MI_BATCH_BUFFER_START | 2 << 6 | 1;
*batch++ = lower_32_bits(vma->node.start);
}
@@ -234,6 +258,16 @@ static void hang_fini(struct hang *h)
i915_gem_wait_for_idle(h->i915, I915_WAIT_LOCKED);
  }
  
+static bool wait_for_hang(struct hang *h, struct drm_i915_gem_request *rq)

+{
+   return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
+  rq->fence.seqno),
+10) &&
+wait_for(i915_seqno_passed(hws_seqno(h, rq),
+   rq->fence.seqno),
+ 1000));
+}
+
  static int igt_hang_sanitycheck(void *arg)
  {
struct drm_i915_private *i915 = arg;
@@ -296,6 +330,9 @@ static void global_reset_lock(struct drm_i915_private *i915)
struct intel_engine_cs *engine;
enum intel_engine_id id;
  
+	pr_debug("%s: current gpu_error=%08lx\n",

+__func__, i915->gpu_error.flags);
+
while (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags))
wait_event(i915->gpu_error.reset_queue,
   !test_bit(I915_RESET_BACKOFF,
@@ -353,54 +390,127 @@ static int igt_global_reset(void *arg)
return err;
  }
  
-static int igt_reset_engine(void *arg)

+static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
  {
-   struct drm_i915_private *i915 = arg;
struct intel_engine_cs *engine;
enum intel_engine_id id;
-   unsigned int reset_count, reset_engine_count;
+   struct hang h;
int err = 0;
  
-	/* Check that we can issue a global GPU and engine reset */

+   /* Check that we can issue an engine reset on an idle engine (no-op) */
  
  	if (!intel_has_reset_engine(i915))

return 0;
  
+	if (active) {

+   mutex_lock(&i915->drm

Re: [Intel-gfx] [PATCH] drm/i915: Avoid context dereference inside execlists_submission_tasklet

2017-12-19 Thread Michel Thierry

On 12/19/2017 2:09 PM, Chris Wilson wrote:

A lesson that has to be relearnt over and over again is that the request
does not keep a reference to the context and so we cannot freely
dereference the context from inside the execlists_submission_tasklet. In
particular, we try to do so in the new GEM_TRACE() so convert those over
to the port->context_id we keep for GEM debugging. This means the
tracing now depends on DRM_I915_GEM_DEBUG.



Even before the port->context_id dependency, I don't think many people 
would enable DRM_I915_TRACE_GEM without DRM_I915_DEBUG_GEM.



Fixes: bccd3b831185 ("drm/i915: Use trace_printk to provide a death rattle for 
GEM")
References: https://bugs.freedesktop.org/show_bug.cgi?id=104066
References: https://bugs.freedesktop.org/show_bug.cgi?id=104162
References: https://bugs.freedesktop.org/show_bug.cgi?id=104242
References: https://bugs.freedesktop.org/show_bug.cgi?id=104310
Signed-off-by: Chris Wilson 
Cc: Mika Kuoppala 
Cc: Joonas Lahtinen 
Cc: Tvrtko Ursulin 
---
  drivers/gpu/drm/i915/Kconfig.debug | 2 +-
  drivers/gpu/drm/i915/intel_lrc.c   | 6 +++---
  2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/Kconfig.debug 
b/drivers/gpu/drm/i915/Kconfig.debug
index fa36491495b1..c846b250b9c4 100644
--- a/drivers/gpu/drm/i915/Kconfig.debug
+++ b/drivers/gpu/drm/i915/Kconfig.debug
@@ -29,7 +29,6 @@ config DRM_I915_DEBUG
 select SW_SYNC # signaling validation framework (igt/syncobj*)
 select DRM_I915_SW_FENCE_DEBUG_OBJECTS
 select DRM_I915_SELFTEST
-   select DRM_I915_TRACE_GEM
  default n
  help
Choose this option to turn on extra driver debugging that may affect
@@ -55,6 +54,7 @@ config DRM_I915_TRACE_GEM
 bool "Insert extra ftrace output from the GEM internals"
 select TRACING
 default n
+   depends on DRM_I915_DEBUG_GEM
 help
   Enable additional and verbose debugging output that will spam
   ordinary tests, but may be vital for post-mortem debugging when
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index eee718e3f371..64d49d5054b9 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -449,7 +449,7 @@ static void execlists_submit_ports(struct intel_engine_cs 
*engine)

 GEM_TRACE("%s in[%d]:  ctx=%d.%d, seqno=%x\n",
   engine->name, n,
- rq->ctx->hw_id, count,
+ port[n].context_id, count,
   rq->global_seqno);
 } else {
 GEM_BUG_ON(!n);
@@ -861,7 +861,7 @@ static void execlists_submission_tasklet(unsigned long data)
  */

 status = READ_ONCE(buf[2 * head]); /* maybe mmio! */
-   GEM_TRACE("%s csb[%dd]: status=0x%08x:0x%08x\n",
+   GEM_TRACE("%s csb[%d]: status=0x%08x:0x%08x\n",
   engine->name, head,
   status, buf[2*head + 1]);

@@ -905,7 +905,7 @@ static void execlists_submission_tasklet(unsigned long data)
 rq = port_unpack(port, &count);
 GEM_TRACE("%s out[0]: ctx=%d.%d, seqno=%x\n",
   engine->name,
- rq->ctx->hw_id, count,
+ port->context_id, count,
   rq->global_seqno);
         GEM_BUG_ON(count == 0);
 if (--count == 0) {
--
2.15.1



Reviewed-by: Michel Thierry 
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH] drm/i915/icl: Correctly clear lost ctx-switch interrupts across reset for Gen11

2018-04-24 Thread Michel Thierry

On 4/24/2018 2:39 PM, Oscar Mateo wrote:

Interrupt handling in Gen11 is quite different from previous platforms.

v2: Rebased (Michel)
v3: Rebased with wiggle
v4: Rebased, remove TODO warning correctly (Daniele)
v5: Rebased, made gen11_gtiir const while at it (Michel)
v6: Rebased
v7: Adapt to the style currently in upstream

Suggested-by: Michel Thierry 
Signed-off-by: Rodrigo Vivi 
Signed-off-by: Michel Thierry 
Signed-off-by: Oscar Mateo 
Cc: Tvrtko Ursulin 
Cc: Daniele Ceraolo Spurio 
Cc: Mika Kuoppala 
---
  drivers/gpu/drm/i915/i915_irq.c  |  6 ++--
  drivers/gpu/drm/i915/intel_drv.h |  3 ++
  drivers/gpu/drm/i915/intel_lrc.c | 60 
  3 files changed, 48 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 96547e0..f9bc3aa 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -247,9 +247,9 @@ void i915_hotplug_interrupt_update(struct drm_i915_private 
*dev_priv,
  gen11_gt_engine_identity(struct drm_i915_private * const i915,
 const unsigned int bank, const unsigned int bit);
  
-static bool gen11_reset_one_iir(struct drm_i915_private * const i915,

-   const unsigned int bank,
-   const unsigned int bit)
+bool gen11_reset_one_iir(struct drm_i915_private * const i915,
+const unsigned int bank,
+const unsigned int bit)
  {
void __iomem * const regs = i915->regs;
u32 dw;
diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h
index 58868b9..9bba035 100644
--- a/drivers/gpu/drm/i915/intel_drv.h
+++ b/drivers/gpu/drm/i915/intel_drv.h
@@ -1333,6 +1333,9 @@ void intel_pch_fifo_underrun_irq_handler(struct 
drm_i915_private *dev_priv,
  void intel_check_pch_fifo_underruns(struct drm_i915_private *dev_priv);
  
  /* i915_irq.c */

+bool gen11_reset_one_iir(struct drm_i915_private * const i915,
+const unsigned int bank,
+const unsigned int bit);
  void gen5_enable_gt_irq(struct drm_i915_private *dev_priv, uint32_t mask);
  void gen5_disable_gt_irq(struct drm_i915_private *dev_priv, uint32_t mask);
  void gen6_mask_pm_irq(struct drm_i915_private *dev_priv, u32 mask);
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 2d6572a..7ea5f36 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -789,22 +789,9 @@ static void execlists_dequeue(struct intel_engine_cs 
*engine)
  
  static void clear_gtiir(struct intel_engine_cs *engine)

  {
-   static const u8 gtiir[] = {
-   [RCS]  = 0,
-   [BCS]  = 0,
-   [VCS]  = 1,
-   [VCS2] = 1,
-   [VECS] = 3,
-   };
struct drm_i915_private *dev_priv = engine->i915;
int i;
  
-	/* TODO: correctly reset irqs for gen11 */

-   if (WARN_ON_ONCE(INTEL_GEN(engine->i915) >= 11))
-   return;
-
-   GEM_BUG_ON(engine->id >= ARRAY_SIZE(gtiir));
-
/*
 * Clear any pending interrupt state.
 *
@@ -812,13 +799,50 @@ static void clear_gtiir(struct intel_engine_cs *engine)
 * double buffered, and so if we only reset it once there may
 * still be an interrupt pending.
 */
-   for (i = 0; i < 2; i++) {
-   I915_WRITE(GEN8_GT_IIR(gtiir[engine->id]),
+   if (INTEL_GEN(dev_priv) >= 11) {
+   static const struct {
+   u8 bank;
+   u8 bit;
+   } gen11_gtiir[] = {
+   [RCS] = {0, GEN11_RCS0},
+   [BCS] = {0, GEN11_BCS},
+   [_VCS(0)] = {1, GEN11_VCS(0)},
+   [_VCS(1)] = {1, GEN11_VCS(1)},
+   [_VCS(2)] = {1, GEN11_VCS(2)},
+   [_VCS(3)] = {1, GEN11_VCS(3)},
+   [_VECS(0)] = {1, GEN11_VECS(0)},
+   [_VECS(1)] = {1, GEN11_VECS(1)},
+   };

bit,bank values are correct so

Reviewed-by: Michel Thierry 


+   unsigned long irqflags;
+
+   GEM_BUG_ON(engine->id >= ARRAY_SIZE(gen11_gtiir));
+
+   spin_lock_irqsave(&dev_priv->irq_lock, irqflags);
+   for (i = 0; i < 2; i++) {
+   gen11_reset_one_iir(dev_priv,
+   gen11_gtiir[engine->id].bank,
+   gen11_gtiir[engine->id].bit);
+   }
+   spin_unlock_irqrestore(&dev_priv->irq_lock, irqflags);
+   } else {
+   static const u8 gtiir[] = {
+   [RCS]  = 0,
+   [BCS]  = 0,
+   [VCS]  = 1,
+   [VCS2] = 1,
+   [VECS] = 3

Re: [Intel-gfx] [PATCH libdrm] intel: add support for ICL 11

2018-04-25 Thread Michel Thierry

On 04/25/2018 05:09 PM, Paulo Zanoni wrote:

Add the PCI IDs and the basic code to enable ICL.  This is the current
PCI ID list in our documentation.

Kernel commit: d55cb4fa2cf0 ("drm/i915/icl: Add the ICL PCI IDs")

v2: Michel provided a fix to IS_9XX that was broken by rebase bot.
v3: Fix double definition of PCI IDs, update IDs according to bspec
 and keep them in the same order and rebase (Lucas)

Cc: Michel Thierry 
Signed-off-by: Paulo Zanoni 
Signed-off-by: Rodrigo Vivi 
Signed-off-by: Lucas De Marchi 
---
  intel/intel_bufmgr_gem.c |  2 ++
  intel/intel_chipset.h| 27 ++-
  intel/intel_decode.c |  4 +++-
  3 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/intel/intel_bufmgr_gem.c b/intel/intel_bufmgr_gem.c
index 5c47a46f..8c3a4b20 100644
--- a/intel/intel_bufmgr_gem.c
+++ b/intel/intel_bufmgr_gem.c
@@ -3660,6 +3660,8 @@ drm_intel_bufmgr_gem_init(int fd, int batch_size)
bufmgr_gem->gen = 9;
else if (IS_GEN10(bufmgr_gem->pci_device))
bufmgr_gem->gen = 10;
+   else if (IS_GEN11(bufmgr_gem->pci_device))
+   bufmgr_gem->gen = 11;
else {
free(bufmgr_gem);
bufmgr_gem = NULL;
diff --git a/intel/intel_chipset.h b/intel/intel_chipset.h
index ba2e3ac1..32b2c48f 100644
--- a/intel/intel_chipset.h
+++ b/intel/intel_chipset.h
@@ -257,6 +257,16 @@
  #define PCI_CHIP_CANNONLAKE_120x5A44
  #define PCI_CHIP_CANNONLAKE_130x5A4C
  
+#define PCI_CHIP_ICELAKE_11_0		0x8A50

+#define PCI_CHIP_ICELAKE_11_1  0x8A51
+#define PCI_CHIP_ICELAKE_11_2  0x8A5C
+#define PCI_CHIP_ICELAKE_11_3  0x8A5D
+#define PCI_CHIP_ICELAKE_11_4  0x8A52
+#define PCI_CHIP_ICELAKE_11_5  0x8A5A
+#define PCI_CHIP_ICELAKE_11_6  0x8A5B
+#define PCI_CHIP_ICELAKE_11_7  0x8A71
+#define PCI_CHIP_ICELAKE_11_8  0x8A70
+


matches what we have in the kernel's i915_pciids.h


  #define IS_MOBILE(devid)  ((devid) == PCI_CHIP_I855_GM || \
 (devid) == PCI_CHIP_I915_GM || \
 (devid) == PCI_CHIP_I945_GM || \
@@ -538,6 +548,20 @@
  
  #define IS_GEN10(devid)		(IS_CANNONLAKE(devid))
  
+#define IS_ICELAKE_11(devid)	((devid) == PCI_CHIP_ICELAKE_11_0 || \

+(devid) == PCI_CHIP_ICELAKE_11_1 || \
+(devid) == PCI_CHIP_ICELAKE_11_2 || \
+(devid) == PCI_CHIP_ICELAKE_11_3 || \
+(devid) == PCI_CHIP_ICELAKE_11_4 || \
+(devid) == PCI_CHIP_ICELAKE_11_5 || \
+(devid) == PCI_CHIP_ICELAKE_11_6 || \
+(devid) == PCI_CHIP_ICELAKE_11_7 || \
+(devid) == PCI_CHIP_ICELAKE_11_8)
+
+#define IS_ICELAKE(devid)  (IS_ICELAKE_11(devid))
+
+#define IS_GEN11(devid)(IS_ICELAKE_11(devid))
+
  #define IS_9XX(dev)   (IS_GEN3(dev) || \
 IS_GEN4(dev) || \
 IS_GEN5(dev) || \
@@ -545,6 +569,7 @@
 IS_GEN7(dev) || \
 IS_GEN8(dev) || \
 IS_GEN9(dev) || \
-IS_GEN10(dev))
+IS_GEN10(dev) || \
+IS_GEN11(dev))
  
  #endif /* _INTEL_CHIPSET_H */

diff --git a/intel/intel_decode.c b/intel/intel_decode.c
index bc7b04b8..b24861b1 100644
--- a/intel/intel_decode.c
+++ b/intel/intel_decode.c
@@ -3823,7 +3823,9 @@ drm_intel_decode_context_alloc(uint32_t devid)
ctx->devid = devid;
ctx->out = stdout;
  
-	if (IS_GEN10(devid))

+   if (IS_GEN11(devid))
+   ctx->gen = 11;
+   else if (IS_GEN10(devid))
ctx->gen = 10;
else if (IS_GEN9(devid))
    ctx->gen = 9;



Reviewed-by: Michel Thierry 
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH] drm/i915/lrc: Scrub the GPU state of the guilty hanging request

2018-04-27 Thread Michel Thierry

On 4/27/2018 12:32 PM, Chris Wilson wrote:

Previously, we just reset the ring register in the context image such
that we could skip over the broken batch and emit the closing
breadcrumb. However, on resume the context image and GPU state would be
reloaded, which may have been left in an inconsistent state by the
reset. The presumption was that at worst it would just cause another
reset and skip again until it recovered, however it seems just as likely
to cause an unrecoverable hang. Instead of risking loading an incomplete
context image, restore it back to the default state.

Signed-off-by: Chris Wilson 
Cc: Mika Kuoppala 
Cc: Michał Winiarski 
Cc: Michel Thierry 
Cc: Tvrtko Ursulin 
---
  drivers/gpu/drm/i915/intel_lrc.c | 24 +---
  1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index ce23d5116482..422b05290ed6 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1804,8 +1804,8 @@ static void reset_common_ring(struct intel_engine_cs 
*engine,
  struct i915_request *request)
  {
struct intel_engine_execlists * const execlists = &engine->execlists;
-   struct intel_context *ce;
unsigned long flags;
+   u32 *regs;
  
  	GEM_TRACE("%s request global=%x, current=%d\n",

  engine->name, request ? request->global_seqno : 0,
@@ -1855,14 +1855,24 @@ static void reset_common_ring(struct intel_engine_cs 
*engine,
 * future request will be after userspace has had the opportunity
 * to recreate its own state.
 */
-   ce = &request->ctx->engine[engine->id];
-   execlists_init_reg_state(ce->lrc_reg_state,
-request->ctx, engine, ce->ring);
+   regs = request->ctx->engine[engine->id].lrc_reg_state;
+   if (engine->default_state) {
+   void *defaults;
+
+   defaults = i915_gem_object_pin_map(engine->default_state,
+  I915_MAP_WB);
+   if (!IS_ERR(defaults)) {
+   memcpy(regs,
+  defaults + LRC_HEADER_PAGES * PAGE_SIZE,
+  engine->context_size);

Hi,

The context_size is taking into count the PP_HWSP page, do we also need 
to rewrite the PP_HSWP? (or just the logical state).


Also regs is already pointing to the start of the logical state
(vaddr + LRC_STATE_PN * PAGE_SIZE).

So if we want to overwrite from the PP_HWSP, then regs is not the right 
offset, or if we only want to change the logical state then it should be 
from 'defaults +  LRC_STATE_PN * PAGE_SIZE'.


-Michel


+   i915_gem_object_unpin_map(engine->default_state);
+   }
+   }
+   execlists_init_reg_state(regs, request->ctx, engine, request->ring);
  
  	/* Move the RING_HEAD onto the breadcrumb, past the hanging batch */

-   ce->lrc_reg_state[CTX_RING_BUFFER_START+1] =
-   i915_ggtt_offset(ce->ring->vma);
-   ce->lrc_reg_state[CTX_RING_HEAD+1] = request->postfix;
+   regs[CTX_RING_BUFFER_START + 1] = i915_ggtt_offset(request->ring->vma);
+   regs[CTX_RING_HEAD + 1] = request->postfix;
  
  	request->ring->head = request->postfix;

intel_ring_update_space(request->ring);


___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH v2] drm/i915/lrc: Scrub the GPU state of the guilty hanging request

2018-04-27 Thread Michel Thierry

On 4/27/2018 1:24 PM, Chris Wilson wrote:

Previously, we just reset the ring register in the context image such
that we could skip over the broken batch and emit the closing
breadcrumb. However, on resume the context image and GPU state would be
reloaded, which may have been left in an inconsistent state by the
reset. The presumption was that at worst it would just cause another
reset and skip again until it recovered, however it seems just as likely
to cause an unrecoverable hang. Instead of risking loading an incomplete
context image, restore it back to the default state.

v2: Fix up off-by-one from including the ppHSWP in with the register
state.

Signed-off-by: Chris Wilson 
Cc: Mika Kuoppala 
Cc: Michał Winiarski 
Cc: Michel Thierry 
Cc: Tvrtko Ursulin 


Reviewed-by: Michel Thierry 

Does it need a 'Fixes:' tag or has a bugzilla reference?

---
  drivers/gpu/drm/i915/intel_lrc.c | 24 +---
  1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index ce23d5116482..01750a4c2f3f 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1804,8 +1804,8 @@ static void reset_common_ring(struct intel_engine_cs 
*engine,
  struct i915_request *request)
  {
struct intel_engine_execlists * const execlists = &engine->execlists;
-   struct intel_context *ce;
unsigned long flags;
+   u32 *regs;
  
  	GEM_TRACE("%s request global=%x, current=%d\n",

  engine->name, request ? request->global_seqno : 0,
@@ -1855,14 +1855,24 @@ static void reset_common_ring(struct intel_engine_cs 
*engine,
 * future request will be after userspace has had the opportunity
 * to recreate its own state.
 */
-   ce = &request->ctx->engine[engine->id];
-   execlists_init_reg_state(ce->lrc_reg_state,
-request->ctx, engine, ce->ring);
+   regs = request->ctx->engine[engine->id].lrc_reg_state;
+   if (engine->default_state) {
+   void *defaults;
+
+   defaults = i915_gem_object_pin_map(engine->default_state,
+  I915_MAP_WB);
+   if (!IS_ERR(defaults)) {
+   memcpy(regs, /* skip restoring to the vanilla PPHWSP */
+  defaults + LRC_STATE_PN * PAGE_SIZE,
+  engine->context_size - PAGE_SIZE);
+   i915_gem_object_unpin_map(engine->default_state);
+   }
+   }
+   execlists_init_reg_state(regs, request->ctx, engine, request->ring);
  
  	/* Move the RING_HEAD onto the breadcrumb, past the hanging batch */

-   ce->lrc_reg_state[CTX_RING_BUFFER_START+1] =
-   i915_ggtt_offset(ce->ring->vma);
-   ce->lrc_reg_state[CTX_RING_HEAD+1] = request->postfix;
+   regs[CTX_RING_BUFFER_START + 1] = i915_ggtt_offset(request->ring->vma);
+   regs[CTX_RING_HEAD + 1] = request->postfix;
  
  	request->ring->head = request->postfix;

intel_ring_update_space(request->ring);


___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH v2] drm/i915/lrc: Scrub the GPU state of the guilty hanging request

2018-04-27 Thread Michel Thierry

On 4/27/2018 1:35 PM, Chris Wilson wrote:

Quoting Michel Thierry (2018-04-27 21:27:46)

On 4/27/2018 1:24 PM, Chris Wilson wrote:

Previously, we just reset the ring register in the context image such
that we could skip over the broken batch and emit the closing
breadcrumb. However, on resume the context image and GPU state would be
reloaded, which may have been left in an inconsistent state by the
reset. The presumption was that at worst it would just cause another
reset and skip again until it recovered, however it seems just as likely
to cause an unrecoverable hang. Instead of risking loading an incomplete
context image, restore it back to the default state.

v2: Fix up off-by-one from including the ppHSWP in with the register
state.

Signed-off-by: Chris Wilson 
Cc: Mika Kuoppala 
Cc: Michał Winiarski 
Cc: Michel Thierry 
Cc: Tvrtko Ursulin 


Reviewed-by: Michel Thierry 

Does it need a 'Fixes:' tag or has a bugzilla reference?


I suspect it's rare enough that the unrecoverable hang might not be
recognisable in bugzilla. I was just looking at

https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_4108/fi-bsw-n3050/dmesg0.log

trying to think of ways how the reset might appear to work but the
recovery fail with

<7>[  521.765114] missed_breadcrumb vecs0 missed breadcrumb at 
intel_breadcrumbs_hangcheck+0x5a/0x80 [i915]
<7>[  521.765176] missed_breadcrumb   current seqno e4e, last e4f, 
hangcheck e4e [2048 ms], inflight 1
<7>[  521.765191] missed_breadcrumb   Reset count: 0 (global 0)
<7>[  521.765206] missed_breadcrumb   Requests:
<7>[  521.765223] missed_breadcrumb   first  e4f [9b82:e4f] prio=0 
@ 3766ms: gem_sync[3107]/0
<7>[  521.765239] missed_breadcrumb   last   e4f [9b82:e4f] prio=0 
@ 3766ms: gem_sync[3107]/0
<7>[  521.765256] missed_breadcrumb   active e4f [9b82:e4f] prio=0 
@ 3766ms: gem_sync[3107]/0
<7>[  521.765274] missed_breadcrumb   [head 3900, postfix 3930, 
tail 3948, batch 0x_00042000]
<7>[  521.765289] missed_breadcrumb   ring->start:  0x008ef000
<7>[  521.765301] missed_breadcrumb   ring->head:   0x38f8
<7>[  521.765313] missed_breadcrumb   ring->tail:   0x3948
<7>[  521.765325] missed_breadcrumb   ring->emit:   0x3950
<7>[  521.765337] missed_breadcrumb   ring->space:  0x2618
<7>[  521.765372] missed_breadcrumb   RING_START: 0x008ef000
<7>[  521.765389] missed_breadcrumb   RING_HEAD:  0x38f8
<7>[  521.765404] missed_breadcrumb   RING_TAIL:  0x3948
<7>[  521.765422] missed_breadcrumb   RING_CTL:   0x3001
<7>[  521.765438] missed_breadcrumb   RING_MODE:  0x
<7>[  521.765453] missed_breadcrumb   RING_IMR: fefe
<7>[  521.765473] missed_breadcrumb   ACTHD:  0x_022039b8
<7>[  521.765492] missed_breadcrumb   BBADDR: 0x_00042004
<7>[  521.765511] missed_breadcrumb   DMA_FADDR: 0x_008f28f8
<7>[  521.765537] missed_breadcrumb   IPEIR: 0x
<7>[  521.765552] missed_breadcrumb   IPEHR: 0x1111
<7>[  521.765570] missed_breadcrumb   Execlist status: 0x00044032 0002
<7>[  521.765586] missed_breadcrumb   Execlist CSB read 1 [1 cached], write 
2 [2 from hws], interrupt posted? no, tasklet queued? no (enabled)
<7>[  521.765604] missed_breadcrumb   Execlist CSB[2]: 0x0001 
[0x0001 in hwsp], context: 0 [0 in hwsp]
<7>[  521.765619] missed_breadcrumb   ELSP[0] count=1, rq: e4f 
[9b82:e4f] prio=0 @ 3767ms: gem_sync[3107]/0
<7>[  521.765632] missed_breadcrumb   ELSP[1] idle
<7>[  521.765645] missed_breadcrumb   HW active? 0x1
<7>[  521.765660] missed_breadcrumb   E e4f [9b82:e4f] prio=0 @ 
3767ms: gem_sync[3107]/0
<7>[  521.765670] missed_breadcrumb   Queue priority: -2147483648
<7>[  521.765684] missed_breadcrumb   gem_sync [3112] waiting for e4f
<7>[  521.765697] missed_breadcrumb IRQ? 0x1 (breadcrumbs? yes) (execlists? no)
<7>[  521.765707] missed_breadcrumb HWSP:
<7>[  521.765723] missed_breadcrumb     
    
<7>[  521.765733] missed_breadcrumb *
<7>[  521.765747] missed_breadcrumb 0040 0001  0018 
0002 0001  0018 0002
<7>[  521.765760] missed_breadcrumb 0060 0001  0018 
0002    0002
<7>[  521.765774] missed_breadcrumb 0080    
    
<7>[  521.765784] missed_breadcrumb *
<7>[  521.765809] missed_breadcrumb 00c0 0e4e   
  

Re: [Intel-gfx] [PATCH v4] drm/i915/lrc: Scrub the GPU state of the guilty hanging request

2018-04-30 Thread Michel Thierry

On 04/28/2018 04:15 AM, Chris Wilson wrote:

Previously, we just reset the ring register in the context image such
that we could skip over the broken batch and emit the closing
breadcrumb. However, on resume the context image and GPU state would be
reloaded, which may have been left in an inconsistent state by the
reset. The presumption was that at worst it would just cause another
reset and skip again until it recovered, however it seems just as likely
to cause an unrecoverable hang. Instead of risking loading an incomplete
context image, restore it back to the default state.

v2: Fix up off-by-one from including the ppHSWP in with the register
state.
v3: Use a ring local to compact a few lines.
v4: Beware setting the ring local before checking for a NULL request.


Didn't you want to set the ring local after this check?
if (!request || request->fence.error != -EIO)

This is identical to v2.


Signed-off-by: Chris Wilson 
Cc: Mika Kuoppala 
Cc: Michał Winiarski 
Cc: Michel Thierry 
Cc: Tvrtko Ursulin 
Reviewed-by: Michel Thierry  #v2
---
  drivers/gpu/drm/i915/intel_lrc.c | 24 +---
  1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index ce23d5116482..513aee6b3634 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1804,8 +1804,8 @@ static void reset_common_ring(struct intel_engine_cs 
*engine,
  struct i915_request *request)
  {
struct intel_engine_execlists * const execlists = &engine->execlists;
-   struct intel_context *ce;
unsigned long flags;
+   u32 *regs;
  
  	GEM_TRACE("%s request global=%x, current=%d\n",

  engine->name, request ? request->global_seqno : 0,
@@ -1855,14 +1855,24 @@ static void reset_common_ring(struct intel_engine_cs 
*engine,
 * future request will be after userspace has had the opportunity
 * to recreate its own state.
 */
-   ce = &request->ctx->engine[engine->id];
-   execlists_init_reg_state(ce->lrc_reg_state,
-request->ctx, engine, ce->ring);
+   regs = request->ctx->engine[engine->id].lrc_reg_state;
+   if (engine->default_state) {
+   void *defaults;
+
+   defaults = i915_gem_object_pin_map(engine->default_state,
+  I915_MAP_WB);
+   if (!IS_ERR(defaults)) {
+   memcpy(regs, /* skip restoring the vanilla PPHWSP */
+  defaults + LRC_STATE_PN * PAGE_SIZE,
+  engine->context_size - PAGE_SIZE);
+   i915_gem_object_unpin_map(engine->default_state);
+   }
+   }
+   execlists_init_reg_state(regs, request->ctx, engine, request->ring);
  
  	/* Move the RING_HEAD onto the breadcrumb, past the hanging batch */

-   ce->lrc_reg_state[CTX_RING_BUFFER_START+1] =
-   i915_ggtt_offset(ce->ring->vma);
-   ce->lrc_reg_state[CTX_RING_HEAD+1] = request->postfix;
+   regs[CTX_RING_BUFFER_START + 1] = i915_ggtt_offset(request->ring->vma);
+   regs[CTX_RING_HEAD + 1] = request->postfix;
  
  	request->ring->head = request->postfix;

intel_ring_update_space(request->ring);


___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH] drm/i915/guc: Assert we have the doorbell before setting it up

2018-05-01 Thread Michel Thierry

On 5/1/2018 12:52 AM, Chris Wilson wrote:

As our early doorbell is split between early allocation and a late setup
after we have a channel to the GuC, it may happen due to a lapse of
programmer judgement that we try to setup an invalid doorbell. Make use
of our has_doorbell() function to check the doorbell does exist for the
client before we try and tell the guc about it. In doing so, we prevent
the compiler from warning about the otherwise unused function in some
configurations.



Looks ok to me, but the new place has_doorbell is called is inside a 
GEM_BUG_ON...

So the warning will still be there when CONFIG_DRM_I915_DEBUG_GEM=n, right?

(btw, until late last year, there where more users of that function).


Reported-by: Matthias Kaehlcke 
Signed-off-by: Chris Wilson 
Cc: Daniele Ceraolo Spurio 
Cc: Michał Winiarski 
Cc: Michal Wajdeczko 
Cc: Michel Thierry 
---
  drivers/gpu/drm/i915/intel_guc_submission.c | 22 +++--
  1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c 
b/drivers/gpu/drm/i915/intel_guc_submission.c
index 6e6ed0f46bd3..c6bb5bebddfc 100644
--- a/drivers/gpu/drm/i915/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/intel_guc_submission.c
@@ -124,9 +124,17 @@ static int reserve_doorbell(struct intel_guc_client 
*client)
return 0;
  }
  
+static bool has_doorbell(struct intel_guc_client *client)

+{
+   if (client->doorbell_id == GUC_DOORBELL_INVALID)
+   return false;
+
+   return test_bit(client->doorbell_id, client->guc->doorbell_bitmap);
+}
+
  static void unreserve_doorbell(struct intel_guc_client *client)
  {
-   GEM_BUG_ON(client->doorbell_id == GUC_DOORBELL_INVALID);
+   GEM_BUG_ON(!has_doorbell(client));
  
  	__clear_bit(client->doorbell_id, client->guc->doorbell_bitmap);

client->doorbell_id = GUC_DOORBELL_INVALID;
@@ -184,14 +192,6 @@ static struct guc_doorbell_info *__get_doorbell(struct 
intel_guc_client *client)
return client->vaddr + client->doorbell_offset;
  }
  
-static bool has_doorbell(struct intel_guc_client *client)

-{
-   if (client->doorbell_id == GUC_DOORBELL_INVALID)
-   return false;
-
-   return test_bit(client->doorbell_id, client->guc->doorbell_bitmap);
-}
-
  static void __create_doorbell(struct intel_guc_client *client)
  {
struct guc_doorbell_info *doorbell;
@@ -207,7 +207,6 @@ static void __destroy_doorbell(struct intel_guc_client 
*client)
struct guc_doorbell_info *doorbell;
u16 db_id = client->doorbell_id;
  
-

doorbell = __get_doorbell(client);
doorbell->db_status = GUC_DOORBELL_DISABLED;
doorbell->cookie = 0;
@@ -224,6 +223,9 @@ static int create_doorbell(struct intel_guc_client *client)
  {
int ret;
  
+	if (WARN_ON(!has_doorbell(client)))

+   return -ENODEV; /* internal setup error, should never happen */
+
__update_doorbell_desc(client, client->doorbell_id);
__create_doorbell(client);
  


___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH] drm/i915/guc: Assert we have the doorbell before setting it up

2018-05-02 Thread Michel Thierry

On 05/02/2018 02:11 AM, Chris Wilson wrote:

Quoting Michel Thierry (2018-05-01 15:21:53)

On 5/1/2018 12:52 AM, Chris Wilson wrote:

As our early doorbell is split between early allocation and a late setup
after we have a channel to the GuC, it may happen due to a lapse of
programmer judgement that we try to setup an invalid doorbell. Make use
of our has_doorbell() function to check the doorbell does exist for the
client before we try and tell the guc about it. In doing so, we prevent
the compiler from warning about the otherwise unused function in some
configurations.



Looks ok to me, but the new place has_doorbell is called is inside a
GEM_BUG_ON...
So the warning will still be there when CONFIG_DRM_I915_DEBUG_GEM=n, right?



@@ -224,6 +223,9 @@ static int create_doorbell(struct intel_guc_client *client)
   {
   int ret;
   
+ if (WARN_ON(!has_doorbell(client)))

+ return -ENODEV; /* internal setup error, should never happen */


Sorry, somehow I read that line as GEM_WARN_ON...



This is the one I added to make sure we had at least one user. If it
weren't for the compiler warning I'd be happy for this to be
GEM_BUG_ON() as well.
-Chris



Reviewed-by: Michel Thierry 
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH] drm/i915/execlists: Drop preemption arbitrations points along the ring

2018-05-03 Thread Michel Thierry

On 5/3/2018 12:54 PM, Chris Wilson wrote:

Limit the arbitration (where preemption may occur) to inside the batch,
and prevent it from happening on the pipecontrols/flushes we use to
write the breadcrumb seqno. Once the user batch is complete, we have
nothing left to do but serialise and emit the breadcrumb; switching
contexts at this point is futile so don't.

Signed-off-by: Chris Wilson 
Cc: Michał Winiarski 
Cc: Michel Thierry 
Cc: Joonas Lahtinen 
Reviewed-by: Tvrtko Ursulin 
---

Michał and Michel,
   please take a look and see if you can think of any objections.


No objections, I was only thinking what would happen if we arm the 
watchdog (and we decide timeout != reset the engine) and forgot to write 
a reply. For the record, I think it'd be ok anyway.


Reviewed-by: Michel Thierry 


-Chris

---
  drivers/gpu/drm/i915/intel_lrc.c | 9 ++---
  1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 3d747d1c3d4d..9f3cce022b2d 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1933,7 +1933,7 @@ static int gen8_emit_bb_start(struct i915_request *rq,
rq->ctx->ppgtt->pd_dirty_rings &= 
~intel_engine_flag(rq->engine);
}
  
-	cs = intel_ring_begin(rq, 4);

+   cs = intel_ring_begin(rq, 6);
if (IS_ERR(cs))
return PTR_ERR(cs);
  
@@ -1962,6 +1962,9 @@ static int gen8_emit_bb_start(struct i915_request *rq,

(flags & I915_DISPATCH_RS ? MI_BATCH_RESOURCE_STREAMER : 0);
*cs++ = lower_32_bits(offset);
*cs++ = upper_32_bits(offset);
+
+   *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
+   *cs++ = MI_NOOP;
intel_ring_advance(rq, cs);
  
  	return 0;

@@ -2104,7 +2107,7 @@ static void gen8_emit_breadcrumb(struct i915_request 
*request, u32 *cs)
cs = gen8_emit_ggtt_write(cs, request->global_seqno,
  intel_hws_seqno_address(request->engine));
*cs++ = MI_USER_INTERRUPT;
-   *cs++ = MI_NOOP;
+   *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
request->tail = intel_ring_offset(request, cs);
assert_ring_tail_valid(request->ring, request->tail);
  
@@ -2120,7 +2123,7 @@ static void gen8_emit_breadcrumb_rcs(struct i915_request *request, u32 *cs)

cs = gen8_emit_ggtt_write_rcs(cs, request->global_seqno,
  intel_hws_seqno_address(request->engine));
*cs++ = MI_USER_INTERRUPT;
-   *cs++ = MI_NOOP;
+   *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
request->tail = intel_ring_offset(request, cs);
assert_ring_tail_valid(request->ring, request->tail);
  


___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 1/2] drm/i915/execlists: Use rmb() to order CSB reads

2018-05-08 Thread Michel Thierry

On 05/08/2018 09:30 AM, Chris Wilson wrote:

We assume that the CSB is written using the normal ringbuffer
coherency protocols, as outlined in kernel/events/ring_buffer.c:

 *   (HW)  (DRIVER)
 *
 *   if (LOAD ->data_tail) {LOAD ->data_head
 *  (A) smp_rmb()   (C)
 *  STORE $data LOAD $data
 *  smp_wmb()   (B) smp_mb()(D)
 *  STORE ->data_head   STORE ->data_tail
 *   }

So we assume that the HW fulfils its ordering requirements (B), and so
we should use a complimentary rmb (C) to ensure that our read of its
WRITE pointer is completed before we start accessing the data.

The final mb (D) is implied by the uncached mmio we perform to inform
the HW of our READ pointer.

References: https://bugs.freedesktop.org/show_bug.cgi?id=105064
References: https://bugs.freedesktop.org/show_bug.cgi?id=105888
References: https://bugs.freedesktop.org/show_bug.cgi?id=106185
Fixes: 767a983ab255 ("drm/i915/execlists: Read the context-status HEAD from the 
HWSP")
References: 61bf9719fa17 ("drm/i915/cnl: Use mmio access to context status 
buffer")
Suggested-by: Mika Kuoppala 
Signed-off-by: Chris Wilson 
Cc: Mika Kuoppala 
Cc: Joonas Lahtinen 
Cc: Tvrtko Ursulin 
Cc: Michał Winiarski 
Cc: Rafael Antognolli 
Cc: Michel Thierry 
Cc: Timo Aaltonen 
Tested-by: Timo Aaltonen 


Acked-by: Michel Thierry 


---
  drivers/gpu/drm/i915/intel_lrc.c | 1 +
  1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 911f288f78aa..8977600f0d81 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -992,6 +992,7 @@ static void execlists_submission_tasklet(unsigned long data)
  
  			head = execlists->csb_head;

tail = READ_ONCE(buf[write_idx]);
+   rmb(); /* Hopefully paired with a wmb() in HW */
}
GEM_TRACE("%s cs-irq head=%d [%d%s], tail=%d [%d%s]\n",
  engine->name,


___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 2/2] Revert "drm/i915/cnl: Use mmio access to context status buffer"

2018-05-08 Thread Michel Thierry

On 05/08/2018 09:30 AM, Chris Wilson wrote:

In the previous patch (to include a rmb() after readig the CSB WRITE
pointer from the HWSP) we believe we have fixed the underlying bug, and
so can re-enable using the HWSP on Cannolake.

This reverts commit 61bf9719fa17 ("drm/i915/cnl: Use mmio access to
context status buffer").

References: https://bugs.freedesktop.org/show_bug.cgi?id=105888
References: https://bugs.freedesktop.org/show_bug.cgi?id=106185
References: 61bf9719fa17 ("drm/i915/cnl: Use mmio access to context status 
buffer")
Signed-off-by: Chris Wilson 
Cc: Mika Kuoppala 
Cc: Joonas Lahtinen 
Cc: Tvrtko Ursulin 
Cc: Michał Winiarski 
Cc: Rafael Antognolli 
Cc: Michel Thierry 
Cc: Timo Aaltonen 
Tested-by: Timo Aaltonen 


Acked-by: Michel Thierry 


---
  drivers/gpu/drm/i915/intel_engine_cs.c | 3 ---
  1 file changed, 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c 
b/drivers/gpu/drm/i915/intel_engine_cs.c
index 70325e0824e3..8303e05b0c7d 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -470,9 +470,6 @@ static bool csb_force_mmio(struct drm_i915_private *i915)
if (intel_vgpu_active(i915) && !intel_vgpu_has_hwsp_emulation(i915))
return true;
  
-	if (IS_CANNONLAKE(i915))

-   return true;
-
return false;
  }
  


___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH] drm/i915: Annotate timeline lock nesting

2018-05-08 Thread Michel Thierry

On 05/08/2018 08:35 AM, Chris Wilson wrote:

CI noticed

<4>[   23.430701] 
<4>[   23.430706] WARNING: possible recursive locking detected
<4>[   23.430713] 4.17.0-rc4-CI-CI_DRM_4156+ #1 Not tainted
<4>[   23.430720] 
<4>[   23.430725] systemd-udevd/169 is trying to acquire lock:
<4>[   23.430732] (ptrval) (&(&timeline->lock)->rlock){}, at: 
move_to_timeline+0x48/0x12c [i915]
<4>[   23.430888]
   but task is already holding lock:
<4>[   23.430894] (ptrval) (&(&timeline->lock)->rlock){}, at: 
i915_request_submit+0x1a/0x40 [i915]
<4>[   23.430995]
   other info that might help us debug this:
<4>[   23.431002]  Possible unsafe locking scenario:

<4>[   23.431007]CPU0
<4>[   23.431010]
<4>[   23.431013]   lock(&(&timeline->lock)->rlock);
<4>[   23.431021]   lock(&(&timeline->lock)->rlock);
<4>[   23.431028]
*** DEADLOCK ***

<4>[   23.431036]  May be due to missing lock nesting notation

<4>[   23.431044] 5 locks held by systemd-udevd/169:
<4>[   23.431049]  #0: (ptrval) (&dev->mutex){}, at: 
__driver_attach+0x42/0xe0
<4>[   23.431065]  #1: (ptrval) (&dev->mutex){}, at: 
__driver_attach+0x50/0xe0
<4>[   23.431078]  #2: (ptrval) (&dev->struct_mutex){+.+.}, at: 
i915_gem_init+0xca/0x630 [i915]
<4>[   23.431174]  #3: (ptrval) (rcu_read_lock){}, at: 
submit_notify+0x35/0x124 [i915]
<4>[   23.431271]  #4: (ptrval) (&(&timeline->lock)->rlock){}, at: 
i915_request_submit+0x1a/0x40 [i915]
<4>[   23.431369]
   stack backtrace:
<4>[   23.431377] CPU: 0 PID: 169 Comm: systemd-udevd Not tainted 
4.17.0-rc4-CI-CI_DRM_4156+ #1
<4>[   23.431385] Hardware name: Dell Inc. OptiPlex GX280   
/0G8310, BIOS A04 02/09/2005
<4>[   23.431394] Call Trace:
<4>[   23.431403]  dump_stack+0x67/0x9b

...

<4>[   23.432765] R13: 561a47296450 R14: 0002 R15: 
561a472a4b30

but did not report it as an issue as it only occurred during the first
module on boot. This is due to the removal of the distinct global
timeline, and its separate lock class. So instead mark up the expected
nesting. An alternative would be to define a separate lock class for the
engine, but since we only expect to have a single point of nesting, we
can avoid having multiple lock classes for the struct.

Fixes: a89d1f921c15 ("drm/i915: Split i915_gem_timeline into individual 
timelines")
Signed-off-by: Chris Wilson 
Cc: Chris Wilson 
Cc: Tvrtko Ursulin 
Cc: Joonas Lahtinen 


Tested-by: Michel Thierry 


---
  drivers/gpu/drm/i915/i915_request.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_request.c 
b/drivers/gpu/drm/i915/i915_request.c
index f336942229cf..8928894dd9c7 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -502,7 +502,7 @@ static void move_to_timeline(struct i915_request *request,
GEM_BUG_ON(request->timeline == &request->engine->timeline);
lockdep_assert_held(&request->engine->timeline.lock);
  
-	spin_lock(&request->timeline->lock);

+   spin_lock_nested(&request->timeline->lock, SINGLE_DEPTH_NESTING);
list_move_tail(&request->link, &timeline->requests);
spin_unlock(&request->timeline->lock);
  }


___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH] drm/i915/selftests: Create mock_engine() under struct_mutex

2018-05-08 Thread Michel Thierry

On 05/08/2018 02:10 PM, Chris Wilson wrote:

Calling mock_engine() calls i915_timeline_init() and that requires
struct_mutex to be held as it adds itself to the global list of
timelines. This error was introduced by commit a89d1f921c15 ("drm/i915:
Split i915_gem_timeline into individual timelines") but the issue was
masked in CI by the earlier lockdep spam.

Fixes: a89d1f921c15 ("drm/i915: Split i915_gem_timeline into individual 
timelines")
Signed-off-by: Chris Wilson 
Cc: Tvrtko Ursulin 
Cc: Joonas Lahtinen 
Cc: Michel Thierry 


Double checked that mock_ring (the other caller of i915_timeline_init) 
is covered by this same lock.


Reviewed-by: Michel Thierry 


---
  drivers/gpu/drm/i915/selftests/mock_gem_device.c | 9 ++---
  1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c 
b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
index 4b6622c6986a..94baedfa0f74 100644
--- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c
+++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
@@ -229,18 +229,20 @@ struct drm_i915_private *mock_gem_device(void)
INIT_LIST_HEAD(&i915->gt.closed_vma);
  
  	mutex_lock(&i915->drm.struct_mutex);

+
mock_init_ggtt(i915);
-   mutex_unlock(&i915->drm.struct_mutex);
  
  	mkwrite_device_info(i915)->ring_mask = BIT(0);

i915->engine[RCS] = mock_engine(i915, "mock", RCS);
if (!i915->engine[RCS])
-   goto err_priorities;
+   goto err_unlock;
  
  	i915->kernel_context = mock_context(i915, NULL);

if (!i915->kernel_context)
goto err_engine;
  
+	mutex_unlock(&i915->drm.struct_mutex);

+
WARN_ON(i915_gemfs_init(i915));
  
  	return i915;

@@ -248,7 +250,8 @@ struct drm_i915_private *mock_gem_device(void)
  err_engine:
for_each_engine(engine, i915, id)
mock_engine_free(engine);
-err_priorities:
+err_unlock:
+   mutex_unlock(&i915->drm.struct_mutex);
kmem_cache_destroy(i915->priorities);
  err_dependencies:
kmem_cache_destroy(i915->dependencies);


___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [PATCH] drm/i915/gen9: Add WaClearHIZ_WM_CHICKEN3 for bxt and glk

2018-05-10 Thread Michel Thierry
Factor in clear values wherever required while updating destination
min/max.

References: HSDES#160184
Signed-off-by: Michel Thierry 
Cc: mesa-...@lists.freedesktop.org
Cc: Mika Kuoppala 
Cc: Oscar Mateo 
---
 drivers/gpu/drm/i915/i915_reg.h  | 3 +++
 drivers/gpu/drm/i915/intel_workarounds.c | 4 
 2 files changed, 7 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 085928c9005e..ee11f6ed217a 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -7251,6 +7251,9 @@ enum {
 #define SLICE_ECO_CHICKEN0 _MMIO(0x7308)
 #define   PIXEL_MASK_CAMMING_DISABLE   (1 << 14)
 
+#define GEN9_WM_CHICKEN3   _MMIO(0x5588)
+#define   GEN9_FACTOR_IN_CLR_VAL_HIZ   (1 << 9)
+
 /* WaCatErrorRejectionIssue */
 #define GEN7_SQ_CHICKEN_MBCUNIT_CONFIG _MMIO(0x9030)
 #define  GEN7_SQ_CHICKEN_MBCUNIT_SQINTMOB  (1<<11)
diff --git a/drivers/gpu/drm/i915/intel_workarounds.c 
b/drivers/gpu/drm/i915/intel_workarounds.c
index ec9d340fcb00..054e1dee7899 100644
--- a/drivers/gpu/drm/i915/intel_workarounds.c
+++ b/drivers/gpu/drm/i915/intel_workarounds.c
@@ -270,6 +270,10 @@ static int gen9_ctx_workarounds_init(struct 
drm_i915_private *dev_priv)
GEN9_PREEMPT_GPGPU_LEVEL_MASK,
GEN9_PREEMPT_GPGPU_COMMAND_LEVEL);
 
+   /* WaClearHIZ_WM_CHICKEN3:bxt,glk */
+   if (IS_GEN9_LP(dev_priv))
+   WA_SET_BIT_MASKED(GEN9_WM_CHICKEN3, GEN9_FACTOR_IN_CLR_VAL_HIZ);
+
return 0;
 }
 
-- 
2.17.0

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH] drm/i915/gen9: Add WaClearHIZ_WM_CHICKEN3 for bxt and glk

2018-05-11 Thread Michel Thierry

On 5/11/2018 5:43 AM, Mika Kuoppala wrote:

Chris Wilson  writes:


Quoting Mika Kuoppala (2018-05-11 10:56:49)

Michel Thierry  writes:


Factor in clear values wherever required while updating destination
min/max.

References: HSDES#160184
Signed-off-by: Michel Thierry 
Cc: mesa-...@lists.freedesktop.org
Cc: Mika Kuoppala 
Cc: Oscar Mateo 


Reviewed-by: Mika Kuoppala 


Cc: stable?


Yes, we should.



I think so too, although stable doesn't have the workaround refactoring 
yet, the change will be in intel_engine_cs.c instead.



-Mika


___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [BACKPORT v4.17-rc5] drm/i915/gen9: Add WaClearHIZ_WM_CHICKEN3 for bxt and glk

2018-05-14 Thread Michel Thierry
Factor in clear values wherever required while updating destination
min/max.

References: HSDES#160184
Signed-off-by: Michel Thierry 
Cc: mesa-...@lists.freedesktop.org
Cc: Mika Kuoppala 
Cc: Oscar Mateo 
Reviewed-by: Mika Kuoppala 
Signed-off-by: Chris Wilson 
Link: 
https://patchwork.freedesktop.org/patch/msgid/20180510200708.18097-1-michel.thie...@intel.com
Cc: sta...@vger.kernel.org
Cc: Joonas Lahtinen 
---
 drivers/gpu/drm/i915/i915_reg.h| 3 +++
 drivers/gpu/drm/i915/intel_engine_cs.c | 4 
 2 files changed, 7 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index e6a8c0ee7df1..8a69a9275e28 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -7326,6 +7326,9 @@ enum {
 #define SLICE_ECO_CHICKEN0 _MMIO(0x7308)
 #define   PIXEL_MASK_CAMMING_DISABLE   (1 << 14)
 
+#define GEN9_WM_CHICKEN3   _MMIO(0x5588)
+#define   GEN9_FACTOR_IN_CLR_VAL_HIZ   (1 << 9)
+
 /* WaCatErrorRejectionIssue */
 #define GEN7_SQ_CHICKEN_MBCUNIT_CONFIG _MMIO(0x9030)
 #define  GEN7_SQ_CHICKEN_MBCUNIT_SQINTMOB  (1<<11)
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c 
b/drivers/gpu/drm/i915/intel_engine_cs.c
index 4ba139c27fba..f7c25828d3bb 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -1149,6 +1149,10 @@ static int gen9_init_workarounds(struct intel_engine_cs 
*engine)
WA_SET_FIELD_MASKED(GEN8_CS_CHICKEN1, GEN9_PREEMPT_GPGPU_LEVEL_MASK,
GEN9_PREEMPT_GPGPU_COMMAND_LEVEL);
 
+   /* WaClearHIZ_WM_CHICKEN3:bxt,glk */
+   if (IS_GEN9_LP(dev_priv))
+   WA_SET_BIT_MASKED(GEN9_WM_CHICKEN3, GEN9_FACTOR_IN_CLR_VAL_HIZ);
+
/* WaVFEStateAfterPipeControlwithMediaStateClear:skl,bxt,glk,cfl */
ret = wa_ring_whitelist_reg(engine, GEN9_CTX_PREEMPT_REG);
if (ret)
-- 
2.17.0

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [BACKPORT v4.17-rc5] drm/i915/gen9: Add WaClearHIZ_WM_CHICKEN3 for bxt and glk

2018-05-15 Thread Michel Thierry

On 5/15/2018 10:13 AM, Jani Nikula wrote:

On Mon, 14 May 2018, Michel Thierry  wrote:

Factor in clear values wherever required while updating destination
min/max.


Hi Michel, please elaborate what the intention here is.



Hi Jani, isn't the intention of all the workarounds to prevent gpu hangs?


BR,
Jani.





References: HSDES#160184
Signed-off-by: Michel Thierry 
Cc: mesa-...@lists.freedesktop.org
Cc: Mika Kuoppala 
Cc: Oscar Mateo 
Reviewed-by: Mika Kuoppala 
Signed-off-by: Chris Wilson 
Link: 
https://patchwork.freedesktop.org/patch/msgid/20180510200708.18097-1-michel.thie...@intel.com
Cc: sta...@vger.kernel.org
Cc: Joonas Lahtinen 
---
  drivers/gpu/drm/i915/i915_reg.h| 3 +++
  drivers/gpu/drm/i915/intel_engine_cs.c | 4 
  2 files changed, 7 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index e6a8c0ee7df1..8a69a9275e28 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -7326,6 +7326,9 @@ enum {
  #define SLICE_ECO_CHICKEN0_MMIO(0x7308)
  #define   PIXEL_MASK_CAMMING_DISABLE  (1 << 14)
  
+#define GEN9_WM_CHICKEN3			_MMIO(0x5588)

+#define   GEN9_FACTOR_IN_CLR_VAL_HIZ   (1 << 9)
+
  /* WaCatErrorRejectionIssue */
  #define GEN7_SQ_CHICKEN_MBCUNIT_CONFIG_MMIO(0x9030)
  #define  GEN7_SQ_CHICKEN_MBCUNIT_SQINTMOB (1<<11)
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c 
b/drivers/gpu/drm/i915/intel_engine_cs.c
index 4ba139c27fba..f7c25828d3bb 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -1149,6 +1149,10 @@ static int gen9_init_workarounds(struct intel_engine_cs 
*engine)
WA_SET_FIELD_MASKED(GEN8_CS_CHICKEN1, GEN9_PREEMPT_GPGPU_LEVEL_MASK,
GEN9_PREEMPT_GPGPU_COMMAND_LEVEL);
  
+	/* WaClearHIZ_WM_CHICKEN3:bxt,glk */

+   if (IS_GEN9_LP(dev_priv))
+   WA_SET_BIT_MASKED(GEN9_WM_CHICKEN3, GEN9_FACTOR_IN_CLR_VAL_HIZ);
+
/* WaVFEStateAfterPipeControlwithMediaStateClear:skl,bxt,glk,cfl */
ret = wa_ring_whitelist_reg(engine, GEN9_CTX_PREEMPT_REG);
if (ret)



___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [BACKPORT v4.17-rc5] drm/i915/gen9: Add WaClearHIZ_WM_CHICKEN3 for bxt and glk

2018-05-15 Thread Michel Thierry

On 5/15/2018 11:17 AM, Jani Nikula wrote:

On Tue, 15 May 2018, Michel Thierry  wrote:

On 5/15/2018 10:13 AM, Jani Nikula wrote:

On Mon, 14 May 2018, Michel Thierry  wrote:

Factor in clear values wherever required while updating destination
min/max.


Hi Michel, please elaborate what the intention here is.



Hi Jani, isn't the intention of all the workarounds to prevent gpu
hangs?


Err, sorry for the riddles, I meant with [BACKPORT v4.17-rc5] etc. :)


No worries,


Is this in dinq already? Commit id?

It was merged only a couple of days ago,

https://cgit.freedesktop.org/drm-tip/commit/?id=0c79f9cb77eae28d48a4f9fc1b3341aacbbd260c

Joonas asked me to backport it (stable doesn't have the 
intel_workarounds refactor yet).




BR,
Jani.






BR,
Jani.





References: HSDES#160184
Signed-off-by: Michel Thierry 
Cc: mesa-...@lists.freedesktop.org
Cc: Mika Kuoppala 
Cc: Oscar Mateo 
Reviewed-by: Mika Kuoppala 
Signed-off-by: Chris Wilson 
Link: 
https://patchwork.freedesktop.org/patch/msgid/20180510200708.18097-1-michel.thie...@intel.com
Cc: sta...@vger.kernel.org
Cc: Joonas Lahtinen 
---
   drivers/gpu/drm/i915/i915_reg.h| 3 +++
   drivers/gpu/drm/i915/intel_engine_cs.c | 4 
   2 files changed, 7 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index e6a8c0ee7df1..8a69a9275e28 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -7326,6 +7326,9 @@ enum {
   #define SLICE_ECO_CHICKEN0   _MMIO(0x7308)
   #define   PIXEL_MASK_CAMMING_DISABLE (1 << 14)
   
+#define GEN9_WM_CHICKEN3			_MMIO(0x5588)

+#define   GEN9_FACTOR_IN_CLR_VAL_HIZ   (1 << 9)
+
   /* WaCatErrorRejectionIssue */
   #define GEN7_SQ_CHICKEN_MBCUNIT_CONFIG   _MMIO(0x9030)
   #define  GEN7_SQ_CHICKEN_MBCUNIT_SQINTMOB(1<<11)
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c 
b/drivers/gpu/drm/i915/intel_engine_cs.c
index 4ba139c27fba..f7c25828d3bb 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -1149,6 +1149,10 @@ static int gen9_init_workarounds(struct intel_engine_cs 
*engine)
WA_SET_FIELD_MASKED(GEN8_CS_CHICKEN1, GEN9_PREEMPT_GPGPU_LEVEL_MASK,
GEN9_PREEMPT_GPGPU_COMMAND_LEVEL);
   
+	/* WaClearHIZ_WM_CHICKEN3:bxt,glk */

+   if (IS_GEN9_LP(dev_priv))
+   WA_SET_BIT_MASKED(GEN9_WM_CHICKEN3, GEN9_FACTOR_IN_CLR_VAL_HIZ);
+
/* WaVFEStateAfterPipeControlwithMediaStateClear:skl,bxt,glk,cfl */
ret = wa_ring_whitelist_reg(engine, GEN9_CTX_PREEMPT_REG);
if (ret)





___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 1/2] drm/i915/guc: Fix sleep under spinlock during reset

2017-04-12 Thread Michel Thierry

On 12/04/17 08:58, Chris Wilson wrote:

On Wed, Apr 12, 2017 at 04:48:42PM +0100, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

Looks like intel_guc_reset had the ability to sleep under the
uncore spinlock since forever but it wasn't detected until the
recent changes annotated the wait for register with might_sleep.

I have fixed it by removing holding of the uncore spinlock over
the call to gen6_hw_domain_reset, since I do not see that is
really needed. But there is always a possibility I am missing
some nasty detail so please double check.


Afaik, no we are not using the uncore.lock here to serialise resets so
yes we should be safe in dropping it.

Will the guc be coming under the same hw semaphore as gen8 per-engine
resets?


A bit unrelated, but should intel_guc_reset be intel_reset_guc instead?
Here we're trying to reset the microcontroller, not asking guc to do a 
reset.

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH v5 12/18] drm/i915/guc: Provide register list to be saved/restored during engine reset

2017-04-13 Thread Michel Thierry


On 13/04/17 17:16, Daniele Ceraolo Spurio wrote:



On 24/03/17 18:30, Michel Thierry wrote:

From: Arun Siluvery 

GuC expects a list of registers from the driver which are saved/restored
during engine reset. The type of value to be saved is controlled by
flags. We provide a minimal set of registers that we want GuC to save and
restore. This is not an issue in case of engine reset as driver
initializes
most of them following an engine reset, but in case of media reset (aka
watchdog reset) which is completely internal to GuC (including
resubmission
of hung workload), it is necessary to provide this list, otherwise GuC
won't
be able to schedule further workloads after a reset. This is the minimal
set of registers identified for things to work as expected but if we see
any new issues, this register list can be expanded.

Signed-off-by: Arun Siluvery 
Signed-off-by: Jeff McGee 
Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/i915_guc_submission.c | 50
+-
 1 file changed, 49 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c
b/drivers/gpu/drm/i915/i915_guc_submission.c
index 991e76e10f82..2445af96aa71 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -1015,6 +1015,22 @@ static void guc_policies_init(struct
guc_policies *policies)
 policies->is_valid = 1;
 }

+/*
+ * In this macro it is highly unlikely to exceed max value but even
if we did
+ * it is not an error so just throw a warning and continue. Only side
effect
+ * in continuing further means some registers won't be added to
save/restore
+ * list.
+ */
+#define GUC_ADD_MMIO_REG_ADS(node, reg_addr, _flags)\
+do {\
+u32 __count = node->number_of_registers;\
+if (WARN_ON(__count >= GUC_REGSET_MAX_REGISTERS))\
+continue;\
+node->registers[__count].offset = reg_addr.reg;\
+node->registers[__count].flags = (_flags);\
+node->number_of_registers++;\
+} while (0)
+
 static int guc_ads_create(struct intel_guc *guc)
 {
 struct drm_i915_private *dev_priv = guc_to_i915(guc);
@@ -1047,10 +1063,42 @@ static int guc_ads_create(struct intel_guc *guc)

 /* MMIO reg state */
 for_each_engine(engine, dev_priv, id) {
+u32 flags;
+struct guc_mmio_regset *eng_reg =
+&blob->reg_state.engine_reg[engine->guc_id];
+
+/*
+ * Provide a list of registers to be saved/restored during gpu
+ * reset. This is mainly required for Media reset (aka watchdog
+ * timeout) which is completely under the control of GuC
+ * (resubmission of hung workload is handled inside GuC).
+ */
+flags = GUC_REGSET_POWERCYCLE |


This flag doesn't really do anything inside GuC, I guess it is probably
a remnant of some functionality that has been removed.


+GUC_REGSET_ENGINERESET |
+GUC_REGSET_SAVE_CURRENT_VALUE;
+
+GUC_ADD_MMIO_REG_ADS(eng_reg, RING_HEAD(engine->mmio_base),
+ flags);
+
+GUC_ADD_MMIO_REG_ADS(eng_reg, RING_TAIL(engine->mmio_base),
+ flags);


Aren't head & tail context save/restored? there should be no need to
have GuC restore them. Also, won't restoring them manually generate
activity on the engine?


+
+GUC_ADD_MMIO_REG_ADS(eng_reg, RING_HWS_PGA(engine->mmio_base),
+ flags);
+
+GUC_ADD_MMIO_REG_ADS(eng_reg, RING_MODE_GEN7(engine),
+ (flags | GUC_REGSET_MASKED));


GUC_REGSET_MASKED doesn't do anything either (surprise!). Since this is


s/doesn't do anything/guc is full of bugs/  :smirk:


actually something that is required, I've chatted with the GuC guys and
they said they're going to look into re-adding proper functionality. In
the meantime the suggestion is to use GUC_REGSET_SAVE_DEFAULT_VALUE, in
which case the guc after the reset will write whatever we set in
node->registers[__count].value and we can put an already masked value in
there.

Thanks,
Daniele



Thanks, I'll add it to the changes in the next version.


+
+GUC_ADD_MMIO_REG_ADS(eng_reg, RING_IMR(engine->mmio_base),
+ flags);
+
+DRM_DEBUG_DRIVER("%s register save/restore count: %u\n",
+ engine->name, eng_reg->number_of_registers);
+
+ /* Nothing to be white listed for now. */
 blob->reg_state.white_list[engine->guc_id].mmio_start =
 engine->mmio_base + GUC_MMIO_WHITE_LIST_START;

-/* Nothing to be saved or restored for now. */
 blob->reg_state.white_list[engine->guc_id].count = 0;
 }



___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH v5 17/18] drm/i915: Watchdog timeout: DRM kernel interface to set the timeout

2017-04-14 Thread Michel Thierry



On 14/04/17 09:05, Daniele Ceraolo Spurio wrote:



On 24/03/17 18:30, Michel Thierry wrote:

Final enablement patch for GPU hang detection using watchdog timeout.
Using the gem_context_setparam ioctl, users can specify the desired
timeout value in microseconds, and the driver will do the conversion to
'timestamps'.

The recommended default watchdog threshold for video engines is 6 us,
since this has been _empirically determined_ to be a good compromise for
low-latency requirements and low rate of false positives. The default
register value is ~106000us and the theoretical max value (all 1s) is
353 seconds.

v2: Fixed get api to return values in microseconds. Threshold updated to
be per context engine. Check for u32 overflow. Capture ctx threshold
value in error state.

Signed-off-by: Tomas Elf 
Signed-off-by: Arun Siluvery 
Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/i915_drv.h |  1 +
 drivers/gpu/drm/i915/i915_gem_context.c | 78
+
 drivers/gpu/drm/i915/i915_gem_context.h | 20 +
 drivers/gpu/drm/i915/i915_gpu_error.c   | 11 +++--
 drivers/gpu/drm/i915/intel_lrc.c|  2 +-
 include/uapi/drm/i915_drm.h |  1 +
 6 files changed, 108 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h
b/drivers/gpu/drm/i915/i915_drv.h
index b43c37a911bb..1741584d858f 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1039,6 +1039,7 @@ struct i915_gpu_state {
 int ban_score;
 int active;
 int guilty;
+int watchdog_threshold;
 } context;

 struct drm_i915_error_object {
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c
b/drivers/gpu/drm/i915/i915_gem_context.c
index edbed85a1c88..f5c126c0c681 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -422,6 +422,78 @@ i915_gem_context_create_gvt(struct drm_device *dev)
 return ctx;
 }

+/* Return the timer count threshold in microseconds. */
+int i915_gem_context_get_watchdog(struct i915_gem_context *ctx,
+  struct drm_i915_gem_context_param *args)
+{
+struct drm_i915_private *dev_priv = ctx->i915;
+struct intel_engine_cs *engine;
+enum intel_engine_id id;
+u32 threshold_in_us[I915_NUM_ENGINES];
+
+if (!dev_priv->engine[VCS]->emit_start_watchdog)
+return -ENODEV;
+
+for_each_engine(engine, dev_priv, id) {
+struct intel_context *ce = &ctx->engine[id];
+
+threshold_in_us[id] = watchdog_to_us(ce->watchdog_threshold);
+}
+
+mutex_unlock(&dev_priv->drm.struct_mutex);
+if (__copy_to_user(u64_to_user_ptr(args->value),
+   &threshold_in_us,
+   sizeof(threshold_in_us))) {
+mutex_lock(&dev_priv->drm.struct_mutex);
+return -EFAULT;
+}
+mutex_lock(&dev_priv->drm.struct_mutex);
+args->size = sizeof(threshold_in_us);
+
+return 0;
+}
+
+/*
+ * Based on time out value in microseconds (us) calculate
+ * timer count thresholds needed based on core frequency.
+ * Watchdog can be disabled by setting it to 0.
+ */
+int i915_gem_context_set_watchdog(struct i915_gem_context *ctx,
+  struct drm_i915_gem_context_param *args)
+{
+struct drm_i915_private *dev_priv = ctx->i915;
+struct intel_engine_cs *engine;
+enum intel_engine_id id;
+u32 threshold_in_us[I915_NUM_ENGINES];
+
+if (!dev_priv->engine[VCS]->emit_start_watchdog)
+return -ENODEV;
+else if (args->size < sizeof(threshold_in_us))
+return -EINVAL;


won't we break userspace with this check if we ever get more engines on
a new platform and thus bump I915_NUM_ENGINES?

Thanks,
Daniele



There's a v3 of this patch with Chris feedback,
https://patchwork.freedesktop.org/patch/148805/
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH v5 14/18] drm/i915: Watchdog timeout: Pass GuC shared data structure during param load

2017-04-17 Thread Michel Thierry


On 17/04/17 14:28, Daniele Ceraolo Spurio wrote:



On 24/03/17 18:30, Michel Thierry wrote:

For watchdog / media reset, the firmware must know the address of the
shared
data page (the first page of the default context).

This information should be in DWORD 9 of the GUC_CTL structure.

Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/intel_guc_fwif.h   | 2 +-
 drivers/gpu/drm/i915/intel_guc_loader.c | 8 
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/intel_guc_fwif.h
b/drivers/gpu/drm/i915/intel_guc_fwif.h
index b627206b8f56..5db3def5f74e 100644
--- a/drivers/gpu/drm/i915/intel_guc_fwif.h
+++ b/drivers/gpu/drm/i915/intel_guc_fwif.h
@@ -135,7 +135,7 @@
 #define   GUC_ADS_ADDR_SHIFT11
 #define   GUC_ADS_ADDR_MASK0xf800

-#define GUC_CTL_RSRVD9
+#define GUC_CTL_SHARED_DATA9

 #define GUC_CTL_MAX_DWORDS(SOFT_SCRATCH_COUNT - 2) /* [1..14] */

diff --git a/drivers/gpu/drm/i915/intel_guc_loader.c
b/drivers/gpu/drm/i915/intel_guc_loader.c
index 7d92321f8731..afa584864cb5 100644
--- a/drivers/gpu/drm/i915/intel_guc_loader.c
+++ b/drivers/gpu/drm/i915/intel_guc_loader.c
@@ -119,6 +119,7 @@ static void guc_params_init(struct
drm_i915_private *dev_priv)
 {
 struct intel_guc *guc = &dev_priv->guc;
 u32 params[GUC_CTL_MAX_DWORDS];
+struct i915_gem_context *ctx;
 int i;

 memset(¶ms, 0, sizeof(params));
@@ -167,6 +168,13 @@ static void guc_params_init(struct
drm_i915_private *dev_priv)
 params[GUC_CTL_FEATURE] &= ~GUC_CTL_DISABLE_SCHEDULER;
 }

+/*
+ * For watchdog / media reset, GuC must know the address of the
shared
+ * data page, which is the first page of the default context.
+ */
+ctx = dev_priv->kernel_context;
+params[GUC_CTL_SHARED_DATA] =
i915_ggtt_offset(ctx->engine[RCS].state);


Since we use this page in several places (here, in the engine reset h2g
and if I'm not mistaken also in the preemption h2g), is it worth saving
its ggtt offset inside the guc struct to make things cleaner?



In suspend/resume too.




+
 I915_WRITE(SOFT_SCRATCH(0), 0);

 for (i = 0; i < GUC_CTL_MAX_DWORDS; i++)


___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [PATCH v6 04/20] drm/i915/tdr: Modify error handler for per engine hang recovery

2017-04-18 Thread Michel Thierry
From: Arun Siluvery 

This is a preparatory patch which modifies error handler to do per engine
hang recovery. The actual patch which implements this sequence follows
later in the series. The aim is to prepare existing recovery function to
adapt to this new function where applicable (which fails at this point
because core implementation is lacking) and continue recovery using legacy
full gpu reset.

A helper function is also added to query the availability of engine
reset.

The error events behaviour that are used to notify user of reset are
adapted to engine reset such that it doesn't break users listening to these
events. In legacy we report an error event, a reset event before resetting
the gpu and a reset done event marking the completion of reset. The same
behaviour is adapted but reset event is only dispatched once even when
multiple engines are hung. Finally once reset is complete we send reset
done event as usual.

Note that this implementation of engine reset is for i915 directly
submitting to the ELSP, where the driver manages the hang detection,
recovery and resubmission. With GuC submission these tasks are shared
between driver and firmware; i915 will still responsible for detecting a
hang, and when it does it will have to request GuC to reset that Engine and
remind the firmware about the outstanding submissions. This will be
added in different patch.

v2: rebase, advertise engine reset availability in platform definition,
add note about GuC submission.
v3: s/*engine_reset*/*reset_engine*/. (Chris)
Handle reset as 2 level resets, by first going to engine only and fall
backing to full/chip reset as needed, i.e. reset_engine will need the
struct_mutex.
v4: Pass the engine mask to i915_reset. (Chris)
v5: Rebase, update selftests.
v6: Rebase, prepare for mutex-less reset engine.

Cc: Chris Wilson 
Cc: Mika Kuoppala 
Signed-off-by: Ian Lister 
Signed-off-by: Tomas Elf 
Signed-off-by: Arun Siluvery 
Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/i915_drv.c | 15 +++
 drivers/gpu/drm/i915/i915_drv.h |  6 ++
 drivers/gpu/drm/i915/i915_irq.c | 30 +-
 drivers/gpu/drm/i915/i915_pci.c |  5 -
 drivers/gpu/drm/i915/intel_ringbuffer.h | 16 
 drivers/gpu/drm/i915/intel_uncore.c | 11 +++
 6 files changed, 81 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index cc7393e65e99..e03d0643dbd6 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1800,6 +1800,8 @@ void i915_reset(struct drm_i915_private *dev_priv)
if (!test_bit(I915_RESET_HANDOFF, &error->flags))
return;
 
+   DRM_DEBUG_DRIVER("resetting chip\n");
+
/* Clear any previous failed attempts at recovery. Time to try again. */
if (!i915_gem_unset_wedged(dev_priv))
goto wakeup;
@@ -1863,6 +1865,19 @@ void i915_reset(struct drm_i915_private *dev_priv)
goto finish;
 }
 
+/**
+ * i915_reset_engine - reset GPU engine to recover from a hang
+ * @engine: engine to reset
+ *
+ * Reset a specific GPU engine. Useful if a hang is detected.
+ * Returns zero on successful reset or otherwise an error code.
+ */
+int i915_reset_engine(struct intel_engine_cs *engine)
+{
+   /* FIXME: replace me with engine reset sequence */
+   return -ENODEV;
+}
+
 static int i915_pm_suspend(struct device *kdev)
 {
struct pci_dev *pdev = to_pci_dev(kdev);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index e06af46f5a57..7bc5f552add7 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -814,6 +814,7 @@ struct intel_csr {
func(has_ddi); \
func(has_decoupled_mmio); \
func(has_dp_mst); \
+   func(has_reset_engine); \
func(has_fbc); \
func(has_fpga_dbg); \
func(has_full_ppgtt); \
@@ -1616,6 +1617,9 @@ struct i915_gpu_error {
 #define I915_RESET_HANDOFF 1
 #define I915_WEDGED(BITS_PER_LONG - 1)
 
+   /* if available, engine-specific reset is tried before full gpu reset */
+   u32 reset_engine_mask;
+
/**
 * Waitqueue to signal when a hang is detected. Used to for waiters
 * to release the struct_mutex for the reset to procede.
@@ -3019,6 +3023,8 @@ extern void i915_driver_unload(struct drm_device *dev);
 extern int intel_gpu_reset(struct drm_i915_private *dev_priv, u32 engine_mask);
 extern bool intel_has_gpu_reset(struct drm_i915_private *dev_priv);
 extern void i915_reset(struct drm_i915_private *dev_priv);
+extern int i915_reset_engine(struct intel_engine_cs *engine);
+extern bool intel_has_reset_engine(struct drm_i915_private *dev_priv);
 extern int intel_guc_reset(struct drm_i915_private *dev_priv);
 extern void intel_engine_init_hangcheck(struct intel_engine_cs *engine);
 extern void intel_hangcheck_init(s

[Intel-gfx] [PATCH v6 01/20] drm/i915: Fix stale comment about I915_RESET_IN_PROGRESS flag

2017-04-18 Thread Michel Thierry
It has been replaced by I915_RESET_BACKOFF / I915_RESET_HANDOFF.

Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/i915_drv.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index b5c81de102e8..e06af46f5a57 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1568,7 +1568,7 @@ struct i915_gpu_error {
 *
 * This is a counter which gets incremented when reset is triggered,
 *
-* Before the reset commences, the I915_RESET_IN_PROGRESS bit is set
+* Before the reset commences, the I915_RESET_BACKOFF bit is set
 * meaning that any waiters holding onto the struct_mutex should
 * relinquish the lock immediately in order for the reset to start.
 *
-- 
2.11.0

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [PATCH v6 08/20] drm/i915/tdr: Export per-engine reset count info to debugfs

2017-04-18 Thread Michel Thierry
From: Arun Siluvery 

A new variable is added to export the reset counts to debugfs, this
includes full gpu reset and engine reset count. This is useful for tests
where they are expected to trigger reset; these counts are checked before
and after the test to ensure the same.

v2: Include reset engine count in i915_engine_info too (Chris).

Cc: Chris Wilson 
Cc: Mika Kuoppala 
Signed-off-by: Arun Siluvery 
Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/i915_debugfs.c | 21 +
 1 file changed, 21 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c 
b/drivers/gpu/drm/i915/i915_debugfs.c
index 870c470177b5..6444c1a9bd22 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1403,6 +1403,23 @@ static int i915_hangcheck_info(struct seq_file *m, void 
*unused)
return 0;
 }
 
+static int i915_reset_info(struct seq_file *m, void *unused)
+{
+   struct drm_i915_private *dev_priv = node_to_i915(m->private);
+   struct i915_gpu_error *error = &dev_priv->gpu_error;
+   struct intel_engine_cs *engine;
+   enum intel_engine_id id;
+
+   seq_printf(m, "full gpu reset = %u\n", i915_reset_count(error));
+
+   for_each_engine(engine, dev_priv, id) {
+   seq_printf(m, "%s = %u\n", engine->name,
+  i915_reset_engine_count(error, engine));
+   }
+
+   return 0;
+}
+
 static int ironlake_drpc_info(struct seq_file *m)
 {
struct drm_i915_private *dev_priv = node_to_i915(m->private);
@@ -3242,6 +3259,7 @@ static int i915_display_info(struct seq_file *m, void 
*unused)
 static int i915_engine_info(struct seq_file *m, void *unused)
 {
struct drm_i915_private *dev_priv = node_to_i915(m->private);
+   struct i915_gpu_error *error = &dev_priv->gpu_error;
struct intel_engine_cs *engine;
enum intel_engine_id id;
 
@@ -3265,6 +3283,8 @@ static int i915_engine_info(struct seq_file *m, void 
*unused)
   engine->hangcheck.seqno,
   jiffies_to_msecs(jiffies - 
engine->hangcheck.action_timestamp),
   engine->timeline->inflight_seqnos);
+   seq_printf(m, "\tReset count: %d\n",
+  i915_reset_engine_count(error, engine));
 
rcu_read_lock();
 
@@ -4777,6 +4797,7 @@ static const struct drm_info_list i915_debugfs_list[] = {
{"i915_huc_load_status", i915_huc_load_status_info, 0},
{"i915_frequency_info", i915_frequency_info, 0},
{"i915_hangcheck_info", i915_hangcheck_info, 0},
+   {"i915_reset_info", i915_reset_info, 0},
{"i915_drpc_info", i915_drpc_info, 0},
{"i915_emon_status", i915_emon_status, 0},
{"i915_ring_freq_table", i915_ring_freq_table, 0},
-- 
2.11.0

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [PATCH v6 07/20] drm/i915/tdr: Add engine reset count to error state

2017-04-18 Thread Michel Thierry
From: Arun Siluvery 

Driver maintains count of how many times a given engine is reset, useful to
capture this in error state also. It gives an idea of how engine is coping
up with the workloads it is executing before this error state.

A follow-up patch will provide this information in debugfs.

v2: s/engine_reset/reset_engine/ (Chris)
Define count as unsigned int (Tvrtko)

Cc: Chris Wilson 
Cc: Mika Kuoppala 
Signed-off-by: Arun Siluvery 
Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/i915_drv.c   | 2 ++
 drivers/gpu/drm/i915/i915_drv.h   | 8 
 drivers/gpu/drm/i915/i915_gpu_error.c | 3 +++
 3 files changed, 13 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 634893cd93b3..974be1fa77f9 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1935,6 +1935,8 @@ int i915_reset_engine(struct intel_engine_cs *engine)
if (ret)
goto error;
 
+   error->reset_engine_count[engine->id]++;
+
 wakeup:
enable_irq(dev_priv->drm.irq);
intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index eb3e5bcda478..71c34f15be64 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -982,6 +982,7 @@ struct i915_gpu_state {
enum intel_engine_hangcheck_action hangcheck_action;
struct i915_address_space *vm;
int num_requests;
+   u32 reset_count;
 
/* position of active request inside the ring */
u32 rq_head, rq_post, rq_tail;
@@ -1619,6 +1620,7 @@ struct i915_gpu_error {
 
/* if available, engine-specific reset is tried before full gpu reset */
u32 reset_engine_mask;
+   u32 reset_engine_count[I915_NUM_ENGINES];
 
/**
 * Waitqueue to signal when a hang is detected. Used to for waiters
@@ -3442,6 +3444,12 @@ static inline u32 i915_reset_count(struct i915_gpu_error 
*error)
return READ_ONCE(error->reset_count);
 }
 
+static inline u32 i915_reset_engine_count(struct i915_gpu_error *error,
+ struct intel_engine_cs *engine)
+{
+   return READ_ONCE(error->reset_engine_count[engine->id]);
+}
+
 int i915_gem_reset_prepare_engine(struct intel_engine_cs *engine);
 int i915_gem_reset_prepare(struct drm_i915_private *dev_priv,
   unsigned int engine_mask);
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c 
b/drivers/gpu/drm/i915/i915_gpu_error.c
index 14e2064b7653..a2ffb1ef2cfa 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -463,6 +463,7 @@ static void error_print_engine(struct 
drm_i915_error_state_buf *m,
err_printf(m, "  hangcheck action timestamp: %lu, %u ms ago\n",
   ee->hangcheck_timestamp,
   jiffies_to_msecs(jiffies - ee->hangcheck_timestamp));
+   err_printf(m, "  engine reset count: %u\n", ee->reset_count);
 
error_print_request(m, "  ELSP[0]: ", &ee->execlist[0]);
error_print_request(m, "  ELSP[1]: ", &ee->execlist[1]);
@@ -1244,6 +1245,8 @@ static void error_record_engine_registers(struct 
i915_gpu_state *error,
ee->hangcheck_timestamp = engine->hangcheck.action_timestamp;
ee->hangcheck_action = engine->hangcheck.action;
ee->hangcheck_stalled = engine->hangcheck.stalled;
+   ee->reset_count = i915_reset_engine_count(&dev_priv->gpu_error,
+ engine);
 
if (USES_PPGTT(dev_priv)) {
int i;
-- 
2.11.0

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [PATCH v6 03/20] drm/i915: Update i915.reset to handle engine resets

2017-04-18 Thread Michel Thierry
From: Arun Siluvery 

In preparation for engine reset work update this parameter to handle more
than one type of reset. Default at the moment is still full gpu reset.

Cc: Chris Wilson 
Cc: Mika Kuoppala 
Signed-off-by: Arun Siluvery 
Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/i915_params.c | 6 +++---
 drivers/gpu/drm/i915/i915_params.h | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_params.c 
b/drivers/gpu/drm/i915/i915_params.c
index b6a7e363d076..045cadb77285 100644
--- a/drivers/gpu/drm/i915/i915_params.c
+++ b/drivers/gpu/drm/i915/i915_params.c
@@ -46,7 +46,7 @@ struct i915_params i915 __read_mostly = {
.prefault_disable = 0,
.load_detect_test = 0,
.force_reset_modeset_test = 0,
-   .reset = true,
+   .reset = 1,
.error_capture = true,
.invert_brightness = 0,
.disable_display = 0,
@@ -115,8 +115,8 @@ MODULE_PARM_DESC(vbt_sdvo_panel_type,
"Override/Ignore selection of SDVO panel mode in the VBT "
"(-2=ignore, -1=auto [default], index in VBT BIOS table)");
 
-module_param_named_unsafe(reset, i915.reset, bool, 0600);
-MODULE_PARM_DESC(reset, "Attempt GPU resets (default: true)");
+module_param_named_unsafe(reset, i915.reset, int, 0600);
+MODULE_PARM_DESC(reset, "Attempt GPU resets (0=disabled, 1=full gpu reset 
[default], 2=engine reset)");
 
 #if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
 module_param_named(error_capture, i915.error_capture, bool, 0600);
diff --git a/drivers/gpu/drm/i915/i915_params.h 
b/drivers/gpu/drm/i915/i915_params.h
index 34148cc8637c..febbfdbd30bd 100644
--- a/drivers/gpu/drm/i915/i915_params.h
+++ b/drivers/gpu/drm/i915/i915_params.h
@@ -51,6 +51,7 @@
func(int, use_mmio_flip); \
func(int, mmio_debug); \
func(int, edp_vswing); \
+   func(int, reset); \
func(unsigned int, inject_load_failure); \
/* leave bools at the end to not create holes */ \
func(bool, alpha_support); \
@@ -60,7 +61,6 @@
func(bool, prefault_disable); \
func(bool, load_detect_test); \
func(bool, force_reset_modeset_test); \
-   func(bool, reset); \
func(bool, error_capture); \
func(bool, disable_display); \
func(bool, verbose_state_checks); \
-- 
2.11.0

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [PATCH v6 02/20] drm/i915: Rename gen8_(un)request_engine_reset to gen8_reset_engine_start/cancel

2017-04-18 Thread Michel Thierry
As all other functions related to resetting engines are using
reset_engine.

v2: remove _request_ and use start/cancel instead (Chris)

Cc: Chris Wilson 
Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/intel_uncore.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_uncore.c 
b/drivers/gpu/drm/i915/intel_uncore.c
index a10d8863b0a9..07a722f74fa1 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -1683,7 +1683,7 @@ int intel_wait_for_register(struct drm_i915_private 
*dev_priv,
return ret;
 }
 
-static int gen8_request_engine_reset(struct intel_engine_cs *engine)
+static int gen8_reset_engine_start(struct intel_engine_cs *engine)
 {
struct drm_i915_private *dev_priv = engine->i915;
int ret;
@@ -1702,7 +1702,7 @@ static int gen8_request_engine_reset(struct 
intel_engine_cs *engine)
return ret;
 }
 
-static void gen8_unrequest_engine_reset(struct intel_engine_cs *engine)
+static void gen8_reset_engine_cancel(struct intel_engine_cs *engine)
 {
struct drm_i915_private *dev_priv = engine->i915;
 
@@ -1717,14 +1717,14 @@ static int gen8_reset_engines(struct drm_i915_private 
*dev_priv,
unsigned int tmp;
 
for_each_engine_masked(engine, dev_priv, engine_mask, tmp)
-   if (gen8_request_engine_reset(engine))
+   if (gen8_reset_engine_start(engine))
goto not_ready;
 
return gen6_reset_engines(dev_priv, engine_mask);
 
 not_ready:
for_each_engine_masked(engine, dev_priv, engine_mask, tmp)
-   gen8_unrequest_engine_reset(engine);
+   gen8_reset_engine_cancel(engine);
 
return -EIO;
 }
-- 
2.11.0

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [PATCH v6 06/20] drm/i915: Skip reset request if there is one already

2017-04-18 Thread Michel Thierry
From: Mika Kuoppala 

To perform engine reset we first disable engine to capture its state. This
is done by issuing a reset request. Because we are reusing existing
infrastructure, again when we actually reset an engine, reset function
checks engine mask and issues reset request again which is unnecessary. To
avoid this we check if the engine is already prepared, if so we just exit
from that point.

Cc: Chris Wilson 
Signed-off-by: Mika Kuoppala 
Signed-off-by: Arun Siluvery 
Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/intel_uncore.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_uncore.c 
b/drivers/gpu/drm/i915/intel_uncore.c
index 3ebba6b2dd74..120fb440bb8b 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -1686,10 +1686,15 @@ int intel_wait_for_register(struct drm_i915_private 
*dev_priv,
 static int gen8_reset_engine_start(struct intel_engine_cs *engine)
 {
struct drm_i915_private *dev_priv = engine->i915;
+   const i915_reg_t reset_ctrl = RING_RESET_CTL(engine->mmio_base);
+   const u32 ready = RESET_CTL_REQUEST_RESET | RESET_CTL_READY_TO_RESET;
int ret;
 
-   I915_WRITE_FW(RING_RESET_CTL(engine->mmio_base),
- _MASKED_BIT_ENABLE(RESET_CTL_REQUEST_RESET));
+   /* If engine has been already prepared, we can shortcut here */
+   if ((I915_READ_FW(reset_ctrl) & ready) == ready)
+   return 0;
+
+   I915_WRITE_FW(reset_ctrl, _MASKED_BIT_ENABLE(RESET_CTL_REQUEST_RESET));
 
ret = intel_wait_for_register_fw(dev_priv,
 RING_RESET_CTL(engine->mmio_base),
-- 
2.11.0

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [PATCH v6 05/20] drm/i915/tdr: Add support for per engine reset recovery

2017-04-18 Thread Michel Thierry
From: Arun Siluvery 

This change implements support for per-engine reset as an initial, less
intrusive hang recovery option to be attempted before falling back to the
legacy full GPU reset recovery mode if necessary. This is only supported
from Gen8 onwards.

Hangchecker determines which engines are hung and invokes error handler to
recover from it. Error handler schedules recovery for each of those engines
that are hung. The recovery procedure is as follows,
 - identifies the request that caused the hang and it is dropped
 - force engine to idle: this is done by issuing a reset request
 - reset and re-init engine
 - restart submissions to the engine

If engine reset fails then we fall back to heavy weight full gpu reset
which resets all engines and reinitiazes complete state of HW and SW.

v2: Rebase.
v3: s/*engine_reset*/*reset_engine*/; freeze engine and irqs before
calling i915_gem_reset_engine (Chris).
v4: Rebase, modify i915_gem_reset_prepare to use a ring mask and
reuse the function for reset_engine.
v5: intel_reset_engine_start/cancel instead of request/unrequest_reset.
v6: Clean up reset_engine function to not require mutex, i.e. no need to call
revoke/restore_fences and _retire_requests (Chris).

Cc: Chris Wilson 
Cc: Mika Kuoppala 
Signed-off-by: Tomas Elf 
Signed-off-by: Arun Siluvery 
Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/i915_drv.c | 76 --
 drivers/gpu/drm/i915/i915_drv.h | 12 +++-
 drivers/gpu/drm/i915/i915_gem.c | 97 +++--
 drivers/gpu/drm/i915/i915_gem_request.c |  2 +-
 drivers/gpu/drm/i915/intel_uncore.c | 20 +++
 5 files changed, 158 insertions(+), 49 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index e03d0643dbd6..634893cd93b3 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1810,7 +1810,7 @@ void i915_reset(struct drm_i915_private *dev_priv)
 
pr_notice("drm/i915: Resetting chip after gpu hang\n");
disable_irq(dev_priv->drm.irq);
-   ret = i915_gem_reset_prepare(dev_priv);
+   ret = i915_gem_reset_prepare(dev_priv, ALL_ENGINES);
if (ret) {
DRM_ERROR("GPU recovery failed\n");
intel_gpu_reset(dev_priv, ALL_ENGINES);
@@ -1852,7 +1852,7 @@ void i915_reset(struct drm_i915_private *dev_priv)
i915_queue_hangcheck(dev_priv);
 
 finish:
-   i915_gem_reset_finish(dev_priv);
+   i915_gem_reset_finish(dev_priv, ALL_ENGINES);
enable_irq(dev_priv->drm.irq);
 
 wakeup:
@@ -1871,11 +1871,79 @@ void i915_reset(struct drm_i915_private *dev_priv)
  *
  * Reset a specific GPU engine. Useful if a hang is detected.
  * Returns zero on successful reset or otherwise an error code.
+ *
+ * Procedure is:
+ *  - identifies the request that caused the hang and it is dropped
+ *  - force engine to idle: this is done by issuing a reset request
+ *  - reset engine
+ *  - restart submissions to the engine
  */
 int i915_reset_engine(struct intel_engine_cs *engine)
 {
-   /* FIXME: replace me with engine reset sequence */
-   return -ENODEV;
+   int ret;
+   struct drm_i915_private *dev_priv = engine->i915;
+   struct i915_gpu_error *error = &dev_priv->gpu_error;
+
+   GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
+
+   DRM_DEBUG_DRIVER("resetting %s\n", engine->name);
+
+   /*
+* We need to first idle the engine by issuing a reset request,
+* then perform soft reset and re-initialize hw state, for all of
+* this GT power need to be awake so ensure it does throughout the
+* process
+*/
+   intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
+
+   disable_irq(dev_priv->drm.irq);
+   ret = i915_gem_reset_prepare_engine(engine);
+   if (ret) {
+   DRM_ERROR("Previous reset failed - promote to full reset\n");
+   goto error;
+   }
+
+   /*
+* the request that caused the hang is stuck on elsp, identify the
+* active request and drop it, adjust head to skip the offending
+* request to resume executing remaining requests in the queue.
+*/
+   i915_gem_reset_engine(engine);
+
+   /* forcing engine to idle */
+   ret = intel_reset_engine_start(engine);
+   if (ret) {
+   DRM_ERROR("Failed to disable %s\n", engine->name);
+   goto error;
+   }
+
+   /* finally, reset engine */
+   ret = intel_gpu_reset(dev_priv, intel_engine_flag(engine));
+   if (ret) {
+   DRM_ERROR("Failed to reset %s, ret=%d\n", engine->name, ret);
+   intel_reset_engine_cancel(engine);
+   goto error;
+   }
+
+   /* be sure the request reset bit gets cleared */
+   intel_reset_engine_cancel(engine);
+
+   i915_gem_

[Intel-gfx] [PATCH v6 00/20] Gen8+ engine-reset

2017-04-18 Thread Michel Thierry
These patches add the reset-engine feature from Gen8. This is also
referred to as Timeout detection and recovery (TDR). This complements to
the full gpu reset feature available in i915 but it only allows to reset a
particular engine instead of all engines thus providing a light weight
engine reset and recovery mechanism.

Thanks to recent changes merged, this implementation is now not only for
execlists, but for GuC based submission too; it is still limited from
Gen8 onwards. I have also included the changes for watchdog timeout
detection. The GuC related patches are functional, but can be seen as RFC.

Timeout detection relies on the existing hangcheck, which remains the same;
main changes are to the recovery mechanism. Once we detect a hang on a
particular engine we identify the request that caused the hang, skip the
request and adjust head pointers to allow the execution to proceed
normally. After some cleanup, submissions are restarted to process
remaining work queued to that engine.

If engine reset fails to recover engine correctly then we fallback to full
gpu reset.

We can argue about the effectiveness of reset-engine vs full reset when
more than one ring is hung, but the benefits of just resetting one engine
are reduced when the driver has to do it multiple times.

v2: ELSP queue request tracking and reset path changes to handle incomplete
requests during reset. Thanks to Chris Wilson for providing these patches.

v3: Let the waiter keep handling the full gpu reset if it already has the
lock; point out that GuC submission needs a different method to restart
workloads after the engine reset completes.

v4: Handle reset as 2 level resets, by first going to engine only and fall
backing to full/chip reset as needed, i.e. reset_engine will need the
struct_mutex.

v5: Rebased after reset flag split in 2, add GuC support, include watchdog
detection patches, addressing comments from prev RFC.

v6: Mutex-less reset engine. Updates in watchdog abi and guc whitelist &
register-restore fixes (including an old patch from Daniele).

Cc: Chris Wilson 
Cc: Mika Kuoppala 
Cc: Daniele Ceraolo Spurio 

Arun Siluvery (7):
  drm/i915: Update i915.reset to handle engine resets
  drm/i915/tdr: Modify error handler for per engine hang recovery
  drm/i915/tdr: Add support for per engine reset recovery
  drm/i915/tdr: Add engine reset count to error state
  drm/i915/tdr: Export per-engine reset count info to debugfs
  drm/i915/tdr: Enable Engine reset and recovery support
  drm/i915/guc: Provide register list to be saved/restored during engine
reset

Daniele Ceraolo Spurio (1):
  drm/i915/guc: fix mmio whitelist mmio_start offset and add reminder

Michel Thierry (11):
  drm/i915: Fix stale comment about I915_RESET_IN_PROGRESS flag
  drm/i915: Rename gen8_(un)request_engine_reset to
gen8_reset_engine_start/cancel
  drm/i915: Add engine reset count in get-reset-stats ioctl
  drm/i915/selftests: reset engine self tests
  drm/i915/guc: Add support for reset engine using GuC commands
  drm/i915: Watchdog timeout: Pass GuC shared data structure during
param load
  drm/i915: Watchdog timeout: IRQ handler for gen8+
  drm/i915: Watchdog timeout: Ringbuffer command emission for gen8+
  drm/i915: Watchdog timeout: DRM kernel interface to set the timeout
  drm/i915: Watchdog timeout: Include threshold value in error state
  drm/i915: Watchdog timeout: Export media reset count from GuC to
debugfs

Mika Kuoppala (1):
  drm/i915: Skip reset request if there is one already

 drivers/gpu/drm/i915/i915_debugfs.c  |  43 +++
 drivers/gpu/drm/i915/i915_drv.c  | 104 ++-
 drivers/gpu/drm/i915/i915_drv.h  |  63 -
 drivers/gpu/drm/i915/i915_gem.c  |  97 --
 drivers/gpu/drm/i915/i915_gem_context.c  | 116 -
 drivers/gpu/drm/i915/i915_gem_context.h  |   4 +
 drivers/gpu/drm/i915/i915_gem_request.c  |   2 +-
 drivers/gpu/drm/i915/i915_gpu_error.c|  14 +-
 drivers/gpu/drm/i915/i915_guc_submission.c   | 128 +--
 drivers/gpu/drm/i915/i915_irq.c  |  43 ++-
 drivers/gpu/drm/i915/i915_params.c   |   6 +-
 drivers/gpu/drm/i915/i915_params.h   |   2 +-
 drivers/gpu/drm/i915/i915_pci.c  |   5 +-
 drivers/gpu/drm/i915/i915_reg.h  |   6 +
 drivers/gpu/drm/i915/intel_guc_fwif.h|  27 +++-
 drivers/gpu/drm/i915/intel_guc_loader.c  |  11 ++
 drivers/gpu/drm/i915/intel_hangcheck.c   |  13 +-
 drivers/gpu/drm/i915/intel_lrc.c | 156 ++-
 drivers/gpu/drm/i915/intel_ringbuffer.h  |  24 
 drivers/gpu/drm/i915/intel_uc.h  |   3 +
 drivers/gpu/drm/i915/intel_uncore.c  |  43 ++-
 drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 147 +
 include/uapi/drm/i915_drm.h  |

[Intel-gfx] [PATCH v6 18/20] drm/i915: Watchdog timeout: DRM kernel interface to set the timeout

2017-04-18 Thread Michel Thierry
Final enablement patch for GPU hang detection using watchdog timeout.
Using the gem_context_setparam ioctl, users can specify the desired
timeout value in microseconds, and the driver will do the conversion to
'timestamps'.

The recommended default watchdog threshold for video engines is 6 us,
since this has been _empirically determined_ to be a good compromise for
low-latency requirements and low rate of false positives. The default
register value is ~106000us and the theoretical max value (all 1s) is
353 seconds.

Note, UABI engine ids and i915 engine ids are different, and this patch
uses the i915 ones. Some kind of mapping table [1] is required if we
decide to use the UABI engine ids.

[1] 
http://patchwork.freedesktop.org/patch/msgid/20170329135831.30254-2-ch...@chris-wilson.co.uk

v2: Fixed get api to return values in microseconds. Threshold updated to
be per context engine. Check for u32 overflow. Capture ctx threshold
value in error state.

v3: Add a way to get array size, short-cut to disable all thresholds,
return EFAULT / EINVAL as needed. Move the capture of the threshold
value in the error state into a new patch. BXT has a different
timestamp base (because why not?).

Signed-off-by: Tomas Elf 
Signed-off-by: Arun Siluvery 
Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/i915_drv.h |  29 +
 drivers/gpu/drm/i915/i915_gem_context.c | 102 
 drivers/gpu/drm/i915/intel_lrc.c|   5 +-
 include/uapi/drm/i915_drm.h |   1 +
 4 files changed, 135 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 203f2112dd18..f65a236fddef 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3574,6 +3574,35 @@ i915_gem_context_lookup_timeline(struct i915_gem_context 
*ctx,
return &vm->timeline.engine[engine->id];
 }
 
+/*
+ * BDW & SKL+ Timestamp timer resolution = 0.080 uSec,
+ * or 1250 counts per second, or ~12 counts per microsecond.
+ *
+ * But Broxton Timestamp timer resolution is different, 0.052 uSec,
+ * or 1920 counts per second, or ~19 counts per microsecond.
+ */
+#define SKL_TIMESTAMP_CNTS_PER_USEC 12
+#define BXT_TIMESTAMP_CNTS_PER_USEC 19
+#define TIMESTAMP_CNTS_PER_USEC(dev_priv) (IS_BROXTON(dev_priv) ? \
+  BXT_TIMESTAMP_CNTS_PER_USEC : \
+  SKL_TIMESTAMP_CNTS_PER_USEC)
+static inline u32
+watchdog_to_us(struct drm_i915_private *dev_priv, u32 value_in_clock_counts)
+{
+   return value_in_clock_counts / TIMESTAMP_CNTS_PER_USEC(dev_priv);
+}
+
+static inline u32
+watchdog_to_clock_counts(struct drm_i915_private *dev_priv, u64 value_in_us)
+{
+   u64 threshold = value_in_us * TIMESTAMP_CNTS_PER_USEC(dev_priv);
+
+   if (overflows_type(threshold, u32))
+   return -EINVAL;
+
+   return threshold;
+}
+
 int i915_perf_open_ioctl(struct drm_device *dev, void *data,
 struct drm_file *file);
 
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c 
b/drivers/gpu/drm/i915/i915_gem_context.c
index edbed85a1c88..85a6467a25a6 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -422,6 +422,102 @@ i915_gem_context_create_gvt(struct drm_device *dev)
return ctx;
 }
 
+/* Return the timer count threshold in microseconds. */
+int i915_gem_context_get_watchdog(struct i915_gem_context *ctx,
+ struct drm_i915_gem_context_param *args)
+{
+   struct drm_i915_private *dev_priv = ctx->i915;
+   struct intel_engine_cs *engine;
+   enum intel_engine_id id;
+   u32 threshold_in_us[I915_NUM_ENGINES];
+
+   if (args->size == 0)
+   goto out;
+
+   if (args->size < sizeof(threshold_in_us))
+   return -EFAULT;
+
+   if (!dev_priv->engine[VCS]->emit_start_watchdog)
+   return -ENODEV;
+
+   for_each_engine(engine, dev_priv, id) {
+   struct intel_context *ce = &ctx->engine[id];
+
+   threshold_in_us[id] = watchdog_to_us(dev_priv,
+ce->watchdog_threshold);
+   }
+
+   mutex_unlock(&dev_priv->drm.struct_mutex);
+   if (__copy_to_user(u64_to_user_ptr(args->value),
+  &threshold_in_us,
+  sizeof(threshold_in_us))) {
+   mutex_lock(&dev_priv->drm.struct_mutex);
+   return -EFAULT;
+   }
+   mutex_lock(&dev_priv->drm.struct_mutex);
+
+out:
+   args->size = sizeof(threshold_in_us);
+
+   return 0;
+}
+
+/*
+ * Based on time out value in microseconds (us) calculate
+ * timer count thresholds needed based on core frequency.
+ * Watchdog can be disabled by setting it to 0.
+ */
+int i915_gem_context_

[Intel-gfx] [PATCH v6 15/20] drm/i915: Watchdog timeout: Pass GuC shared data structure during param load

2017-04-18 Thread Michel Thierry
For watchdog / media reset, the firmware must know the address of the shared
data page (the first page of the default context).

This information should be in DWORD 9 of the GUC_CTL structure.

v2: Use guc_ggtt_offset (Chris).
Store the ggtt offset of the default ctx as we needed for
suspend/resume/reset (Daniele).

Cc: Chris Wilson 
Cc: Daniele Ceraolo Spurio 
Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/i915_guc_submission.c | 21 ++---
 drivers/gpu/drm/i915/intel_guc_fwif.h  |  2 +-
 drivers/gpu/drm/i915/intel_guc_loader.c| 11 +++
 drivers/gpu/drm/i915/intel_uc.h|  2 ++
 4 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c 
b/drivers/gpu/drm/i915/i915_guc_submission.c
index c8067aeab6f4..58d6c570a188 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -1364,7 +1364,6 @@ void i915_guc_submission_reenable_engine(struct 
intel_engine_cs *engine)
 int intel_guc_suspend(struct drm_i915_private *dev_priv)
 {
struct intel_guc *guc = &dev_priv->guc;
-   struct i915_gem_context *ctx;
u32 data[3];
 
if (guc->fw.load_status != INTEL_UC_FIRMWARE_SUCCESS)
@@ -1372,13 +1371,11 @@ int intel_guc_suspend(struct drm_i915_private *dev_priv)
 
gen9_disable_guc_interrupts(dev_priv);
 
-   ctx = dev_priv->kernel_context;
-
data[0] = INTEL_GUC_ACTION_ENTER_S_STATE;
/* any value greater than GUC_POWER_D0 */
data[1] = GUC_POWER_D1;
-   /* first page is shared data with GuC */
-   data[2] = guc_ggtt_offset(ctx->engine[RCS].state);
+   /* first page of default ctx is shared data with GuC */
+   data[2] = guc->shared_data_offset;
 
return intel_guc_send(guc, data, ARRAY_SIZE(data));
 }
@@ -1390,7 +1387,6 @@ int intel_guc_suspend(struct drm_i915_private *dev_priv)
 int intel_guc_resume(struct drm_i915_private *dev_priv)
 {
struct intel_guc *guc = &dev_priv->guc;
-   struct i915_gem_context *ctx;
u32 data[3];
 
if (guc->fw.load_status != INTEL_UC_FIRMWARE_SUCCESS)
@@ -1399,12 +1395,10 @@ int intel_guc_resume(struct drm_i915_private *dev_priv)
if (i915.guc_log_level >= 0)
gen9_enable_guc_interrupts(dev_priv);
 
-   ctx = dev_priv->kernel_context;
-
data[0] = INTEL_GUC_ACTION_EXIT_S_STATE;
data[1] = GUC_POWER_D0;
-   /* first page is shared data with GuC */
-   data[2] = guc_ggtt_offset(ctx->engine[RCS].state);
+   /* first page of default ctx is shared data with GuC */
+   data[2] = guc->shared_data_offset;
 
return intel_guc_send(guc, data, ARRAY_SIZE(data));
 }
@@ -1413,14 +1407,11 @@ int i915_guc_reset_engine(struct intel_engine_cs 
*engine)
 {
struct drm_i915_private *dev_priv = engine->i915;
struct intel_guc *guc = &dev_priv->guc;
-   struct i915_gem_context *ctx;
u32 data[7];
 
if (!i915.enable_guc_submission)
return 0;
 
-   ctx = dev_priv->kernel_context;
-
/*
 * The affected context report is populated by GuC and is provided
 * to the driver using the shared page. We request for it but don't
@@ -1432,8 +1423,8 @@ int i915_guc_reset_engine(struct intel_engine_cs *engine)
data[3] = 0;
data[4] = 0;
data[5] = guc->execbuf_client->stage_id;
-   /* first page is shared data with GuC */
-   data[6] = guc_ggtt_offset(ctx->engine[RCS].state);
+   /* first page of default ctx is shared data with GuC */
+   data[6] = guc->shared_data_offset;
 
return intel_guc_send(guc, data, ARRAY_SIZE(data));
 }
diff --git a/drivers/gpu/drm/i915/intel_guc_fwif.h 
b/drivers/gpu/drm/i915/intel_guc_fwif.h
index 081f2cf614e6..a2d0cba2f8b9 100644
--- a/drivers/gpu/drm/i915/intel_guc_fwif.h
+++ b/drivers/gpu/drm/i915/intel_guc_fwif.h
@@ -135,7 +135,7 @@
 #define   GUC_ADS_ADDR_SHIFT   11
 #define   GUC_ADS_ADDR_MASK0xf800
 
-#define GUC_CTL_RSRVD  9
+#define GUC_CTL_SHARED_DATA9
 
 #define GUC_CTL_MAX_DWORDS (SOFT_SCRATCH_COUNT - 2) /* [1..14] */
 
diff --git a/drivers/gpu/drm/i915/intel_guc_loader.c 
b/drivers/gpu/drm/i915/intel_guc_loader.c
index d9045b6e897b..8cd5c2bf9510 100644
--- a/drivers/gpu/drm/i915/intel_guc_loader.c
+++ b/drivers/gpu/drm/i915/intel_guc_loader.c
@@ -108,6 +108,7 @@ static void guc_params_init(struct drm_i915_private 
*dev_priv)
 {
struct intel_guc *guc = &dev_priv->guc;
u32 params[GUC_CTL_MAX_DWORDS];
+   struct i915_gem_context *ctx;
int i;
 
memset(¶ms, 0, sizeof(params));
@@ -156,6 +157,16 @@ static void guc_params_init(struct drm_i915_private 
*dev_priv)
params[GUC_CTL_FEATURE] &= ~GUC_CTL_DISABLE_SCHEDULER;
}
 
+   /*
+* For watchdog / media

[Intel-gfx] [PATCH v6 20/20] drm/i915: Watchdog timeout: Export media reset count from GuC to debugfs

2017-04-18 Thread Michel Thierry
From firmware v8.8, GuC provides the count of media engine resets
(watchdog timeout). This information is available in the GuC shared
context data struct, which resides in the first page of the default
(kernel) lrc context.

Since GuC handled engine resets are transparent for kernel and user,
provide a simple debugfs entry to see the number of times media reset
has happened.

v2: Remove unnecessary struct_mutex, _get_dirty_page and kmap_atomic;
use READ_ONCE. (Chris)

Cc: Chris Wilson 
Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/i915_debugfs.c   | 22 ++
 drivers/gpu/drm/i915/intel_guc_fwif.h | 18 ++
 2 files changed, 40 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c 
b/drivers/gpu/drm/i915/i915_debugfs.c
index 6444c1a9bd22..35ce771c8b8f 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1403,6 +1403,26 @@ static int i915_hangcheck_info(struct seq_file *m, void 
*unused)
return 0;
 }
 
+static u32 i915_watchdog_reset_count(struct drm_i915_private *dev_priv)
+{
+   struct i915_gem_context *ctx;
+   struct page *page;
+   struct guc_shared_ctx_data *guc_shared_data;
+   u32 guc_media_reset_count;
+
+   if (!i915.enable_guc_submission)
+   return 0;
+
+   ctx = dev_priv->kernel_context;
+   page = i915_gem_object_get_page(ctx->engine[RCS].state->obj,
+   LRC_GUCSHR_PN);
+   guc_shared_data = kmap(page);
+   guc_media_reset_count = READ_ONCE(guc_shared_data->media_reset_count);
+   kunmap(page);
+
+   return guc_media_reset_count;
+}
+
 static int i915_reset_info(struct seq_file *m, void *unused)
 {
struct drm_i915_private *dev_priv = node_to_i915(m->private);
@@ -1411,6 +1431,8 @@ static int i915_reset_info(struct seq_file *m, void 
*unused)
enum intel_engine_id id;
 
seq_printf(m, "full gpu reset = %u\n", i915_reset_count(error));
+   seq_printf(m, "GuC watchdog/media reset = %u\n",
+  i915_watchdog_reset_count(dev_priv));
 
for_each_engine(engine, dev_priv, id) {
seq_printf(m, "%s = %u\n", engine->name,
diff --git a/drivers/gpu/drm/i915/intel_guc_fwif.h 
b/drivers/gpu/drm/i915/intel_guc_fwif.h
index a2d0cba2f8b9..e45987f7aa50 100644
--- a/drivers/gpu/drm/i915/intel_guc_fwif.h
+++ b/drivers/gpu/drm/i915/intel_guc_fwif.h
@@ -502,6 +502,24 @@ union guc_log_control {
u32 value;
 } __packed;
 
+/* GuC Shared Context Data Struct */
+struct guc_shared_ctx_data {
+   u32 addr_of_last_preempted_data_low;
+   u32 addr_of_last_preempted_data_high;
+   u32 addr_of_last_preempted_data_high_tmp;
+   u32 padding;
+   u32 is_mapped_to_proxy;
+   u32 proxy_ctx_id;
+   u32 engine_reset_ctx_id;
+   u32 media_reset_count;
+   u32 reserved[8];
+   u32 uk_last_ctx_switch_reason;
+   u32 was_reset;
+   u32 lrca_gpu_addr;
+   u32 execlist_ctx;
+   u32 reserved1[32];
+} __packed;
+
 /* This Action will be programmed in C180 - SOFT_SCRATCH_O_REG */
 enum intel_guc_action {
INTEL_GUC_ACTION_DEFAULT = 0x0,
-- 
2.11.0

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [PATCH v6 19/20] drm/i915: Watchdog timeout: Include threshold value in error state

2017-04-18 Thread Michel Thierry
Save the watchdog threshold (in us) as part of the engine state.

Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/i915_drv.h   |  1 +
 drivers/gpu/drm/i915/i915_gpu_error.c | 11 +++
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index f65a236fddef..02ee26199cfe 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1022,6 +1022,7 @@ struct i915_gpu_state {
int ban_score;
int active;
int guilty;
+   int watchdog_threshold;
} context;
 
struct drm_i915_error_object {
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c 
b/drivers/gpu/drm/i915/i915_gpu_error.c
index a2ffb1ef2cfa..1b1a49bc0c3c 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -388,9 +388,10 @@ static void error_print_context(struct 
drm_i915_error_state_buf *m,
const char *header,
const struct drm_i915_error_context *ctx)
 {
-   err_printf(m, "%s%s[%d] user_handle %d hw_id %d, ban score %d guilty %d 
active %d\n",
+   err_printf(m, "%s%s[%d] user_handle %d hw_id %d, ban score %d guilty %d 
active %d, watchdog %dus\n",
   header, ctx->comm, ctx->pid, ctx->handle, ctx->hw_id,
-  ctx->ban_score, ctx->guilty, ctx->active);
+  ctx->ban_score, ctx->guilty, ctx->active,
+  watchdog_to_us(m->i915, ctx->watchdog_threshold));
 }
 
 static void error_print_engine(struct drm_i915_error_state_buf *m,
@@ -1344,7 +1345,8 @@ static void error_record_engine_execlists(struct 
intel_engine_cs *engine,
 }
 
 static void record_context(struct drm_i915_error_context *e,
-  struct i915_gem_context *ctx)
+  struct i915_gem_context *ctx,
+  u32 engine_id)
 {
if (ctx->pid) {
struct task_struct *task;
@@ -1363,6 +1365,7 @@ static void record_context(struct drm_i915_error_context 
*e,
e->ban_score = ctx->ban_score;
e->guilty = ctx->guilty_count;
e->active = ctx->active_count;
+   e->watchdog_threshold = ctx->engine[engine_id].watchdog_threshold;
 }
 
 static void request_record_user_bo(struct drm_i915_gem_request *request,
@@ -1426,7 +1429,7 @@ static void i915_gem_record_rings(struct drm_i915_private 
*dev_priv,
ee->vm = request->ctx->ppgtt ?
&request->ctx->ppgtt->base : &ggtt->base;
 
-   record_context(&ee->context, request->ctx);
+   record_context(&ee->context, request->ctx, engine->id);
 
/* We need to copy these to an anonymous buffer
 * as the simplest method to avoid being overwritten
-- 
2.11.0

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [PATCH v6 17/20] drm/i915: Watchdog timeout: Ringbuffer command emission for gen8+

2017-04-18 Thread Michel Thierry
Emit the required commands into the ring buffer for starting and
stopping the watchdog timer before/after batch buffer start during
batch buffer submission.

v2: Support watchdog threshold per context engine, merge lri commands,
and move watchdog commands emission to emit_bb_start. Request space of
combined start_watchdog, bb_start and stop_watchdog to avoid any error
after emitting bb_start.

Signed-off-by: Tomas Elf 
Signed-off-by: Ian Lister 
Signed-off-by: Arun Siluvery 
Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/i915_gem_context.h |  4 ++
 drivers/gpu/drm/i915/intel_lrc.c| 81 -
 drivers/gpu/drm/i915/intel_ringbuffer.h |  4 ++
 3 files changed, 87 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_context.h 
b/drivers/gpu/drm/i915/i915_gem_context.h
index 4af2ab94558b..88700bdbb4e1 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.h
+++ b/drivers/gpu/drm/i915/i915_gem_context.h
@@ -150,6 +150,10 @@ struct i915_gem_context {
u32 *lrc_reg_state;
u64 lrc_desc;
int pin_count;
+   /** watchdog_threshold: hw watchdog threshold value,
+* in clock counts
+*/
+   u32 watchdog_threshold;
bool initialised;
} engine[I915_NUM_ENGINES];
 
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 2263b9fb9b50..7a202e73ce9b 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1310,6 +1310,8 @@ static int gen8_emit_bb_start(struct drm_i915_gem_request 
*req,
  const unsigned int flags)
 {
u32 *cs;
+   u32 num_dwords;
+   bool watchdog_running = false;
int ret;
 
/* Don't rely in hw updating PDPs, specially in lite-restore.
@@ -1329,10 +1331,29 @@ static int gen8_emit_bb_start(struct 
drm_i915_gem_request *req,
req->ctx->ppgtt->pd_dirty_rings &= 
~intel_engine_flag(req->engine);
}
 
-   cs = intel_ring_begin(req, 4);
+   /* bb_start only */
+   num_dwords = 4;
+
+   /* check if watchdog will be required */
+   if (req->ctx->engine[req->engine->id].watchdog_threshold != 0) {
+   if (!req->engine->emit_start_watchdog ||
+   !req->engine->emit_stop_watchdog)
+   return -EINVAL;
+
+   /* + start_watchdog (6) + stop_watchdog (4) */
+   num_dwords += 10;
+   watchdog_running = true;
+   }
+
+   cs = intel_ring_begin(req, num_dwords);
if (IS_ERR(cs))
return PTR_ERR(cs);
 
+   if (watchdog_running) {
+   /* Start watchdog timer */
+   cs = req->engine->emit_start_watchdog(req, cs);
+   }
+
/* FIXME(BDW): Address space and security selectors. */
*cs++ = MI_BATCH_BUFFER_START_GEN8 |
(flags & I915_DISPATCH_SECURE ? 0 : BIT(8)) |
@@ -1340,8 +1361,13 @@ static int gen8_emit_bb_start(struct 
drm_i915_gem_request *req,
*cs++ = lower_32_bits(offset);
*cs++ = upper_32_bits(offset);
*cs++ = MI_NOOP;
-   intel_ring_advance(req, cs);
 
+   if (watchdog_running) {
+   /* Cancel watchdog timer */
+   cs = req->engine->emit_stop_watchdog(req, cs);
+   }
+
+   intel_ring_advance(req, cs);
return 0;
 }
 
@@ -1508,6 +1534,49 @@ static void gen8_watchdog_irq_handler(unsigned long data)
intel_uncore_forcewake_put(dev_priv, engine->fw_domains);
 }
 
+static u32 *gen8_emit_start_watchdog(struct drm_i915_gem_request *req, u32 *cs)
+{
+   struct intel_engine_cs *engine = req->engine;
+   struct i915_gem_context *ctx = req->ctx;
+   struct intel_context *ce = &ctx->engine[engine->id];
+
+   /* XXX: no watchdog support in BCS engine */
+   GEM_BUG_ON(engine->id == BCS);
+
+   /*
+* watchdog register must never be programmed to zero. This would
+* cause the watchdog counter to exceed and not allow the engine to
+* go into IDLE state
+*/
+   GEM_BUG_ON(ce->watchdog_threshold == 0);
+
+   /* Set counter period */
+   *cs++ = MI_LOAD_REGISTER_IMM(2);
+   *cs++ = i915_mmio_reg_offset(RING_THRESH(engine->mmio_base));
+   *cs++ = ce->watchdog_threshold;
+   /* Start counter */
+   *cs++ = i915_mmio_reg_offset(RING_CNTR(engine->mmio_base));
+   *cs++ = GEN8_WATCHDOG_ENABLE;
+   *cs++ = MI_NOOP;
+
+   return cs;
+}
+
+static u32 *gen8_emit_stop_watchdog(struct drm_i915_gem_request *req, u32 *cs)
+{
+   struct intel_engine_cs *engine = req->engine;
+
+   /* XXX: no watchdog support in BCS engine */
+   GEM_BUG_ON(engine->id == BCS);
+
+   *cs++ = MI_LOAD_REGISTER_IMM(2);
+   *cs++ = i915_mmio_reg_offset(RING_CNTR(engine->

[Intel-gfx] [PATCH v6 13/20] drm/i915/guc: Provide register list to be saved/restored during engine reset

2017-04-18 Thread Michel Thierry
From: Arun Siluvery 

GuC expects a list of registers from the driver which are saved/restored
during engine reset. The type of value to be saved is controlled by
flags. We provide a minimal set of registers that we want GuC to save and
restore. This is not an issue in case of engine reset as driver initializes
most of them following an engine reset, but in case of media reset (aka
watchdog reset) which is completely internal to GuC (including resubmission
of hung workload), it is necessary to provide this list, otherwise GuC won't
be able to schedule further workloads after a reset. This is the minimal
set of registers identified for things to work as expected but if we see
any new issues, this register list can be expanded.

v2: REGSET_MASKED is too difficult for GuC, use REGSET_SAVE_DEFAULT_VALUE
and current value from RING_MODE reg instead; no need to preserve
head/tail either, be extra paranoid and save whitelisted registers (Daniele).

Cc: Daniele Ceraolo Spurio 
Signed-off-by: Arun Siluvery 
Signed-off-by: Jeff McGee 
Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/i915_guc_submission.c | 60 +-
 1 file changed, 58 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c 
b/drivers/gpu/drm/i915/i915_guc_submission.c
index 1ea36a88d2fb..d772718861df 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -1003,6 +1003,24 @@ static void guc_policies_init(struct guc_policies 
*policies)
policies->is_valid = 1;
 }
 
+/*
+ * In this macro it is highly unlikely to exceed max value but even if we did
+ * it is not an error so just throw a warning and continue. Only side effect
+ * in continuing further means some registers won't be added to save/restore
+ * list.
+ */
+#define GUC_ADD_MMIO_REG_ADS(node, reg_addr, _flags, defvalue) \
+   do {\
+   u32 __count = node->number_of_registers;\
+   if (WARN_ON(__count >= GUC_REGSET_MAX_REGISTERS))   \
+   continue;   \
+   node->registers[__count].offset = reg_addr.reg; \
+   node->registers[__count].flags = (_flags);  \
+   if (defvalue)   \
+   node->registers[__count].value = (defvalue);\
+   node->number_of_registers++;\
+   } while (0)
+
 static int guc_ads_create(struct intel_guc *guc)
 {
struct drm_i915_private *dev_priv = guc_to_i915(guc);
@@ -1016,6 +1034,7 @@ static int guc_ads_create(struct intel_guc *guc)
u8 reg_state_buffer[GUC_S3_SAVE_SPACE_PAGES * PAGE_SIZE];
} __packed *blob;
struct intel_engine_cs *engine;
+   struct i915_workarounds *workarounds = &dev_priv->workarounds;
enum intel_engine_id id;
u32 base;
 
@@ -1035,6 +1054,39 @@ static int guc_ads_create(struct intel_guc *guc)
 
/* MMIO reg state */
for_each_engine(engine, dev_priv, id) {
+   u32 i;
+   struct guc_mmio_regset *eng_reg =
+   &blob->reg_state.engine_reg[engine->guc_id];
+
+   /*
+* Provide a list of registers to be saved/restored during gpu
+* reset. This is mainly required for Media reset (aka watchdog
+* timeout) which is completely under the control of GuC
+* (resubmission of hung workload is handled inside GuC).
+*/
+   GUC_ADD_MMIO_REG_ADS(eng_reg, RING_HWS_PGA(engine->mmio_base),
+GUC_REGSET_ENGINERESET |
+GUC_REGSET_SAVE_CURRENT_VALUE, 0);
+
+   /*
+* Workaround the guc issue with masked registers, note that
+* at this point guc submission is still disabled and the mode
+* register doesnt have the irq_steering bit set, which we
+* need to fwd irqs to GuC.
+*/
+   GUC_ADD_MMIO_REG_ADS(eng_reg, RING_MODE_GEN7(engine),
+GUC_REGSET_ENGINERESET |
+GUC_REGSET_SAVE_DEFAULT_VALUE,
+I915_READ(RING_MODE_GEN7(engine)) |
+GFX_INTERRUPT_STEERING | (0x<<16));
+
+   GUC_ADD_MMIO_REG_ADS(eng_reg, RING_IMR(engine->mmio_base),
+GUC_REGSET_ENGINERESET |
+GUC_REGSET_SAVE_CURRENT_VALUE, 0);
+
+   DRM_DEBUG_DRIVER("%s register save/restore count: %u\n",
+engine->name, eng_reg->number_of_registe

[Intel-gfx] [PATCH v6 11/20] drm/i915/selftests: reset engine self tests

2017-04-18 Thread Michel Thierry
Check that we can reset specific engines, also check the fallback to
full reset if something didn't work.

v2: rebase.

Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 147 +++
 1 file changed, 147 insertions(+)

diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c 
b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index aa31d6c0cdfb..f64fa0e4bb40 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -322,6 +322,56 @@ static int igt_global_reset(void *arg)
return err;
 }
 
+static int igt_reset_engine(void *arg)
+{
+   struct drm_i915_private *i915 = arg;
+   struct intel_engine_cs *engine;
+   enum intel_engine_id id;
+   unsigned int reset_count, reset_engine_count;
+   int err = 0;
+
+   /* Check that we can issue a global GPU and engine reset */
+
+   if (!intel_has_gpu_reset(i915))
+   return 0;
+
+   if (!intel_has_reset_engine(i915))
+   return 0;
+
+   set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
+
+   for_each_engine(engine, i915, id) {
+   reset_count = i915_reset_count(&i915->gpu_error);
+   reset_engine_count = i915_reset_engine_count(&i915->gpu_error,
+engine);
+
+   err = i915_reset_engine(engine);
+   if (err) {
+   pr_err("i915_reset_engine failed\n");
+   break;
+   }
+
+   if (i915_reset_count(&i915->gpu_error) != reset_count) {
+   pr_err("Full GPU reset recorded! (engine reset 
expected)\n");
+   err = -EINVAL;
+   break;
+   }
+
+   if (i915_reset_engine_count(&i915->gpu_error, engine) ==
+   reset_engine_count) {
+   pr_err("No %s engine reset recorded!\n", engine->name);
+   err = -EINVAL;
+   break;
+   }
+   }
+
+   clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
+   if (i915_terminally_wedged(&i915->gpu_error))
+   err = -EIO;
+
+   return err;
+}
+
 static u32 fake_hangcheck(struct drm_i915_gem_request *rq)
 {
u32 reset_count;
@@ -526,13 +576,110 @@ static int igt_reset_queue(void *arg)
return err;
 }
 
+static int igt_render_engine_reset_fallback(void *arg)
+{
+   struct drm_i915_private *i915 = arg;
+   struct intel_engine_cs *engine = i915->engine[RCS];
+   struct hang h;
+   struct drm_i915_gem_request *rq;
+   unsigned int reset_count, reset_engine_count;
+   int err = 0;
+
+   /* Check that we can issue a global GPU and engine reset */
+
+   if (!intel_has_gpu_reset(i915))
+   return 0;
+
+   if (!intel_has_reset_engine(i915))
+   return 0;
+
+   set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
+   mutex_lock(&i915->drm.struct_mutex);
+
+   err = hang_init(&h, i915);
+   if (err)
+   goto unlock;
+
+   rq = hang_create_request(&h, engine, i915->kernel_context);
+   if (IS_ERR(rq)) {
+   err = PTR_ERR(rq);
+   goto fini;
+   }
+
+   i915_gem_request_get(rq);
+   __i915_add_request(rq, true);
+
+   /* make reset engine fail */
+   rq->fence.error = -EIO;
+
+   if (!wait_for_hang(&h, rq)) {
+   pr_err("Failed to start request %x\n", rq->fence.seqno);
+   err = -EIO;
+   goto fini;
+   }
+
+   reset_engine_count = i915_reset_engine_count(&i915->gpu_error, engine);
+   reset_count = fake_hangcheck(rq);
+
+   err = i915_reset_engine(engine);
+   if (err) {
+   pr_err("i915_reset_engine failed\n");
+   goto fini;
+   }
+
+   if (i915_reset_engine_count(&i915->gpu_error, engine) !=
+   reset_engine_count) {
+   pr_err("render engine reset recorded! (full reset expected)\n");
+   err = -EINVAL;
+   goto fini;
+   }
+
+   if (i915_reset_count(&i915->gpu_error) == reset_count) {
+   pr_err("No full GPU reset recorded!\n");
+   err = -EINVAL;
+   goto fini;
+   }
+
+   /*
+* by using fence.error = -EIO, full reset sets the wedged flag, do one
+* more full reset to re-enable the hw.
+*/
+   if (i915_terminally_wedged(&i915->gpu_error)) {
+   rq->fence.error = 0;
+
+   set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
+   i915_reset(i915);
+   GEM_BUG_ON(test_bit(I915_RESET_HAND

[Intel-gfx] [PATCH v6 10/20] drm/i915: Add engine reset count in get-reset-stats ioctl

2017-04-18 Thread Michel Thierry
Users/tests relying on the total reset count will start seeing a smaller
number since most of the hangs can be handled by engine reset.
Note that if reset engine x, context a running on engine y will be unaware
and unaffected.

To start the discussion, include just a total engine reset count. If it
is deemed useful, it can be extended to report each engine separately.

v2: s/engine_reset/reset_engine/, use union in uapi to not break compatibility.

Cc: Chris Wilson 
Cc: Mika Kuoppala 
Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/i915_gem_context.c | 14 +++---
 include/uapi/drm/i915_drm.h |  6 +-
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_context.c 
b/drivers/gpu/drm/i915/i915_gem_context.c
index 8bd0c4966913..edbed85a1c88 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -1133,9 +1133,11 @@ int i915_gem_context_reset_stats_ioctl(struct drm_device 
*dev,
struct drm_i915_private *dev_priv = to_i915(dev);
struct drm_i915_reset_stats *args = data;
struct i915_gem_context *ctx;
+   struct intel_engine_cs *engine;
+   enum intel_engine_id id;
int ret;
 
-   if (args->flags || args->pad)
+   if (args->flags)
return -EINVAL;
 
if (args->ctx_id == DEFAULT_CONTEXT_HANDLE && !capable(CAP_SYS_ADMIN))
@@ -1151,10 +1153,16 @@ int i915_gem_context_reset_stats_ioctl(struct 
drm_device *dev,
return PTR_ERR(ctx);
}
 
-   if (capable(CAP_SYS_ADMIN))
+   if (capable(CAP_SYS_ADMIN)) {
args->reset_count = i915_reset_count(&dev_priv->gpu_error);
-   else
+   for_each_engine(engine, dev_priv, id)
+   args->reset_engine_count +=
+   i915_reset_engine_count(&dev_priv->gpu_error,
+   engine);
+   } else {
args->reset_count = 0;
+   args->reset_engine_count = 0;
+   }
 
args->batch_active = ctx->guilty_count;
args->batch_pending = ctx->active_count;
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index f24a80d2d42e..fadedefba6db 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1278,7 +1278,11 @@ struct drm_i915_reset_stats {
/* Number of batches lost pending for execution, for this context */
__u32 batch_pending;
 
-   __u32 pad;
+   union {
+   __u32 pad;
+   /* Engine resets since boot/module reload, for all contexts */
+   __u32 reset_engine_count;
+   };
 };
 
 struct drm_i915_gem_userptr {
-- 
2.11.0

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [PATCH v6 14/20] drm/i915/guc: Add support for reset engine using GuC commands

2017-04-18 Thread Michel Thierry
This patch adds per engine reset and recovery (TDR) support when GuC is
used to submit workloads to GPU.

In the case of i915 directly submission to ELSP, driver manages hang
detection, recovery and resubmission. With GuC submission these tasks
are shared between driver and GuC. i915 is still responsible for detecting
a hang, and when it does it only requests GuC to reset that Engine. GuC
internally manages acquiring forcewake and idling the engine before actually
resetting it.

Once the reset is successful, i915 takes over again and handles resubmission.
The scheduler in i915 knows which requests are pending so after resetting
a engine, pending workloads/requests are resubmitted again.

v2: s/i915_guc_request_engine_reset/i915_guc_reset_engine/ to match the
non-guc funtion names.

Signed-off-by: Arun Siluvery 
Signed-off-by: Jeff McGee 
Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/i915_drv.c| 43 +-
 drivers/gpu/drm/i915/i915_drv.h|  1 +
 drivers/gpu/drm/i915/i915_guc_submission.c | 48 ++
 drivers/gpu/drm/i915/intel_guc_fwif.h  |  6 
 drivers/gpu/drm/i915/intel_lrc.c   |  5 ++--
 drivers/gpu/drm/i915/intel_uc.h|  1 +
 drivers/gpu/drm/i915/intel_uncore.c|  5 
 7 files changed, 88 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 974be1fa77f9..b7e2fa8a0036 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1910,23 +1910,34 @@ int i915_reset_engine(struct intel_engine_cs *engine)
 */
i915_gem_reset_engine(engine);
 
-   /* forcing engine to idle */
-   ret = intel_reset_engine_start(engine);
-   if (ret) {
-   DRM_ERROR("Failed to disable %s\n", engine->name);
-   goto error;
-   }
+   if (!dev_priv->guc.execbuf_client) {
+   /* forcing engine to idle */
+   ret = intel_reset_engine_start(engine);
+   if (ret) {
+   DRM_ERROR("Failed to disable %s\n", engine->name);
+   goto error;
+   }
 
-   /* finally, reset engine */
-   ret = intel_gpu_reset(dev_priv, intel_engine_flag(engine));
-   if (ret) {
-   DRM_ERROR("Failed to reset %s, ret=%d\n", engine->name, ret);
+   /* finally, reset engine */
+   ret = intel_gpu_reset(dev_priv, intel_engine_flag(engine));
+   if (ret) {
+   DRM_ERROR("Failed to reset %s, ret=%d\n",
+ engine->name, ret);
+   intel_reset_engine_cancel(engine);
+   goto error;
+   }
+
+   /* be sure the request reset bit gets cleared */
intel_reset_engine_cancel(engine);
-   goto error;
-   }
 
-   /* be sure the request reset bit gets cleared */
-   intel_reset_engine_cancel(engine);
+   } else {
+   ret = i915_guc_reset_engine(engine);
+   if (ret) {
+   DRM_ERROR("GuC failed to reset %s, ret=%d\n",
+ engine->name, ret);
+   goto error;
+   }
+   }
 
i915_gem_reset_finish_engine(engine);
 
@@ -1935,6 +1946,10 @@ int i915_reset_engine(struct intel_engine_cs *engine)
if (ret)
goto error;
 
+   /* for guc too */
+   if (dev_priv->guc.execbuf_client)
+   i915_guc_submission_reenable_engine(engine);
+
error->reset_engine_count[engine->id]++;
 
 wakeup:
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 71c34f15be64..5f2345fbff44 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3029,6 +3029,7 @@ extern int i915_reset_engine(struct intel_engine_cs 
*engine);
 extern bool intel_has_reset_engine(struct drm_i915_private *dev_priv);
 extern int intel_reset_engine_start(struct intel_engine_cs *engine);
 extern void intel_reset_engine_cancel(struct intel_engine_cs *engine);
+extern int i915_guc_reset_engine(struct intel_engine_cs *engine);
 extern int intel_guc_reset(struct drm_i915_private *dev_priv);
 extern void intel_engine_init_hangcheck(struct intel_engine_cs *engine);
 extern void intel_hangcheck_init(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c 
b/drivers/gpu/drm/i915/i915_guc_submission.c
index d772718861df..c8067aeab6f4 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -1338,6 +1338,25 @@ void i915_guc_submission_disable(struct drm_i915_private 
*dev_priv)
guc->execbuf_client = NULL;
 }
 
+void i915_guc_submission_reenable_engine(struct intel_engine_cs *engine)
+{
+  

[Intel-gfx] [PATCH v6 16/20] drm/i915: Watchdog timeout: IRQ handler for gen8+

2017-04-18 Thread Michel Thierry
*** General ***

Watchdog timeout (or "media engine reset") is a feature that allows
userland applications to enable hang detection on individual batch buffers.
The detection mechanism itself is mostly bound to the hardware and the only
thing that the driver needs to do to support this form of hang detection
is to implement the interrupt handling support as well as watchdog command
emission before and after the emitted batch buffer start instruction in the
ring buffer.

The principle of the hang detection mechanism is as follows:

1. Once the decision has been made to enable watchdog timeout for a
particular batch buffer and the driver is in the process of emitting the
batch buffer start instruction into the ring buffer it also emits a
watchdog timer start instruction before and a watchdog timer cancellation
instruction after the batch buffer start instruction in the ring buffer.

2. Once the GPU execution reaches the watchdog timer start instruction
the hardware watchdog counter is started by the hardware. The counter
keeps counting until either reaching a previously configured threshold
value or the timer cancellation instruction is executed.

2a. If the counter reaches the threshold value the hardware fires a
watchdog interrupt that is picked up by the watchdog interrupt handler.
This means that a hang has been detected and the driver needs to deal with
it the same way it would deal with a engine hang detected by the periodic
hang checker. The only difference between the two is that we already blamed
the active request (to ensure an engine reset).

2b. If the batch buffer completes and the execution reaches the watchdog
cancellation instruction before the watchdog counter reaches its
threshold value the watchdog is cancelled and nothing more comes of it.
No hang is detected.

Note about future interaction with preemption: Preemption could happen
in a command sequence prior to watchdog counter getting disabled,
resulting in watchdog being triggered following preemption. The driver will
need to explicitly disable the watchdog counter as part of the
preemption sequence.

*** This patch introduces: ***

1. IRQ handler code for watchdog timeout allowing direct hang recovery
based on hardware-driven hang detection, which then integrates directly
with the hang recovery path. This is independent of having per-engine reset
or just full gpu reset.

2. Watchdog specific register information.

Currently the render engine and all available media engines support
watchdog timeout (VECS is only supported in GEN9). The specifications elude
to the BCS engine being supported but that is currently not supported by
this commit.

Note that the value to stop the counter is different between render and
non-render engines in GEN8; GEN9 onwards it's the same.

v2: Move irq handler to tasklet, arm watchdog for a 2nd time to check
against false-positives.

v3: Don't use high priority tasklet, use engine_last_submit while
checking for false-positives. From GEN9 onwards, the stop counter bit is
the same for all engines.

Signed-off-by: Tomas Elf 
Signed-off-by: Ian Lister 
Signed-off-by: Arun Siluvery 
Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/i915_drv.h |  4 ++
 drivers/gpu/drm/i915/i915_irq.c | 13 ++-
 drivers/gpu/drm/i915/i915_reg.h |  6 +++
 drivers/gpu/drm/i915/intel_hangcheck.c  | 13 +--
 drivers/gpu/drm/i915/intel_lrc.c| 69 +
 drivers/gpu/drm/i915/intel_ringbuffer.h |  4 ++
 6 files changed, 104 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 5f2345fbff44..203f2112dd18 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1608,6 +1608,9 @@ struct i915_gpu_error {
 * inspect the bit and do the reset directly, otherwise the worker
 * waits for the struct_mutex.
 *
+* #I915_RESET_WATCHDOG - When hw detects a hang before us, we can use
+* I915_RESET_WATCHDOG to report the hang detection cause accurately.
+*
 * #I915_WEDGED - If reset fails and we can no longer use the GPU,
 * we set the #I915_WEDGED bit. Prior to command submission, e.g.
 * i915_gem_request_alloc(), this bit is checked and the sequence
@@ -1616,6 +1619,7 @@ struct i915_gpu_error {
unsigned long flags;
 #define I915_RESET_BACKOFF 0
 #define I915_RESET_HANDOFF 1
+#define I915_RESET_WATCHDOG2
 #define I915_WEDGED(BITS_PER_LONG - 1)
 
/* if available, engine-specific reset is tried before full gpu reset */
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index ab1d77ab0977..39ed432db19e 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1370,6 +1370,10 @@ gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 
iir, int test_shift)
 
if (tasklet)
tasklet

[Intel-gfx] [PATCH v6 09/20] drm/i915/tdr: Enable Engine reset and recovery support

2017-04-18 Thread Michel Thierry
From: Arun Siluvery 

This feature is made available only from Gen8, for previous gen devices
driver uses legacy full gpu reset.

Cc: Chris Wilson 
Cc: Mika Kuoppala 
Signed-off-by: Tomas Elf 
Signed-off-by: Arun Siluvery 
Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/i915_params.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_params.c 
b/drivers/gpu/drm/i915/i915_params.c
index 045cadb77285..14e2c2e57f96 100644
--- a/drivers/gpu/drm/i915/i915_params.c
+++ b/drivers/gpu/drm/i915/i915_params.c
@@ -46,7 +46,7 @@ struct i915_params i915 __read_mostly = {
.prefault_disable = 0,
.load_detect_test = 0,
.force_reset_modeset_test = 0,
-   .reset = 1,
+   .reset = 2,
.error_capture = true,
.invert_brightness = 0,
.disable_display = 0,
@@ -116,7 +116,7 @@ MODULE_PARM_DESC(vbt_sdvo_panel_type,
"(-2=ignore, -1=auto [default], index in VBT BIOS table)");
 
 module_param_named_unsafe(reset, i915.reset, int, 0600);
-MODULE_PARM_DESC(reset, "Attempt GPU resets (0=disabled, 1=full gpu reset 
[default], 2=engine reset)");
+MODULE_PARM_DESC(reset, "Attempt GPU resets (0=disabled, 1=full gpu reset, 
2=engine reset [default])");
 
 #if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
 module_param_named(error_capture, i915.error_capture, bool, 0600);
-- 
2.11.0

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [PATCH v6 12/20] drm/i915/guc: fix mmio whitelist mmio_start offset and add reminder

2017-04-18 Thread Michel Thierry
From: Daniele Ceraolo Spurio 

The mmio_start offset for the whitelist is the first FORCE_TO_NONPRIV
register the GuC can use to restore the provided whitelist when an
engine reset via GuC (which we still don't support) is triggered.

We're currently adding the mmio_base of the engine to the absolute
address of the RCS version of the register, which results in the wrong
offset. Fix it by using the definition we already have instead of
re-defining it in the GuC FW header.

Also add a comment to avoid future issues with FORCE_TO_NONPRIV
registers, which are also used by the workaround framework.

v2: improve comment (Michal), move comment about save/restore because it
is not related to the mmio_white_list field.

v3: rebase/resurrect.

Signed-off-by: Daniele Ceraolo Spurio 
Cc: Michał Winiarski 
Cc: Michal Wajdeczko 
Cc: Arkadiusz Hiler 
Cc: Oscar Mateo 
Reviewed-by: Michał Winiarski  (v2)
Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/i915_guc_submission.c | 11 +--
 drivers/gpu/drm/i915/intel_guc_fwif.h  |  1 -
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c 
b/drivers/gpu/drm/i915/i915_guc_submission.c
index 1642fff9cf13..1ea36a88d2fb 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -1036,10 +1036,17 @@ static int guc_ads_create(struct intel_guc *guc)
/* MMIO reg state */
for_each_engine(engine, dev_priv, id) {
blob->reg_state.white_list[engine->guc_id].mmio_start =
-   engine->mmio_base + GUC_MMIO_WHITE_LIST_START;
+   
i915_mmio_reg_offset(RING_FORCE_TO_NONPRIV(engine->mmio_base, 0));
 
-   /* Nothing to be saved or restored for now. */
+   /*
+* Note: if the GuC whitelist management is enabled, the values
+* should be filled using the workaround framework to avoid
+* inconsistencies with the handling of FORCE_TO_NONPRIV
+* registers.
+*/
blob->reg_state.white_list[engine->guc_id].count = 0;
+
+   /* Nothing to be saved or restored for now. */
}
 
/*
diff --git a/drivers/gpu/drm/i915/intel_guc_fwif.h 
b/drivers/gpu/drm/i915/intel_guc_fwif.h
index 6156845641a3..e6f8079df94a 100644
--- a/drivers/gpu/drm/i915/intel_guc_fwif.h
+++ b/drivers/gpu/drm/i915/intel_guc_fwif.h
@@ -394,7 +394,6 @@ struct guc_policies {
 #define GUC_REGSET_SAVE_CURRENT_VALUE  0x10
 
 #define GUC_REGSET_MAX_REGISTERS   25
-#define GUC_MMIO_WHITE_LIST_START  0x24d0
 #define GUC_MMIO_WHITE_LIST_MAX12
 #define GUC_S3_SAVE_SPACE_PAGES10
 
-- 
2.11.0

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH v6 17/20] drm/i915: Watchdog timeout: Ringbuffer command emission for gen8+

2017-04-18 Thread Michel Thierry


On 18/04/17 14:20, Chris Wilson wrote:

On Tue, Apr 18, 2017 at 01:23:32PM -0700, Michel Thierry wrote:

@@ -1329,10 +1331,29 @@ static int gen8_emit_bb_start(struct 
drm_i915_gem_request *req,
req->ctx->ppgtt->pd_dirty_rings &= 
~intel_engine_flag(req->engine);
}

-   cs = intel_ring_begin(req, 4);
+   /* bb_start only */
+   num_dwords = 4;
+
+   /* check if watchdog will be required */
+   if (req->ctx->engine[req->engine->id].watchdog_threshold != 0) {
+   if (!req->engine->emit_start_watchdog ||
+   !req->engine->emit_stop_watchdog)
+   return -EINVAL;


This is still a bug in the context setparam to get to this point without
a watchdog.



This can't happen (threshold != 0 && no emit_watchdog func), 
i915_gem_context_set_watchdog returns -ENODEV if vcs's 
emit_start_watchdog is not defined (the assumption is if the vcs has it, 
rcs does too).


I can remove it, if that's what you mean.

But re i915_gem_context_set_watchdog, I think maybe it should return 
ENODEV when there's no watchdog and the user is trying to get the array 
size (args->size == 0), and don't give false hopes.



+
+   /* + start_watchdog (6) + stop_watchdog (4) */
+   num_dwords += 10;
+   watchdog_running = true;
+   }
+static u32 *gen8_emit_stop_watchdog(struct drm_i915_gem_request *req, u32 *cs)
+{
+   struct intel_engine_cs *engine = req->engine;
+
+   /* XXX: no watchdog support in BCS engine */
+   GEM_BUG_ON(engine->id == BCS);
+
+   *cs++ = MI_LOAD_REGISTER_IMM(2);
+   *cs++ = i915_mmio_reg_offset(RING_CNTR(engine->mmio_base));
+   *cs++ = get_watchdog_disable(engine);
+   *cs++ = MI_NOOP;


Oops.


_context_set_watchdog also rejects if threshold[BCS] != 0.


-Chris


___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH v6 04/20] drm/i915/tdr: Modify error handler for per engine hang recovery

2017-04-18 Thread Michel Thierry



On 18/04/17 14:40, Chris Wilson wrote:

On Tue, Apr 18, 2017 at 01:23:19PM -0700, Michel Thierry wrote:

From: Arun Siluvery 

This is a preparatory patch which modifies error handler to do per engine
hang recovery. The actual patch which implements this sequence follows
later in the series. The aim is to prepare existing recovery function to
adapt to this new function where applicable (which fails at this point
because core implementation is lacking) and continue recovery using legacy
full gpu reset.

A helper function is also added to query the availability of engine
reset.

The error events behaviour that are used to notify user of reset are
adapted to engine reset such that it doesn't break users listening to these
events. In legacy we report an error event, a reset event before resetting
the gpu and a reset done event marking the completion of reset. The same
behaviour is adapted but reset event is only dispatched once even when
multiple engines are hung. Finally once reset is complete we send reset
done event as usual.

Note that this implementation of engine reset is for i915 directly
submitting to the ELSP, where the driver manages the hang detection,
recovery and resubmission. With GuC submission these tasks are shared
between driver and firmware; i915 will still responsible for detecting a
hang, and when it does it will have to request GuC to reset that Engine and
remind the firmware about the outstanding submissions. This will be
added in different patch.

v2: rebase, advertise engine reset availability in platform definition,
add note about GuC submission.
v3: s/*engine_reset*/*reset_engine*/. (Chris)
Handle reset as 2 level resets, by first going to engine only and fall
backing to full/chip reset as needed, i.e. reset_engine will need the
struct_mutex.
v4: Pass the engine mask to i915_reset. (Chris)
v5: Rebase, update selftests.
v6: Rebase, prepare for mutex-less reset engine.


I'm not sure if there is any trace of the original patch left. Certainly
this is the first that has come close to making me happy and looks like
it might actually work. :)


diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index e06af46f5a57..7bc5f552add7 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -814,6 +814,7 @@ struct intel_csr {
func(has_ddi); \
func(has_decoupled_mmio); \
func(has_dp_mst); \
+   func(has_reset_engine); \
func(has_fbc); \
func(has_fpga_dbg); \
func(has_full_ppgtt); \
@@ -1616,6 +1617,9 @@ struct i915_gpu_error {
 #define I915_RESET_HANDOFF 1
 #define I915_WEDGED(BITS_PER_LONG - 1)

+   /* if available, engine-specific reset is tried before full gpu reset */
+   u32 reset_engine_mask;


I want to convince myself that storing this here is sensible. My
expectation was that this would be passed along as a function parameter,
so I'll need to go back and see why that doesn't work.



This is a left-over from the previous version, the engine mask can be 
passed as a parameter to i915_reset_and_wakeup.



/**
 * Waitqueue to signal when a hang is detected. Used to for waiters
 * to release the struct_mutex for the reset to procede.
@@ -3019,6 +3023,8 @@ extern void i915_driver_unload(struct drm_device *dev);
 extern int intel_gpu_reset(struct drm_i915_private *dev_priv, u32 engine_mask);
 extern bool intel_has_gpu_reset(struct drm_i915_private *dev_priv);
 extern void i915_reset(struct drm_i915_private *dev_priv);
+extern int i915_reset_engine(struct intel_engine_cs *engine);
+extern bool intel_has_reset_engine(struct drm_i915_private *dev_priv);
 extern int intel_guc_reset(struct drm_i915_private *dev_priv);
 extern void intel_engine_init_hangcheck(struct intel_engine_cs *engine);
 extern void intel_hangcheck_init(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index fd97fe00cd0d..ab1d77ab0977 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2645,12 +2645,33 @@ static void i915_reset_and_wakeup(struct 
drm_i915_private *dev_priv)
char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
+   u32 engine_mask = dev_priv->gpu_error.reset_engine_mask;

kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);

-   DRM_DEBUG_DRIVER("resetting chip\n");
+   /*
+* This event needs to be sent before performing gpu reset. When
+* engine resets are supported we iterate through all engines and
+* reset hung engines individually. To keep the event dispatch
+* mechanism consistent with full gpu reset, this is only sent once
+* even when multiple engines are hung. It is also safe to move thi

Re: [Intel-gfx] [PATCH v6 17/20] drm/i915: Watchdog timeout: Ringbuffer command emission for gen8+

2017-04-18 Thread Michel Thierry


On 18/04/17 16:06, Chris Wilson wrote:

On Tue, Apr 18, 2017 at 02:36:14PM -0700, Michel Thierry wrote:


On 18/04/17 14:20, Chris Wilson wrote:

On Tue, Apr 18, 2017 at 01:23:32PM -0700, Michel Thierry wrote:

@@ -1329,10 +1331,29 @@ static int gen8_emit_bb_start(struct 
drm_i915_gem_request *req,
req->ctx->ppgtt->pd_dirty_rings &= 
~intel_engine_flag(req->engine);
}

-   cs = intel_ring_begin(req, 4);
+   /* bb_start only */
+   num_dwords = 4;
+
+   /* check if watchdog will be required */
+   if (req->ctx->engine[req->engine->id].watchdog_threshold != 0) {
+   if (!req->engine->emit_start_watchdog ||
+   !req->engine->emit_stop_watchdog)
+   return -EINVAL;


This is still a bug in the context setparam to get to this point without
a watchdog.



This can't happen (threshold != 0 && no emit_watchdog func),
i915_gem_context_set_watchdog returns -ENODEV if vcs's
emit_start_watchdog is not defined (the assumption is if the vcs has
it, rcs does too).

I can remove it, if that's what you mean.


Yes, we shouldn't be setting the watchdog threshold if the watchdog is
not available. GEM_BUG_ON() would be fine. Throwing a very, very late
EINVAL is disconcerting.


But re i915_gem_context_set_watchdog, I think maybe it should return
ENODEV when there's no watchdog and the user is trying to get the
array size (args->size == 0), and don't give false hopes.


Seems reasonable.


+
+   /* + start_watchdog (6) + stop_watchdog (4) */
+   num_dwords += 10;
+   watchdog_running = true;
+   }
+static u32 *gen8_emit_stop_watchdog(struct drm_i915_gem_request *req, u32 *cs)
+{
+   struct intel_engine_cs *engine = req->engine;
+
+   /* XXX: no watchdog support in BCS engine */
+   GEM_BUG_ON(engine->id == BCS);
+
+   *cs++ = MI_LOAD_REGISTER_IMM(2);
+   *cs++ = i915_mmio_reg_offset(RING_CNTR(engine->mmio_base));
+   *cs++ = get_watchdog_disable(engine);
+   *cs++ = MI_NOOP;


Oops.


_context_set_watchdog also rejects if threshold[BCS] != 0.


LRI(2), but only setting one register not two.


Oh... proof that most of the time I only copy+paste stuff.


-Chris


___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH v6 13/20] drm/i915/guc: Provide register list to be saved/restored during engine reset

2017-04-18 Thread Michel Thierry

On 18/04/17 17:26, Daniele Ceraolo Spurio wrote:



On 18/04/17 13:23, Michel Thierry wrote:

From: Arun Siluvery 

GuC expects a list of registers from the driver which are saved/restored
during engine reset. The type of value to be saved is controlled by
flags. We provide a minimal set of registers that we want GuC to save and
restore. This is not an issue in case of engine reset as driver
initializes
most of them following an engine reset, but in case of media reset (aka
watchdog reset) which is completely internal to GuC (including
resubmission
of hung workload), it is necessary to provide this list, otherwise GuC
won't
be able to schedule further workloads after a reset. This is the minimal
set of registers identified for things to work as expected but if we see
any new issues, this register list can be expanded.

v2: REGSET_MASKED is too difficult for GuC, use REGSET_SAVE_DEFAULT_VALUE
and current value from RING_MODE reg instead; no need to preserve
head/tail either, be extra paranoid and save whitelisted registers
(Daniele).

Cc: Daniele Ceraolo Spurio 
Signed-off-by: Arun Siluvery 
Signed-off-by: Jeff McGee 
Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/i915_guc_submission.c | 60
+-
 1 file changed, 58 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c
b/drivers/gpu/drm/i915/i915_guc_submission.c
index 1ea36a88d2fb..d772718861df 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -1003,6 +1003,24 @@ static void guc_policies_init(struct
guc_policies *policies)
 policies->is_valid = 1;
 }

+/*
+ * In this macro it is highly unlikely to exceed max value but even
if we did
+ * it is not an error so just throw a warning and continue. Only side
effect
+ * in continuing further means some registers won't be added to
save/restore
+ * list.
+ */
+#define GUC_ADD_MMIO_REG_ADS(node, reg_addr, _flags, defvalue)\
+do {\
+u32 __count = node->number_of_registers;\
+if (WARN_ON(__count >= GUC_REGSET_MAX_REGISTERS))\
+continue;\
+node->registers[__count].offset = reg_addr.reg;\
+node->registers[__count].flags = (_flags);\
+if (defvalue)\
+node->registers[__count].value = (defvalue);\
+node->number_of_registers++;\
+} while (0)
+
 static int guc_ads_create(struct intel_guc *guc)
 {
 struct drm_i915_private *dev_priv = guc_to_i915(guc);
@@ -1016,6 +1034,7 @@ static int guc_ads_create(struct intel_guc *guc)
 u8 reg_state_buffer[GUC_S3_SAVE_SPACE_PAGES * PAGE_SIZE];
 } __packed *blob;
 struct intel_engine_cs *engine;
+struct i915_workarounds *workarounds = &dev_priv->workarounds;
 enum intel_engine_id id;
 u32 base;

@@ -1035,6 +1054,39 @@ static int guc_ads_create(struct intel_guc *guc)

 /* MMIO reg state */
 for_each_engine(engine, dev_priv, id) {
+u32 i;
+struct guc_mmio_regset *eng_reg =
+&blob->reg_state.engine_reg[engine->guc_id];
+
+/*
+ * Provide a list of registers to be saved/restored during gpu
+ * reset. This is mainly required for Media reset (aka watchdog
+ * timeout) which is completely under the control of GuC
+ * (resubmission of hung workload is handled inside GuC).
+ */
+GUC_ADD_MMIO_REG_ADS(eng_reg, RING_HWS_PGA(engine->mmio_base),
+ GUC_REGSET_ENGINERESET |
+ GUC_REGSET_SAVE_CURRENT_VALUE, 0);
+
+/*
+ * Workaround the guc issue with masked registers, note that
+ * at this point guc submission is still disabled and the mode
+ * register doesnt have the irq_steering bit set, which we
+ * need to fwd irqs to GuC.
+ */
+GUC_ADD_MMIO_REG_ADS(eng_reg, RING_MODE_GEN7(engine),
+ GUC_REGSET_ENGINERESET |
+ GUC_REGSET_SAVE_DEFAULT_VALUE,
+ I915_READ(RING_MODE_GEN7(engine)) |
+ GFX_INTERRUPT_STEERING | (0x<<16));
+
+GUC_ADD_MMIO_REG_ADS(eng_reg, RING_IMR(engine->mmio_base),
+ GUC_REGSET_ENGINERESET |
+ GUC_REGSET_SAVE_CURRENT_VALUE, 0);
+


I might just be too paranoid, but I think that we also have to add the
registers that we use for WAs via mmio (i.e. not using an LRI in the
ringbuffer). I did a quick test for the registers in the
gen9_init_workarounds and skl_init_workarounds functions that we write
using I915_WRITE and it looks like some of them lose their value after
and RCS media reset:

 REG   WA BITSVAL PRE-MRVAL POST-MR
0x20D4(1<<2)0x00040x
0xb11c(1<<2)0x00050x00

Re: [Intel-gfx] [PATCH v6 16/20] drm/i915: Watchdog timeout: IRQ handler for gen8+

2017-04-19 Thread Michel Thierry



On 19/04/17 03:20, Chris Wilson wrote:

On Tue, Apr 18, 2017 at 01:23:31PM -0700, Michel Thierry wrote:

*** General ***

Watchdog timeout (or "media engine reset") is a feature that allows
userland applications to enable hang detection on individual batch buffers.
The detection mechanism itself is mostly bound to the hardware and the only
thing that the driver needs to do to support this form of hang detection
is to implement the interrupt handling support as well as watchdog command
emission before and after the emitted batch buffer start instruction in the
ring buffer.

The principle of the hang detection mechanism is as follows:

1. Once the decision has been made to enable watchdog timeout for a
particular batch buffer and the driver is in the process of emitting the
batch buffer start instruction into the ring buffer it also emits a
watchdog timer start instruction before and a watchdog timer cancellation
instruction after the batch buffer start instruction in the ring buffer.

2. Once the GPU execution reaches the watchdog timer start instruction
the hardware watchdog counter is started by the hardware. The counter
keeps counting until either reaching a previously configured threshold
value or the timer cancellation instruction is executed.

2a. If the counter reaches the threshold value the hardware fires a
watchdog interrupt that is picked up by the watchdog interrupt handler.
This means that a hang has been detected and the driver needs to deal with
it the same way it would deal with a engine hang detected by the periodic
hang checker. The only difference between the two is that we already blamed
the active request (to ensure an engine reset).

2b. If the batch buffer completes and the execution reaches the watchdog
cancellation instruction before the watchdog counter reaches its
threshold value the watchdog is cancelled and nothing more comes of it.
No hang is detected.

Note about future interaction with preemption: Preemption could happen
in a command sequence prior to watchdog counter getting disabled,
resulting in watchdog being triggered following preemption. The driver will
need to explicitly disable the watchdog counter as part of the
preemption sequence.


Does MI_ARB_ON_OFF do the trick? Shouldn't we basically be only turning
preemption on for the user buffers as it just causes hassle if we allow
preemption in our preamble + breadcrumb. (And there's little point in
preempting in the flushes.)



Mid-batch?
The watchdog counter is not aware of MI_ARB_ON_OFF (or any other cmd) 
and would keep running / expire. We could call emit_stop_watchdog 
unconditionally to prevent this.



*** This patch introduces: ***

1. IRQ handler code for watchdog timeout allowing direct hang recovery
based on hardware-driven hang detection, which then integrates directly
with the hang recovery path. This is independent of having per-engine reset
or just full gpu reset.

2. Watchdog specific register information.

Currently the render engine and all available media engines support
watchdog timeout (VECS is only supported in GEN9). The specifications elude
to the BCS engine being supported but that is currently not supported by
this commit.

Note that the value to stop the counter is different between render and
non-render engines in GEN8; GEN9 onwards it's the same.


Should mention the choice to piggyback the current hangcheck + capture
scheme.


+   if (iir & (GT_GEN8_WATCHDOG_INTERRUPT << test_shift)) {
+   tasklet_schedule(&engine->watchdog_tasklet);
+   }


Kill unwanted braces.


+#define GEN8_WATCHDOG_1000US 0x2ee0 //XXX: Temp, replace with helper function
+static void gen8_watchdog_irq_handler(unsigned long data)
+{
+   struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
+   struct drm_i915_private *dev_priv = engine->i915;
+   u32 current_seqno;
+
+   intel_uncore_forcewake_get(dev_priv, engine->fw_domains);
+
+   /* Stop the counter to prevent further timeout interrupts */
+   I915_WRITE_FW(RING_CNTR(engine->mmio_base), 
get_watchdog_disable(engine));
+
+   current_seqno = intel_engine_get_seqno(engine);
+
+   /* did the request complete after the timer expired? */
+   if (intel_engine_last_submit(engine) == current_seqno)
+   goto fw_put;
+
+   if (engine->hangcheck.watchdog == current_seqno) {
+   /* Make sure the active request will be marked as guilty */
+   engine->hangcheck.stalled = true;
+   engine->hangcheck.seqno = intel_engine_get_seqno(engine);


Use current_seqno again. intel_engine_get_seqno() may have just changed.
-Chris


___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH v6 16/20] drm/i915: Watchdog timeout: IRQ handler for gen8+

2017-04-19 Thread Michel Thierry



On 19/04/17 10:51, Chris Wilson wrote:

On Wed, Apr 19, 2017 at 10:11:37AM -0700, Michel Thierry wrote:



On 19/04/17 03:20, Chris Wilson wrote:

On Tue, Apr 18, 2017 at 01:23:31PM -0700, Michel Thierry wrote:

*** General ***

Watchdog timeout (or "media engine reset") is a feature that allows
userland applications to enable hang detection on individual batch buffers.
The detection mechanism itself is mostly bound to the hardware and the only
thing that the driver needs to do to support this form of hang detection
is to implement the interrupt handling support as well as watchdog command
emission before and after the emitted batch buffer start instruction in the
ring buffer.

The principle of the hang detection mechanism is as follows:

1. Once the decision has been made to enable watchdog timeout for a
particular batch buffer and the driver is in the process of emitting the
batch buffer start instruction into the ring buffer it also emits a
watchdog timer start instruction before and a watchdog timer cancellation
instruction after the batch buffer start instruction in the ring buffer.

2. Once the GPU execution reaches the watchdog timer start instruction
the hardware watchdog counter is started by the hardware. The counter
keeps counting until either reaching a previously configured threshold
value or the timer cancellation instruction is executed.

2a. If the counter reaches the threshold value the hardware fires a
watchdog interrupt that is picked up by the watchdog interrupt handler.
This means that a hang has been detected and the driver needs to deal with
it the same way it would deal with a engine hang detected by the periodic
hang checker. The only difference between the two is that we already blamed
the active request (to ensure an engine reset).

2b. If the batch buffer completes and the execution reaches the watchdog
cancellation instruction before the watchdog counter reaches its
threshold value the watchdog is cancelled and nothing more comes of it.
No hang is detected.

Note about future interaction with preemption: Preemption could happen
in a command sequence prior to watchdog counter getting disabled,
resulting in watchdog being triggered following preemption. The driver will
need to explicitly disable the watchdog counter as part of the
preemption sequence.


Does MI_ARB_ON_OFF do the trick? Shouldn't we basically be only turning
preemption on for the user buffers as it just causes hassle if we allow
preemption in our preamble + breadcrumb. (And there's little point in
preempting in the flushes.)



Mid-batch?
The watchdog counter is not aware of MI_ARB_ON_OFF (or any other
cmd) and would keep running / expire. We could call
emit_stop_watchdog unconditionally to prevent this.


No, I was thinking of the opposite where we had preemption after the
batch. Completely missed the point of the watchdog being abled for the
low priority batch then being inherited by the high priority batch - and
vice versa that the watchdog counter would not be restored on the
context switch back. Does suggest that the watchdog should really be
part of the context image...


RING_CNTR (0x2178) & RING_THRESH (0x217c) are part of the context image, 
but there's still the issue of the ctx restore being slower (or maybe 
it's a lite-restore).


And the 'counter' isn't part of the image; when the pre-empted batch 
resumes, the counter will re-start from 0.

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [PATCH v6 13/20] drm/i915/guc: Provide register list to be saved/restored during engine reset

2017-04-19 Thread Michel Thierry
From: Arun Siluvery 

GuC expects a list of registers from the driver which are saved/restored
during engine reset. The type of value to be saved is controlled by
flags. We provide a minimal set of registers that we want GuC to save and
restore. This is not an issue in case of engine reset as driver initializes
most of them following an engine reset, but in case of media reset (aka
watchdog reset) which is completely internal to GuC (including resubmission
of hung workload), it is necessary to provide this list, otherwise GuC won't
be able to schedule further workloads after a reset. This is the minimal
set of registers identified for things to work as expected but if we see
any new issues, this register list can be expanded.

In order to not loose any existing workarounds, we have to let GuC know
the registers and its values. These will be reapplied after the reset.
Note that we can't just read the current value because most of these
registers are masked (so we have a workaround for a workaround for a
workaround).

v2: REGSET_MASKED is too difficult for GuC, use REGSET_SAVE_DEFAULT_VALUE
and current value from RING_MODE reg instead; no need to preserve
head/tail either, be extra paranoid and save whitelisted registers (Daniele).

v3: Workarounds added only once during _init_workarounds also have to
been restored, or we risk loosing them after internal GuC reset
(Daniele).

Cc: Daniele Ceraolo Spurio 
Signed-off-by: Arun Siluvery 
Signed-off-by: Jeff McGee 
Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/i915_drv.h|  3 ++
 drivers/gpu/drm/i915/i915_guc_submission.c | 68 +-
 drivers/gpu/drm/i915/intel_engine_cs.c | 65 +++-
 3 files changed, 114 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index fa3988c5033b..1ba1ac016973 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1913,7 +1913,10 @@ struct i915_wa_reg {
 
 struct i915_workarounds {
struct i915_wa_reg reg[I915_MAX_WA_REGS];
+   /* list of registers (and their values) that GuC will have to restore */
+   struct i915_wa_reg guc_reg[GUC_REGSET_MAX_REGISTERS];
u32 count;
+   u32 guc_count;
u32 hw_whitelist_count[I915_NUM_ENGINES];
 };
 
diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c 
b/drivers/gpu/drm/i915/i915_guc_submission.c
index 1ea36a88d2fb..f4081da88df2 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -1003,6 +1003,24 @@ static void guc_policies_init(struct guc_policies 
*policies)
policies->is_valid = 1;
 }
 
+/*
+ * In this macro it is highly unlikely to exceed max value but even if we did
+ * it is not an error so just throw a warning and continue. Only side effect
+ * in continuing further means some registers won't be added to save/restore
+ * list.
+ */
+#define GUC_ADD_MMIO_REG_ADS(node, reg_addr, _flags, defvalue) \
+   do {\
+   u32 __count = node->number_of_registers;\
+   if (WARN_ON(__count >= GUC_REGSET_MAX_REGISTERS))   \
+   continue;   \
+   node->registers[__count].offset = reg_addr.reg; \
+   node->registers[__count].flags = (_flags);  \
+   if (defvalue)   \
+   node->registers[__count].value = (defvalue);\
+   node->number_of_registers++;\
+   } while (0)
+
 static int guc_ads_create(struct intel_guc *guc)
 {
struct drm_i915_private *dev_priv = guc_to_i915(guc);
@@ -1016,6 +1034,7 @@ static int guc_ads_create(struct intel_guc *guc)
u8 reg_state_buffer[GUC_S3_SAVE_SPACE_PAGES * PAGE_SIZE];
} __packed *blob;
struct intel_engine_cs *engine;
+   struct i915_workarounds *workarounds = &dev_priv->workarounds;
enum intel_engine_id id;
u32 base;
 
@@ -1035,6 +1054,47 @@ static int guc_ads_create(struct intel_guc *guc)
 
/* MMIO reg state */
for_each_engine(engine, dev_priv, id) {
+   u32 i;
+   struct guc_mmio_regset *eng_reg =
+   &blob->reg_state.engine_reg[engine->guc_id];
+
+   /*
+* Provide a list of registers to be saved/restored during gpu
+* reset. This is mainly required for Media reset (aka watchdog
+* timeout) which is completely under the control of GuC
+* (resubmission of hung workload is handled inside GuC).
+*/
+   GUC_ADD_MMIO_REG_ADS(eng_reg, RING_HWS_PGA(engine->mmio_base),
+  

Re: [Intel-gfx] [PATCH v6 14/20] drm/i915/guc: Add support for reset engine using GuC commands

2017-04-19 Thread Michel Thierry

On 19/04/17 03:27, Chris Wilson wrote:

On Tue, Apr 18, 2017 at 01:23:29PM -0700, Michel Thierry wrote:

This patch adds per engine reset and recovery (TDR) support when GuC is
used to submit workloads to GPU.

In the case of i915 directly submission to ELSP, driver manages hang
detection, recovery and resubmission. With GuC submission these tasks
are shared between driver and GuC. i915 is still responsible for detecting
a hang, and when it does it only requests GuC to reset that Engine. GuC
internally manages acquiring forcewake and idling the engine before actually
resetting it.

Once the reset is successful, i915 takes over again and handles resubmission.
The scheduler in i915 knows which requests are pending so after resetting
a engine, pending workloads/requests are resubmitted again.

v2: s/i915_guc_request_engine_reset/i915_guc_reset_engine/ to match the
non-guc funtion names.

Signed-off-by: Arun Siluvery 
Signed-off-by: Jeff McGee 
Signed-off-by: Michel Thierry 
---
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 7df278fe492e..6295760098a1 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1176,14 +1176,15 @@ static int gen8_init_common_ring(struct intel_engine_cs 
*engine)

/* After a GPU reset, we may have requests to replay */
clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
-   if (!i915.enable_guc_submission && !execlists_elsp_idle(engine)) {
+   if (!execlists_elsp_idle(engine)) {
DRM_DEBUG_DRIVER("Restarting %s from requests [0x%x, 0x%x]\n",
 engine->name,
 port_seqno(&engine->execlist_port[0]),
 port_seqno(&engine->execlist_port[1]));
engine->execlist_port[0].count = 0;
engine->execlist_port[1].count = 0;
-   execlists_submit_ports(engine);
+   if (!dev_priv->guc.execbuf_client)
+   execlists_submit_ports(engine);


Not sure what you were intending to do here as this only resets the
submission count -- which is not used by guc dequeue. Some merit in the
making the code look similar, certainly adds the dbg message but I think
it is unrelated to the rest of the patch.


Yes, it only keeps the same debug message (originally added to check it 
was taking the right path). I can remove if you think it doesn't provide 
anything useful.

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH v6 18/20] drm/i915: Watchdog timeout: DRM kernel interface to set the timeout

2017-04-19 Thread Michel Thierry

On 18/04/17 13:23, Michel Thierry wrote:

Final enablement patch for GPU hang detection using watchdog timeout.
Using the gem_context_setparam ioctl, users can specify the desired
timeout value in microseconds, and the driver will do the conversion to
'timestamps'.

The recommended default watchdog threshold for video engines is 6 us,
since this has been _empirically determined_ to be a good compromise for
low-latency requirements and low rate of false positives. The default
register value is ~106000us and the theoretical max value (all 1s) is
353 seconds.

Note, UABI engine ids and i915 engine ids are different, and this patch
uses the i915 ones. Some kind of mapping table [1] is required if we
decide to use the UABI engine ids.

[1] 
http://patchwork.freedesktop.org/patch/msgid/20170329135831.30254-2-ch...@chris-wilson.co.uk

v2: Fixed get api to return values in microseconds. Threshold updated to
be per context engine. Check for u32 overflow. Capture ctx threshold
value in error state.

v3: Add a way to get array size, short-cut to disable all thresholds,
return EFAULT / EINVAL as needed. Move the capture of the threshold
value in the error state into a new patch. BXT has a different
timestamp base (because why not?).

Signed-off-by: Tomas Elf 
Signed-off-by: Arun Siluvery 
Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/i915_drv.h |  29 +
 drivers/gpu/drm/i915/i915_gem_context.c | 102 
 drivers/gpu/drm/i915/intel_lrc.c|   5 +-
 include/uapi/drm/i915_drm.h |   1 +
 4 files changed, 135 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 203f2112dd18..f65a236fddef 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3574,6 +3574,35 @@ i915_gem_context_lookup_timeline(struct i915_gem_context 
*ctx,
return &vm->timeline.engine[engine->id];
 }

+/*
+ * BDW & SKL+ Timestamp timer resolution = 0.080 uSec,
+ * or 1250 counts per second, or ~12 counts per microsecond.
+ *
+ * But Broxton Timestamp timer resolution is different, 0.052 uSec,
+ * or 1920 counts per second, or ~19 counts per microsecond.
+ */
+#define SKL_TIMESTAMP_CNTS_PER_USEC 12
+#define BXT_TIMESTAMP_CNTS_PER_USEC 19
+#define TIMESTAMP_CNTS_PER_USEC(dev_priv) (IS_BROXTON(dev_priv) ? \
+  BXT_TIMESTAMP_CNTS_PER_USEC : \
+  SKL_TIMESTAMP_CNTS_PER_USEC)
+static inline u32
+watchdog_to_us(struct drm_i915_private *dev_priv, u32 value_in_clock_counts)
+{
+   return value_in_clock_counts / TIMESTAMP_CNTS_PER_USEC(dev_priv);
+}
+
+static inline u32
+watchdog_to_clock_counts(struct drm_i915_private *dev_priv, u64 value_in_us)
+{
+   u64 threshold = value_in_us * TIMESTAMP_CNTS_PER_USEC(dev_priv);
+
+   if (overflows_type(threshold, u32))
+   return -EINVAL;
+
+   return threshold;
+}
+
 int i915_perf_open_ioctl(struct drm_device *dev, void *data,
 struct drm_file *file);

diff --git a/drivers/gpu/drm/i915/i915_gem_context.c 
b/drivers/gpu/drm/i915/i915_gem_context.c
index edbed85a1c88..85a6467a25a6 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -422,6 +422,102 @@ i915_gem_context_create_gvt(struct drm_device *dev)
return ctx;
 }

+/* Return the timer count threshold in microseconds. */
+int i915_gem_context_get_watchdog(struct i915_gem_context *ctx,
+ struct drm_i915_gem_context_param *args)
+{
+   struct drm_i915_private *dev_priv = ctx->i915;
+   struct intel_engine_cs *engine;
+   enum intel_engine_id id;
+   u32 threshold_in_us[I915_NUM_ENGINES];
+
+   if (args->size == 0)
+   goto out;
+
+   if (args->size < sizeof(threshold_in_us))
+   return -EFAULT;
+
+   if (!dev_priv->engine[VCS]->emit_start_watchdog)
+   return -ENODEV;
+
+   for_each_engine(engine, dev_priv, id) {
+   struct intel_context *ce = &ctx->engine[id];
+
+   threshold_in_us[id] = watchdog_to_us(dev_priv,
+ce->watchdog_threshold);
+   }
+
+   mutex_unlock(&dev_priv->drm.struct_mutex);
+   if (__copy_to_user(u64_to_user_ptr(args->value),
+  &threshold_in_us,
+  sizeof(threshold_in_us))) {
+   mutex_lock(&dev_priv->drm.struct_mutex);
+   return -EFAULT;
+   }
+   mutex_lock(&dev_priv->drm.struct_mutex);
+
+out:
+   args->size = sizeof(threshold_in_us);
+
+   return 0;
+}
+
+/*
+ * Based on time out value in microseconds (us) calculate
+ * timer count thresholds needed based on core frequency.
+ * Watchdog can be disabled by se

Re: [Intel-gfx] [PATCH v6 18/20] drm/i915: Watchdog timeout: DRM kernel interface to set the timeout

2017-04-20 Thread Michel Thierry



On 20/04/17 01:52, Chris Wilson wrote:

On Wed, Apr 19, 2017 at 06:09:00PM -0700, Michel Thierry wrote:

This patch is missing:

diff --git a/drivers/gpu/drm/i915/i915_gem_context.c
b/drivers/gpu/drm/i915/i915_gem_context.c
index c1013af0b910..a8bdea43a217 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -1135,7 +1135,7 @@ int i915_gem_context_getparam_ioctl(struct
drm_device *dev, void *data,
return PTR_ERR(ctx);
}

-   args->size = 0;
+   args->size = (args->param != I915_CONTEXT_PARAM_WATCHDOG) ? 0 :
args->size;
switch (args->param) {
case I915_CONTEXT_PARAM_BAN_PERIOD:
ret = -EINVAL;

Or there will be no way to get the current thresholds (chunk was
missed due to some TRTT code nearby). I'll be sure to include it in
the next version.


No. It is always preset to 0. The PARAM should set it to the actual
struct size (it would write) and *not* the user's size.
-Chris



Ok, then I'll change the shortcut in get_watchdog, because as it is you 
can query the size, but not the thresholds.


int i915_gem_context_get_watchdog()
{
...
if (args->size == 0)
goto out;
...
out:
args->size = sizeof(threshold_in_us);

return 0;
}
}
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH v6 13/20] drm/i915/guc: Provide register list to be saved/restored during engine reset

2017-04-20 Thread Michel Thierry



On 20/04/17 09:39, Daniele Ceraolo Spurio wrote:



On 20/04/17 04:33, Joonas Lahtinen wrote:

On ke, 2017-04-19 at 11:35 -0700, Michel Thierry wrote:

From: Arun Siluvery 

GuC expects a list of registers from the driver which are saved/restored
during engine reset. The type of value to be saved is controlled by
flags. We provide a minimal set of registers that we want GuC to save
and
restore. This is not an issue in case of engine reset as driver
initializes
most of them following an engine reset, but in case of media reset (aka
watchdog reset) which is completely internal to GuC (including
resubmission
of hung workload), it is necessary to provide this list, otherwise
GuC won't
be able to schedule further workloads after a reset. This is the minimal
set of registers identified for things to work as expected but if we see
any new issues, this register list can be expanded.

In order to not loose any existing workarounds, we have to let GuC know
the registers and its values. These will be reapplied after the reset.
Note that we can't just read the current value because most of these
registers are masked (so we have a workaround for a workaround for a
workaround).

v2: REGSET_MASKED is too difficult for GuC, use
REGSET_SAVE_DEFAULT_VALUE
and current value from RING_MODE reg instead; no need to preserve
head/tail either, be extra paranoid and save whitelisted registers
(Daniele).

v3: Workarounds added only once during _init_workarounds also have to
been restored, or we risk loosing them after internal GuC reset
(Daniele).

Cc: Daniele Ceraolo Spurio 
Signed-off-by: Arun Siluvery 
Signed-off-by: Jeff McGee 
Signed-off-by: Michel Thierry 





@@ -732,15 +755,16 @@ static int gen9_init_workarounds(struct
intel_engine_cs *engine)

 int ret;


 /* WaConextSwitchWithConcurrentTLBInvalidate:skl,bxt,kbl,glk */
-I915_WRITE(GEN9_CSFE_CHICKEN1_RCS,
_MASKED_BIT_ENABLE(GEN9_PREEMPT_GPGPU_SYNC_SWITCH_DISABLE));
+I915_GUC_REG_WRITE(GEN9_CSFE_CHICKEN1_RCS,
+
_MASKED_BIT_ENABLE(GEN9_PREEMPT_GPGPU_SYNC_SWITCH_DISABLE));


To make grepping easier, how about?

I915_WRITE(GUC_REG(GEN9_CSFE_CHICKEN1_RCS),
   ...);

Regards, Joonas



GUC_REG makes it sound like it is somehow related to GuC, while it
isn't, we just want GuC to restore its value. What about GUC_REG_RESTORE?



Honestly, I dont care about names, pick one and I add it.
Just a reminder, we not only need the reg offset, we want to save the 
value too.


-Michel
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH v6 05/20] drm/i915/tdr: Add support for per engine reset recovery

2017-04-20 Thread Michel Thierry

On 19/04/17 03:49, Chris Wilson wrote:

On Tue, Apr 18, 2017 at 01:23:20PM -0700, Michel Thierry wrote:

From: Arun Siluvery 

This change implements support for per-engine reset as an initial, less
intrusive hang recovery option to be attempted before falling back to the
legacy full GPU reset recovery mode if necessary. This is only supported
from Gen8 onwards.

Hangchecker determines which engines are hung and invokes error handler to
recover from it. Error handler schedules recovery for each of those engines
that are hung. The recovery procedure is as follows,
 - identifies the request that caused the hang and it is dropped
 - force engine to idle: this is done by issuing a reset request
 - reset and re-init engine
 - restart submissions to the engine

If engine reset fails then we fall back to heavy weight full gpu reset
which resets all engines and reinitiazes complete state of HW and SW.

v2: Rebase.
v3: s/*engine_reset*/*reset_engine*/; freeze engine and irqs before
calling i915_gem_reset_engine (Chris).
v4: Rebase, modify i915_gem_reset_prepare to use a ring mask and
reuse the function for reset_engine.
v5: intel_reset_engine_start/cancel instead of request/unrequest_reset.
v6: Clean up reset_engine function to not require mutex, i.e. no need to call
revoke/restore_fences and _retire_requests (Chris).

Cc: Chris Wilson 
Cc: Mika Kuoppala 
Signed-off-by: Tomas Elf 
Signed-off-by: Arun Siluvery 
Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/i915_drv.c | 76 --
 drivers/gpu/drm/i915/i915_drv.h | 12 +++-
 drivers/gpu/drm/i915/i915_gem.c | 97 +++--
 drivers/gpu/drm/i915/i915_gem_request.c |  2 +-
 drivers/gpu/drm/i915/intel_uncore.c | 20 +++
 5 files changed, 158 insertions(+), 49 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index e03d0643dbd6..634893cd93b3 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1810,7 +1810,7 @@ void i915_reset(struct drm_i915_private *dev_priv)

pr_notice("drm/i915: Resetting chip after gpu hang\n");
disable_irq(dev_priv->drm.irq);
-   ret = i915_gem_reset_prepare(dev_priv);
+   ret = i915_gem_reset_prepare(dev_priv, ALL_ENGINES);
if (ret) {
DRM_ERROR("GPU recovery failed\n");
intel_gpu_reset(dev_priv, ALL_ENGINES);
@@ -1852,7 +1852,7 @@ void i915_reset(struct drm_i915_private *dev_priv)
i915_queue_hangcheck(dev_priv);

 finish:
-   i915_gem_reset_finish(dev_priv);
+   i915_gem_reset_finish(dev_priv, ALL_ENGINES);
enable_irq(dev_priv->drm.irq);

 wakeup:
@@ -1871,11 +1871,79 @@ void i915_reset(struct drm_i915_private *dev_priv)
  *
  * Reset a specific GPU engine. Useful if a hang is detected.
  * Returns zero on successful reset or otherwise an error code.
+ *
+ * Procedure is:
+ *  - identifies the request that caused the hang and it is dropped
+ *  - force engine to idle: this is done by issuing a reset request
+ *  - reset engine
+ *  - restart submissions to the engine
  */
 int i915_reset_engine(struct intel_engine_cs *engine)
 {
-   /* FIXME: replace me with engine reset sequence */
-   return -ENODEV;
+   int ret;
+   struct drm_i915_private *dev_priv = engine->i915;
+   struct i915_gpu_error *error = &dev_priv->gpu_error;
+
+   GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
+
+   DRM_DEBUG_DRIVER("resetting %s\n", engine->name);
+
+   /*
+* We need to first idle the engine by issuing a reset request,
+* then perform soft reset and re-initialize hw state, for all of
+* this GT power need to be awake so ensure it does throughout the
+* process
+*/
+   intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);


Hmm, what path did we take to get here without taking rpm? I thought I
had fixed the last offender.



Too many rebases... As you say, this is no longer needed after 
1604a86d08053 "drm/i915: Extend rpm wakelock during i915_handle_error()"



+   disable_irq(dev_priv->drm.irq);


I am 99% certain that we don't need to disable_irq() now for per-engine
reset... I'd keep it in the global reset as simple paranoia.



100% correct.


+   ret = i915_gem_reset_prepare_engine(engine);
+   if (ret) {
+   DRM_ERROR("Previous reset failed - promote to full reset\n");
+   goto error;
+   }
+
+   /*
+* the request that caused the hang is stuck on elsp, identify the
+* active request and drop it, adjust head to skip the offending
+* request to resume executing remaining requests in the queue.
+*/


Hmm. Interesting. This relies on i915_gem_retire_requests() (i.e.
struct_mutex) to skip replaying innocent requests, but here we should be
asserting that we do 

Re: [Intel-gfx] [PATCH v6 13/20] drm/i915/guc: Provide register list to be saved/restored during engine reset

2017-04-21 Thread Michel Thierry



On 20/04/17 10:29, Michel Thierry wrote:



On 20/04/17 09:39, Daniele Ceraolo Spurio wrote:



On 20/04/17 04:33, Joonas Lahtinen wrote:

On ke, 2017-04-19 at 11:35 -0700, Michel Thierry wrote:

From: Arun Siluvery 

GuC expects a list of registers from the driver which are
saved/restored
during engine reset. The type of value to be saved is controlled by
flags. We provide a minimal set of registers that we want GuC to save
and
restore. This is not an issue in case of engine reset as driver
initializes
most of them following an engine reset, but in case of media reset (aka
watchdog reset) which is completely internal to GuC (including
resubmission
of hung workload), it is necessary to provide this list, otherwise
GuC won't
be able to schedule further workloads after a reset. This is the
minimal
set of registers identified for things to work as expected but if we
see
any new issues, this register list can be expanded.

In order to not loose any existing workarounds, we have to let GuC know
the registers and its values. These will be reapplied after the reset.
Note that we can't just read the current value because most of these
registers are masked (so we have a workaround for a workaround for a
workaround).

v2: REGSET_MASKED is too difficult for GuC, use
REGSET_SAVE_DEFAULT_VALUE
and current value from RING_MODE reg instead; no need to preserve
head/tail either, be extra paranoid and save whitelisted registers
(Daniele).

v3: Workarounds added only once during _init_workarounds also have to
been restored, or we risk loosing them after internal GuC reset
(Daniele).

Cc: Daniele Ceraolo Spurio 
Signed-off-by: Arun Siluvery 
Signed-off-by: Jeff McGee 
Signed-off-by: Michel Thierry 





@@ -732,15 +755,16 @@ static int gen9_init_workarounds(struct
intel_engine_cs *engine)

 int ret;


 /* WaConextSwitchWithConcurrentTLBInvalidate:skl,bxt,kbl,glk */
-I915_WRITE(GEN9_CSFE_CHICKEN1_RCS,
_MASKED_BIT_ENABLE(GEN9_PREEMPT_GPGPU_SYNC_SWITCH_DISABLE));
+I915_GUC_REG_WRITE(GEN9_CSFE_CHICKEN1_RCS,
+
_MASKED_BIT_ENABLE(GEN9_PREEMPT_GPGPU_SYNC_SWITCH_DISABLE));


To make grepping easier, how about?

I915_WRITE(GUC_REG(GEN9_CSFE_CHICKEN1_RCS),
   ...);

Regards, Joonas



GUC_REG makes it sound like it is somehow related to GuC, while it
isn't, we just want GuC to restore its value. What about GUC_REG_RESTORE?



Honestly, I dont care about names, pick one and I add it.
Just a reminder, we not only need the reg offset, we want to save the
value too.



I915_WRITE_GUC_RESTORE(reg, value) ?

That would be inline to the others we have, e.g. I915_WRITE_FW, 
I915_WRITE_CTL, I915_WRITE_HEAD/TAIL.


-Michel
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH v6 13/20] drm/i915/guc: Provide register list to be saved/restored during engine reset

2017-04-21 Thread Michel Thierry



On 21/04/17 13:21, Chris Wilson wrote:

On Fri, Apr 21, 2017 at 01:10:37PM -0700, Daniele Ceraolo Spurio wrote:



On 21/04/17 13:07, Michel Thierry wrote:



On 20/04/17 10:29, Michel Thierry wrote:



On 20/04/17 09:39, Daniele Ceraolo Spurio wrote:



On 20/04/17 04:33, Joonas Lahtinen wrote:

On ke, 2017-04-19 at 11:35 -0700, Michel Thierry wrote:

From: Arun Siluvery 

GuC expects a list of registers from the driver which are
saved/restored
during engine reset. The type of value to be saved is controlled by
flags. We provide a minimal set of registers that we want GuC to save
and
restore. This is not an issue in case of engine reset as driver
initializes
most of them following an engine reset, but in case of media reset
(aka
watchdog reset) which is completely internal to GuC (including
resubmission
of hung workload), it is necessary to provide this list, otherwise
GuC won't
be able to schedule further workloads after a reset. This is the
minimal
set of registers identified for things to work as expected but if we
see
any new issues, this register list can be expanded.

In order to not loose any existing workarounds, we have to let GuC
know
the registers and its values. These will be reapplied after the reset.
Note that we can't just read the current value because most of these
registers are masked (so we have a workaround for a workaround for a
workaround).

v2: REGSET_MASKED is too difficult for GuC, use
REGSET_SAVE_DEFAULT_VALUE
and current value from RING_MODE reg instead; no need to preserve
head/tail either, be extra paranoid and save whitelisted registers
(Daniele).

v3: Workarounds added only once during _init_workarounds also have to
been restored, or we risk loosing them after internal GuC reset
(Daniele).

Cc: Daniele Ceraolo Spurio 
Signed-off-by: Arun Siluvery 
Signed-off-by: Jeff McGee 
Signed-off-by: Michel Thierry 





@@ -732,15 +755,16 @@ static int gen9_init_workarounds(struct
intel_engine_cs *engine)

int ret;


/* WaConextSwitchWithConcurrentTLBInvalidate:skl,bxt,kbl,glk */
-I915_WRITE(GEN9_CSFE_CHICKEN1_RCS,
_MASKED_BIT_ENABLE(GEN9_PREEMPT_GPGPU_SYNC_SWITCH_DISABLE));
+I915_GUC_REG_WRITE(GEN9_CSFE_CHICKEN1_RCS,
+
_MASKED_BIT_ENABLE(GEN9_PREEMPT_GPGPU_SYNC_SWITCH_DISABLE));


To make grepping easier, how about?

   I915_WRITE(GUC_REG(GEN9_CSFE_CHICKEN1_RCS),
  ...);

Regards, Joonas



GUC_REG makes it sound like it is somehow related to GuC, while it
isn't, we just want GuC to restore its value. What about
GUC_REG_RESTORE?



Honestly, I dont care about names, pick one and I add it.
Just a reminder, we not only need the reg offset, we want to save the
value too.



I915_WRITE_GUC_RESTORE(reg, value) ?

That would be inline to the others we have, e.g. I915_WRITE_FW,
I915_WRITE_CTL, I915_WRITE_HEAD/TAIL.


I915_WRITE_FW is not the same class as I915_WRITE_CTL/HEAD/TAIL, and I
can say from experience the I915_*_CTL/HEAD/TAIL were a mistake (special
casing one particular access to the ring mmio, but we often deviate from
that pattern).

Looking at the above I see you are falling for the same trap as the ring
shorthand... So are you sure the convenience will not be lost later? And
in particular avoid using I915_WRITE_*() naming style as I would rather
that was earmarked for the different mmio accessors.


Ok, then can follow the pattern of the other workarounds & whitelist reg 
code?


E.g. WA_REG_GUC_RESTORE or WA_MMIO_REG_GUC_RESTORE (to make it clearer 
that these are not registers in the ctx image).




___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH v6 05/20] drm/i915/tdr: Add support for per engine reset recovery

2017-04-24 Thread Michel Thierry



On 20/04/17 17:17, Michel Thierry wrote:

Hmm. Interesting. This relies on i915_gem_retire_requests() (i.e.
struct_mutex) to skip replaying innocent requests, but here we should be
asserting that we do have the hung request.

i.e.
request = i915_gem_find_active_request(engine);
if (!request)
goto skip.

Bonus points for tying that into i915_gem_reset_prepare_engine() so that
we only seach for the active_request once.



Will this do it?
https://patchwork.freedesktop.org/patch/152494/  (ignore the DRM_ERROR I 
still have to change)


I'm not sure about reusing the active request in full-reset (what if we 
have more than one engine hung?).


Thanks
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH] drm/i915: Report request restarts for both execlists/guc

2017-04-25 Thread Michel Thierry

On 25/04/17 05:30, Chris Wilson wrote:

On Tue, Apr 25, 2017 at 01:21:47PM +0100, Tvrtko Ursulin wrote:


On 25/04/2017 11:38, Chris Wilson wrote:

As we now share the execlist_port[] tracking for both execlists/guc, we
can reset the inflight count on both and report which requests are being
restarted.



Thanks, one less patch for me (and I arrived late to the party, I see 
it's already merged).



Suggested-by: Michel Thierry 
Signed-off-by: Chris Wilson 
Cc: Michel Thierry 
Cc: Mika Kuoppala 
Cc: Tvrtko Ursulin 
---
drivers/gpu/drm/i915/intel_lrc.c | 29 -
1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index d3612969098f..961f4a2ad498 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1147,14 +1147,11 @@ static int intel_init_workaround_bb(struct 
intel_engine_cs *engine)
return ret;
}

-static u32 port_seqno(struct execlist_port *port)
-{
-return port->request ? port->request->global_seqno : 0;
-}
-
static int gen8_init_common_ring(struct intel_engine_cs *engine)
{
struct drm_i915_private *dev_priv = engine->i915;
+struct execlist_port *port = engine->execlist_port;
+unsigned int n;
int ret;

ret = intel_mocs_init_engine(engine);
@@ -1175,16 +1172,22 @@ static int gen8_init_common_ring(struct intel_engine_cs 
*engine)

/* After a GPU reset, we may have requests to replay */
clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
-if (!i915.enable_guc_submission && !execlists_elsp_idle(engine)) {
-DRM_DEBUG_DRIVER("Restarting %s from requests [0x%x, 0x%x]\n",
- engine->name,
- port_seqno(&engine->execlist_port[0]),
- port_seqno(&engine->execlist_port[1]));
-engine->execlist_port[0].count = 0;
-engine->execlist_port[1].count = 0;
-execlists_submit_ports(engine);
+
+for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++) {
+if (!port[n].request)
+break;


At some point we'll maybe want to start thinking about the
for_each_port_request or something.


Something.


+
+DRM_DEBUG_DRIVER("Restarting %s:%d from 0x%x\n",
+ engine->name, n,
+ port[n].request->global_seqno);
+
+/* Discard the current inflight count */
+port[n].count = 0;
}

+if (!i915.enable_guc_submission && !execlists_elsp_idle(engine))
+execlists_submit_ports(engine);
+
return 0;
}




Looks okay to me. Someone has plans to start using counts in guc mode?


Spoilers. I moved the submission out of a few locks to reduce lock
contention (queued_spin_lock_slowpath exists for guc!), makes the CPU
numbers look better, but bxt/guc is still 3x higher latency. I just hope
it is broken firmware.


Beating a dead horse?
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 1/2] drm/i915/guc: Fix sleep under spinlock during reset

2017-04-27 Thread Michel Thierry


On 12/04/17 09:22, Michel Thierry wrote:

On 12/04/17 08:58, Chris Wilson wrote:

On Wed, Apr 12, 2017 at 04:48:42PM +0100, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

Looks like intel_guc_reset had the ability to sleep under the
uncore spinlock since forever but it wasn't detected until the
recent changes annotated the wait for register with might_sleep.

I have fixed it by removing holding of the uncore spinlock over
the call to gen6_hw_domain_reset, since I do not see that is
really needed. But there is always a possibility I am missing
some nasty detail so please double check.


Afaik, no we are not using the uncore.lock here to serialise resets so
yes we should be safe in dropping it.

Will the guc be coming under the same hw semaphore as gen8 per-engine
resets?


A bit unrelated, but should intel_guc_reset be intel_reset_guc instead?
Here we're trying to reset the microcontroller, not asking guc to do a
reset.


Ping?

Anyone unlucky enough to be using GuC submission should be seeing this 
warning when the firmware has to be reloaded (for example after any 
i-g-t hang test).


I still think the function should be renamed to _reset_guc though, since 
it's the hw reseting the guc, not the other way around.


Acked-by: Michel Thierry 
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 1/2] drm/i915/guc: Fix sleep under spinlock during reset

2017-04-27 Thread Michel Thierry


On 27/04/17 11:20, Tvrtko Ursulin wrote:


On 27/04/2017 19:14, Michel Thierry wrote:

On 12/04/17 09:22, Michel Thierry wrote:

On 12/04/17 08:58, Chris Wilson wrote:

On Wed, Apr 12, 2017 at 04:48:42PM +0100, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

Looks like intel_guc_reset had the ability to sleep under the
uncore spinlock since forever but it wasn't detected until the
recent changes annotated the wait for register with might_sleep.

I have fixed it by removing holding of the uncore spinlock over
the call to gen6_hw_domain_reset, since I do not see that is
really needed. But there is always a possibility I am missing
some nasty detail so please double check.


Afaik, no we are not using the uncore.lock here to serialise resets so
yes we should be safe in dropping it.

Will the guc be coming under the same hw semaphore as gen8 per-engine
resets?


A bit unrelated, but should intel_guc_reset be intel_reset_guc instead?
Here we're trying to reset the microcontroller, not asking guc to do a
reset.


Ping?

Anyone unlucky enough to be using GuC submission should be seeing this
warning when the firmware has to be reloaded (for example after any
i-g-t hang test).

I still think the function should be renamed to _reset_guc though, since
it's the hw reseting the guc, not the other way around.

Acked-by: Michel Thierry 


Thanks! Now just exercise restrain in suggesting bikesheds and if
someone can provide an r-b we could merge this. ;) (To be read as - lets
leave the renaming for a follow up work since this fix is not to blame
for the objectionable name.)

Regards,



_Invoking GuC experts_

Agreed, and since I'm the one that will tell the guc to perform a reset, 
I can include the bikeshed in my patches.



___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [PATCH v7 12/20] drm/i915/guc: Provide register list to be saved/restored during engine reset

2017-04-27 Thread Michel Thierry
From: Arun Siluvery 

GuC expects a list of registers from the driver which are saved/restored
during engine reset. The type of value to be saved is controlled by
flags. We provide a minimal set of registers that we want GuC to save and
restore. This is not an issue in case of engine reset as driver initializes
most of them following an engine reset, but in case of media reset (aka
watchdog reset) which is completely internal to GuC (including resubmission
of hung workload), it is necessary to provide this list, otherwise GuC won't
be able to schedule further workloads after a reset. This is the minimal
set of registers identified for things to work as expected but if we see
any new issues, this register list can be expanded.

In order to not loose any existing workarounds, we have to let GuC know
the registers and its values. These will be reapplied after the reset.
Note that we can't just read the current value because most of these
registers are masked (so we have a workaround for a workaround for a
workaround).

v2: REGSET_MASKED is too difficult for GuC, use REGSET_SAVE_DEFAULT_VALUE
and current value from RING_MODE reg instead; no need to preserve
head/tail either, be extra paranoid and save whitelisted registers (Daniele).

v3: Workarounds added only once during _init_workarounds also have to
been restored, or we risk loosing them after internal GuC reset
(Daniele).

v4: Rename macro used to keep track the workaround registers we will
have to restore after reset (s/I915_GUC_REG_WRITE/WA_REG_WR_GUC_RESTORE).

Cc: Daniele Ceraolo Spurio 
Signed-off-by: Arun Siluvery 
Signed-off-by: Jeff McGee 
Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/i915_drv.h|  3 ++
 drivers/gpu/drm/i915/i915_guc_submission.c | 68 +-
 drivers/gpu/drm/i915/intel_engine_cs.c | 65 +++-
 3 files changed, 114 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index b00ea523a634..c9ff7f726d47 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1913,7 +1913,10 @@ struct i915_wa_reg {
 
 struct i915_workarounds {
struct i915_wa_reg reg[I915_MAX_WA_REGS];
+   /* list of registers (and their values) that GuC will have to restore */
+   struct i915_wa_reg guc_reg[GUC_REGSET_MAX_REGISTERS];
u32 count;
+   u32 guc_count;
u32 hw_whitelist_count[I915_NUM_ENGINES];
 };
 
diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c 
b/drivers/gpu/drm/i915/i915_guc_submission.c
index 2cfe5d3b7795..4d1784c84fd4 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -1001,6 +1001,24 @@ static void guc_policies_init(struct guc_policies 
*policies)
policies->is_valid = 1;
 }
 
+/*
+ * In this macro it is highly unlikely to exceed max value but even if we did
+ * it is not an error so just throw a warning and continue. Only side effect
+ * in continuing further means some registers won't be added to save/restore
+ * list.
+ */
+#define GUC_ADD_MMIO_REG_ADS(node, reg_addr, _flags, defvalue) \
+   do {\
+   u32 __count = node->number_of_registers;\
+   if (WARN_ON(__count >= GUC_REGSET_MAX_REGISTERS))   \
+   continue;   \
+   node->registers[__count].offset = reg_addr.reg; \
+   node->registers[__count].flags = (_flags);  \
+   if (defvalue)   \
+   node->registers[__count].value = (defvalue);\
+   node->number_of_registers++;\
+   } while (0)
+
 static int guc_ads_create(struct intel_guc *guc)
 {
struct drm_i915_private *dev_priv = guc_to_i915(guc);
@@ -1014,6 +1032,7 @@ static int guc_ads_create(struct intel_guc *guc)
u8 reg_state_buffer[GUC_S3_SAVE_SPACE_PAGES * PAGE_SIZE];
} __packed *blob;
struct intel_engine_cs *engine;
+   struct i915_workarounds *workarounds = &dev_priv->workarounds;
enum intel_engine_id id;
u32 base;
 
@@ -1033,6 +1052,47 @@ static int guc_ads_create(struct intel_guc *guc)
 
/* MMIO reg state */
for_each_engine(engine, dev_priv, id) {
+   u32 i;
+   struct guc_mmio_regset *eng_reg =
+   &blob->reg_state.engine_reg[engine->guc_id];
+
+   /*
+* Provide a list of registers to be saved/restored during gpu
+* reset. This is mainly required for Media reset (aka watchdog
+* timeout) which is completely under the control of GuC
+* (resubmission of hung workload is handled inside GuC).
+  

[Intel-gfx] [PATCH v7 04/20] drm/i915: Skip reset request if there is one already

2017-04-27 Thread Michel Thierry
From: Mika Kuoppala 

To perform engine reset we first disable engine to capture its state. This
is done by issuing a reset request. Because we are reusing existing
infrastructure, again when we actually reset an engine, reset function
checks engine mask and issues reset request again which is unnecessary. To
avoid this we check if the engine is already prepared, if so we just exit
from that point.

Cc: Chris Wilson 
Signed-off-by: Mika Kuoppala 
Signed-off-by: Arun Siluvery 
Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/intel_uncore.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_uncore.c 
b/drivers/gpu/drm/i915/intel_uncore.c
index 3ebba6b2dd74..120fb440bb8b 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -1686,10 +1686,15 @@ int intel_wait_for_register(struct drm_i915_private 
*dev_priv,
 static int gen8_reset_engine_start(struct intel_engine_cs *engine)
 {
struct drm_i915_private *dev_priv = engine->i915;
+   const i915_reg_t reset_ctrl = RING_RESET_CTL(engine->mmio_base);
+   const u32 ready = RESET_CTL_REQUEST_RESET | RESET_CTL_READY_TO_RESET;
int ret;
 
-   I915_WRITE_FW(RING_RESET_CTL(engine->mmio_base),
- _MASKED_BIT_ENABLE(RESET_CTL_REQUEST_RESET));
+   /* If engine has been already prepared, we can shortcut here */
+   if ((I915_READ_FW(reset_ctrl) & ready) == ready)
+   return 0;
+
+   I915_WRITE_FW(reset_ctrl, _MASKED_BIT_ENABLE(RESET_CTL_REQUEST_RESET));
 
ret = intel_wait_for_register_fw(dev_priv,
 RING_RESET_CTL(engine->mmio_base),
-- 
2.11.0

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [PATCH v7 02/20] drm/i915: Modify error handler for per engine hang recovery

2017-04-27 Thread Michel Thierry
From: Arun Siluvery 

This is a preparatory patch which modifies error handler to do per engine
hang recovery. The actual patch which implements this sequence follows
later in the series. The aim is to prepare existing recovery function to
adapt to this new function where applicable (which fails at this point
because core implementation is lacking) and continue recovery using legacy
full gpu reset.

A helper function is also added to query the availability of engine
reset.

The error events behaviour that are used to notify user of reset are
adapted to engine reset such that it doesn't break users listening to these
events. In legacy we report an error event, a reset event before resetting
the gpu and a reset done event marking the completion of reset. The same
behaviour is adapted but reset event is only dispatched once even when
multiple engines are hung. Finally once reset is complete we send reset
done event as usual.

Note that this implementation of engine reset is for i915 directly
submitting to the ELSP, where the driver manages the hang detection,
recovery and resubmission. With GuC submission these tasks are shared
between driver and firmware; i915 will still responsible for detecting a
hang, and when it does it will have to request GuC to reset that Engine and
remind the firmware about the outstanding submissions. This will be
added in different patch.

v2: rebase, advertise engine reset availability in platform definition,
add note about GuC submission.
v3: s/*engine_reset*/*reset_engine*/. (Chris)
Handle reset as 2 level resets, by first going to engine only and fall
backing to full/chip reset as needed, i.e. reset_engine will need the
struct_mutex.
v4: Pass the engine mask to i915_reset. (Chris)
v5: Rebase, update selftests.
v6: Rebase, prepare for mutex-less reset engine.
v7: Pass reset_engine mask as a function parameter, and iterate over the
engine mask for reset_engine. (Chris)

Cc: Chris Wilson 
Cc: Mika Kuoppala 
Signed-off-by: Ian Lister 
Signed-off-by: Tomas Elf 
Signed-off-by: Arun Siluvery 
Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/i915_drv.c | 15 +++
 drivers/gpu/drm/i915/i915_drv.h |  3 +++
 drivers/gpu/drm/i915/i915_irq.c | 33 ++---
 drivers/gpu/drm/i915/i915_pci.c |  5 -
 drivers/gpu/drm/i915/intel_uncore.c | 11 +++
 5 files changed, 63 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index c7d68e789642..48c8b69d9bde 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1800,6 +1800,8 @@ void i915_reset(struct drm_i915_private *dev_priv)
if (!test_bit(I915_RESET_HANDOFF, &error->flags))
return;
 
+   DRM_DEBUG_DRIVER("resetting chip\n");
+
/* Clear any previous failed attempts at recovery. Time to try again. */
if (!i915_gem_unset_wedged(dev_priv))
goto wakeup;
@@ -1863,6 +1865,19 @@ void i915_reset(struct drm_i915_private *dev_priv)
goto finish;
 }
 
+/**
+ * i915_reset_engine - reset GPU engine to recover from a hang
+ * @engine: engine to reset
+ *
+ * Reset a specific GPU engine. Useful if a hang is detected.
+ * Returns zero on successful reset or otherwise an error code.
+ */
+int i915_reset_engine(struct intel_engine_cs *engine)
+{
+   /* FIXME: replace me with engine reset sequence */
+   return -ENODEV;
+}
+
 static int i915_pm_suspend(struct device *kdev)
 {
struct pci_dev *pdev = to_pci_dev(kdev);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index e06af46f5a57..ab7e68626c49 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -814,6 +814,7 @@ struct intel_csr {
func(has_ddi); \
func(has_decoupled_mmio); \
func(has_dp_mst); \
+   func(has_reset_engine); \
func(has_fbc); \
func(has_fpga_dbg); \
func(has_full_ppgtt); \
@@ -3019,6 +3020,8 @@ extern void i915_driver_unload(struct drm_device *dev);
 extern int intel_gpu_reset(struct drm_i915_private *dev_priv, u32 engine_mask);
 extern bool intel_has_gpu_reset(struct drm_i915_private *dev_priv);
 extern void i915_reset(struct drm_i915_private *dev_priv);
+extern int i915_reset_engine(struct intel_engine_cs *engine);
+extern bool intel_has_reset_engine(struct drm_i915_private *dev_priv);
 extern int intel_guc_reset(struct drm_i915_private *dev_priv);
 extern void intel_engine_init_hangcheck(struct intel_engine_cs *engine);
 extern void intel_hangcheck_init(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index fd97fe00cd0d..3a59ef1367ec 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2635,11 +2635,13 @@ static irqreturn_t gen8_irq_handler(int irq, void *arg)
 /**
  * i915_reset_and_wakeup - do process context error handling wo

[Intel-gfx] [PATCH v7 07/20] drm/i915: Export per-engine reset count info to debugfs

2017-04-27 Thread Michel Thierry
From: Arun Siluvery 

A new variable is added to export the reset counts to debugfs, this
includes full gpu reset and engine reset count. This is useful for tests
where they are expected to trigger reset; these counts are checked before
and after the test to ensure the same.

v2: Include reset engine count in i915_engine_info too (Chris).

Cc: Chris Wilson 
Cc: Mika Kuoppala 
Signed-off-by: Arun Siluvery 
Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/i915_debugfs.c | 21 +
 1 file changed, 21 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c 
b/drivers/gpu/drm/i915/i915_debugfs.c
index 870c470177b5..6444c1a9bd22 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1403,6 +1403,23 @@ static int i915_hangcheck_info(struct seq_file *m, void 
*unused)
return 0;
 }
 
+static int i915_reset_info(struct seq_file *m, void *unused)
+{
+   struct drm_i915_private *dev_priv = node_to_i915(m->private);
+   struct i915_gpu_error *error = &dev_priv->gpu_error;
+   struct intel_engine_cs *engine;
+   enum intel_engine_id id;
+
+   seq_printf(m, "full gpu reset = %u\n", i915_reset_count(error));
+
+   for_each_engine(engine, dev_priv, id) {
+   seq_printf(m, "%s = %u\n", engine->name,
+  i915_reset_engine_count(error, engine));
+   }
+
+   return 0;
+}
+
 static int ironlake_drpc_info(struct seq_file *m)
 {
struct drm_i915_private *dev_priv = node_to_i915(m->private);
@@ -3242,6 +3259,7 @@ static int i915_display_info(struct seq_file *m, void 
*unused)
 static int i915_engine_info(struct seq_file *m, void *unused)
 {
struct drm_i915_private *dev_priv = node_to_i915(m->private);
+   struct i915_gpu_error *error = &dev_priv->gpu_error;
struct intel_engine_cs *engine;
enum intel_engine_id id;
 
@@ -3265,6 +3283,8 @@ static int i915_engine_info(struct seq_file *m, void 
*unused)
   engine->hangcheck.seqno,
   jiffies_to_msecs(jiffies - 
engine->hangcheck.action_timestamp),
   engine->timeline->inflight_seqnos);
+   seq_printf(m, "\tReset count: %d\n",
+  i915_reset_engine_count(error, engine));
 
rcu_read_lock();
 
@@ -4777,6 +4797,7 @@ static const struct drm_info_list i915_debugfs_list[] = {
{"i915_huc_load_status", i915_huc_load_status_info, 0},
{"i915_frequency_info", i915_frequency_info, 0},
{"i915_hangcheck_info", i915_hangcheck_info, 0},
+   {"i915_reset_info", i915_reset_info, 0},
{"i915_drpc_info", i915_drpc_info, 0},
{"i915_emon_status", i915_emon_status, 0},
{"i915_ring_freq_table", i915_ring_freq_table, 0},
-- 
2.11.0

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [PATCH v7 00/20] Gen8+ engine-reset

2017-04-27 Thread Michel Thierry
These patches add the reset-engine feature from Gen8. This is also
referred to as Timeout detection and recovery (TDR). This complements to
the full gpu reset feature available in i915 but it only allows to reset a
particular engine instead of all engines thus providing a light weight
engine reset and recovery mechanism.

Thanks to recent changes merged, this implementation is now not only for
execlists, but for GuC based submission too; it is still limited from
Gen8 onwards. I have also included the changes for watchdog timeout
detection. The GuC related patches are functional, but can be seen as RFC.

Timeout detection relies on the existing hangcheck, which remains the same;
main changes are to the recovery mechanism. Once we detect a hang on a
particular engine we identify the request that caused the hang, skip the
request and adjust head pointers to allow the execution to proceed
normally. After some cleanup, submissions are restarted to process
remaining work queued to that engine.

If engine reset fails to recover engine correctly then we fallback to full
gpu reset.

We can argue about the effectiveness of reset-engine vs full reset when
more than one ring is hung, but the benefits of just resetting one engine
are reduced when the driver has to do it multiple times.

v2: ELSP queue request tracking and reset path changes to handle incomplete
requests during reset. Thanks to Chris Wilson for providing these patches.

v3: Let the waiter keep handling the full gpu reset if it already has the
lock; point out that GuC submission needs a different method to restart
workloads after the engine reset completes.

v4: Handle reset as 2 level resets, by first going to engine only and fall
backing to full/chip reset as needed, i.e. reset_engine will need the
struct_mutex.

v5: Rebased after reset flag split in 2, add GuC support, include watchdog
detection patches, addressing comments from prev RFC.

v6: Mutex-less reset engine. Updates in watchdog abi and guc whitelist &
register-restore fixes (including an old patch from Daniele).

v7: Removed leftovers from v5; review comments; ability to cancel the reset
if there's no active request.

Cc: Chris Wilson 
Cc: Mika Kuoppala 
Cc: Daniele Ceraolo Spurio 

Arun Siluvery (7):
  drm/i915: Update i915.reset to handle engine resets
  drm/i915: Modify error handler for per engine hang recovery
  drm/i915: Add support for per engine reset recovery
  drm/i915: Add engine reset count to error state
  drm/i915: Export per-engine reset count info to debugfs
  drm/i915: Enable Engine reset and recovery support
  drm/i915/guc: Provide register list to be saved/restored during engine
reset

Daniele Ceraolo Spurio (1):
  drm/i915/guc: fix mmio whitelist mmio_start offset and add reminder

Michel Thierry (11):
  drm/i915: Cancel reset-engine if we couldn't find an active request
  drm/i915: Add engine reset count in get-reset-stats ioctl
  drm/i915/selftests: reset engine self tests
  drm/i915/guc: Rename the function that resets the GuC
  drm/i915/guc: Add support for reset engine using GuC commands
  drm/i915: Watchdog timeout: Pass GuC shared data structure during
param load
  drm/i915: Watchdog timeout: IRQ handler for gen8+
  drm/i915: Watchdog timeout: Ringbuffer command emission for gen8+
  drm/i915: Watchdog timeout: DRM kernel interface to set the timeout
  drm/i915: Watchdog timeout: Include threshold value in error state
  drm/i915: Watchdog timeout: Export media reset count from GuC to
debugfs

Mika Kuoppala (1):
  drm/i915: Skip reset request if there is one already

 drivers/gpu/drm/i915/i915_debugfs.c  |  43 +++
 drivers/gpu/drm/i915/i915_drv.c  | 109 +++-
 drivers/gpu/drm/i915/i915_drv.h  |  67 +-
 drivers/gpu/drm/i915/i915_gem.c  | 116 ++---
 drivers/gpu/drm/i915/i915_gem_context.c  | 109 +++-
 drivers/gpu/drm/i915/i915_gem_context.h  |   4 +
 drivers/gpu/drm/i915/i915_gem_request.c  |   2 +-
 drivers/gpu/drm/i915/i915_gpu_error.c|  14 +-
 drivers/gpu/drm/i915/i915_guc_submission.c   | 136 ++--
 drivers/gpu/drm/i915/i915_irq.c  |  45 ++-
 drivers/gpu/drm/i915/i915_params.c   |   6 +-
 drivers/gpu/drm/i915/i915_params.h   |   2 +-
 drivers/gpu/drm/i915/i915_pci.c  |   5 +-
 drivers/gpu/drm/i915/i915_reg.h  |   6 +
 drivers/gpu/drm/i915/intel_engine_cs.c   |  65 +++---
 drivers/gpu/drm/i915/intel_guc_fwif.h|  27 +++-
 drivers/gpu/drm/i915/intel_guc_loader.c  |  11 ++
 drivers/gpu/drm/i915/intel_hangcheck.c   |  13 +-
 drivers/gpu/drm/i915/intel_lrc.c | 155 ++-
 drivers/gpu/drm/i915/intel_ringbuffer.h  |   8 ++
 drivers/gpu/drm/i915/intel_uc.c  |   4 +-
 drivers/gpu/drm/i915/intel_uc.h  | 

[Intel-gfx] [PATCH v7 09/20] drm/i915: Add engine reset count in get-reset-stats ioctl

2017-04-27 Thread Michel Thierry
Users/tests relying on the total reset count will start seeing a smaller
number since most of the hangs can be handled by engine reset.
Note that if reset engine x, context a running on engine y will be unaware
and unaffected.

To start the discussion, include just a total engine reset count. If it
is deemed useful, it can be extended to report each engine separately.

v2: s/engine_reset/reset_engine/, use union in uapi to not break compatibility.

Cc: Chris Wilson 
Cc: Mika Kuoppala 
Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/i915_gem_context.c | 14 +++---
 include/uapi/drm/i915_drm.h |  6 +-
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_context.c 
b/drivers/gpu/drm/i915/i915_gem_context.c
index d46a69d3d390..e98d9daa3f00 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -1074,9 +1074,11 @@ int i915_gem_context_reset_stats_ioctl(struct drm_device 
*dev,
struct drm_i915_private *dev_priv = to_i915(dev);
struct drm_i915_reset_stats *args = data;
struct i915_gem_context *ctx;
+   struct intel_engine_cs *engine;
+   enum intel_engine_id id;
int ret;
 
-   if (args->flags || args->pad)
+   if (args->flags)
return -EINVAL;
 
if (args->ctx_id == DEFAULT_CONTEXT_HANDLE && !capable(CAP_SYS_ADMIN))
@@ -1092,10 +1094,16 @@ int i915_gem_context_reset_stats_ioctl(struct 
drm_device *dev,
return PTR_ERR(ctx);
}
 
-   if (capable(CAP_SYS_ADMIN))
+   if (capable(CAP_SYS_ADMIN)) {
args->reset_count = i915_reset_count(&dev_priv->gpu_error);
-   else
+   for_each_engine(engine, dev_priv, id)
+   args->reset_engine_count +=
+   i915_reset_engine_count(&dev_priv->gpu_error,
+   engine);
+   } else {
args->reset_count = 0;
+   args->reset_engine_count = 0;
+   }
 
args->batch_active = ctx->guilty_count;
args->batch_pending = ctx->active_count;
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index f24a80d2d42e..fadedefba6db 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1278,7 +1278,11 @@ struct drm_i915_reset_stats {
/* Number of batches lost pending for execution, for this context */
__u32 batch_pending;
 
-   __u32 pad;
+   union {
+   __u32 pad;
+   /* Engine resets since boot/module reload, for all contexts */
+   __u32 reset_engine_count;
+   };
 };
 
 struct drm_i915_gem_userptr {
-- 
2.11.0

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [PATCH v7 13/20] drm/i915/guc: Rename the function that resets the GuC

2017-04-27 Thread Michel Thierry
intel_guc_reset sounds more like the microcontroller is the one performing
a reset, while in this case is the opposite. intel_reset_guc not only
makes it clearer, it follows the other intel_reset functions available.

Cc: Tvrtko Ursulin 
Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/i915_drv.h | 2 +-
 drivers/gpu/drm/i915/intel_uc.c | 4 ++--
 drivers/gpu/drm/i915/intel_uncore.c | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index c9ff7f726d47..e9e04c92a376 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3031,7 +3031,7 @@ extern int i915_reset_engine(struct intel_engine_cs 
*engine);
 extern bool intel_has_reset_engine(struct drm_i915_private *dev_priv);
 extern int intel_reset_engine_start(struct intel_engine_cs *engine);
 extern void intel_reset_engine_cancel(struct intel_engine_cs *engine);
-extern int intel_guc_reset(struct drm_i915_private *dev_priv);
+extern int intel_reset_guc(struct drm_i915_private *dev_priv);
 extern void intel_engine_init_hangcheck(struct intel_engine_cs *engine);
 extern void intel_hangcheck_init(struct drm_i915_private *dev_priv);
 extern unsigned long i915_chipset_val(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/intel_uc.c b/drivers/gpu/drm/i915/intel_uc.c
index 900e3767a899..bad282b6c886 100644
--- a/drivers/gpu/drm/i915/intel_uc.c
+++ b/drivers/gpu/drm/i915/intel_uc.c
@@ -46,9 +46,9 @@ static int __intel_uc_reset_hw(struct drm_i915_private 
*dev_priv)
int ret;
u32 guc_status;
 
-   ret = intel_guc_reset(dev_priv);
+   ret = intel_reset_guc(dev_priv);
if (ret) {
-   DRM_ERROR("GuC reset failed, ret = %d\n", ret);
+   DRM_ERROR("Reset GuC failed, ret = %d\n", ret);
return ret;
}
 
diff --git a/drivers/gpu/drm/i915/intel_uncore.c 
b/drivers/gpu/drm/i915/intel_uncore.c
index 120fb440bb8b..00251d83e7bd 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -1792,7 +1792,7 @@ bool intel_has_reset_engine(struct drm_i915_private 
*dev_priv)
i915.reset == 2);
 }
 
-int intel_guc_reset(struct drm_i915_private *dev_priv)
+int intel_reset_guc(struct drm_i915_private *dev_priv)
 {
int ret;
 
-- 
2.11.0

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [PATCH v7 16/20] drm/i915: Watchdog timeout: IRQ handler for gen8+

2017-04-27 Thread Michel Thierry
*** General ***

Watchdog timeout (or "media engine reset") is a feature that allows
userland applications to enable hang detection on individual batch buffers.
The detection mechanism itself is mostly bound to the hardware and the only
thing that the driver needs to do to support this form of hang detection
is to implement the interrupt handling support as well as watchdog command
emission before and after the emitted batch buffer start instruction in the
ring buffer.

The principle of the hang detection mechanism is as follows:

1. Once the decision has been made to enable watchdog timeout for a
particular batch buffer and the driver is in the process of emitting the
batch buffer start instruction into the ring buffer it also emits a
watchdog timer start instruction before and a watchdog timer cancellation
instruction after the batch buffer start instruction in the ring buffer.

2. Once the GPU execution reaches the watchdog timer start instruction
the hardware watchdog counter is started by the hardware. The counter
keeps counting until either reaching a previously configured threshold
value or the timer cancellation instruction is executed.

2a. If the counter reaches the threshold value the hardware fires a
watchdog interrupt that is picked up by the watchdog interrupt handler.
This means that a hang has been detected and the driver needs to deal with
it the same way it would deal with a engine hang detected by the periodic
hang checker. The only difference between the two is that we already blamed
the active request (to ensure an engine reset).

2b. If the batch buffer completes and the execution reaches the watchdog
cancellation instruction before the watchdog counter reaches its
threshold value the watchdog is cancelled and nothing more comes of it.
No hang is detected.

Note about future interaction with preemption: Preemption could happen
in a command sequence prior to watchdog counter getting disabled,
resulting in watchdog being triggered following preemption (e.g. when
watchdog had been enabled in the low priority batch). The driver will
need to explicitly disable the watchdog counter as part of the
preemption sequence.

*** This patch introduces: ***

1. IRQ handler code for watchdog timeout allowing direct hang recovery
based on hardware-driven hang detection, which then integrates directly
with the hang recovery path. This is independent of having per-engine reset
or just full gpu reset.

2. Watchdog specific register information.

Currently the render engine and all available media engines support
watchdog timeout (VECS is only supported in GEN9). The specifications elude
to the BCS engine being supported but that is currently not supported by
this commit.

Note that the value to stop the counter is different between render and
non-render engines in GEN8; GEN9 onwards it's the same.

v2: Move irq handler to tasklet, arm watchdog for a 2nd time to check
against false-positives.

v3: Don't use high priority tasklet, use engine_last_submit while
checking for false-positives. From GEN9 onwards, the stop counter bit is
the same for all engines.

v4: Remove unnecessary brackets, use current_seqno to mark the request
as guilty in the hangcheck/capture code.

Signed-off-by: Tomas Elf 
Signed-off-by: Ian Lister 
Signed-off-by: Arun Siluvery 
Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/i915_drv.h |  4 ++
 drivers/gpu/drm/i915/i915_irq.c | 12 +-
 drivers/gpu/drm/i915/i915_reg.h |  6 +++
 drivers/gpu/drm/i915/intel_hangcheck.c  | 13 +--
 drivers/gpu/drm/i915/intel_lrc.c| 69 +
 drivers/gpu/drm/i915/intel_ringbuffer.h |  4 ++
 6 files changed, 103 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index cbefcd4b2507..2e1211e25945 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1608,6 +1608,9 @@ struct i915_gpu_error {
 * inspect the bit and do the reset directly, otherwise the worker
 * waits for the struct_mutex.
 *
+* #I915_RESET_WATCHDOG - When hw detects a hang before us, we can use
+* I915_RESET_WATCHDOG to report the hang detection cause accurately.
+*
 * #I915_WEDGED - If reset fails and we can no longer use the GPU,
 * we set the #I915_WEDGED bit. Prior to command submission, e.g.
 * i915_gem_request_alloc(), this bit is checked and the sequence
@@ -1616,6 +1619,7 @@ struct i915_gpu_error {
unsigned long flags;
 #define I915_RESET_BACKOFF 0
 #define I915_RESET_HANDOFF 1
+#define I915_RESET_WATCHDOG2
 #define I915_WEDGED(BITS_PER_LONG - 1)
 
/** Number of times an engine has been reset */
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 3a59ef1367ec..662cc3d93a18 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.

[Intel-gfx] [PATCH v7 03/20] drm/i915: Add support for per engine reset recovery

2017-04-27 Thread Michel Thierry
From: Arun Siluvery 

This change implements support for per-engine reset as an initial, less
intrusive hang recovery option to be attempted before falling back to the
legacy full GPU reset recovery mode if necessary. This is only supported
from Gen8 onwards.

Hangchecker determines which engines are hung and invokes error handler to
recover from it. Error handler schedules recovery for each of those engines
that are hung. The recovery procedure is as follows,
 - identifies the request that caused the hang and it is dropped
 - force engine to idle: this is done by issuing a reset request
 - reset and re-init engine
 - restart submissions to the engine

If engine reset fails then we fall back to heavy weight full gpu reset
which resets all engines and reinitiazes complete state of HW and SW.

v2: Rebase.
v3: s/*engine_reset*/*reset_engine*/; freeze engine and irqs before
calling i915_gem_reset_engine (Chris).
v4: Rebase, modify i915_gem_reset_prepare to use a ring mask and
reuse the function for reset_engine.
v5: intel_reset_engine_start/cancel instead of request/unrequest_reset.
v6: Clean up reset_engine function to not require mutex, i.e. no need to call
revoke/restore_fences and _retire_requests (Chris).
v7: Remove leftovers from v5, i.e. no need to disable irq, hold
forcewake or wakeup the handoff bit (Chris).

Cc: Chris Wilson 
Cc: Mika Kuoppala 
Signed-off-by: Tomas Elf 
Signed-off-by: Arun Siluvery 
Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/i915_drv.c | 60 ++--
 drivers/gpu/drm/i915/i915_drv.h | 12 +++-
 drivers/gpu/drm/i915/i915_gem.c | 97 +++--
 drivers/gpu/drm/i915/i915_gem_request.c |  2 +-
 drivers/gpu/drm/i915/intel_uncore.c | 20 +++
 5 files changed, 142 insertions(+), 49 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 48c8b69d9bde..ae891529dedd 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1810,7 +1810,7 @@ void i915_reset(struct drm_i915_private *dev_priv)
 
pr_notice("drm/i915: Resetting chip after gpu hang\n");
disable_irq(dev_priv->drm.irq);
-   ret = i915_gem_reset_prepare(dev_priv);
+   ret = i915_gem_reset_prepare(dev_priv, ALL_ENGINES);
if (ret) {
DRM_ERROR("GPU recovery failed\n");
intel_gpu_reset(dev_priv, ALL_ENGINES);
@@ -1852,7 +1852,7 @@ void i915_reset(struct drm_i915_private *dev_priv)
i915_queue_hangcheck(dev_priv);
 
 finish:
-   i915_gem_reset_finish(dev_priv);
+   i915_gem_reset_finish(dev_priv, ALL_ENGINES);
enable_irq(dev_priv->drm.irq);
 
 wakeup:
@@ -1871,11 +1871,63 @@ void i915_reset(struct drm_i915_private *dev_priv)
  *
  * Reset a specific GPU engine. Useful if a hang is detected.
  * Returns zero on successful reset or otherwise an error code.
+ *
+ * Procedure is:
+ *  - identifies the request that caused the hang and it is dropped
+ *  - force engine to idle: this is done by issuing a reset request
+ *  - reset engine
+ *  - restart submissions to the engine
  */
 int i915_reset_engine(struct intel_engine_cs *engine)
 {
-   /* FIXME: replace me with engine reset sequence */
-   return -ENODEV;
+   int ret;
+   struct drm_i915_private *dev_priv = engine->i915;
+   struct i915_gpu_error *error = &dev_priv->gpu_error;
+
+   GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
+
+   DRM_DEBUG_DRIVER("resetting %s\n", engine->name);
+
+   ret = i915_gem_reset_prepare_engine(engine);
+   if (ret) {
+   DRM_ERROR("Previous reset failed - promote to full reset\n");
+   goto out;
+   }
+
+   /*
+* the request that caused the hang is stuck on elsp, identify the
+* active request and drop it, adjust head to skip the offending
+* request to resume executing remaining requests in the queue.
+*/
+   i915_gem_reset_engine(engine);
+
+   /* forcing engine to idle */
+   ret = intel_reset_engine_start(engine);
+   if (ret) {
+   DRM_ERROR("Failed to disable %s\n", engine->name);
+   goto out;
+   }
+
+   /* finally, reset engine */
+   ret = intel_gpu_reset(dev_priv, intel_engine_flag(engine));
+   if (ret) {
+   DRM_ERROR("Failed to reset %s, ret=%d\n", engine->name, ret);
+   intel_reset_engine_cancel(engine);
+   goto out;
+   }
+
+   /* be sure the request reset bit gets cleared */
+   intel_reset_engine_cancel(engine);
+
+   i915_gem_reset_finish_engine(engine);
+
+   /* replay remaining requests in the queue */
+   ret = engine->init_hw(engine);
+   if (ret)
+   goto out; //XXX: ignore this line for now
+
+out:
+   return ret;
 }
 
 static int i915_pm_suspend(struct dev

[Intel-gfx] [PATCH v7 14/20] drm/i915/guc: Add support for reset engine using GuC commands

2017-04-27 Thread Michel Thierry
This patch adds per engine reset and recovery (TDR) support when GuC is
used to submit workloads to GPU.

In the case of i915 directly submission to ELSP, driver manages hang
detection, recovery and resubmission. With GuC submission these tasks
are shared between driver and GuC. i915 is still responsible for detecting
a hang, and when it does it only requests GuC to reset that Engine. GuC
internally manages acquiring forcewake and idling the engine before actually
resetting it.

Once the reset is successful, i915 takes over again and handles resubmission.
The scheduler in i915 knows which requests are pending so after resetting
a engine, pending workloads/requests are resubmitted again.

v2: s/i915_guc_request_engine_reset/i915_guc_reset_engine/ to match the
non-guc funtion names.

v3: Removed debug message about engine restarting from which request,
since the new baseline do it regardless of submission mode. (Chris)

Signed-off-by: Arun Siluvery 
Signed-off-by: Jeff McGee 
Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/i915_drv.c| 42 +-
 drivers/gpu/drm/i915/i915_drv.h|  1 +
 drivers/gpu/drm/i915/i915_guc_submission.c | 48 ++
 drivers/gpu/drm/i915/intel_guc_fwif.h  |  6 
 drivers/gpu/drm/i915/intel_uc.h|  1 +
 drivers/gpu/drm/i915/intel_uncore.c|  5 
 6 files changed, 84 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 426db8756e95..df8e2e8e3b7f 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1918,24 +1918,34 @@ int i915_reset_engine(struct intel_engine_cs *engine)
 */
i915_gem_reset_engine(engine, active_request);
 
-   /* forcing engine to idle */
-   ret = intel_reset_engine_start(engine);
-   if (ret) {
-   DRM_ERROR("Failed to disable %s\n", engine->name);
-   goto out;
-   }
+   if (!dev_priv->guc.execbuf_client) {
+   /* forcing engine to idle */
+   ret = intel_reset_engine_start(engine);
+   if (ret) {
+   DRM_ERROR("Failed to disable %s\n", engine->name);
+   goto out;
+   }
 
-   /* finally, reset engine */
-   ret = intel_gpu_reset(dev_priv, intel_engine_flag(engine));
-   if (ret) {
-   DRM_ERROR("Failed to reset %s, ret=%d\n", engine->name, ret);
+   /* finally, reset engine */
+   ret = intel_gpu_reset(dev_priv, intel_engine_flag(engine));
+   if (ret) {
+   DRM_ERROR("Failed to reset %s, ret=%d\n",
+ engine->name, ret);
+   intel_reset_engine_cancel(engine);
+   goto out;
+   }
+
+   /* be sure the request reset bit gets cleared */
intel_reset_engine_cancel(engine);
-   goto out;
+   } else {
+   ret = i915_guc_reset_engine(engine);
+   if (ret) {
+   DRM_ERROR("GuC failed to reset %s, ret=%d\n",
+ engine->name, ret);
+   goto out;
+   }
}
 
-   /* be sure the request reset bit gets cleared */
-   intel_reset_engine_cancel(engine);
-
i915_gem_reset_finish_engine(engine);
 
/* replay remaining requests in the queue */
@@ -1943,6 +1953,10 @@ int i915_reset_engine(struct intel_engine_cs *engine)
if (ret)
goto out;
 
+   /* for guc too */
+   if (dev_priv->guc.execbuf_client)
+   i915_guc_submission_reenable_engine(engine);
+
error->reset_engine_count[engine->id]++;
 out:
return ret;
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index e9e04c92a376..cbefcd4b2507 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3032,6 +3032,7 @@ extern bool intel_has_reset_engine(struct 
drm_i915_private *dev_priv);
 extern int intel_reset_engine_start(struct intel_engine_cs *engine);
 extern void intel_reset_engine_cancel(struct intel_engine_cs *engine);
 extern int intel_reset_guc(struct drm_i915_private *dev_priv);
+extern int i915_guc_reset_engine(struct intel_engine_cs *engine);
 extern void intel_engine_init_hangcheck(struct intel_engine_cs *engine);
 extern void intel_hangcheck_init(struct drm_i915_private *dev_priv);
 extern unsigned long i915_chipset_val(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c 
b/drivers/gpu/drm/i915/i915_guc_submission.c
index 4d1784c84fd4..57815edfc4df 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -1344,6 +1344,25 @@ void i915_guc_submission_disable(struc

[Intel-gfx] [PATCH v7 19/20] drm/i915: Watchdog timeout: Include threshold value in error state

2017-04-27 Thread Michel Thierry
Save the watchdog threshold (in us) as part of the engine state.

Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/i915_drv.h   |  1 +
 drivers/gpu/drm/i915/i915_gpu_error.c | 11 +++
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 7a64f67974cb..aaa7d3d96bda 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1022,6 +1022,7 @@ struct i915_gpu_state {
int ban_score;
int active;
int guilty;
+   int watchdog_threshold;
} context;
 
struct drm_i915_error_object {
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c 
b/drivers/gpu/drm/i915/i915_gpu_error.c
index a2ffb1ef2cfa..1b1a49bc0c3c 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -388,9 +388,10 @@ static void error_print_context(struct 
drm_i915_error_state_buf *m,
const char *header,
const struct drm_i915_error_context *ctx)
 {
-   err_printf(m, "%s%s[%d] user_handle %d hw_id %d, ban score %d guilty %d 
active %d\n",
+   err_printf(m, "%s%s[%d] user_handle %d hw_id %d, ban score %d guilty %d 
active %d, watchdog %dus\n",
   header, ctx->comm, ctx->pid, ctx->handle, ctx->hw_id,
-  ctx->ban_score, ctx->guilty, ctx->active);
+  ctx->ban_score, ctx->guilty, ctx->active,
+  watchdog_to_us(m->i915, ctx->watchdog_threshold));
 }
 
 static void error_print_engine(struct drm_i915_error_state_buf *m,
@@ -1344,7 +1345,8 @@ static void error_record_engine_execlists(struct 
intel_engine_cs *engine,
 }
 
 static void record_context(struct drm_i915_error_context *e,
-  struct i915_gem_context *ctx)
+  struct i915_gem_context *ctx,
+  u32 engine_id)
 {
if (ctx->pid) {
struct task_struct *task;
@@ -1363,6 +1365,7 @@ static void record_context(struct drm_i915_error_context 
*e,
e->ban_score = ctx->ban_score;
e->guilty = ctx->guilty_count;
e->active = ctx->active_count;
+   e->watchdog_threshold = ctx->engine[engine_id].watchdog_threshold;
 }
 
 static void request_record_user_bo(struct drm_i915_gem_request *request,
@@ -1426,7 +1429,7 @@ static void i915_gem_record_rings(struct drm_i915_private 
*dev_priv,
ee->vm = request->ctx->ppgtt ?
&request->ctx->ppgtt->base : &ggtt->base;
 
-   record_context(&ee->context, request->ctx);
+   record_context(&ee->context, request->ctx, engine->id);
 
/* We need to copy these to an anonymous buffer
 * as the simplest method to avoid being overwritten
-- 
2.11.0

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [PATCH v7 06/20] drm/i915: Add engine reset count to error state

2017-04-27 Thread Michel Thierry
From: Arun Siluvery 

Driver maintains count of how many times a given engine is reset, useful to
capture this in error state also. It gives an idea of how engine is coping
up with the workloads it is executing before this error state.

A follow-up patch will provide this information in debugfs.

v2: s/engine_reset/reset_engine/ (Chris)
Define count as unsigned int (Tvrtko)

Cc: Chris Wilson 
Cc: Mika Kuoppala 
Signed-off-by: Arun Siluvery 
Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/i915_drv.c   |  3 ++-
 drivers/gpu/drm/i915/i915_drv.h   | 10 ++
 drivers/gpu/drm/i915/i915_gpu_error.c |  3 +++
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index a64e9b63cdbc..426db8756e95 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1941,8 +1941,9 @@ int i915_reset_engine(struct intel_engine_cs *engine)
/* replay remaining requests in the queue */
ret = engine->init_hw(engine);
if (ret)
-   goto out; //XXX: ignore this line for now
+   goto out;
 
+   error->reset_engine_count[engine->id]++;
 out:
return ret;
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 8e93189c2104..b00ea523a634 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -982,6 +982,7 @@ struct i915_gpu_state {
enum intel_engine_hangcheck_action hangcheck_action;
struct i915_address_space *vm;
int num_requests;
+   u32 reset_count;
 
/* position of active request inside the ring */
u32 rq_head, rq_post, rq_tail;
@@ -1617,6 +1618,9 @@ struct i915_gpu_error {
 #define I915_RESET_HANDOFF 1
 #define I915_WEDGED(BITS_PER_LONG - 1)
 
+   /** Number of times an engine has been reset */
+   u32 reset_engine_count[I915_NUM_ENGINES];
+
/**
 * Waitqueue to signal when a hang is detected. Used to for waiters
 * to release the struct_mutex for the reset to procede.
@@ -3439,6 +3443,12 @@ static inline u32 i915_reset_count(struct i915_gpu_error 
*error)
return READ_ONCE(error->reset_count);
 }
 
+static inline u32 i915_reset_engine_count(struct i915_gpu_error *error,
+ struct intel_engine_cs *engine)
+{
+   return READ_ONCE(error->reset_engine_count[engine->id]);
+}
+
 struct drm_i915_gem_request *
 i915_gem_reset_prepare_engine(struct intel_engine_cs *engine);
 int i915_gem_reset_prepare(struct drm_i915_private *dev_priv,
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c 
b/drivers/gpu/drm/i915/i915_gpu_error.c
index 14e2064b7653..a2ffb1ef2cfa 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -463,6 +463,7 @@ static void error_print_engine(struct 
drm_i915_error_state_buf *m,
err_printf(m, "  hangcheck action timestamp: %lu, %u ms ago\n",
   ee->hangcheck_timestamp,
   jiffies_to_msecs(jiffies - ee->hangcheck_timestamp));
+   err_printf(m, "  engine reset count: %u\n", ee->reset_count);
 
error_print_request(m, "  ELSP[0]: ", &ee->execlist[0]);
error_print_request(m, "  ELSP[1]: ", &ee->execlist[1]);
@@ -1244,6 +1245,8 @@ static void error_record_engine_registers(struct 
i915_gpu_state *error,
ee->hangcheck_timestamp = engine->hangcheck.action_timestamp;
ee->hangcheck_action = engine->hangcheck.action;
ee->hangcheck_stalled = engine->hangcheck.stalled;
+   ee->reset_count = i915_reset_engine_count(&dev_priv->gpu_error,
+ engine);
 
if (USES_PPGTT(dev_priv)) {
int i;
-- 
2.11.0

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [PATCH v7 20/20] drm/i915: Watchdog timeout: Export media reset count from GuC to debugfs

2017-04-27 Thread Michel Thierry
From firmware v8.8, GuC provides the count of media engine resets
(watchdog timeout). This information is available in the GuC shared
context data struct, which resides in the first page of the default
(kernel) lrc context.

Since GuC handled engine resets are transparent for kernel and user,
provide a simple debugfs entry to see the number of times media reset
has happened.

v2: Remove unnecessary struct_mutex, _get_dirty_page and kmap_atomic;
use READ_ONCE. (Chris)

Cc: Chris Wilson 
Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/i915_debugfs.c   | 22 ++
 drivers/gpu/drm/i915/intel_guc_fwif.h | 18 ++
 2 files changed, 40 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c 
b/drivers/gpu/drm/i915/i915_debugfs.c
index 6444c1a9bd22..35ce771c8b8f 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1403,6 +1403,26 @@ static int i915_hangcheck_info(struct seq_file *m, void 
*unused)
return 0;
 }
 
+static u32 i915_watchdog_reset_count(struct drm_i915_private *dev_priv)
+{
+   struct i915_gem_context *ctx;
+   struct page *page;
+   struct guc_shared_ctx_data *guc_shared_data;
+   u32 guc_media_reset_count;
+
+   if (!i915.enable_guc_submission)
+   return 0;
+
+   ctx = dev_priv->kernel_context;
+   page = i915_gem_object_get_page(ctx->engine[RCS].state->obj,
+   LRC_GUCSHR_PN);
+   guc_shared_data = kmap(page);
+   guc_media_reset_count = READ_ONCE(guc_shared_data->media_reset_count);
+   kunmap(page);
+
+   return guc_media_reset_count;
+}
+
 static int i915_reset_info(struct seq_file *m, void *unused)
 {
struct drm_i915_private *dev_priv = node_to_i915(m->private);
@@ -1411,6 +1431,8 @@ static int i915_reset_info(struct seq_file *m, void 
*unused)
enum intel_engine_id id;
 
seq_printf(m, "full gpu reset = %u\n", i915_reset_count(error));
+   seq_printf(m, "GuC watchdog/media reset = %u\n",
+  i915_watchdog_reset_count(dev_priv));
 
for_each_engine(engine, dev_priv, id) {
seq_printf(m, "%s = %u\n", engine->name,
diff --git a/drivers/gpu/drm/i915/intel_guc_fwif.h 
b/drivers/gpu/drm/i915/intel_guc_fwif.h
index a2d0cba2f8b9..e45987f7aa50 100644
--- a/drivers/gpu/drm/i915/intel_guc_fwif.h
+++ b/drivers/gpu/drm/i915/intel_guc_fwif.h
@@ -502,6 +502,24 @@ union guc_log_control {
u32 value;
 } __packed;
 
+/* GuC Shared Context Data Struct */
+struct guc_shared_ctx_data {
+   u32 addr_of_last_preempted_data_low;
+   u32 addr_of_last_preempted_data_high;
+   u32 addr_of_last_preempted_data_high_tmp;
+   u32 padding;
+   u32 is_mapped_to_proxy;
+   u32 proxy_ctx_id;
+   u32 engine_reset_ctx_id;
+   u32 media_reset_count;
+   u32 reserved[8];
+   u32 uk_last_ctx_switch_reason;
+   u32 was_reset;
+   u32 lrca_gpu_addr;
+   u32 execlist_ctx;
+   u32 reserved1[32];
+} __packed;
+
 /* This Action will be programmed in C180 - SOFT_SCRATCH_O_REG */
 enum intel_guc_action {
INTEL_GUC_ACTION_DEFAULT = 0x0,
-- 
2.11.0

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [PATCH v7 10/20] drm/i915/selftests: reset engine self tests

2017-04-27 Thread Michel Thierry
Check that we can reset specific engines, also check the fallback to
full reset if something didn't work.

v2: rebase.

Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 147 +++
 1 file changed, 147 insertions(+)

diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c 
b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index aa31d6c0cdfb..f64fa0e4bb40 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -322,6 +322,56 @@ static int igt_global_reset(void *arg)
return err;
 }
 
+static int igt_reset_engine(void *arg)
+{
+   struct drm_i915_private *i915 = arg;
+   struct intel_engine_cs *engine;
+   enum intel_engine_id id;
+   unsigned int reset_count, reset_engine_count;
+   int err = 0;
+
+   /* Check that we can issue a global GPU and engine reset */
+
+   if (!intel_has_gpu_reset(i915))
+   return 0;
+
+   if (!intel_has_reset_engine(i915))
+   return 0;
+
+   set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
+
+   for_each_engine(engine, i915, id) {
+   reset_count = i915_reset_count(&i915->gpu_error);
+   reset_engine_count = i915_reset_engine_count(&i915->gpu_error,
+engine);
+
+   err = i915_reset_engine(engine);
+   if (err) {
+   pr_err("i915_reset_engine failed\n");
+   break;
+   }
+
+   if (i915_reset_count(&i915->gpu_error) != reset_count) {
+   pr_err("Full GPU reset recorded! (engine reset 
expected)\n");
+   err = -EINVAL;
+   break;
+   }
+
+   if (i915_reset_engine_count(&i915->gpu_error, engine) ==
+   reset_engine_count) {
+   pr_err("No %s engine reset recorded!\n", engine->name);
+   err = -EINVAL;
+   break;
+   }
+   }
+
+   clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
+   if (i915_terminally_wedged(&i915->gpu_error))
+   err = -EIO;
+
+   return err;
+}
+
 static u32 fake_hangcheck(struct drm_i915_gem_request *rq)
 {
u32 reset_count;
@@ -526,13 +576,110 @@ static int igt_reset_queue(void *arg)
return err;
 }
 
+static int igt_render_engine_reset_fallback(void *arg)
+{
+   struct drm_i915_private *i915 = arg;
+   struct intel_engine_cs *engine = i915->engine[RCS];
+   struct hang h;
+   struct drm_i915_gem_request *rq;
+   unsigned int reset_count, reset_engine_count;
+   int err = 0;
+
+   /* Check that we can issue a global GPU and engine reset */
+
+   if (!intel_has_gpu_reset(i915))
+   return 0;
+
+   if (!intel_has_reset_engine(i915))
+   return 0;
+
+   set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
+   mutex_lock(&i915->drm.struct_mutex);
+
+   err = hang_init(&h, i915);
+   if (err)
+   goto unlock;
+
+   rq = hang_create_request(&h, engine, i915->kernel_context);
+   if (IS_ERR(rq)) {
+   err = PTR_ERR(rq);
+   goto fini;
+   }
+
+   i915_gem_request_get(rq);
+   __i915_add_request(rq, true);
+
+   /* make reset engine fail */
+   rq->fence.error = -EIO;
+
+   if (!wait_for_hang(&h, rq)) {
+   pr_err("Failed to start request %x\n", rq->fence.seqno);
+   err = -EIO;
+   goto fini;
+   }
+
+   reset_engine_count = i915_reset_engine_count(&i915->gpu_error, engine);
+   reset_count = fake_hangcheck(rq);
+
+   err = i915_reset_engine(engine);
+   if (err) {
+   pr_err("i915_reset_engine failed\n");
+   goto fini;
+   }
+
+   if (i915_reset_engine_count(&i915->gpu_error, engine) !=
+   reset_engine_count) {
+   pr_err("render engine reset recorded! (full reset expected)\n");
+   err = -EINVAL;
+   goto fini;
+   }
+
+   if (i915_reset_count(&i915->gpu_error) == reset_count) {
+   pr_err("No full GPU reset recorded!\n");
+   err = -EINVAL;
+   goto fini;
+   }
+
+   /*
+* by using fence.error = -EIO, full reset sets the wedged flag, do one
+* more full reset to re-enable the hw.
+*/
+   if (i915_terminally_wedged(&i915->gpu_error)) {
+   rq->fence.error = 0;
+
+   set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
+   i915_reset(i915);
+   GEM_BUG_ON(test_bit(I915_RESET_HAND

[Intel-gfx] [PATCH v7 11/20] drm/i915/guc: fix mmio whitelist mmio_start offset and add reminder

2017-04-27 Thread Michel Thierry
From: Daniele Ceraolo Spurio 

The mmio_start offset for the whitelist is the first FORCE_TO_NONPRIV
register the GuC can use to restore the provided whitelist when an
engine reset via GuC (which we still don't support) is triggered.

We're currently adding the mmio_base of the engine to the absolute
address of the RCS version of the register, which results in the wrong
offset. Fix it by using the definition we already have instead of
re-defining it in the GuC FW header.

Also add a comment to avoid future issues with FORCE_TO_NONPRIV
registers, which are also used by the workaround framework.

v2: improve comment (Michal), move comment about save/restore because it
is not related to the mmio_white_list field.

v3: rebase/resurrect.

Signed-off-by: Daniele Ceraolo Spurio 
Cc: Michał Winiarski 
Cc: Michal Wajdeczko 
Cc: Arkadiusz Hiler 
Cc: Oscar Mateo 
Reviewed-by: Michał Winiarski  (v2)
Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/i915_guc_submission.c | 11 +--
 drivers/gpu/drm/i915/intel_guc_fwif.h  |  1 -
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c 
b/drivers/gpu/drm/i915/i915_guc_submission.c
index 4cc97bf1bdac..2cfe5d3b7795 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -1034,10 +1034,17 @@ static int guc_ads_create(struct intel_guc *guc)
/* MMIO reg state */
for_each_engine(engine, dev_priv, id) {
blob->reg_state.white_list[engine->guc_id].mmio_start =
-   engine->mmio_base + GUC_MMIO_WHITE_LIST_START;
+   
i915_mmio_reg_offset(RING_FORCE_TO_NONPRIV(engine->mmio_base, 0));
 
-   /* Nothing to be saved or restored for now. */
+   /*
+* Note: if the GuC whitelist management is enabled, the values
+* should be filled using the workaround framework to avoid
+* inconsistencies with the handling of FORCE_TO_NONPRIV
+* registers.
+*/
blob->reg_state.white_list[engine->guc_id].count = 0;
+
+   /* Nothing to be saved or restored for now. */
}
 
/*
diff --git a/drivers/gpu/drm/i915/intel_guc_fwif.h 
b/drivers/gpu/drm/i915/intel_guc_fwif.h
index 6156845641a3..e6f8079df94a 100644
--- a/drivers/gpu/drm/i915/intel_guc_fwif.h
+++ b/drivers/gpu/drm/i915/intel_guc_fwif.h
@@ -394,7 +394,6 @@ struct guc_policies {
 #define GUC_REGSET_SAVE_CURRENT_VALUE  0x10
 
 #define GUC_REGSET_MAX_REGISTERS   25
-#define GUC_MMIO_WHITE_LIST_START  0x24d0
 #define GUC_MMIO_WHITE_LIST_MAX12
 #define GUC_S3_SAVE_SPACE_PAGES10
 
-- 
2.11.0

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [PATCH v7 01/20] drm/i915: Update i915.reset to handle engine resets

2017-04-27 Thread Michel Thierry
From: Arun Siluvery 

In preparation for engine reset work update this parameter to handle more
than one type of reset. Default at the moment is still full gpu reset.

Cc: Chris Wilson 
Cc: Mika Kuoppala 
Signed-off-by: Arun Siluvery 
Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/i915_params.c | 6 +++---
 drivers/gpu/drm/i915/i915_params.h | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_params.c 
b/drivers/gpu/drm/i915/i915_params.c
index b6a7e363d076..045cadb77285 100644
--- a/drivers/gpu/drm/i915/i915_params.c
+++ b/drivers/gpu/drm/i915/i915_params.c
@@ -46,7 +46,7 @@ struct i915_params i915 __read_mostly = {
.prefault_disable = 0,
.load_detect_test = 0,
.force_reset_modeset_test = 0,
-   .reset = true,
+   .reset = 1,
.error_capture = true,
.invert_brightness = 0,
.disable_display = 0,
@@ -115,8 +115,8 @@ MODULE_PARM_DESC(vbt_sdvo_panel_type,
"Override/Ignore selection of SDVO panel mode in the VBT "
"(-2=ignore, -1=auto [default], index in VBT BIOS table)");
 
-module_param_named_unsafe(reset, i915.reset, bool, 0600);
-MODULE_PARM_DESC(reset, "Attempt GPU resets (default: true)");
+module_param_named_unsafe(reset, i915.reset, int, 0600);
+MODULE_PARM_DESC(reset, "Attempt GPU resets (0=disabled, 1=full gpu reset 
[default], 2=engine reset)");
 
 #if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
 module_param_named(error_capture, i915.error_capture, bool, 0600);
diff --git a/drivers/gpu/drm/i915/i915_params.h 
b/drivers/gpu/drm/i915/i915_params.h
index 34148cc8637c..febbfdbd30bd 100644
--- a/drivers/gpu/drm/i915/i915_params.h
+++ b/drivers/gpu/drm/i915/i915_params.h
@@ -51,6 +51,7 @@
func(int, use_mmio_flip); \
func(int, mmio_debug); \
func(int, edp_vswing); \
+   func(int, reset); \
func(unsigned int, inject_load_failure); \
/* leave bools at the end to not create holes */ \
func(bool, alpha_support); \
@@ -60,7 +61,6 @@
func(bool, prefault_disable); \
func(bool, load_detect_test); \
func(bool, force_reset_modeset_test); \
-   func(bool, reset); \
func(bool, error_capture); \
func(bool, disable_display); \
func(bool, verbose_state_checks); \
-- 
2.11.0

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [PATCH v7 18/20] drm/i915: Watchdog timeout: DRM kernel interface to set the timeout

2017-04-27 Thread Michel Thierry
Final enablement patch for GPU hang detection using watchdog timeout.
Using the gem_context_setparam ioctl, users can specify the desired
timeout value in microseconds, and the driver will do the conversion to
'timestamps'.

The recommended default watchdog threshold for video engines is 6 us,
since this has been _empirically determined_ to be a good compromise for
low-latency requirements and low rate of false positives. The default
register value is ~106000us and the theoretical max value (all 1s) is
353 seconds.

Note, UABI engine ids and i915 engine ids are different, and this patch
uses the i915 ones. Some kind of mapping table [1] is required if we
decide to use the UABI engine ids.

[1] 
http://patchwork.freedesktop.org/patch/msgid/20170329135831.30254-2-ch...@chris-wilson.co.uk

v2: Fixed get api to return values in microseconds. Threshold updated to
be per context engine. Check for u32 overflow. Capture ctx threshold
value in error state.

v3: Add a way to get array size, short-cut to disable all thresholds,
return EFAULT / EINVAL as needed. Move the capture of the threshold
value in the error state into a new patch. BXT has a different
timestamp base (because why not?).

v4: Checking if watchdog is available should be the first thing to
do, instead of giving false hopes to abi users; remove unnecessary & in
set_watchdog; ignore args->size in getparam.

Signed-off-by: Tomas Elf 
Signed-off-by: Arun Siluvery 
Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/i915_drv.h | 29 ++
 drivers/gpu/drm/i915/i915_gem_context.c | 95 +
 drivers/gpu/drm/i915/intel_lrc.c|  5 +-
 include/uapi/drm/i915_drm.h |  1 +
 4 files changed, 128 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 2e1211e25945..7a64f67974cb 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3578,6 +3578,35 @@ i915_gem_context_lookup_timeline(struct i915_gem_context 
*ctx,
return &vm->timeline.engine[engine->id];
 }
 
+/*
+ * BDW & SKL+ Timestamp timer resolution = 0.080 uSec,
+ * or 1250 counts per second, or ~12 counts per microsecond.
+ *
+ * But Broxton Timestamp timer resolution is different, 0.052 uSec,
+ * or 1920 counts per second, or ~19 counts per microsecond.
+ */
+#define SKL_TIMESTAMP_CNTS_PER_USEC 12
+#define BXT_TIMESTAMP_CNTS_PER_USEC 19
+#define TIMESTAMP_CNTS_PER_USEC(dev_priv) (IS_BROXTON(dev_priv) ? \
+  BXT_TIMESTAMP_CNTS_PER_USEC : \
+  SKL_TIMESTAMP_CNTS_PER_USEC)
+static inline u32
+watchdog_to_us(struct drm_i915_private *dev_priv, u32 value_in_clock_counts)
+{
+   return value_in_clock_counts / TIMESTAMP_CNTS_PER_USEC(dev_priv);
+}
+
+static inline u32
+watchdog_to_clock_counts(struct drm_i915_private *dev_priv, u64 value_in_us)
+{
+   u64 threshold = value_in_us * TIMESTAMP_CNTS_PER_USEC(dev_priv);
+
+   if (overflows_type(threshold, u32))
+   return -EINVAL;
+
+   return threshold;
+}
+
 int i915_perf_open_ioctl(struct drm_device *dev, void *data,
 struct drm_file *file);
 
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c 
b/drivers/gpu/drm/i915/i915_gem_context.c
index e98d9daa3f00..574df077cf34 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -363,6 +363,95 @@ i915_gem_context_create_gvt(struct drm_device *dev)
return ctx;
 }
 
+/* Return the timer count threshold in microseconds. */
+int i915_gem_context_get_watchdog(struct i915_gem_context *ctx,
+ struct drm_i915_gem_context_param *args)
+{
+   struct drm_i915_private *dev_priv = ctx->i915;
+   struct intel_engine_cs *engine;
+   enum intel_engine_id id;
+   u32 threshold_in_us[I915_NUM_ENGINES];
+
+   if (!dev_priv->engine[VCS]->emit_start_watchdog)
+   return -ENODEV;
+
+   for_each_engine(engine, dev_priv, id) {
+   struct intel_context *ce = &ctx->engine[id];
+
+   threshold_in_us[id] = watchdog_to_us(dev_priv,
+ce->watchdog_threshold);
+   }
+
+   mutex_unlock(&dev_priv->drm.struct_mutex);
+   if (__copy_to_user(u64_to_user_ptr(args->value),
+  &threshold_in_us,
+  sizeof(threshold_in_us))) {
+   mutex_lock(&dev_priv->drm.struct_mutex);
+   return -EFAULT;
+   }
+   mutex_lock(&dev_priv->drm.struct_mutex);
+
+   args->size = sizeof(threshold_in_us);
+
+   return 0;
+}
+
+/*
+ * Based on time out value in microseconds (us) calculate
+ * timer count thresholds needed based on core frequency.
+ * Watchdog can be disabled by sett

[Intel-gfx] [PATCH v7 08/20] drm/i915: Enable Engine reset and recovery support

2017-04-27 Thread Michel Thierry
From: Arun Siluvery 

This feature is made available only from Gen8, for previous gen devices
driver uses legacy full gpu reset.

Cc: Chris Wilson 
Cc: Mika Kuoppala 
Signed-off-by: Tomas Elf 
Signed-off-by: Arun Siluvery 
Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/i915_params.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_params.c 
b/drivers/gpu/drm/i915/i915_params.c
index 045cadb77285..14e2c2e57f96 100644
--- a/drivers/gpu/drm/i915/i915_params.c
+++ b/drivers/gpu/drm/i915/i915_params.c
@@ -46,7 +46,7 @@ struct i915_params i915 __read_mostly = {
.prefault_disable = 0,
.load_detect_test = 0,
.force_reset_modeset_test = 0,
-   .reset = 1,
+   .reset = 2,
.error_capture = true,
.invert_brightness = 0,
.disable_display = 0,
@@ -116,7 +116,7 @@ MODULE_PARM_DESC(vbt_sdvo_panel_type,
"(-2=ignore, -1=auto [default], index in VBT BIOS table)");
 
 module_param_named_unsafe(reset, i915.reset, int, 0600);
-MODULE_PARM_DESC(reset, "Attempt GPU resets (0=disabled, 1=full gpu reset 
[default], 2=engine reset)");
+MODULE_PARM_DESC(reset, "Attempt GPU resets (0=disabled, 1=full gpu reset, 
2=engine reset [default])");
 
 #if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
 module_param_named(error_capture, i915.error_capture, bool, 0600);
-- 
2.11.0

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [PATCH v7 17/20] drm/i915: Watchdog timeout: Ringbuffer command emission for gen8+

2017-04-27 Thread Michel Thierry
Emit the required commands into the ring buffer for starting and
stopping the watchdog timer before/after batch buffer start during
batch buffer submission.

v2: Support watchdog threshold per context engine, merge lri commands,
and move watchdog commands emission to emit_bb_start. Request space of
combined start_watchdog, bb_start and stop_watchdog to avoid any error
after emitting bb_start.

v3: There were too many req->engine in emit_bb_start.
Use GEM_BUG_ON instead of returning a very late EINVAL in the remote
case of watchdog misprogramming; set correct LRI cmd size in
emit_stop_watchdog. (Chris)

Cc: Chris Wilson 
Signed-off-by: Tomas Elf 
Signed-off-by: Ian Lister 
Signed-off-by: Arun Siluvery 
Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/i915_gem_context.h |  4 ++
 drivers/gpu/drm/i915/intel_lrc.c| 85 +++--
 drivers/gpu/drm/i915/intel_ringbuffer.h |  4 ++
 3 files changed, 89 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_context.h 
b/drivers/gpu/drm/i915/i915_gem_context.h
index 4af2ab94558b..88700bdbb4e1 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.h
+++ b/drivers/gpu/drm/i915/i915_gem_context.h
@@ -150,6 +150,10 @@ struct i915_gem_context {
u32 *lrc_reg_state;
u64 lrc_desc;
int pin_count;
+   /** watchdog_threshold: hw watchdog threshold value,
+* in clock counts
+*/
+   u32 watchdog_threshold;
bool initialised;
} engine[I915_NUM_ENGINES];
 
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 69a73440ff12..207cf7d8721b 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1310,7 +1310,10 @@ static int gen8_emit_bb_start(struct 
drm_i915_gem_request *req,
  u64 offset, u32 len,
  const unsigned int flags)
 {
+   struct intel_engine_cs *engine = req->engine;
u32 *cs;
+   u32 num_dwords;
+   bool watchdog_running = false;
int ret;
 
/* Don't rely in hw updating PDPs, specially in lite-restore.
@@ -1320,20 +1323,38 @@ static int gen8_emit_bb_start(struct 
drm_i915_gem_request *req,
 * not idle). PML4 is allocated during ppgtt init so this is
 * not needed in 48-bit.*/
if (req->ctx->ppgtt &&
-   (intel_engine_flag(req->engine) & req->ctx->ppgtt->pd_dirty_rings) 
&&
+   (intel_engine_flag(engine) & req->ctx->ppgtt->pd_dirty_rings) &&
!i915_vm_is_48bit(&req->ctx->ppgtt->base) &&
!intel_vgpu_active(req->i915)) {
ret = intel_logical_ring_emit_pdps(req);
if (ret)
return ret;
 
-   req->ctx->ppgtt->pd_dirty_rings &= 
~intel_engine_flag(req->engine);
+   req->ctx->ppgtt->pd_dirty_rings &= ~intel_engine_flag(engine);
+   }
+
+   /* bb_start only */
+   num_dwords = 4;
+
+   /* check if watchdog will be required */
+   if (req->ctx->engine[engine->id].watchdog_threshold != 0) {
+   GEM_BUG_ON(!engine->emit_start_watchdog ||
+  !engine->emit_stop_watchdog);
+
+   /* + start_watchdog (6) + stop_watchdog (4) */
+   num_dwords += 10;
+   watchdog_running = true;
}
 
-   cs = intel_ring_begin(req, 4);
+   cs = intel_ring_begin(req, num_dwords);
if (IS_ERR(cs))
return PTR_ERR(cs);
 
+   if (watchdog_running) {
+   /* Start watchdog timer */
+   cs = engine->emit_start_watchdog(req, cs);
+   }
+
/* FIXME(BDW): Address space and security selectors. */
*cs++ = MI_BATCH_BUFFER_START_GEN8 |
(flags & I915_DISPATCH_SECURE ? 0 : BIT(8)) |
@@ -1341,8 +1362,13 @@ static int gen8_emit_bb_start(struct 
drm_i915_gem_request *req,
*cs++ = lower_32_bits(offset);
*cs++ = upper_32_bits(offset);
*cs++ = MI_NOOP;
-   intel_ring_advance(req, cs);
 
+   if (watchdog_running) {
+   /* Cancel watchdog timer */
+   cs = engine->emit_stop_watchdog(req, cs);
+   }
+
+   intel_ring_advance(req, cs);
return 0;
 }
 
@@ -1509,6 +1535,49 @@ static void gen8_watchdog_irq_handler(unsigned long data)
intel_uncore_forcewake_put(dev_priv, engine->fw_domains);
 }
 
+static u32 *gen8_emit_start_watchdog(struct drm_i915_gem_request *req, u32 *cs)
+{
+   struct intel_engine_cs *engine = req->engine;
+   struct i915_gem_context *ctx = req->ctx;
+   struct intel_context *ce = &ctx->engine[engine->id];
+
+   /* XXX: no watchdog support in BCS engine */
+   GEM_BUG_ON(engine->id == BCS);
+
+   

[Intel-gfx] [PATCH v7 05/20] drm/i915: Cancel reset-engine if we couldn't find an active request

2017-04-27 Thread Michel Thierry
Before reseting an engine, check if there is an active request, and if
the _hung_ request has completed. In these two cases, the seqno has moved
after hang declaration and we can skip the reset.

Also store the active request so that we only search for it once.

Suggested-by: Chris Wilson 
Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/i915_drv.c | 37 +
 drivers/gpu/drm/i915/i915_drv.h |  6 --
 drivers/gpu/drm/i915/i915_gem.c | 37 -
 3 files changed, 57 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index ae891529dedd..a64e9b63cdbc 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1811,7 +1811,7 @@ void i915_reset(struct drm_i915_private *dev_priv)
pr_notice("drm/i915: Resetting chip after gpu hang\n");
disable_irq(dev_priv->drm.irq);
ret = i915_gem_reset_prepare(dev_priv, ALL_ENGINES);
-   if (ret) {
+   if (ret == -EIO) {
DRM_ERROR("GPU recovery failed\n");
intel_gpu_reset(dev_priv, ALL_ENGINES);
goto error;
@@ -1883,23 +1883,40 @@ int i915_reset_engine(struct intel_engine_cs *engine)
int ret;
struct drm_i915_private *dev_priv = engine->i915;
struct i915_gpu_error *error = &dev_priv->gpu_error;
+   struct drm_i915_gem_request *active_request;
 
GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
 
DRM_DEBUG_DRIVER("resetting %s\n", engine->name);
 
-   ret = i915_gem_reset_prepare_engine(engine);
-   if (ret) {
-   DRM_ERROR("Previous reset failed - promote to full reset\n");
-   goto out;
+   active_request = i915_gem_reset_prepare_engine(engine);
+   if (!active_request) {
+   DRM_DEBUG_DRIVER("seqno moved after hang declaration, 
pardoned\n");
+   goto canceled;
+   }
+   if (IS_ERR(active_request)) {
+   ret = PTR_ERR(active_request);
+   if (ret == -ECANCELED) {
+   DRM_DEBUG_DRIVER("no active request found, skip 
reset\n");
+   goto canceled;
+   } else if (ret) {
+   DRM_DEBUG_DRIVER("Previous reset failed, promote to 
full reset\n");
+   goto out;
+   }
}
 
+   if (__i915_gem_request_completed(active_request, 
engine->hangcheck.seqno)) {
+   DRM_DEBUG_DRIVER("request completed, skip the reset\n");
+   goto canceled;
+   }
+
+
/*
-* the request that caused the hang is stuck on elsp, identify the
-* active request and drop it, adjust head to skip the offending
+* the request that caused the hang is stuck on elsp, we know the
+* active request and can drop it, adjust head to skip the offending
 * request to resume executing remaining requests in the queue.
 */
-   i915_gem_reset_engine(engine);
+   i915_gem_reset_engine(engine, active_request);
 
/* forcing engine to idle */
ret = intel_reset_engine_start(engine);
@@ -1928,6 +1945,10 @@ int i915_reset_engine(struct intel_engine_cs *engine)
 
 out:
return ret;
+
+canceled:
+   i915_gem_reset_finish_engine(engine);
+   return 0;
 }
 
 static int i915_pm_suspend(struct device *kdev)
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index efbf34318893..8e93189c2104 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3439,7 +3439,8 @@ static inline u32 i915_reset_count(struct i915_gpu_error 
*error)
return READ_ONCE(error->reset_count);
 }
 
-int i915_gem_reset_prepare_engine(struct intel_engine_cs *engine);
+struct drm_i915_gem_request *
+i915_gem_reset_prepare_engine(struct intel_engine_cs *engine);
 int i915_gem_reset_prepare(struct drm_i915_private *dev_priv,
   unsigned int engine_mask);
 void i915_gem_reset(struct drm_i915_private *dev_priv);
@@ -3448,7 +3449,8 @@ void i915_gem_reset_finish(struct drm_i915_private 
*dev_priv,
   unsigned int engine_mask);
 void i915_gem_set_wedged(struct drm_i915_private *dev_priv);
 bool i915_gem_unset_wedged(struct drm_i915_private *dev_priv);
-void i915_gem_reset_engine(struct intel_engine_cs *engine);
+void i915_gem_reset_engine(struct intel_engine_cs *engine,
+  struct drm_i915_gem_request *request);
 
 void i915_gem_init_mmio(struct drm_i915_private *i915);
 int __must_check i915_gem_init(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index bce38062f94e..4e357d333cc2 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_

[Intel-gfx] [PATCH v7 15/20] drm/i915: Watchdog timeout: Pass GuC shared data structure during param load

2017-04-27 Thread Michel Thierry
For watchdog / media reset, the firmware must know the address of the shared
data page (the first page of the default context).

This information should be in DWORD 9 of the GUC_CTL structure.

v2: Use guc_ggtt_offset (Chris).
Store the ggtt offset of the default ctx as we needed for
suspend/resume/reset (Daniele).

Cc: Chris Wilson 
Cc: Daniele Ceraolo Spurio 
Reviewed-by: Daniele Ceraolo Spurio 
Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/i915_guc_submission.c | 21 ++---
 drivers/gpu/drm/i915/intel_guc_fwif.h  |  2 +-
 drivers/gpu/drm/i915/intel_guc_loader.c| 11 +++
 drivers/gpu/drm/i915/intel_uc.h|  2 ++
 4 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c 
b/drivers/gpu/drm/i915/i915_guc_submission.c
index 57815edfc4df..97392e1c04d1 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -1370,7 +1370,6 @@ void i915_guc_submission_reenable_engine(struct 
intel_engine_cs *engine)
 int intel_guc_suspend(struct drm_i915_private *dev_priv)
 {
struct intel_guc *guc = &dev_priv->guc;
-   struct i915_gem_context *ctx;
u32 data[3];
 
if (guc->fw.load_status != INTEL_UC_FIRMWARE_SUCCESS)
@@ -1378,13 +1377,11 @@ int intel_guc_suspend(struct drm_i915_private *dev_priv)
 
gen9_disable_guc_interrupts(dev_priv);
 
-   ctx = dev_priv->kernel_context;
-
data[0] = INTEL_GUC_ACTION_ENTER_S_STATE;
/* any value greater than GUC_POWER_D0 */
data[1] = GUC_POWER_D1;
-   /* first page is shared data with GuC */
-   data[2] = guc_ggtt_offset(ctx->engine[RCS].state);
+   /* first page of default ctx is shared data with GuC */
+   data[2] = guc->shared_data_offset;
 
return intel_guc_send(guc, data, ARRAY_SIZE(data));
 }
@@ -1396,7 +1393,6 @@ int intel_guc_suspend(struct drm_i915_private *dev_priv)
 int intel_guc_resume(struct drm_i915_private *dev_priv)
 {
struct intel_guc *guc = &dev_priv->guc;
-   struct i915_gem_context *ctx;
u32 data[3];
 
if (guc->fw.load_status != INTEL_UC_FIRMWARE_SUCCESS)
@@ -1405,12 +1401,10 @@ int intel_guc_resume(struct drm_i915_private *dev_priv)
if (i915.guc_log_level >= 0)
gen9_enable_guc_interrupts(dev_priv);
 
-   ctx = dev_priv->kernel_context;
-
data[0] = INTEL_GUC_ACTION_EXIT_S_STATE;
data[1] = GUC_POWER_D0;
-   /* first page is shared data with GuC */
-   data[2] = guc_ggtt_offset(ctx->engine[RCS].state);
+   /* first page of default ctx is shared data with GuC */
+   data[2] = guc->shared_data_offset;
 
return intel_guc_send(guc, data, ARRAY_SIZE(data));
 }
@@ -1419,14 +1413,11 @@ int i915_guc_reset_engine(struct intel_engine_cs 
*engine)
 {
struct drm_i915_private *dev_priv = engine->i915;
struct intel_guc *guc = &dev_priv->guc;
-   struct i915_gem_context *ctx;
u32 data[7];
 
if (!i915.enable_guc_submission)
return 0;
 
-   ctx = dev_priv->kernel_context;
-
/*
 * The affected context report is populated by GuC and is provided
 * to the driver using the shared page. We request for it but don't
@@ -1438,8 +1429,8 @@ int i915_guc_reset_engine(struct intel_engine_cs *engine)
data[3] = 0;
data[4] = 0;
data[5] = guc->execbuf_client->stage_id;
-   /* first page is shared data with GuC */
-   data[6] = guc_ggtt_offset(ctx->engine[RCS].state);
+   /* first page of default ctx is shared data with GuC */
+   data[6] = guc->shared_data_offset;
 
return intel_guc_send(guc, data, ARRAY_SIZE(data));
 }
diff --git a/drivers/gpu/drm/i915/intel_guc_fwif.h 
b/drivers/gpu/drm/i915/intel_guc_fwif.h
index 081f2cf614e6..a2d0cba2f8b9 100644
--- a/drivers/gpu/drm/i915/intel_guc_fwif.h
+++ b/drivers/gpu/drm/i915/intel_guc_fwif.h
@@ -135,7 +135,7 @@
 #define   GUC_ADS_ADDR_SHIFT   11
 #define   GUC_ADS_ADDR_MASK0xf800
 
-#define GUC_CTL_RSRVD  9
+#define GUC_CTL_SHARED_DATA9
 
 #define GUC_CTL_MAX_DWORDS (SOFT_SCRATCH_COUNT - 2) /* [1..14] */
 
diff --git a/drivers/gpu/drm/i915/intel_guc_loader.c 
b/drivers/gpu/drm/i915/intel_guc_loader.c
index d9045b6e897b..8cd5c2bf9510 100644
--- a/drivers/gpu/drm/i915/intel_guc_loader.c
+++ b/drivers/gpu/drm/i915/intel_guc_loader.c
@@ -108,6 +108,7 @@ static void guc_params_init(struct drm_i915_private 
*dev_priv)
 {
struct intel_guc *guc = &dev_priv->guc;
u32 params[GUC_CTL_MAX_DWORDS];
+   struct i915_gem_context *ctx;
int i;
 
memset(¶ms, 0, sizeof(params));
@@ -156,6 +157,16 @@ static void guc_params_init(struct drm_i915_private 
*dev_priv)
params[GUC_CTL_FEATURE] &= ~GUC_CTL_DISABLE_SCHEDULER;
}
 

Re: [Intel-gfx] [PATCH v7 12/20] drm/i915/guc: Provide register list to be saved/restored during engine reset

2017-04-28 Thread Michel Thierry



On 4/27/2017 4:58 PM, Chris Wilson wrote:

On Thu, Apr 27, 2017 at 04:12:52PM -0700, Michel Thierry wrote:

+#define WA_REG_WR_GUC_RESTORE(addr, val) do { \
+   const int r = guc_wa_add(dev_priv, (addr), (val)); \
+   if (r) \
+   return r; \
+   } while (0)


Try to avoid burying returns inside macros. Does this macro help code
readability? Would perhaps just a table of registers + values be easier?
-Chris



Sure, I can change it to something else.
I only replicated what the other WA_* macros (the ones we need in ctx 
switch) were doing.

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH v7 03/20] drm/i915: Add support for per engine reset recovery

2017-04-28 Thread Michel Thierry



On 4/27/2017 4:50 PM, Chris Wilson wrote:

-static void engine_retire_requests(struct intel_engine_cs *engine)
+void engine_retire_requests(struct intel_engine_cs *engine)

Fortunately stray chunk. I was about to scream.



This chunk has been there for quite a long time, at least since v4... 
thanks for spotting it (I'm the one that should be screaming).

___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH v7 13/20] drm/i915/guc: Rename the function that resets the GuC

2017-05-01 Thread Michel Thierry

On 28/04/17 00:40, Tvrtko Ursulin wrote:

--- a/drivers/gpu/drm/i915/intel_uc.c
+++ b/drivers/gpu/drm/i915/intel_uc.c
@@ -46,9 +46,9 @@ static int __intel_uc_reset_hw(struct
drm_i915_private *dev_priv)
 int ret;
 u32 guc_status;

-ret = intel_guc_reset(dev_priv);
+ret = intel_reset_guc(dev_priv);
 if (ret) {
-DRM_ERROR("GuC reset failed, ret = %d\n", ret);
+DRM_ERROR("Reset GuC failed, ret = %d\n", ret);


As a non-native speaker I might be wrong, but was thinking something
like "Failed to reset GuC", "Resetting GuC failed", "Reset of GuC
failed" would be clearer? I leave it for someone more competent to decide.


"Failed to reset GuC" sounds good to me.
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH v7 04/20] drm/i915: Skip reset request if there is one already

2017-05-01 Thread Michel Thierry



On 29/04/17 07:21, Chris Wilson wrote:

On Thu, Apr 27, 2017 at 04:12:44PM -0700, Michel Thierry wrote:

From: Mika Kuoppala 

To perform engine reset we first disable engine to capture its state. This
is done by issuing a reset request. Because we are reusing existing
infrastructure, again when we actually reset an engine, reset function
checks engine mask and issues reset request again which is unnecessary. To
avoid this we check if the engine is already prepared, if so we just exit
from that point.


Do we still need this? I am a bit dubious because it implies we have no
idea what we are doing, recursively calling resets.
-Chris



I can drop this one. It isn't really needed (the 'shortcut' it refers is 
because we already set the bit in intel_reset_engine_start).


btw here it's only setting/querying "Ready-ness for Reset", and I've 
heard rumours that the register may not clear itself sometimes (but I 
haven't seen that behaviour myself).


-Michel
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH v7 03/20] drm/i915: Add support for per engine reset recovery

2017-05-03 Thread Michel Thierry

On 27/04/17 16:50, Chris Wilson wrote:

On Thu, Apr 27, 2017 at 04:12:43PM -0700, Michel Thierry wrote:

From: Arun Siluvery 

This change implements support for per-engine reset as an initial, less
intrusive hang recovery option to be attempted before falling back to the
legacy full GPU reset recovery mode if necessary. This is only supported
from Gen8 onwards.

Hangchecker determines which engines are hung and invokes error handler to
recover from it. Error handler schedules recovery for each of those engines
that are hung. The recovery procedure is as follows,
 - identifies the request that caused the hang and it is dropped
 - force engine to idle: this is done by issuing a reset request
 - reset and re-init engine
 - restart submissions to the engine

If engine reset fails then we fall back to heavy weight full gpu reset
which resets all engines and reinitiazes complete state of HW and SW.

v2: Rebase.
v3: s/*engine_reset*/*reset_engine*/; freeze engine and irqs before
calling i915_gem_reset_engine (Chris).
v4: Rebase, modify i915_gem_reset_prepare to use a ring mask and
reuse the function for reset_engine.
v5: intel_reset_engine_start/cancel instead of request/unrequest_reset.
v6: Clean up reset_engine function to not require mutex, i.e. no need to call
revoke/restore_fences and _retire_requests (Chris).
v7: Remove leftovers from v5, i.e. no need to disable irq, hold
forcewake or wakeup the handoff bit (Chris).

Cc: Chris Wilson 
Cc: Mika Kuoppala 
Signed-off-by: Tomas Elf 
Signed-off-by: Arun Siluvery 
Signed-off-by: Michel Thierry 
---
 drivers/gpu/drm/i915/i915_drv.c | 60 ++--
 drivers/gpu/drm/i915/i915_drv.h | 12 +++-
 drivers/gpu/drm/i915/i915_gem.c | 97 +++--
 drivers/gpu/drm/i915/i915_gem_request.c |  2 +-
 drivers/gpu/drm/i915/intel_uncore.c | 20 +++
 5 files changed, 142 insertions(+), 49 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 48c8b69d9bde..ae891529dedd 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1810,7 +1810,7 @@ void i915_reset(struct drm_i915_private *dev_priv)

pr_notice("drm/i915: Resetting chip after gpu hang\n");
disable_irq(dev_priv->drm.irq);
-   ret = i915_gem_reset_prepare(dev_priv);
+   ret = i915_gem_reset_prepare(dev_priv, ALL_ENGINES);
if (ret) {
DRM_ERROR("GPU recovery failed\n");
intel_gpu_reset(dev_priv, ALL_ENGINES);
@@ -1852,7 +1852,7 @@ void i915_reset(struct drm_i915_private *dev_priv)
i915_queue_hangcheck(dev_priv);

 finish:
-   i915_gem_reset_finish(dev_priv);
+   i915_gem_reset_finish(dev_priv, ALL_ENGINES);
enable_irq(dev_priv->drm.irq);

 wakeup:
@@ -1871,11 +1871,63 @@ void i915_reset(struct drm_i915_private *dev_priv)
  *
  * Reset a specific GPU engine. Useful if a hang is detected.
  * Returns zero on successful reset or otherwise an error code.
+ *
+ * Procedure is:
+ *  - identifies the request that caused the hang and it is dropped
+ *  - force engine to idle: this is done by issuing a reset request
+ *  - reset engine
+ *  - restart submissions to the engine


Why does the prospective caller need to know this?


  */
 int i915_reset_engine(struct intel_engine_cs *engine)
 {
-   /* FIXME: replace me with engine reset sequence */
-   return -ENODEV;
+   int ret;
+   struct drm_i915_private *dev_priv = engine->i915;
+   struct i915_gpu_error *error = &dev_priv->gpu_error;
+
+   GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
+
+   DRM_DEBUG_DRIVER("resetting %s\n", engine->name);
+
+   ret = i915_gem_reset_prepare_engine(engine);
+   if (ret) {
+   DRM_ERROR("Previous reset failed - promote to full reset\n");
+   goto out;
+   }
+
+   /*
+* the request that caused the hang is stuck on elsp, identify the
+* active request and drop it, adjust head to skip the offending
+* request to resume executing remaining requests in the queue.
+*/
+   i915_gem_reset_engine(engine);
+
+   /* forcing engine to idle */
+   ret = intel_reset_engine_start(engine);
+   if (ret) {
+   DRM_ERROR("Failed to disable %s\n", engine->name);
+   goto out;
+   }
+
+   /* finally, reset engine */
+   ret = intel_gpu_reset(dev_priv, intel_engine_flag(engine));
+   if (ret) {
+   DRM_ERROR("Failed to reset %s, ret=%d\n", engine->name, ret);
+   intel_reset_engine_cancel(engine);
+   goto out;
+   }
+
+   /* be sure the request reset bit gets cleared */
+   intel_reset_engine_cancel(engine);
+
+   i915_gem_reset_finish_engine(engine);
+
+   /* replay remaining requests in the queue */
+   ret = engine->init_hw(en

  1   2   3   4   5   6   7   8   9   10   >