On Tue, 2020-10-13 at 13:02 +0200, Janusz Krzysztofik wrote:
> Removal of igt_fork_hang_detector() from local_i915_healthcheck() by
> commit 1fbd127bd4e1 ("core_hotplug: Teach the healthcheck how to
> check
> execution status") resulted in unintentional removal of an important
> though implicit test feature of detecting, reporting as failures and
> recovering from potential misses of debugfs subdirs of hot rebound
> i915
> devices.  As a consequence, unexpected failures or skips of other
> unrelated but subsequently run tests have been observed on CI.
> 
> On the other hand, removal of the debugfs issue detection and subtest
> failures from right after hot rebinding the driver enabled the better
> version of the i915 GPU health check fixed by the same commit to
> detect
> and report other issues potentially triggered by device late close.
> 
> Restore the missing test feature by introducing an explicit sysfs
> health check, not limited to i915,  that verifies existence of device
> sysfs and debugfs areas.  Also, split hotrebind/hotreplug scenarios
> into a pair of each, one that performs the health check right after
> hot
> rebind/replug and delegates the device late close step to a follow up
> recovery phase, while the other one checks device health only after
> late closing it.
> 
> v2: Give GPU health check a better chance to detect issues - run it
>     before sysfs health checks.
> v3: Run sysfs health check on any hardware, not only i915.
> 
> Signed-off-by: Janusz Krzysztofik <janusz.krzyszto...@linux.intel.com
> >
> Cc: Chris Wilson <ch...@chris-wilson.co.uk>
> ---
> Even if the root cause has occurred to be sitting on the IGT lib side
> and has been already fixed by commit 937526629344 ("lib: Don't fail
> debugfs lookup on an expected absent drm device"), I think we should
> restore the debugfs health check just in case new issues with similar
> symptoms appear in the future and start affecting subsequent tests
> silently.
> 
> Thanks,
> Janusz
> 
>  tests/core_hotunplug.c | 68 ++++++++++++++++++++++++++++++++++++++
> ----
>  1 file changed, 62 insertions(+), 6 deletions(-)
> 
> diff --git a/tests/core_hotunplug.c b/tests/core_hotunplug.c
> index 70669c590..cdc07c85d 100644
> --- a/tests/core_hotunplug.c
> +++ b/tests/core_hotunplug.c
> @@ -308,7 +308,7 @@ static void node_healthcheck(struct hotunplug
> *priv, unsigned flags)
>               priv->failure = "Unrecoverable test failure";
>               if (local_i915_healthcheck(fd_drm, "") &&
>                   (!(flags & FLAG_RECOVER) ||
> local_i915_recover(fd_drm)))
> -                     priv->failure = "Healthcheck failure!";
> +                     priv->failure = "GPU healthcheck failure!";
>               else
>                       priv->failure = NULL;
>  
> @@ -317,6 +317,16 @@ static void node_healthcheck(struct hotunplug
> *priv, unsigned flags)
>               priv->failure = NULL;
>       }
>  
> +     if (!priv->failure) {
> +             char path[200];
> +
> +             priv->failure = "Device sysfs healthckeck failure!";
> +             local_debug("%s\n", "running device sysfs
> healthcheck");
> +             igt_assert(igt_sysfs_path(fd_drm, path, sizeof(path)));
> +             igt_assert(igt_debugfs_path(fd_drm, path,
> sizeof(path)));
> +             priv->failure = NULL;
> +     }
> +

LGTM,
Reviewed-by: Marcin Bernatowicz <marcin.bernatow...@linux.intel.com>

>       fd_drm = close_device(fd_drm, "", "health checked ");
>       if (closed || fd_drm < -1)      /* update status for
> post_healthcheck */
>               priv->fd.drm_hc = fd_drm;
> @@ -437,7 +447,7 @@ static void hotunplug_rescan(struct hotunplug
> *priv)
>       healthcheck(priv, false);
>  }
>  
> -static void hotrebind_lateclose(struct hotunplug *priv)
> +static void hotrebind(struct hotunplug *priv)
>  {
>       igt_assert_eq(priv->fd.drm, -1);
>       igt_assert_eq(priv->fd.drm_hc, -1);
> @@ -448,6 +458,30 @@ static void hotrebind_lateclose(struct hotunplug
> *priv)
>       driver_bind(priv, 0);
>  
>       healthcheck(priv, false);
> +}
> +
> +static void hotreplug(struct hotunplug *priv)
> +{
> +     igt_assert_eq(priv->fd.drm, -1);
> +     igt_assert_eq(priv->fd.drm_hc, -1);
> +     priv->fd.drm = local_drm_open_driver(false, "", " for hot
> replug");
> +
> +     device_unplug(priv, "hot ", 60);
> +
> +     bus_rescan(priv, 0);
> +
> +     healthcheck(priv, false);
> +}
> +
> +static void hotrebind_lateclose(struct hotunplug *priv)
> +{
> +     igt_assert_eq(priv->fd.drm, -1);
> +     igt_assert_eq(priv->fd.drm_hc, -1);
> +     priv->fd.drm = local_drm_open_driver(false, "", " for hot
> rebind");
> +
> +     driver_unbind(priv, "hot ", 60);
> +
> +     driver_bind(priv, 0);
>  
>       priv->fd.drm = close_device(priv->fd.drm, "late ", "unbound ");
>       igt_assert_eq(priv->fd.drm, -1);
> @@ -465,8 +499,6 @@ static void hotreplug_lateclose(struct hotunplug
> *priv)
>  
>       bus_rescan(priv, 0);
>  
> -     healthcheck(priv, false);
> -
>       priv->fd.drm = close_device(priv->fd.drm, "late ", "removed ");
>       igt_assert_eq(priv->fd.drm, -1);
>  
> @@ -570,7 +602,31 @@ igt_main
>               post_healthcheck(&priv);
>  
>       igt_subtest_group {
> -             igt_describe("Check if the driver hot unbound from a
> still open device can be cleanly rebound, then the old instance
> released");
> +             igt_describe("Check if the driver can be cleanly
> rebound to a device with a still open hot unbound driver instance");
> +             igt_subtest("hotrebind")
> +                     hotrebind(&priv);
> +
> +             igt_fixture
> +                     recover(&priv);
> +     }
> +
> +     igt_fixture
> +             post_healthcheck(&priv);
> +
> +     igt_subtest_group {
> +             igt_describe("Check if a hot unplugged and still open
> device can be cleanly restored");
> +             igt_subtest("hotreplug")
> +                     hotreplug(&priv);
> +
> +             igt_fixture
> +                     recover(&priv);
> +     }
> +
> +     igt_fixture
> +             post_healthcheck(&priv);
> +
> +     igt_subtest_group {
> +             igt_describe("Check if a hot unbound driver instance
> still open after hot rebind can be cleanly released");
>               igt_subtest("hotrebind-lateclose")
>                       hotrebind_lateclose(&priv);
>  
> @@ -582,7 +638,7 @@ igt_main
>               post_healthcheck(&priv);
>  
>       igt_subtest_group {
> -             igt_describe("Check if a still open while hot unplugged
> device can be cleanly restored, then the old instance released");
> +             igt_describe("Check if an instance of a still open
> while hot replugged device can be cleanly released");
>               igt_subtest("hotreplug-lateclose")
>                       hotreplug_lateclose(&priv);
>  

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to