On Fri, Jun 03, 2016 at 05:55:47PM +0100, Chris Wilson wrote:
> The error state is purposefully racy as we expect it to be called at any
> time and so have avoided any locking whilst capturing the crash dump.
> However, with multi-engine GPUs and multiple CPUs, those races can
> manifest into OOPSes as we attempt to chase dangling pointers freed on
> other CPUs. Under discussion are lots of ways to slow down normal
> operation in order to protect the post-mortem error capture, but what it
> we take the opposite approach and freeze the machine whilst the error
> capture runs (note the GPU may still running, but as long as we don't
> process any of the results the driver's bookkeeping will be static).
> 
> Note that by of itself, this is not a complete fix. It also depends on
> the compiler barriers in list_add/list_del to prevent traversing the
> lists into the void.
> 
> v2: Avoid drm_clflush_pages() inside stop_machine() as it may use
> stop_machine() itself for its wbinvd fallback.
> 
> Signed-off-by: Chris Wilson <ch...@chris-wilson.co.uk>

rt folks will hate us for this I think. But yeah the only other options is
mass-rcu-ifying everything, which is much more fragile. Ack on the general
idea at least, need to look at what's all needed for list manipulation
still.
-Daniel

> ---
>  drivers/gpu/drm/i915/Kconfig          |  1 +
>  drivers/gpu/drm/i915/i915_drv.h       |  2 ++
>  drivers/gpu/drm/i915/i915_gpu_error.c | 48 
> +++++++++++++++++++++--------------
>  3 files changed, 32 insertions(+), 19 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig
> index 29a32b11953b..9398a4d06c0e 100644
> --- a/drivers/gpu/drm/i915/Kconfig
> +++ b/drivers/gpu/drm/i915/Kconfig
> @@ -4,6 +4,7 @@ config DRM_I915
>       depends on X86 && PCI
>       select INTEL_GTT
>       select INTERVAL_TREE
> +     select STOP_MACHINE
>       # we need shmfs for the swappable backing store, and in particular
>       # the shmem_readpage() which depends upon tmpfs
>       select SHMEM
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index dbd3c6f3abbc..77564f378771 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -470,6 +470,8 @@ struct drm_i915_error_state {
>       struct kref ref;
>       struct timeval time;
>  
> +     struct drm_i915_private *i915;
> +
>       char error_msg[128];
>       bool simulated;
>       int iommu;
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c 
> b/drivers/gpu/drm/i915/i915_gpu_error.c
> index f01f0ca4bb86..ab2ba76a2a3b 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -28,6 +28,7 @@
>   */
>  
>  #include <generated/utsrelease.h>
> +#include <linux/stop_machine.h>
>  #include "i915_drv.h"
>  
>  static const char *ring_str(int ring)
> @@ -682,14 +683,12 @@ i915_error_object_create(struct drm_i915_private 
> *dev_priv,
>  
>       dst->page_count = num_pages;
>       while (num_pages--) {
> -             unsigned long flags;
>               void *d;
>  
>               d = kmalloc(PAGE_SIZE, GFP_ATOMIC);
>               if (d == NULL)
>                       goto unwind;
>  
> -             local_irq_save(flags);
>               if (use_ggtt) {
>                       void __iomem *s;
>  
> @@ -708,15 +707,10 @@ i915_error_object_create(struct drm_i915_private 
> *dev_priv,
>  
>                       page = i915_gem_object_get_page(src, i);
>  
> -                     drm_clflush_pages(&page, 1);
> -
>                       s = kmap_atomic(page);
>                       memcpy(d, s, PAGE_SIZE);
>                       kunmap_atomic(s);
> -
> -                     drm_clflush_pages(&page, 1);
>               }
> -             local_irq_restore(flags);
>  
>               dst->pages[i++] = d;
>               reloc_offset += PAGE_SIZE;
> @@ -1366,6 +1360,32 @@ static void i915_capture_gen_state(struct 
> drm_i915_private *dev_priv,
>       error->suspend_count = dev_priv->suspend_count;
>  }
>  
> +static int capture(void *data)
> +{
> +     struct drm_i915_error_state *error = data;
> +
> +     /* Ensure that what we readback from memory matches what the GPU sees */
> +     wbinvd();
> +
> +     i915_capture_gen_state(error->i915, error);
> +     i915_capture_reg_state(error->i915, error);
> +     i915_gem_record_fences(error->i915, error);
> +     i915_gem_record_rings(error->i915, error);
> +
> +     i915_capture_active_buffers(error->i915, error);
> +     i915_capture_pinned_buffers(error->i915, error);
> +
> +     do_gettimeofday(&error->time);
> +
> +     error->overlay = intel_overlay_capture_error_state(error->i915);
> +     error->display = intel_display_capture_error_state(error->i915);
> +
> +     /* And make sure we don't leave trash in the CPU cache */
> +     wbinvd();
> +
> +     return 0;
> +}
> +
>  /**
>   * i915_capture_error_state - capture an error record for later analysis
>   * @dev: drm device
> @@ -1394,19 +1414,9 @@ void i915_capture_error_state(struct drm_i915_private 
> *dev_priv,
>       }
>  
>       kref_init(&error->ref);
> +     error->i915 = dev_priv;
>  
> -     i915_capture_gen_state(dev_priv, error);
> -     i915_capture_reg_state(dev_priv, error);
> -     i915_gem_record_fences(dev_priv, error);
> -     i915_gem_record_rings(dev_priv, error);
> -
> -     i915_capture_active_buffers(dev_priv, error);
> -     i915_capture_pinned_buffers(dev_priv, error);
> -
> -     do_gettimeofday(&error->time);
> -
> -     error->overlay = intel_overlay_capture_error_state(dev_priv);
> -     error->display = intel_display_capture_error_state(dev_priv);
> +     stop_machine(capture, error, NULL);
>  
>       i915_error_capture_msg(dev_priv, error, engine_mask, error_msg);
>       DRM_INFO("%s\n", error->error_msg);
> -- 
> 2.8.1
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to