On 12/08/24 13:18, Raag Jadav wrote:
> From: Himal Prasad Ghimiray <himal.prasad.ghimi...@intel.com>
>
> This was dropped in commit 77a0d4d1cea2 ("drm/xe/uapi: Remove reset
> uevent for now") as part of refactoring.
>
> Now that we have better uapi semantics and naming for the uevent,
> bring it back. With this in place, userspace will be notified of
> wedged device along with its reason.
>
> $ udevadm monitor --property --kernel
> monitor will print the received events for:
> KERNEL - the kernel uevent
>
> KERNEL[871.188570] change   
> /devices/pci0000:00/0000:00:01.0/0000:01:00.0/0000:02:01.0/0000:03:00.0 (pci)
> ACTION=change
> DEVPATH=/devices/pci0000:00/0000:00:01.0/0000:01:00.0/0000:02:01.0/0000:03:00.0
> SUBSYSTEM=pci
> DEVICE_STATUS=NEEDS_RESET
> REASON=GT_RESET_FAILED
> TILE_ID=0
> GT_ID=0
> DRIVER=xe
> PCI_CLASS=30000
> PCI_ID=8086:56B1
> PCI_SUBSYS_ID=8086:1210
> PCI_SLOT_NAME=0000:03:00.0
> MODALIAS=pci:v00008086d000056B1sv00008086sd00001210bc03sc00i00
> SEQNUM=6104
>
> v2: Change authorship to Himal (Aravind)
>     Add uevent for all device wedged cases (Aravind)
>
> Signed-off-by: Himal Prasad Ghimiray <himal.prasad.ghimi...@intel.com>
> Signed-off-by: Raag Jadav <raag.ja...@intel.com>
> ---
>  drivers/gpu/drm/xe/xe_device.c     | 10 +++++++++-
>  drivers/gpu/drm/xe/xe_device.h     |  2 +-
>  drivers/gpu/drm/xe/xe_gt.c         | 23 +++++++++++++++++++----
>  drivers/gpu/drm/xe/xe_guc.c        | 13 ++++++++++++-
>  drivers/gpu/drm/xe/xe_guc_submit.c | 13 ++++++++++++-
>  include/uapi/drm/xe_drm.h          | 29 +++++++++++++++++++++++++++++
>  6 files changed, 82 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
> index 1aba6f9eaa19..d975bdce4a7d 100644
> --- a/drivers/gpu/drm/xe/xe_device.c
> +++ b/drivers/gpu/drm/xe/xe_device.c
> @@ -955,6 +955,7 @@ static void xe_device_wedged_fini(struct drm_device *drm, 
> void *arg)
>  /**
>   * xe_device_declare_wedged - Declare device wedged
>   * @xe: xe device instance
> + * @event_params: parameters to be sent along with uevent
>   *
>   * This is a final state that can only be cleared with a mudule
>   * re-probe (unbind + bind).
> @@ -965,8 +966,10 @@ static void xe_device_wedged_fini(struct drm_device 
> *drm, void *arg)
>   * on every single execution timeout (a.k.a. GPU hang) right after 
> devcoredump
>   * snapshot capture. In this mode, GT reset won't be attempted so the state 
> of
>   * the issue is preserved for further debugging.
> + * Caller is expected to pass respective parameters to be sent along with
> + * uevent. Pass NULL in case of no params.
>   */
> -void xe_device_declare_wedged(struct xe_device *xe)
> +void xe_device_declare_wedged(struct xe_device *xe, char **event_params)
>  {
>       struct xe_gt *gt;
>       u8 id;
> @@ -984,12 +987,17 @@ void xe_device_declare_wedged(struct xe_device *xe)
>       xe_pm_runtime_get_noresume(xe);
>  
>       if (!atomic_xchg(&xe->wedged.flag, 1)) {
> +             struct pci_dev *pdev = to_pci_dev(xe->drm.dev);
> +
>               xe->needs_flr_on_fini = true;
>               drm_err(&xe->drm,
>                       "CRITICAL: Xe has declared device %s as wedged.\n"
>                       "IOCTLs and executions are blocked. Only a rebind may 
> clear the failure\n"
>                       "Please file a _new_ bug report at 
> https://gitlab.freedesktop.org/drm/xe/kernel/issues/new\n";,
>                       dev_name(xe->drm.dev));
> +
> +             /* Notify userspace about reset required */
> +             kobject_uevent_env(&pdev->dev.kobj, KOBJ_CHANGE, event_params);
>       }
>  
>       for_each_gt(gt, xe, id)
> diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h
> index db6cc8d0d6b8..5d40fc6f0904 100644
> --- a/drivers/gpu/drm/xe/xe_device.h
> +++ b/drivers/gpu/drm/xe/xe_device.h
> @@ -174,7 +174,7 @@ static inline bool xe_device_wedged(struct xe_device *xe)
>       return atomic_read(&xe->wedged.flag);
>  }
>  
> -void xe_device_declare_wedged(struct xe_device *xe);
> +void xe_device_declare_wedged(struct xe_device *xe, char **reset_event);
>  
>  struct xe_file *xe_file_get(struct xe_file *xef);
>  void xe_file_put(struct xe_file *xef);
> diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c
> index 58895ed22f6e..519f3c2cf9e2 100644
> --- a/drivers/gpu/drm/xe/xe_gt.c
> +++ b/drivers/gpu/drm/xe/xe_gt.c
> @@ -741,6 +741,24 @@ static int do_gt_restart(struct xe_gt *gt)
>       return 0;
>  }
>  
> +static void xe_gt_reset_failed(struct xe_gt *gt, int err)
> +{
> +     char *event_params[5];
> +
> +     xe_gt_err(gt, "reset failed (%pe)\n", ERR_PTR(err));
> +
> +     event_params[0] = DRM_XE_RESET_REQUIRED_UEVENT;
> +     event_params[1] = DRM_XE_RESET_REQUIRED_UEVENT_REASON_GT;
> +     event_params[2] = kasprintf(GFP_KERNEL, "TILE_ID=%d", 
> gt_to_tile(gt)->id);
> +     event_params[3] = kasprintf(GFP_KERNEL, "GT_ID=%d", gt->info.id);

the TILE_ID and GT_ID can be passed for other events as well, with that you can
have a common function to send uevent which would take reason as an input.
> +     event_params[4] = NULL;
> +
> +     xe_device_declare_wedged(gt_to_xe(gt), event_params);
> +
> +     kfree(event_params[2]);
> +     kfree(event_params[3]);
> +}
> +
>  static int gt_reset(struct xe_gt *gt)
>  {
>       int err;
> @@ -796,10 +814,7 @@ static int gt_reset(struct xe_gt *gt)
>       XE_WARN_ON(xe_uc_start(&gt->uc));
>       xe_pm_runtime_put(gt_to_xe(gt));
>  err_fail:
> -     xe_gt_err(gt, "reset failed (%pe)\n", ERR_PTR(err));
> -
> -     xe_device_declare_wedged(gt_to_xe(gt));
> -
> +     xe_gt_reset_failed(gt, err);
>       return err;
>  }
>  
> diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c
> index de0fe9e65746..b544012f5b11 100644
> --- a/drivers/gpu/drm/xe/xe_guc.c
> +++ b/drivers/gpu/drm/xe/xe_guc.c
> @@ -560,6 +560,17 @@ static s32 guc_pc_get_cur_freq(struct xe_guc_pc *guc_pc)
>       return ret ? ret : freq;
>  }
>  
> +static void xe_guc_load_failed(struct xe_gt *gt)
> +{
> +     char *event_params[3];
> +
> +     event_params[0] = DRM_XE_RESET_REQUIRED_UEVENT;
> +     event_params[1] = DRM_XE_RESET_REQUIRED_UEVENT_REASON_GUC;
> +     event_params[2] = NULL;
> +
> +     xe_device_declare_wedged(gt_to_xe(gt), event_params);
> +}
> +
>  /*
>   * Wait for the GuC to start up.
>   *
> @@ -684,7 +695,7 @@ static void guc_wait_ucode(struct xe_guc *guc)
>                       break;
>               }
>  
> -             xe_device_declare_wedged(gt_to_xe(gt));
> +             xe_guc_load_failed(gt);
>       } else if (delta_ms > GUC_LOAD_TIME_WARN_MS) {
>               xe_gt_warn(gt, "excessive init time: %lldms! [status = 0x%08X, 
> timeouts = %d]\n",
>                          delta_ms, status, count);
> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c 
> b/drivers/gpu/drm/xe/xe_guc_submit.c
> index 460808507947..33ed6221f465 100644
> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> @@ -891,6 +891,17 @@ void xe_guc_submit_wedge(struct xe_guc *guc)
>       mutex_unlock(&guc->submission_state.lock);
>  }
>  
> +static void xe_exec_queue_timedout(struct xe_device *xe)
> +{
> +     char *event_params[3];
> +
> +     event_params[0] = DRM_XE_RESET_REQUIRED_UEVENT;
> +     event_params[1] = DRM_XE_RESET_REQUIRED_UEVENT_REASON_TOUT;
> +     event_params[2] = NULL;
> +
> +     xe_device_declare_wedged(xe, event_params);
> +}
> +
>  static bool guc_submit_hint_wedged(struct xe_guc *guc)
>  {
>       struct xe_device *xe = guc_to_xe(guc);
> @@ -901,7 +912,7 @@ static bool guc_submit_hint_wedged(struct xe_guc *guc)
>       if (xe_device_wedged(xe))
>               return true;
>  

i see that this is even called from xe_guc_exec_queue_lr_cleanup which is for 
long running queue
from various call paths need to check in what scenarios those happen.
> -     xe_device_declare_wedged(xe);
> +     xe_exec_queue_timedout(xe);
>  
>       return true;
>  }
> diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h
> index b6fbe4988f2e..dd2f36710057 100644
> --- a/include/uapi/drm/xe_drm.h
> +++ b/include/uapi/drm/xe_drm.h
> @@ -20,6 +20,7 @@ extern "C" {
>   *   2. Extension definition and helper structs
>   *   3. IOCTL's Query structs in the order of the Query's entries.
>   *   4. The rest of IOCTL structs in the order of IOCTL declaration.
> + *   5. uEvents
>   */
>  
>  /**
> @@ -1694,6 +1695,34 @@ struct drm_xe_oa_stream_info {
>       __u64 reserved[3];
>  };
>  
> +/**
> + * DOC: uevent generated by xe on it's pci node.
> + *
> + * DRM_XE_RESET_REQUIRED_UEVENT - Event is generated when device needs reset.
> + * The REASON is provided along with the event for which reset is required.
> + * On the basis of REASONS, additional information might be supplied.
> + */
> +#define DRM_XE_RESET_REQUIRED_UEVENT "DEVICE_STATUS=NEEDS_RESET"
> +
> +/**
> + * DRM_XE_RESET_REQUIRED_UEVENT_REASON_GT - Reason provided to
> + * DRM_XE_RESET_REQUIRED_UEVENT incase of gt reset failure. The additional
> + * information supplied is tile id and gt id for which reset has failed.
> + */
> +#define DRM_XE_RESET_REQUIRED_UEVENT_REASON_GT "REASON=GT_RESET_FAILED"
> +
> +/**
> + * DRM_XE_RESET_REQUIRED_UEVENT_REASON_GUC - Reason provided to
> + * DRM_XE_RESET_REQUIRED_UEVENT incase of guc fw load failure.
> + */
> +#define DRM_XE_RESET_REQUIRED_UEVENT_REASON_GUC "REASON=GUC_LOAD_FAILED"
> +
> +/**
> + * DRM_XE_RESET_REQUIRED_UEVENT_REASON_TOUT - Reason provided to
> + * DRM_XE_RESET_REQUIRED_UEVENT incase of exec queue timeout.
> + */
> +#define DRM_XE_RESET_REQUIRED_UEVENT_REASON_TOUT "REASON=EXEC_QUEUE_TIMEDOUT"

better to  name DRM_XE_RESET_REQUIRED_UEVENT_REASON_EXEC_QUEUE_TIMEOUT for 
clarity

Thanks,
Aravind.
> +
>  #if defined(__cplusplus)
>  }
>  #endif

Reply via email to