One minor NIT (though i hope it could be fixed otw in as it adds a bit of 
ease-of-log-readibility).
That said, everything else looks good.

Reviewed-by: Alan Previn <alan.previn.teres.ale...@intel.com>
 
On Wed, 2022-07-27 at 19:20 -0700, john.c.harri...@intel.com wrote:
> From: John Harrison <john.c.harri...@intel.com>
> 
> When debugging GuC communication issues, it is useful to have the CTB
> info available. So add the state and buffer contents to the error
> capture log.
> 
> Also, add a sub-structure for the GuC specific error capture info as
> it is now becoming numerous.
> 
> Signed-off-by: John Harrison <john.c.harri...@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_gpu_error.c | 59 +++++++++++++++++++++++----
>  drivers/gpu/drm/i915/i915_gpu_error.h | 20 +++++++--
>  2 files changed, 67 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c 
> b/drivers/gpu/drm/i915/i915_gpu_error.c
> index addba75252343..543ba63f958ea 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -671,6 +671,18 @@ static void err_print_pciid(struct 
> drm_i915_error_state_buf *m,
>                  pdev->subsystem_device);
>  }
>  
> +static void err_print_guc_ctb(struct drm_i915_error_state_buf *m,
> +                           const char *name,
> +                           const struct intel_ctb_coredump *ctb)
> +{
> +     if (!ctb->size)
> +             return;
> +
> +     err_printf(m, "GuC %s CTB: raw: 0x%08X, 0x%08X/%08X, cached: 
> 0x%08X/%08X, desc = 0x%08X, buf = 0x%08X x 0x%08X\n",
> +                name, ctb->raw_status, ctb->raw_head, ctb->raw_tail,
> +                ctb->head, ctb->tail, ctb->desc_offset, ctb->cmds_offset, 
> ctb->size);
> 
NIT: to make it more readible on first glance, would be nice to add more 
descriptive text like "raw: Sts:0x%08X,
Hd:0x%08X,Tl:0x@08X..." also, the not sure why cmds_offset is presented with a 
"x size" as opposed to just "desc-off =
foo1, cmd-off = foo2, size = foo3"?
> +}
> +
>  static void err_print_uc(struct drm_i915_error_state_buf *m,
>                        const struct intel_uc_coredump *error_uc)
>  {
> @@ -678,8 +690,12 @@ static void err_print_uc(struct drm_i915_error_state_buf 
> *m,
>  
>       intel_uc_fw_dump(&error_uc->guc_fw, &p);
>       intel_uc_fw_dump(&error_uc->huc_fw, &p);
> -     err_printf(m, "GuC timestamp: 0x%08x\n", error_uc->timestamp);
> -     intel_gpu_error_print_vma(m, NULL, error_uc->guc_log);
> +     err_printf(m, "GuC timestamp: 0x%08x\n", error_uc->guc.timestamp);
> +     intel_gpu_error_print_vma(m, NULL, error_uc->guc.vma_log);
> +     err_printf(m, "GuC CTB fence: %d\n", error_uc->guc.last_fence);
> +     err_print_guc_ctb(m, "Send", error_uc->guc.ctb + 0);
> +     err_print_guc_ctb(m, "Recv", error_uc->guc.ctb + 1);
> +     intel_gpu_error_print_vma(m, NULL, error_uc->guc.vma_ctb);
>  }
>  
>  static void err_free_sgl(struct scatterlist *sgl)
> @@ -854,7 +870,7 @@ static void __err_print_to_sgl(struct 
> drm_i915_error_state_buf *m,
>       if (error->gt) {
>               bool print_guc_capture = false;
>  
> -             if (error->gt->uc && error->gt->uc->is_guc_capture)
> +             if (error->gt->uc && error->gt->uc->guc.is_guc_capture)
>                       print_guc_capture = true;
>  
>               err_print_gt_display(m, error->gt);
> @@ -1009,7 +1025,8 @@ static void cleanup_uc(struct intel_uc_coredump *uc)
>  {
>       kfree(uc->guc_fw.path);
>       kfree(uc->huc_fw.path);
> -     i915_vma_coredump_free(uc->guc_log);
> +     i915_vma_coredump_free(uc->guc.vma_log);
> +     i915_vma_coredump_free(uc->guc.vma_ctb);
>  
>       kfree(uc);
>  }
> @@ -1658,6 +1675,23 @@ gt_record_engines(struct intel_gt_coredump *gt,
>       }
>  }
>  
> +static void gt_record_guc_ctb(struct intel_ctb_coredump *saved,
> +                           const struct intel_guc_ct_buffer *ctb,
> +                           const void *blob_ptr, struct intel_guc *guc)
> +{
> +     if (!ctb || !ctb->desc)
> +             return;
> +
> +     saved->raw_status = ctb->desc->status;
> +     saved->raw_head = ctb->desc->head;
> +     saved->raw_tail = ctb->desc->tail;
> +     saved->head = ctb->head;
> +     saved->tail = ctb->tail;
> +     saved->size = ctb->size;
> +     saved->desc_offset = ((void *)ctb->desc) - blob_ptr;
> +     saved->cmds_offset = ((void *)ctb->cmds) - blob_ptr;
> +}
> +
>  static struct intel_uc_coredump *
>  gt_record_uc(struct intel_gt_coredump *gt,
>            struct i915_vma_compress *compress)
> @@ -1684,9 +1718,16 @@ gt_record_uc(struct intel_gt_coredump *gt,
>        * log times to system times (in conjunction with the error->boottime 
> and
>        * gt->clock_frequency fields saved elsewhere).
>        */
> -     error_uc->timestamp = intel_uncore_read(gt->_gt->uncore, 
> GUCPMTIMESTAMP);
> -     error_uc->guc_log = create_vma_coredump(gt->_gt, uc->guc.log.vma,
> -                                             "GuC log buffer", compress);
> +     error_uc->guc.timestamp = intel_uncore_read(gt->_gt->uncore, 
> GUCPMTIMESTAMP);
> +     error_uc->guc.vma_log = create_vma_coredump(gt->_gt, uc->guc.log.vma,
> +                                                 "GuC log buffer", compress);
> +     error_uc->guc.vma_ctb = create_vma_coredump(gt->_gt, uc->guc.ct.vma,
> +                                                 "GuC CT buffer", compress);
> +     error_uc->guc.last_fence = uc->guc.ct.requests.last_fence;
> +     gt_record_guc_ctb(error_uc->guc.ctb + 0, &uc->guc.ct.ctbs.send,
> +                       uc->guc.ct.ctbs.send.desc, (struct intel_guc 
> *)&uc->guc);
> +     gt_record_guc_ctb(error_uc->guc.ctb + 1, &uc->guc.ct.ctbs.recv,
> +                       uc->guc.ct.ctbs.send.desc, (struct intel_guc 
> *)&uc->guc);
>  
>       return error_uc;
>  }
> @@ -2039,9 +2080,9 @@ __i915_gpu_coredump(struct intel_gt *gt, 
> intel_engine_mask_t engine_mask, u32 du
>                       error->gt->uc = gt_record_uc(error->gt, compress);
>                       if (error->gt->uc) {
>                               if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE)
> -                                     error->gt->uc->is_guc_capture = true;
> +                                     error->gt->uc->guc.is_guc_capture = 
> true;
>                               else
> -                                     
> GEM_BUG_ON(error->gt->uc->is_guc_capture);
> +                                     
> GEM_BUG_ON(error->gt->uc->guc.is_guc_capture);
>                       }
>               }
>  
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h 
> b/drivers/gpu/drm/i915/i915_gpu_error.h
> index d8a8b3d529e09..efc75cc2ffdb9 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.h
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.h
> @@ -125,6 +125,15 @@ struct intel_engine_coredump {
>       struct intel_engine_coredump *next;
>  };
>  
> +struct intel_ctb_coredump {
> +     u32 raw_head, head;
> +     u32 raw_tail, tail;
> +     u32 raw_status;
> +     u32 desc_offset;
> +     u32 cmds_offset;
> +     u32 size;
> +};
> +
>  struct intel_gt_coredump {
>       const struct intel_gt *_gt;
>       bool awake;
> @@ -165,9 +174,14 @@ struct intel_gt_coredump {
>       struct intel_uc_coredump {
>               struct intel_uc_fw guc_fw;
>               struct intel_uc_fw huc_fw;
> -             struct i915_vma_coredump *guc_log;
> -             u32 timestamp;
> -             bool is_guc_capture;
> +             struct guc_info {
> +                     struct intel_ctb_coredump ctb[2];
> +                     struct i915_vma_coredump *vma_ctb;
> +                     struct i915_vma_coredump *vma_log;
> +                     u32 timestamp;
> +                     u16 last_fence;
> +                     bool is_guc_capture;
> +             } guc;
>       } *uc;
>  
>       struct intel_gt_coredump *next;
> -- 
> 2.37.1
> 

Reply via email to