Add capture output size check function to provide a reasonable
minimum size for error capture region before allocating the shared
buffer.

Signed-off-by: Zhanjun Dong <zhanjun.d...@intel.com>
---
 drivers/gpu/drm/xe/xe_guc_capture.c | 76 +++++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)

diff --git a/drivers/gpu/drm/xe/xe_guc_capture.c 
b/drivers/gpu/drm/xe/xe_guc_capture.c
index dde3a269d114..f4153dc4ab86 100644
--- a/drivers/gpu/drm/xe/xe_guc_capture.c
+++ b/drivers/gpu/drm/xe/xe_guc_capture.c
@@ -559,6 +559,81 @@ xe_guc_capture_getnullheader(struct xe_guc *guc, void 
**outptr, size_t *size)
        return 0;
 }
 
+static int
+guc_capture_output_min_size_est(struct xe_guc *guc)
+{
+       struct xe_gt *gt = guc_to_gt(guc);
+       struct xe_hw_engine *hwe;
+       enum xe_hw_engine_id id;
+
+       int worst_min_size = 0;
+       size_t tmp = 0;
+
+       if (!guc->capture)
+               return -ENODEV;
+
+       /*
+        * If every single engine-instance suffered a failure in quick 
succession but
+        * were all unrelated, then a burst of multiple error-capture events 
would dump
+        * registers for every one engine instance, one at a time. In this 
case, GuC
+        * would even dump the global-registers repeatedly.
+        *
+        * For each engine instance, there would be 1 x 
guc_state_capture_group_t output
+        * followed by 3 x guc_state_capture_t lists. The latter is how the 
register
+        * dumps are split across different register types (where the '3' are 
global vs class
+        * vs instance).
+        */
+       for_each_hw_engine(hwe, gt, id) {
+               worst_min_size += sizeof(struct 
guc_state_capture_group_header_t) +
+                                        (3 * sizeof(struct 
guc_state_capture_header_t));
+
+               if (!guc_capture_getlistsize(guc, 0, 
GUC_CAPTURE_LIST_TYPE_GLOBAL, 0, &tmp, true))
+                       worst_min_size += tmp;
+
+               if (!guc_capture_getlistsize(guc, 0, 
GUC_CAPTURE_LIST_TYPE_ENGINE_CLASS,
+                                            hwe->class, &tmp, true)) {
+                       worst_min_size += tmp;
+               }
+               if (!guc_capture_getlistsize(guc, 0, 
GUC_CAPTURE_LIST_TYPE_ENGINE_INSTANCE,
+                                            hwe->class, &tmp, true)) {
+                       worst_min_size += tmp;
+               }
+       }
+
+       return worst_min_size;
+}
+
+/*
+ * Add on a 3x multiplier to allow for multiple back-to-back captures occurring
+ * before the i915 can read the data out and process it
+ */
+#define GUC_CAPTURE_OVERBUFFER_MULTIPLIER 3
+
+static void check_guc_capture_size(struct xe_guc *guc)
+{
+       int min_size = guc_capture_output_min_size_est(guc);
+       int spare_size = min_size * GUC_CAPTURE_OVERBUFFER_MULTIPLIER;
+       u32 buffer_size = xe_guc_log_section_size_capture(&guc->log);
+
+       /*
+        * NOTE: min_size is much smaller than the capture region allocation 
(DG2: <80K vs 1MB)
+        * Additionally, its based on space needed to fit all engines getting 
reset at once
+        * within the same G2H handler task slot. This is very unlikely. 
However, if GuC really
+        * does run out of space for whatever reason, we will see an separate 
warning message
+        * when processing the G2H event capture-notification, search for:
+        * xe_guc_STATE_CAPTURE_EVENT_STATUS_NOSPACE.
+        */
+       if (min_size < 0)
+               xe_gt_warn(guc_to_gt(guc), "Failed to calculate error state 
capture buffer minimum size: %d!\n",
+                          min_size);
+       else if (min_size > buffer_size)
+               xe_gt_warn(guc_to_gt(guc), "Error state capture buffer maybe 
small: %d < %d\n",
+                          buffer_size, min_size);
+       else if (spare_size > buffer_size)
+               xe_gt_dbg(guc_to_gt(guc), "Error state capture buffer lacks 
spare size: %d < %d (min = %d)\n",
+                         buffer_size, spare_size, min_size);
+}
+
 int xe_guc_capture_init(struct xe_guc *guc)
 {
        guc->capture = kzalloc(sizeof(*guc->capture), GFP_KERNEL);
@@ -570,6 +645,7 @@ int xe_guc_capture_init(struct xe_guc *guc)
        INIT_LIST_HEAD(&guc->capture->outlist);
        INIT_LIST_HEAD(&guc->capture->cachelist);
 
+       check_guc_capture_size(guc);
        return 0;
 }
 
-- 
2.34.1

Reply via email to