Over allocation of save area is not fatal, only under allocation is.
ROCm has various components that independently claim authority over save
area size.

Unless KFD decides to claim single authority, relax size checks with a
warning on over allocation.

Signed-off-by: Jonathan Kim <[email protected]>
---
 drivers/gpu/drm/amd/amdkfd/kfd_queue.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
index a65c67cf56ff..448043bc2937 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c
@@ -297,16 +297,21 @@ int kfd_queue_acquire_buffers(struct kfd_process_device 
*pdd, struct queue_prope
                goto out_err_unreserve;
        }
 
-       if (properties->ctx_save_restore_area_size != 
topo_dev->node_props.cwsr_size) {
-               pr_debug("queue cwsr size 0x%x not equal to node cwsr size 
0x%x\n",
+       if (properties->ctx_save_restore_area_size < 
topo_dev->node_props.cwsr_size) {
+               pr_debug("queue cwsr size 0x%x not sufficient for node cwsr 
size 0x%x\n",
                        properties->ctx_save_restore_area_size,
                        topo_dev->node_props.cwsr_size);
                err = -EINVAL;
                goto out_err_unreserve;
        }
 
-       total_cwsr_size = (topo_dev->node_props.cwsr_size + 
topo_dev->node_props.debug_memory_size)
-                         * NUM_XCC(pdd->dev->xcc_mask);
+       if (properties->ctx_save_restore_area_size > 
topo_dev->node_props.cwsr_size)
+               pr_warn_ratelimited("queue cwsr size 0x%x exceeds recommended 
node cwsr size 0x%x\n",
+                                   properties->ctx_save_restore_area_size,
+                                   topo_dev->node_props.cwsr_size);
+
+       total_cwsr_size = (properties->ctx_save_restore_area_size +
+                          topo_dev->node_props.debug_memory_size) * 
NUM_XCC(pdd->dev->xcc_mask);
        total_cwsr_size = ALIGN(total_cwsr_size, PAGE_SIZE);
 
        err = kfd_queue_buffer_get(vm, (void 
*)properties->ctx_save_restore_area_address,
@@ -352,8 +357,8 @@ int kfd_queue_release_buffers(struct kfd_process_device 
*pdd, struct queue_prope
        topo_dev = kfd_topology_device_by_id(pdd->dev->id);
        if (!topo_dev)
                return -EINVAL;
-       total_cwsr_size = (topo_dev->node_props.cwsr_size + 
topo_dev->node_props.debug_memory_size)
-                         * NUM_XCC(pdd->dev->xcc_mask);
+       total_cwsr_size = (properties->ctx_save_restore_area_size +
+                          topo_dev->node_props.debug_memory_size) * 
NUM_XCC(pdd->dev->xcc_mask);
        total_cwsr_size = ALIGN(total_cwsr_size, PAGE_SIZE);
 
        kfd_queue_buffer_svm_put(pdd, 
properties->ctx_save_restore_area_address, total_cwsr_size);
-- 
2.34.1

Reply via email to