It need to check whether kq has been initialized correctly in 
kq_acquire_packet_buffer.
Or it will hit memory corruption during recover, as for recover, it will 
uninitialize
kq first.

Need to flush tlb after recover successully, as it maybe has create bo and
map bo during recover.

Signed-off-by: Emily Deng <emily.d...@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_device.c       |  1 +
 drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c |  4 ++++
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_process.c      | 22 +++++++++++++++++++
 4 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index b9c82be6ce13..eb2df5842618 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -1000,6 +1000,7 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
                return 0;
 
        for (i = 0; i < kfd->num_nodes; i++) {
+               kfd_flush_all_processes(kfd->nodes[i]);
                ret = kfd_resume(kfd->nodes[i]);
                if (ret)
                        return ret;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
index 2b0a830f5b29..5e4ae969818e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
@@ -238,6 +238,10 @@ int kq_acquire_packet_buffer(struct kernel_queue *kq,
        uint64_t wptr64;
        unsigned int *queue_address;
 
+       if (!kq) {
+               pr_debug("kq has not been initialized\n");
+               goto err_no_space;
+       }
        /* When rptr == wptr, the buffer is empty.
         * When rptr == wptr + 1, the buffer is full.
         * It is always rptr that advances to the position of wptr, rather than
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index f6aedf69c644..6c073ead2b06 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1059,7 +1059,7 @@ int kfd_process_evict_queues(struct kfd_process *p, 
uint32_t trigger);
 int kfd_process_restore_queues(struct kfd_process *p);
 void kfd_suspend_all_processes(void);
 int kfd_resume_all_processes(void);
-
+void kfd_flush_all_processes(struct kfd_node *node);
 struct kfd_process_device *kfd_process_device_data_by_id(struct kfd_process 
*process,
                                                         uint32_t gpu_id);
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 7c0c24732481..4ed03359020b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -2110,6 +2110,28 @@ int kfd_resume_all_processes(void)
        return ret;
 }
 
+void kfd_flush_all_processes(struct kfd_node *node)
+{
+       struct kfd_process *p;
+       struct kfd_process_device *pdd;
+       unsigned int temp;
+       int idx = srcu_read_lock(&kfd_processes_srcu);
+       struct amdgpu_vm *vm;
+
+       hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
+               pdd = kfd_get_process_device_data(node, p);
+               if (!pdd)
+                       continue;
+               vm = drm_priv_to_vm(pdd->drm_priv);
+               if (!vm)
+                       continue;
+               atomic64_inc(&vm->tlb_seq);
+               kfd_flush_tlb(pdd, TLB_FLUSH_LEGACY);
+       }
+       srcu_read_unlock(&kfd_processes_srcu, idx);
+
+}
+
 int kfd_reserved_mem_mmap(struct kfd_node *dev, struct kfd_process *process,
                          struct vm_area_struct *vma)
 {
-- 
2.34.1

Reply via email to