From a4e53bda6f65b72b1f6a344c19677574d7842cd3 Mon Sep 17 00:00:00 2001
From: Shuotao Xu <shuotaoxu@microsoft.com>
Date: Wed, 6 Apr 2022 12:42:10 +0900
Subject: [PATCH] drm/amdkfd: Add PCIe Hotplug Support for AMDGPU 1. During
 PCIe probing, decrement KFD lock which was incremented when    the PCIe
 device was removed; otherwise kfd_open is going to fail. 2. Remove p2p links
 in sysfs when device is hotplugged out.

Signed-off-by: Shuotao Xu <shuotaoxu@microsoft.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_device.c   |  4 ++
 drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 50 +++++++++++++++++++++--
 2 files changed, 51 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 62aa6c9d5123..c9638bc299dd 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -575,6 +575,10 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
 	if (kfd_resume(kfd))
 		goto kfd_resume_error;
 
+	/* release kfd lock b/o pcie hotplug out  */
+	if (kfd_is_locked())
+		atomic_dec(&kfd_locked);
+
 	if (kfd_topology_add_device(kfd)) {
 		dev_err(kfd_device, "Error adding device to topology\n");
 		goto kfd_topology_add_device_error;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index 3bdcae239bc0..cfa3b16f6939 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -132,6 +132,21 @@ struct kfd_dev *kfd_device_by_adev(const struct amdgpu_device *adev)
 	return device;
 }
 
+/* Called with write topology_lock acquired */
+static void kfd_release_link_prop(struct kfd_topology_device *dev, uint32_t node_id)
+{
+	struct kfd_iolink_properties *iolink, *tmp;
+
+	list_for_each_entry_safe(iolink, tmp, &dev->io_link_props, list) {
+		if (iolink->node_to == node_id) {
+			pr_debug("%s, io_link from_node = %d, to_node = %d", __func__, iolink->node_from, iolink->node_to);
+			list_del(&iolink->list);
+			kfree(iolink);
+			dev->node_props.io_links_count--;
+		}
+	}
+}
+
 /* Called with write topology_lock acquired */
 static void kfd_release_topology_device(struct kfd_topology_device *dev)
 {
@@ -556,6 +571,21 @@ static void kfd_remove_sysfs_file(struct kobject *kobj, struct attribute *attr)
 	kobject_put(kobj);
 }
 
+static void kfd_remove_sysfs_link_to(struct kfd_topology_device *dev, uint32_t node_id)
+{
+	struct kfd_iolink_properties *iolink;
+
+	if (dev->kobj_iolink) {
+		list_for_each_entry(iolink, &dev->io_link_props, list)
+			if (iolink->kobj && iolink->node_to == node_id) {
+				pr_debug("%s, io_link from_node = %d, to_node = %d", __func__, iolink->node_from, iolink->node_to);
+				kfd_remove_sysfs_file(iolink->kobj,
+									  &iolink->attr);
+				iolink->kobj = NULL;
+			}
+	}
+}
+
 static void kfd_remove_sysfs_node_entry(struct kfd_topology_device *dev)
 {
 	struct kfd_iolink_properties *iolink;
@@ -1490,20 +1520,34 @@ int kfd_topology_remove_device(struct kfd_dev *gpu)
 	struct kfd_topology_device *dev, *tmp;
 	uint32_t gpu_id;
 	int res = -ENODEV;
+	uint32_t node_id = 0;
+	bool found = false;
 
 	down_write(&topology_lock);
 
-	list_for_each_entry_safe(dev, tmp, &topology_device_list, list)
+	list_for_each_entry_safe(dev, tmp, &topology_device_list, list) {
 		if (dev->gpu == gpu) {
 			gpu_id = dev->gpu_id;
 			kfd_remove_sysfs_node_entry(dev);
 			kfd_release_topology_device(dev);
 			sys_props.num_devices--;
 			res = 0;
-			if (kfd_topology_update_sysfs() < 0)
-				kfd_topology_release_sysfs();
+			pr_debug("kfd_topology: removing gpu node, node id = %d", node_id);
+			found = true;
 			break;
 		}
+		node_id++;
+	}
+
+	if (found) {
+		list_for_each_entry(dev, &topology_device_list, list) {
+			kfd_remove_sysfs_link_to(dev, node_id);
+			kfd_release_link_prop(dev, node_id);
+		}
+		atomic_dec(&topology_crat_proximity_domain);
+		if (kfd_topology_update_sysfs() < 0)
+			kfd_topology_release_sysfs();
+	}
 
 	up_write(&topology_lock);
 
-- 
2.17.1

