Hi all,

I just sent out two patches that hopefully make the kernel module more robust in the face of page table shadows being swapped out.

However, even with those patches, I can still fairly reliably reproduce crashes with a backtrace of the shape

amdgpu_cs_ioctl
 -> amdgpu_vm_update_page_directory
 -> amdgpu_ttm_bind
 -> amdgpu_gtt_mgr_alloc

The plausible reason for these crashes is that nothing seems to prevent the shadow BOs from being moved between the calls to amdgpu_cs_validate in amdgpu_cs_parser_bos and the calls to amdgpu_ttm_bind.

The attached patch has fixed these crashes for me so far, but it's very heavy-handed: it collects all page table shadows and the page directory shadow and adds them all to the reservations for the callers of amdgpu_vm_update_page_directory.

I feel like there should be a better way. In part, I wonder why the shadows are needed in the first place. I vaguely recall the discussions about GPU reset and such, but I don't remember why the page tables can't just be rebuilt in some other way.

Cheers,
Nicolai
>From 70706d2283773f19b42312bc34e36c4717a9ce62 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haeh...@amd.com>
Date: Mon, 12 Dec 2016 15:02:27 +0100
Subject: [PATCH] drm/amd/amdgpu: reserve shadows of page directory and tables
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Nicolai Hähnle <nicolai.haeh...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h     |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  |  9 +++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 10 +++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  | 45 +++++++++++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h  |  3 +++
 5 files changed, 67 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index f53e52f..7a84648 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -916,20 +916,21 @@ struct amdgpu_cs_parser {
 	unsigned		nchunks;
 	struct amdgpu_cs_chunk	*chunks;
 
 	/* scheduler job object */
 	struct amdgpu_job	*job;
 
 	/* buffer objects */
 	struct ww_acquire_ctx		ticket;
 	struct amdgpu_bo_list		*bo_list;
 	struct amdgpu_bo_list_entry	vm_pd;
+	struct amdgpu_bo_list_entry	*vm_pd_pt_shadows;
 	struct list_head		validated;
 	struct dma_fence		*fence;
 	uint64_t			bytes_moved_threshold;
 	uint64_t			bytes_moved;
 	struct amdgpu_bo_list_entry	*evictable;
 
 	/* user fence */
 	struct amdgpu_bo_list_entry	uf_entry;
 };
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 985f842..eecf2eb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -510,20 +510,26 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
 	p->bo_list = amdgpu_bo_list_get(fpriv, cs->in.bo_list_handle);
 	if (p->bo_list) {
 		need_mmap_lock = p->bo_list->first_userptr !=
 			p->bo_list->num_entries;
 		amdgpu_bo_list_get_list(p->bo_list, &p->validated);
 	}
 
 	INIT_LIST_HEAD(&duplicates);
 	amdgpu_vm_get_pd_bo(&fpriv->vm, &p->validated, &p->vm_pd);
 
+	r = amdgpu_vm_get_pd_pt_shadows(&fpriv->vm, &p->validated, &p->vm_pd_pt_shadows);
+	if (r) {
+		DRM_ERROR("amdgpu_vm_get_pd_pt_shadows failed.\n");
+		goto error_release_list;
+	}
+
 	if (p->uf_entry.robj)
 		list_add(&p->uf_entry.tv.head, &p->validated);
 
 	if (need_mmap_lock)
 		down_read(&current->mm->mmap_sem);
 
 	while (1) {
 		struct list_head need_pages;
 		unsigned i;
 
@@ -673,20 +679,21 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
 	if (r) {
 		amdgpu_vm_move_pt_bos_in_lru(p->adev, &fpriv->vm);
 		ttm_eu_backoff_reservation(&p->ticket, &p->validated);
 	}
 
 error_free_pages:
 
 	if (need_mmap_lock)
 		up_read(&current->mm->mmap_sem);
 
+error_release_list:
 	if (p->bo_list) {
 		for (i = p->bo_list->first_userptr;
 		     i < p->bo_list->num_entries; ++i) {
 			e = &p->bo_list->array[i];
 
 			if (!e->user_pages)
 				continue;
 
 			release_pages(e->user_pages,
 				      e->robj->tbo.ttm->num_pages,
@@ -736,20 +743,22 @@ static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser, int error, bo
 		ttm_eu_backoff_reservation(&parser->ticket,
 					   &parser->validated);
 	}
 	dma_fence_put(parser->fence);
 
 	if (parser->ctx)
 		amdgpu_ctx_put(parser->ctx);
 	if (parser->bo_list)
 		amdgpu_bo_list_put(parser->bo_list);
 
+	drm_free_large(parser->vm_pd_pt_shadows);
+
 	for (i = 0; i < parser->nchunks; i++)
 		drm_free_large(parser->chunks[i].kdata);
 	kfree(parser->chunks);
 	if (parser->job)
 		amdgpu_job_free(parser->job);
 	amdgpu_bo_unref(&parser->uf_entry.robj);
 }
 
 static int amdgpu_bo_vm_update_pte(struct amdgpu_cs_parser *p,
 				   struct amdgpu_vm *vm)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index ffb3e70..379ccde 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -495,39 +495,44 @@ static int amdgpu_gem_va_check(void *param, struct amdgpu_bo *bo)
  *
  * Update the bo_va directly after setting it's address. Errors are not
  * vital here, so they are not reported back to userspace.
  */
 static void amdgpu_gem_va_update_vm(struct amdgpu_device *adev,
 				    struct amdgpu_bo_va *bo_va,
 				    uint32_t operation)
 {
 	struct ttm_validate_buffer tv, *entry;
 	struct amdgpu_bo_list_entry vm_pd;
+	struct amdgpu_bo_list_entry *vm_pd_pt_shadows;
 	struct amdgpu_bo *pd_shadow;
 	struct ww_acquire_ctx ticket;
 	struct list_head list, duplicates;
 	unsigned domain;
 	int r;
 
 	INIT_LIST_HEAD(&list);
 	INIT_LIST_HEAD(&duplicates);
 
 	tv.bo = &bo_va->bo->tbo;
 	tv.shared = true;
 	list_add(&tv.head, &list);
 
 	amdgpu_vm_get_pd_bo(bo_va->vm, &list, &vm_pd);
 
+	r = amdgpu_vm_get_pd_pt_shadows(bo_va->vm, &list, &vm_pd_pt_shadows);
+	if (r)
+		goto error_print;
+
 	/* Provide duplicates to avoid -EALREADY */
 	r = ttm_eu_reserve_buffers(&ticket, &list, true, &duplicates);
 	if (r)
-		goto error_print;
+		goto error_reserve_failed;
 
 	list_for_each_entry(entry, &list, head) {
 		domain = amdgpu_mem_type_to_domain(entry->bo->mem.mem_type);
 		/* if anything is swapped out don't swap it in here,
 		   just abort and wait for the next CS */
 		if (domain == AMDGPU_GEM_DOMAIN_CPU)
 			goto error_unreserve;
 	}
 
 	/* Also abort if the page directory shadow has been swapped out. */
@@ -551,20 +556,23 @@ static void amdgpu_gem_va_update_vm(struct amdgpu_device *adev,
 	r = amdgpu_vm_clear_freed(adev, bo_va->vm);
 	if (r)
 		goto error_unreserve;
 
 	if (operation == AMDGPU_VA_OP_MAP)
 		r = amdgpu_vm_bo_update(adev, bo_va, false);
 
 error_unreserve:
 	ttm_eu_backoff_reservation(&ticket, &list);
 
+error_reserve_failed:
+	drm_free_large(vm_pd_pt_shadows);
+
 error_print:
 	if (r && r != -ERESTARTSYS)
 		DRM_ERROR("Couldn't update BO_VA (%d)\n", r);
 }
 
 int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
 			  struct drm_file *filp)
 {
 	struct drm_amdgpu_gem_va *args = data;
 	struct drm_gem_object *gobj;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 1dda932..b63a9bf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -109,20 +109,65 @@ void amdgpu_vm_get_pd_bo(struct amdgpu_vm *vm,
 {
 	entry->robj = vm->page_directory;
 	entry->priority = 0;
 	entry->tv.bo = &vm->page_directory->tbo;
 	entry->tv.shared = true;
 	entry->user_pages = NULL;
 	list_add(&entry->tv.head, validated);
 }
 
 /**
+ * TODO document
+ */
+int amdgpu_vm_get_pd_pt_shadows(struct amdgpu_vm *vm,
+				struct list_head *validated,
+				struct amdgpu_bo_list_entry **pentries)
+{
+	struct amdgpu_bo *shadow;
+	struct amdgpu_bo_list_entry *entry;
+	unsigned nentries;
+	unsigned i;
+
+	*pentries = drm_calloc_large(vm->max_pde_used + 2,
+				     sizeof(struct amdgpu_bo_list_entry));
+	if (!*pentries)
+		return -ENOMEM;
+
+	nentries = 0;
+
+	if (vm->page_directory->shadow) {
+		shadow = vm->page_directory->shadow;
+		entry = &(*pentries)[nentries++];
+		entry->robj = shadow;
+		entry->tv.bo = &shadow->tbo;
+		entry->tv.shared = true;
+		list_add(&entry->tv.head, validated);
+	}
+
+	for (i = 0; i <= vm->max_pde_used; ++i) {
+		struct amdgpu_bo *bo = vm->page_tables[i].bo;
+
+		if (!bo || !bo->shadow)
+			continue;
+
+		shadow = bo->shadow;
+		entry = &(*pentries)[nentries++];
+		entry->robj = shadow;
+		entry->tv.bo = &shadow->tbo;
+		entry->tv.shared = true;
+		list_add(&entry->tv.head, validated);
+	}
+
+	return 0;
+}
+
+/**
  * amdgpu_vm_validate_pt_bos - validate the page table BOs
  *
  * @adev: amdgpu device pointer
  * @vm: vm providing the BOs
  * @validate: callback to do the validation
  * @param: parameter for the validation callback
  *
  * Validate the page table BOs on command submission if neccessary.
  */
 int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index adbc2f5..ad0b87a6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -159,20 +159,23 @@ struct amdgpu_vm_manager {
 	atomic64_t				client_counter;
 };
 
 void amdgpu_vm_manager_init(struct amdgpu_device *adev);
 void amdgpu_vm_manager_fini(struct amdgpu_device *adev);
 int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm);
 void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm);
 void amdgpu_vm_get_pd_bo(struct amdgpu_vm *vm,
 			 struct list_head *validated,
 			 struct amdgpu_bo_list_entry *entry);
+int amdgpu_vm_get_pd_pt_shadows(struct amdgpu_vm *vm,
+				struct list_head *validated,
+				struct amdgpu_bo_list_entry **pentries);
 int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 			      int (*callback)(void *p, struct amdgpu_bo *bo),
 			      void *param);
 void amdgpu_vm_move_pt_bos_in_lru(struct amdgpu_device *adev,
 				  struct amdgpu_vm *vm);
 int amdgpu_vm_grab_id(struct amdgpu_vm *vm, struct amdgpu_ring *ring,
 		      struct amdgpu_sync *sync, struct dma_fence *fence,
 		      struct amdgpu_job *job);
 int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job);
 void amdgpu_vm_reset_id(struct amdgpu_device *adev, unsigned vm_id);
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to