On 04/29/2015 05:01 PM, David Gibson wrote:
On Sat, Apr 25, 2015 at 10:14:52PM +1000, Alexey Kardashevskiy wrote:
We are adding support for DMA memory pre-registration to be used in
conjunction with VFIO. The idea is that the userspace which is going to
run a guest may want to pre-register a user space memory region so
it all gets pinned once and never goes away. Having this done,
a hypervisor will not have to pin/unpin pages on every DMA map/unmap
request. This is going to help with multiple pinning of the same memory
and in-kernel acceleration of DMA requests.

This adds a list of memory regions to mm_context_t. Each region consists
of a header and a list of physical addresses. This adds API to:
1. register/unregister memory regions;
2. do final cleanup (which puts all pre-registered pages);
3. do userspace to physical address translation;
4. manage a mapped pages counter; when it is zero, it is safe to
unregister the region.

Multiple registration of the same region is allowed, kref is used to
track the number of registrations.

Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru>
---
Changes:
v8:
* s/mm_iommu_table_group_mem_t/struct mm_iommu_table_group_mem_t/
* fixed error fallback look (s/[i]/[j]/)
---
  arch/powerpc/include/asm/mmu-hash64.h      |   3 +
  arch/powerpc/include/asm/mmu_context.h     |  17 +++
  arch/powerpc/mm/Makefile                   |   1 +
  arch/powerpc/mm/mmu_context_hash64.c       |   6 +
  arch/powerpc/mm/mmu_context_hash64_iommu.c | 215 +++++++++++++++++++++++++++++
  5 files changed, 242 insertions(+)
  create mode 100644 arch/powerpc/mm/mmu_context_hash64_iommu.c

diff --git a/arch/powerpc/include/asm/mmu-hash64.h 
b/arch/powerpc/include/asm/mmu-hash64.h
index 1da6a81..a82f534 100644
--- a/arch/powerpc/include/asm/mmu-hash64.h
+++ b/arch/powerpc/include/asm/mmu-hash64.h
@@ -536,6 +536,9 @@ typedef struct {
        /* for 4K PTE fragment support */
        void *pte_frag;
  #endif
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+       struct list_head iommu_group_mem_list;
+#endif

Urgh.  I know I'm not one to talk, having done the hugepage crap in
there, but man mm_context_t has grown to a bloated mess from orginally
being just intended as a context ID integer :/.


Where else to put it then?... The other way to go would be some global map of pid<->iommu_group_mem_list which needs to be available from both VFIO and KVM.


  } mm_context_t;


diff --git a/arch/powerpc/include/asm/mmu_context.h 
b/arch/powerpc/include/asm/mmu_context.h
index 73382eb..d6116ca 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -16,6 +16,23 @@
   */
  extern int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
  extern void destroy_context(struct mm_struct *mm);
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+struct mm_iommu_table_group_mem_t;
+
+extern bool mm_iommu_preregistered(void);
+extern long mm_iommu_alloc(unsigned long ua, unsigned long entries,
+               struct mm_iommu_table_group_mem_t **pmem);
+extern struct mm_iommu_table_group_mem_t *mm_iommu_get(unsigned long ua,
+               unsigned long entries);
+extern long mm_iommu_put(struct mm_iommu_table_group_mem_t *mem);
+extern void mm_iommu_cleanup(mm_context_t *ctx);
+extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup(unsigned long ua,
+               unsigned long size);
+extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
+               unsigned long ua, unsigned long *hpa);
+extern long mm_iommu_mapped_update(struct mm_iommu_table_group_mem_t *mem,
+               bool inc);
+#endif

  extern void switch_mmu_context(struct mm_struct *prev, struct mm_struct 
*next);
  extern void switch_slb(struct task_struct *tsk, struct mm_struct *mm);
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 9c8770b..e216704 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -36,3 +36,4 @@ obj-$(CONFIG_PPC_SUBPAGE_PROT)        += subpage-prot.o
  obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
  obj-$(CONFIG_HIGHMEM)         += highmem.o
  obj-$(CONFIG_PPC_COPRO_BASE)  += copro_fault.o
+obj-$(CONFIG_SPAPR_TCE_IOMMU)  += mmu_context_hash64_iommu.o
diff --git a/arch/powerpc/mm/mmu_context_hash64.c 
b/arch/powerpc/mm/mmu_context_hash64.c
index 178876ae..eb3080c 100644
--- a/arch/powerpc/mm/mmu_context_hash64.c
+++ b/arch/powerpc/mm/mmu_context_hash64.c
@@ -89,6 +89,9 @@ int init_new_context(struct task_struct *tsk, struct 
mm_struct *mm)
  #ifdef CONFIG_PPC_64K_PAGES
        mm->context.pte_frag = NULL;
  #endif
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+       INIT_LIST_HEAD_RCU(&mm->context.iommu_group_mem_list);
+#endif
        return 0;
  }

@@ -132,6 +135,9 @@ static inline void destroy_pagetable_page(struct mm_struct 
*mm)

  void destroy_context(struct mm_struct *mm)
  {
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+       mm_iommu_cleanup(&mm->context);
+#endif

  #ifdef CONFIG_PPC_ICSWX
        drop_cop(mm->context.acop, mm);
diff --git a/arch/powerpc/mm/mmu_context_hash64_iommu.c 
b/arch/powerpc/mm/mmu_context_hash64_iommu.c
new file mode 100644
index 0000000..af7668c
--- /dev/null
+++ b/arch/powerpc/mm/mmu_context_hash64_iommu.c
@@ -0,0 +1,215 @@
+/*
+ *  IOMMU helpers in MMU context.
+ *
+ *  Copyright (C) 2015 IBM Corp. <a...@ozlabs.ru>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/rculist.h>
+#include <linux/vmalloc.h>
+#include <linux/kref.h>
+#include <asm/mmu_context.h>
+
+struct mm_iommu_table_group_mem_t {
+       struct list_head next;
+       struct rcu_head rcu;
+       struct kref kref;       /* one reference per VFIO container */
+       atomic_t mapped;        /* number of currently mapped pages */
+       u64 ua;                 /* userspace address */
+       u64 entries;            /* number of entries in hpas[] */

Maybe 'npages', since this is used to determine the range of user
addresses covered, not just the number of entries in hpas.


Hm. Ok :)


+       u64 *hpas;              /* vmalloc'ed */
+};
+
+bool mm_iommu_preregistered(void)
+{
+       if (!current || !current->mm)
+               return false;
+
+       return !list_empty(&current->mm->context.iommu_group_mem_list);
+}
+EXPORT_SYMBOL_GPL(mm_iommu_preregistered);
+
+long mm_iommu_alloc(unsigned long ua, unsigned long entries,
+               struct mm_iommu_table_group_mem_t **pmem)
+{
+       struct mm_iommu_table_group_mem_t *mem;
+       long i, j;
+       struct page *page = NULL;
+
+       list_for_each_entry_rcu(mem, &current->mm->context.iommu_group_mem_list,
+                       next) {
+               if ((mem->ua == ua) && (mem->entries == entries))
+                       return -EBUSY;
+
+               /* Overlap? */
+               if ((mem->ua < (ua + (entries << PAGE_SHIFT))) &&
+                               (ua < (mem->ua + (mem->entries << PAGE_SHIFT))))
+                       return -EINVAL;
+       }
+
+       mem = kzalloc(sizeof(*mem), GFP_KERNEL);
+       if (!mem)
+               return -ENOMEM;
+
+       mem->hpas = vzalloc(entries * sizeof(mem->hpas[0]));
+       if (!mem->hpas) {
+               kfree(mem);
+               return -ENOMEM;
+       }
+
+       for (i = 0; i < entries; ++i) {
+               if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT),
+                                       1/* pages */, 1/* iswrite */, &page)) {

Do you really need to call gup() in a loop?  It can do more than one
page at a time..


Ufff. gup() returns the number of pages pinned or -errno if none. So if the return value is positive but less than the requested number of pages, it is still an error. Functions like this make me nervous :(


That might work better if you kept a list of struct page *s instead of
hpas.

I only need struct page* when release the registered area. In other cases I just need fast conversion from an userspace address to a host physical address, including real mode. Ideally I would have to use page_address() which will work in real mode in my case but in general it does not have to. Using addresses rather than page structs makes it more explicit - I need an address, I store an address, simple.

I can change to page structs if you think it makes more sense, should I?




+                       for (j = 0; j < i; ++j)
+                               put_page(pfn_to_page(
+                                               mem->hpas[j] >> PAGE_SHIFT));
+                       vfree(mem->hpas);
+                       kfree(mem);
+                       return -EFAULT;
+               }
+
+               mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT;
+       }
+
+       kref_init(&mem->kref);
+       atomic_set(&mem->mapped, 0);
+       mem->ua = ua;
+       mem->entries = entries;
+       *pmem = mem;
+
+       list_add_rcu(&mem->next, &current->mm->context.iommu_group_mem_list);
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_alloc);
+
+static void mm_iommu_unpin(struct mm_iommu_table_group_mem_t *mem)
+{
+       long i;
+       struct page *page = NULL;
+
+       for (i = 0; i < mem->entries; ++i) {
+               if (!mem->hpas[i])
+                       continue;
+
+               page = pfn_to_page(mem->hpas[i] >> PAGE_SHIFT);
+               if (!page)
+                       continue;
+
+               put_page(page);
+               mem->hpas[i] = 0;
+       }
+}
+
+static void mm_iommu_free(struct rcu_head *head)
+{
+       struct mm_iommu_table_group_mem_t *mem = container_of(head,
+                       struct mm_iommu_table_group_mem_t, rcu);
+
+       mm_iommu_unpin(mem);
+       vfree(mem->hpas);
+       kfree(mem);
+}
+
+static void mm_iommu_release(struct kref *kref)
+{
+       struct mm_iommu_table_group_mem_t *mem = container_of(kref,
+                       struct mm_iommu_table_group_mem_t, kref);
+
+       list_del_rcu(&mem->next);
+       call_rcu(&mem->rcu, mm_iommu_free);
+}
+
+struct mm_iommu_table_group_mem_t *mm_iommu_get(unsigned long ua,
+               unsigned long entries)
+{
+       struct mm_iommu_table_group_mem_t *mem;
+
+       list_for_each_entry_rcu(mem, &current->mm->context.iommu_group_mem_list,
+                       next) {
+               if ((mem->ua == ua) && (mem->entries == entries)) {
+                       kref_get(&mem->kref);
+                       return mem;
+               }
+       }
+
+       return NULL;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_get);
+
+long mm_iommu_put(struct mm_iommu_table_group_mem_t *mem)
+{
+       if (atomic_read(&mem->mapped))
+               return -EBUSY;

What prevents a race between the atomic_read() above and the release below?

Ouch. Nothing. And I cannot think of any nice fast solution here...
I can remove @mapped at all and do kref_get/put(&mem->kref) instead; a container will hold one reference too. And add a flag to mm_iommu_table_group_mem_t to know if mm_iommu_release has been called - this way I will know that was the very last reference, otherwise I'll return -EBUSY.

Or change mm_iommu_lookup() to do kref_get() and require every caller of it also call mm_iommu_put() and only call mm_iommu_mapped_update() when the reference is elevated. And change mm_iommu_put() to return a special code if that was the very last put() (will be checked by VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY handler only, others would not care).

Any ideas?

I am pretty sure there is something very cool (like RCU) which allows avoiding locks in this situation, I am just too ignorant and do not know it :)


+       kref_put(&mem->kref, mm_iommu_release);
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_put);
+
+struct mm_iommu_table_group_mem_t *mm_iommu_lookup(unsigned long ua,
+               unsigned long size)
+{
+       struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
+
+       list_for_each_entry_rcu(mem,
+                       &current->mm->context.iommu_group_mem_list,
+                       next) {
+               if ((mem->ua <= ua) &&
+                               (ua + size <= mem->ua +
+                                (mem->entries << PAGE_SHIFT))) {
+                       ret = mem;
+                       break;
+               }
+       }
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_lookup);
+
+long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
+               unsigned long ua, unsigned long *hpa)

Return type should be int, it's just an error code.


Is it some generic rule that errors must always be "int"? I was just told that gcc on PPC64 will generate an extra instruction to cut 64bit long to 32bit int so I am just trying to use "long" everywhere. Very simple but still optimization :)


+{
+       const long entry = (ua - mem->ua) >> PAGE_SHIFT;
+       u64 *va = &mem->hpas[entry];
+
+       if (entry >= mem->entries)
+               return -EFAULT;
+
+       *hpa = *va | (ua & ~PAGE_MASK);
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa);
+
+long mm_iommu_mapped_update(struct mm_iommu_table_group_mem_t *mem, bool inc)
+{
+       long ret = 0;
+
+       if (inc)
+               atomic_inc(&mem->mapped);
+       else
+               ret = atomic_dec_if_positive(&mem->mapped);
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_mapped_update);

I think this would be clearer as separate inc and dec functions.

Okay.


+
+void mm_iommu_cleanup(mm_context_t *ctx)
+{
+       while (!list_empty(&ctx->iommu_group_mem_list)) {
+               struct mm_iommu_table_group_mem_t *mem;
+
+               mem = list_first_entry(&ctx->iommu_group_mem_list,
+                               struct mm_iommu_table_group_mem_t, next);
+               mm_iommu_release(&mem->kref);
+       }
+}



--
Alexey
_______________________________________________
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Reply via email to