On 02/03/2015 11:11 AM, Alex Williamson wrote: > On Thu, 2015-01-29 at 20:21 +1100, Alexey Kardashevskiy wrote: >> The existing implementation accounts the whole DMA window in >> the locked_vm counter which is going to be even worse with multiple >> containers and huge DMA windows. >> >> This introduces 2 ioctls to register/unregister DMA memory which >> receive user space address and size of the memory region which >> needs to be pinned/unpinned and counted in locked_vm. >> >> If any memory region was registered, all subsequent DMA map requests >> should address already pinned memory. If no memory was registered, >> then the amount of memory required for a single default memory will be >> accounted when the container is enabled and every map/unmap will pin/unpin >> a page. >> >> Dynamic DMA window and in-kernel acceleration will require memory to >> be registered in order to work. >> >> The accounting is done per VFIO container. When the support of >> multiple groups per container is added, we will have accurate locked_vm >> accounting. >> >> Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru> >> --- >> drivers/vfio/vfio_iommu_spapr_tce.c | 333 >> ++++++++++++++++++++++++++++++++---- >> include/uapi/linux/vfio.h | 29 ++++ >> 2 files changed, 331 insertions(+), 31 deletions(-) >> >> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c >> b/drivers/vfio/vfio_iommu_spapr_tce.c >> index 8256275..d0987ae 100644 >> --- a/drivers/vfio/vfio_iommu_spapr_tce.c >> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c >> @@ -86,8 +86,169 @@ struct tce_container { >> struct mutex lock; >> struct iommu_group *grp; >> bool enabled; >> + struct list_head mem_list; >> }; >> >> +struct tce_memory { >> + struct list_head next; >> + struct rcu_head rcu; >> + __u64 vaddr; >> + __u64 size; >> + __u64 pfns[]; >> +}; > > So we're using 2MB of kernel memory per 1G of user mapped memory, right? > Or are we using bigger pages here? I'm not sure the kmalloc below is > the appropriate allocator for something that can be so large.
ok, vmalloc it is then. >> + >> +static void tce_unpin_pages(struct tce_container *container, >> + struct tce_memory *mem, __u64 vaddr, __u64 size) >> +{ >> + __u64 off; >> + struct page *page = NULL; >> + >> + >> + for (off = 0; off < size; off += PAGE_SIZE) { >> + if (!mem->pfns[off >> PAGE_SHIFT]) >> + continue; >> + >> + page = pfn_to_page(mem->pfns[off >> PAGE_SHIFT]); >> + if (!page) >> + continue; >> + >> + put_page(page); >> + mem->pfns[off >> PAGE_SHIFT] = 0; >> + } > > Seems cleaner to count by 1 rather than PAGE_SIZE (ie. shift size once > instead of off 3 times). > >> +} >> + >> +static void release_tce_memory(struct rcu_head *head) >> +{ >> + struct tce_memory *mem = container_of(head, struct tce_memory, rcu); >> + >> + kfree(mem); >> +} >> + >> +static void tce_do_unregister_pages(struct tce_container *container, >> + struct tce_memory *mem) >> +{ >> + tce_unpin_pages(container, mem, mem->vaddr, mem->size); >> + decrement_locked_vm(mem->size); >> + list_del_rcu(&mem->next); >> + call_rcu_sched(&mem->rcu, release_tce_memory); >> +} >> + >> +static long tce_unregister_pages(struct tce_container *container, >> + __u64 vaddr, __u64 size) >> +{ >> + struct tce_memory *mem, *memtmp; >> + >> + if (container->enabled) >> + return -EBUSY; >> + >> + if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK)) >> + return -EINVAL; >> + >> + list_for_each_entry_safe(mem, memtmp, &container->mem_list, next) { >> + if ((mem->vaddr == vaddr) && (mem->size == size)) { >> + tce_do_unregister_pages(container, mem); >> + return 0; >> + } >> + } >> + >> + return -ENOENT; >> +} >> + >> +static long tce_pin_pages(struct tce_container *container, >> + struct tce_memory *mem, __u64 vaddr, __u64 size) >> +{ >> + __u64 off; >> + struct page *page = NULL; >> + >> + for (off = 0; off < size; off += PAGE_SIZE) { >> + if (1 != get_user_pages_fast(vaddr + off, >> + 1/* pages */, 1/* iswrite */, &page)) { >> + tce_unpin_pages(container, mem, vaddr, off); >> + return -EFAULT; >> + } >> + >> + mem->pfns[off >> PAGE_SHIFT] = page_to_pfn(page); >> + } >> + >> + return 0; >> +} >> + >> +static long tce_register_pages(struct tce_container *container, >> + __u64 vaddr, __u64 size) >> +{ >> + long ret; >> + struct tce_memory *mem; >> + >> + if (container->enabled) >> + return -EBUSY; >> + >> + if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) || >> + ((vaddr + size) < vaddr)) >> + return -EINVAL; >> + >> + /* Any overlap with registered chunks? */ >> + rcu_read_lock(); >> + list_for_each_entry_rcu(mem, &container->mem_list, next) { >> + if ((mem->vaddr < (vaddr + size)) && >> + (vaddr < (mem->vaddr + mem->size))) { >> + ret = -EBUSY; >> + goto unlock_exit; >> + } >> + } >> + >> + ret = try_increment_locked_vm(size >> PAGE_SHIFT); >> + if (ret) >> + goto unlock_exit; >> + >> + mem = kzalloc(sizeof(*mem) + (size >> (PAGE_SHIFT - 3)), GFP_KERNEL); > > > I suspect that userspace can break kmalloc with the potential size of > this structure. You might need a vmalloc. I also wonder if there isn't > a more efficient tree structure to use. Right, I'll use vmalloc. All this time I was thinking kmalloc() allocates non-contiguous memory :) How would the tree be more efficient here? I store pfns once and unpin them once as well. >> + if (!mem) >> + goto unlock_exit; >> + >> + if (tce_pin_pages(container, mem, vaddr, size)) >> + goto free_exit; >> + >> + mem->vaddr = vaddr; >> + mem->size = size; >> + >> + list_add_rcu(&mem->next, &container->mem_list); >> + rcu_read_unlock(); >> + >> + return 0; >> + >> +free_exit: >> + kfree(mem); >> + >> +unlock_exit: >> + decrement_locked_vm(size >> PAGE_SHIFT); >> + rcu_read_unlock(); >> + >> + return ret; >> +} >> + >> +static inline bool tce_preregistered(struct tce_container *container) >> +{ >> + return !list_empty(&container->mem_list); >> +} >> + >> +static bool tce_pinned(struct tce_container *container, >> + __u64 vaddr, __u64 size) >> +{ >> + struct tce_memory *mem; >> + bool ret = false; >> + >> + rcu_read_lock(); >> + list_for_each_entry_rcu(mem, &container->mem_list, next) { >> + if ((mem->vaddr <= vaddr) && >> + (vaddr + size <= mem->vaddr + mem->size)) { >> + ret = true; >> + break; >> + } >> + } >> + rcu_read_unlock(); >> + >> + return ret; >> +} >> + >> static bool tce_check_page_size(struct page *page, unsigned page_shift) >> { >> unsigned shift; >> @@ -166,14 +327,16 @@ static int tce_iommu_enable(struct tce_container >> *container) >> * as this information is only available from KVM and VFIO is >> * KVM agnostic. >> */ >> - iommu = iommu_group_get_iommudata(container->grp); >> - if (!iommu) >> - return -EFAULT; >> + if (!tce_preregistered(container)) { >> + iommu = iommu_group_get_iommudata(container->grp); >> + if (!iommu) >> + return -EFAULT; >> >> - tbl = &iommu->tables[0]; >> - ret = try_increment_locked_vm(IOMMU_TABLE_PAGES(tbl)); >> - if (ret) >> - return ret; >> + tbl = &iommu->tables[0]; >> + ret = try_increment_locked_vm(IOMMU_TABLE_PAGES(tbl)); >> + if (ret) >> + return ret; >> + } >> >> container->enabled = true; >> >> @@ -193,12 +356,14 @@ static void tce_iommu_disable(struct tce_container >> *container) >> if (!container->grp || !current->mm) >> return; >> >> - iommu = iommu_group_get_iommudata(container->grp); >> - if (!iommu) >> - return; >> + if (!tce_preregistered(container)) { >> + iommu = iommu_group_get_iommudata(container->grp); >> + if (!iommu) >> + return; >> >> - tbl = &iommu->tables[0]; >> - decrement_locked_vm(IOMMU_TABLE_PAGES(tbl)); >> + tbl = &iommu->tables[0]; >> + decrement_locked_vm(IOMMU_TABLE_PAGES(tbl)); >> + } >> } >> >> static void *tce_iommu_open(unsigned long arg) >> @@ -215,6 +380,7 @@ static void *tce_iommu_open(unsigned long arg) >> return ERR_PTR(-ENOMEM); >> >> mutex_init(&container->lock); >> + INIT_LIST_HEAD_RCU(&container->mem_list); >> >> return container; >> } >> @@ -222,6 +388,7 @@ static void *tce_iommu_open(unsigned long arg) >> static void tce_iommu_release(void *iommu_data) >> { >> struct tce_container *container = iommu_data; >> + struct tce_memory *mem, *memtmp; >> >> WARN_ON(container->grp); >> tce_iommu_disable(container); >> @@ -229,14 +396,19 @@ static void tce_iommu_release(void *iommu_data) >> if (container->grp) >> tce_iommu_detach_group(iommu_data, container->grp); >> >> + list_for_each_entry_safe(mem, memtmp, &container->mem_list, next) >> + tce_do_unregister_pages(container, mem); >> + >> mutex_destroy(&container->lock); >> >> kfree(container); >> } >> >> -static void tce_iommu_unuse_page(unsigned long oldtce) >> +static void tce_iommu_unuse_page(struct tce_container *container, >> + unsigned long oldtce) >> { >> struct page *page; >> + bool do_put = !tce_preregistered(container); >> >> if (!(oldtce & (TCE_PCI_READ | TCE_PCI_WRITE))) >> return; >> @@ -245,7 +417,8 @@ static void tce_iommu_unuse_page(unsigned long oldtce) >> if (oldtce & TCE_PCI_WRITE) >> SetPageDirty(page); >> >> - put_page(page); >> + if (do_put) >> + put_page(page); >> } >> >> static int tce_iommu_clear(struct tce_container *container, >> @@ -261,7 +434,7 @@ static int tce_iommu_clear(struct tce_container >> *container, >> if (ret) >> continue; >> >> - tce_iommu_unuse_page(oldtce); >> + tce_iommu_unuse_page(container, oldtce); >> } >> >> return 0; >> @@ -279,42 +452,91 @@ static enum dma_data_direction >> tce_iommu_direction(unsigned long tce) >> return DMA_NONE; >> } >> >> +static unsigned long tce_get_hva_cached(struct tce_container *container, >> + unsigned page_shift, unsigned long tce) >> +{ >> + struct tce_memory *mem; >> + struct page *page = NULL; >> + unsigned long hva = -1; >> + >> + tce &= ~(TCE_PCI_READ | TCE_PCI_WRITE); >> + rcu_read_lock(); >> + list_for_each_entry_rcu(mem, &container->mem_list, next) { >> + if ((mem->vaddr <= tce) && (tce < (mem->vaddr + mem->size))) { >> + unsigned long gfn = (tce - mem->vaddr) >> PAGE_SHIFT; >> + unsigned long hpa = mem->pfns[gfn] << PAGE_SHIFT; >> + >> + page = pfn_to_page(mem->pfns[gfn]); >> + >> + if (!tce_check_page_size(page, page_shift)) >> + break; >> + >> + hva = (unsigned long) __va(hpa); >> + break; >> + } >> + } >> + rcu_read_unlock(); >> + >> + return hva; >> +} >> + >> +static unsigned long tce_get_hva(struct tce_container *container, >> + unsigned page_shift, unsigned long tce) >> +{ >> + long ret = 0; >> + struct page *page = NULL; >> + unsigned long hva = -1; >> + enum dma_data_direction direction = tce_iommu_direction(tce); >> + >> + ret = get_user_pages_fast(tce & PAGE_MASK, 1, >> + direction != DMA_TO_DEVICE, &page); >> + if (unlikely(ret != 1)) >> + return -1; >> + >> + if (!tce_check_page_size(page, page_shift)) { >> + put_page(page); >> + return -1; >> + } >> + >> + hva = (unsigned long) page_address(page) + >> + (tce & ~((1ULL << page_shift) - 1) & ~PAGE_MASK); >> + >> + return hva; >> +} >> + >> static long tce_iommu_build(struct tce_container *container, >> struct iommu_table *tbl, >> unsigned long entry, unsigned long tce, unsigned long pages) >> { >> long i, ret = 0; >> - struct page *page = NULL; >> unsigned long hva, oldtce; >> enum dma_data_direction direction = tce_iommu_direction(tce); >> + bool do_put = false; >> >> for (i = 0; i < pages; ++i) { >> - ret = get_user_pages_fast(tce & PAGE_MASK, 1, >> - direction != DMA_TO_DEVICE, &page); >> - if (unlikely(ret != 1)) { >> - ret = -EFAULT; >> - break; >> + hva = tce_get_hva_cached(container, tbl->it_page_shift, tce); >> + if (hva == -1) { >> + do_put = true; >> + WARN_ON_ONCE(1); >> + hva = tce_get_hva(container, tbl->it_page_shift, tce); >> } >> >> - if (!tce_check_page_size(page, tbl->it_page_shift)) { >> - ret = -EFAULT; >> - break; >> - } >> - >> - hva = (unsigned long) page_address(page) + >> - (tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK); >> oldtce = 0; >> - >> ret = iommu_tce_xchg(tbl, entry + i, hva, &oldtce, direction); >> if (ret) { >> - put_page(page); >> + if (do_put) >> + put_page(pfn_to_page(__pa(hva) >> PAGE_SHIFT)); >> pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, >> ret=%ld\n", >> __func__, entry << tbl->it_page_shift, >> tce, ret); >> break; >> } >> >> - tce_iommu_unuse_page(oldtce); >> + if (do_put) >> + put_page(pfn_to_page(__pa(hva) >> PAGE_SHIFT)); >> + >> + tce_iommu_unuse_page(container, oldtce); >> + >> tce += IOMMU_PAGE_SIZE(tbl); >> } >> >> @@ -416,6 +638,11 @@ static long tce_iommu_ioctl(void *iommu_data, >> if (ret) >> return ret; >> >> + /* If any memory is pinned, only allow pages from that region */ >> + if (tce_preregistered(container) && >> + !tce_pinned(container, param.vaddr, param.size)) >> + return -EPERM; >> + >> ret = tce_iommu_build(container, tbl, >> param.iova >> tbl->it_page_shift, >> tce, param.size >> tbl->it_page_shift); >> @@ -464,6 +691,50 @@ static long tce_iommu_ioctl(void *iommu_data, >> >> return ret; >> } >> + case VFIO_IOMMU_REGISTER_MEMORY: { >> + struct vfio_iommu_type1_register_memory param; >> + >> + minsz = offsetofend(struct vfio_iommu_type1_register_memory, >> + size); >> + >> + if (copy_from_user(¶m, (void __user *)arg, minsz)) >> + return -EFAULT; >> + >> + if (param.argsz < minsz) >> + return -EINVAL; >> + >> + /* No flag is supported now */ >> + if (param.flags) >> + return -EINVAL; >> + >> + mutex_lock(&container->lock); >> + ret = tce_register_pages(container, param.vaddr, param.size); >> + mutex_unlock(&container->lock); >> + >> + return ret; >> + } >> + case VFIO_IOMMU_UNREGISTER_MEMORY: { >> + struct vfio_iommu_type1_unregister_memory param; >> + >> + minsz = offsetofend(struct vfio_iommu_type1_unregister_memory, >> + size); >> + >> + if (copy_from_user(¶m, (void __user *)arg, minsz)) >> + return -EFAULT; >> + >> + if (param.argsz < minsz) >> + return -EINVAL; >> + >> + /* No flag is supported now */ >> + if (param.flags) >> + return -EINVAL; >> + >> + mutex_lock(&container->lock); >> + tce_unregister_pages(container, param.vaddr, param.size); >> + mutex_unlock(&container->lock); >> + >> + return 0; >> + } >> case VFIO_IOMMU_ENABLE: >> mutex_lock(&container->lock); >> ret = tce_iommu_enable(container); >> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h >> index 29715d2..2bb0c9b 100644 >> --- a/include/uapi/linux/vfio.h >> +++ b/include/uapi/linux/vfio.h >> @@ -437,6 +437,35 @@ struct vfio_iommu_type1_dma_unmap { >> #define VFIO_IOMMU_ENABLE _IO(VFIO_TYPE, VFIO_BASE + 15) >> #define VFIO_IOMMU_DISABLE _IO(VFIO_TYPE, VFIO_BASE + 16) >> >> +/** >> + * VFIO_IOMMU_REGISTER_MEMORY - _IOW(VFIO_TYPE, VFIO_BASE + 17, struct >> vfio_iommu_type1_register_memory) >> + * >> + * Registers user space memory where DMA is allowed. It pins >> + * user pages and does the locked memory accounting so >> + * subsequent VFIO_IOMMU_MAP_DMA/VFIO_IOMMU_UNMAP_DMA calls >> + * get simpler. >> + */ >> +struct vfio_iommu_type1_register_memory { >> + __u32 argsz; >> + __u32 flags; >> + __u64 vaddr; /* Process virtual address */ >> + __u64 size; /* Size of mapping (bytes) */ >> +}; >> +#define VFIO_IOMMU_REGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 17) >> + >> +/** >> + * VFIO_IOMMU_UNREGISTER_MEMORY - _IOW(VFIO_TYPE, VFIO_BASE + 18, struct >> vfio_iommu_type1_unregister_memory) >> + * >> + * Unregisters user space memory registered with VFIO_IOMMU_REGISTER_MEMORY. >> + */ >> +struct vfio_iommu_type1_unregister_memory { >> + __u32 argsz; >> + __u32 flags; >> + __u64 vaddr; /* Process virtual address */ >> + __u64 size; /* Size of mapping (bytes) */ >> +}; >> +#define VFIO_IOMMU_UNREGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 18) >> + > > Is the user allowed to unregister arbitrary sub-regions of previously > registered memory? (I think I know the answer, but it should be > documented) The answer is "no" :) I'll update Documentation/vfio.txt. > Why are these "type1" structures, shouldn't they be down below? Pretty much because these do not look like they do anything powerpc-specific from the userspace prospective. Like DMA map/unmap. > Do we need an extension or flag bit to describe these as present or is > it sufficient to call and fail? Sorry, I do not follow you here. Flag to describe what as present? As it is now, in QEMU I setup a memory listener which walks through all RAM regions and calls VFIO_IOMMU_REGISTER_MEMORY for every slot, once when the container is started being used and I expect that this can fail (because of RLIMIT, etc). > Do we need two ioctls or one? There are map/unmap, enable/disable, set/unset container couples so I thought it would look natural if it was pin/unpin couple, no? > What about Documentation/vfio.txt? Yep. Thanks for the review! > >> /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */ >> >> /* > > > -- Alexey _______________________________________________ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev